@Override public String originalText() { return label.originalText(); }
private void prepareHeidelTimeInput(PrintWriter stream, CoreMap document) { // We really should use the full text annotation because our cleanxml can be useless. for (CoreMap sentence : document.get(CoreAnnotations.SentencesAnnotation.class)) { for (CoreLabel token : sentence.get(CoreAnnotations.TokensAnnotation.class)) { String text = token.originalText(); stream.append(TRANSLATE.getOrDefault(text, text)); // HACK: will not handle contractions like "del = de + el" properly -- will be deel. // stream.append(token.after().length() > 0 ? " " : ""); // HACK: will not handle things like 12-abr-2011 which are chunked up properly into 12 - abr-2011. stream.append(" "); } stream.append("\n"); } }
protected static void findHeadChinese(List<CoreLabel> sent, Mention m) { int headPos = m.endIndex - 1; // Skip trailing punctuations while (headPos > m.startIndex && sent.get(headPos).tag().equals("PU")) { headPos--; } // If we got right to the end without finding non punctuation, reset to end again if (headPos == m.startIndex && sent.get(headPos).tag().equals("PU")) { headPos = m.endIndex - 1; } if (sent.get(headPos).originalText().equals("自己") && m.endIndex != m.startIndex && headPos > m.startIndex) { if (!sent.get(headPos-1).tag().equals("PU")) headPos--; } m.headIndex = headPos; m.headWord = sent.get(headPos); m.headString = m.headWord.get(CoreAnnotations.TextAnnotation.class); }
if (tokens.size() > 0) { CoreLabel token = tokens.get(0); if (token.originalText() != null) { text.append(token.originalText()); } else { text.append(token.word()); } last = tokens.get(0); if (token.originalText() != null) { text.append(token.originalText()); } else { text.append(token.word()); } last = token;
/** * Converts a tree to the Morfette training format. */ private static String treeToMorfette(Tree tree) { StringBuilder sb = new StringBuilder(); List<Label> yield = tree.yield(); List<Label> tagYield = tree.preTerminalYield(); assert yield.size() == tagYield.size(); int listLen = yield.size(); for (int i = 0; i < listLen; ++i) { CoreLabel token = (CoreLabel) yield.get(i); CoreLabel tag = (CoreLabel) tagYield.get(i); String morphStr = token.originalText(); if (morphStr == null || morphStr.equals("")) { morphStr = tag.value(); } String lemma = token.lemma(); if (lemma == null || lemma.equals("")) { lemma = token.value(); } sb.append(String.format("%s %s %s%n", token.value(), lemma, morphStr)); } return sb.toString(); }
/** helper method for creating version of document text without xml. **/ public static String xmlFreeText(String documentText, Annotation annotation) { int firstTokenCharIndex = annotation.get(CoreAnnotations.TokensAnnotation.class).get(0).get( CoreAnnotations.CharacterOffsetBeginAnnotation.class); // add white space for all text before first token String cleanedText = documentText.substring(0,firstTokenCharIndex).replaceAll("\\S", " "); int tokenIndex = 0; List<CoreLabel> tokens = annotation.get(CoreAnnotations.TokensAnnotation.class); for (CoreLabel token : tokens) { // add the current token's text cleanedText += token.originalText(); // add whitespace for non-tokens and xml in between these tokens tokenIndex += 1; if (tokenIndex < tokens.size()) { CoreLabel nextToken = tokens.get(tokenIndex); int inBetweenStart = token.get(CoreAnnotations.CharacterOffsetEndAnnotation.class); int inBetweenEnd = nextToken.get(CoreAnnotations.CharacterOffsetBeginAnnotation.class); String inBetweenTokenText = documentText.substring(inBetweenStart, inBetweenEnd); inBetweenTokenText = inBetweenTokenText.replaceAll("\\S", " "); cleanedText += inBetweenTokenText; } } // add white space for all non-token content after last token cleanedText += documentText.substring( cleanedText.length(), documentText.length()).replaceAll("\\S", " "); return cleanedText; }
sb.append(label.word()); sb.append("\t"); sb.append(label.originalText()); sb.append("\t"); if (outputLemmas) {
if (text.length() == 0) break; tokenEndIdx++; String matchStr = token.originalText().trim();
for(Label label : leaves) { if(label instanceof CoreLabel) morphAnalyses.add(((CoreLabel) label).originalText());
CoreLabel rawToken = (CoreLabel) yield.get(i); String word = rawToken.value(); String morphStr = rawToken.originalText(); Pair<String,String> lemmaMorph = MorphoFeatureSpecification.splitMorphString(word, morphStr); String lemma = lemmaMorph.first();
private static void replacePOSTags(Tree tree) { List<Label> yield = tree.yield(); List<Label> preYield = tree.preTerminalYield(); assert yield.size() == preYield.size(); MorphoFeatureSpecification spec = new FrenchMorphoFeatureSpecification(); for(int i = 0; i < yield.size(); i++) { // Morphological Analysis String morphStr = ((CoreLabel) yield.get(i)).originalText(); if (morphStr == null || morphStr.equals("")) { morphStr = preYield.get(i).value(); // POS subcategory String subCat = ((CoreLabel) yield.get(i)).category(); if (subCat != null && subCat != "") { morphStr += "-" + subCat + "--"; } else { morphStr += "---"; } } MorphoFeatures feats = spec.strToFeatures(morphStr); if(feats.getAltTag() != null && !feats.getAltTag().equals("")) { CoreLabel cl = (CoreLabel) preYield.get(i); cl.setValue(feats.getAltTag()); cl.setTag(feats.getAltTag()); } } }
String morphStr = coreLabel.originalText(); if(morphStr == null || morphStr.equals("")) { morphStr = MorphoFeatureSpecification.NO_ANALYSIS;
String featureStr = ((CoreLabel) yield.get(i)).originalText(); Pair<String,String> lemmaMorph = MorphoFeatureSpecification.splitMorphString(word, featureStr); String lemma = lemmaMorph.first();
@Override public Tree transformTree(Tree t, Tree root) { String baseCat = t.value(); StringBuilder newCategory = new StringBuilder(); //Add manual state splits for (Pair<TregexPattern,Function<TregexMatcher,String>> e : activeAnnotations) { TregexMatcher m = e.first().matcher(root); if (m.matchesAt(t)) newCategory.append(e.second().apply(m)); } // WSGDEBUG //Add morphosyntactic features if this is a POS tag if(t.isPreTerminal() && tagSpec != null) { if( !(t.firstChild().label() instanceof CoreLabel) || ((CoreLabel) t.firstChild().label()).originalText() == null ) throw new RuntimeException(String.format("%s: Term lacks morpho analysis: %s",this.getClass().getName(),t.toString())); String morphoStr = ((CoreLabel) t.firstChild().label()).originalText(); MorphoFeatures feats = tagSpec.strToFeatures(morphoStr); baseCat = feats.getTag(baseCat); } //Update the label(s) String newCat = baseCat + newCategory.toString(); t.setValue(newCat); if (t.isPreTerminal() && t.label() instanceof HasTag) ((HasTag) t.label()).setTag(newCat); return t; }
@Override public Tree transformTree(Tree t, Tree root) { // Perform tregex-powered annotations t = super.transformTree(t, root); String cat = t.value(); //Add morphosyntactic features if this is a POS tag if(t.isPreTerminal() && tagSpec != null) { if( !(t.firstChild().label() instanceof CoreLabel) || ((CoreLabel) t.firstChild().label()).originalText() == null ) throw new RuntimeException(String.format("%s: Term lacks morpho analysis: %s",this.getClass().getName(),t.toString())); String morphoStr = ((CoreLabel) t.firstChild().label()).originalText(); Pair<String,String> lemmaMorph = MorphoFeatureSpecification.splitMorphString("", morphoStr); MorphoFeatures feats = tagSpec.strToFeatures(lemmaMorph.second()); cat = feats.getTag(cat); } //Update the label(s) t.setValue(cat); if (t.isPreTerminal() && t.label() instanceof HasTag) ((HasTag) t.label()).setTag(cat); return t; }
s += word.originalText()+word.after(); prev.set(CoreAnnotations.AfterAnnotation.class, s); prevString += word.before() + word.originalText(); s += word.originalText()+word.after(); prev.set(CoreAnnotations.AfterAnnotation.class, s); prevString += word.before() + word.originalText(); prevString += word.before() + word.originalText(); } else if (word.word().equalsIgnoreCase("</doc>")) { String s = prev.get(CoreAnnotations.AfterAnnotation.class); s += word.originalText(); prev.set(CoreAnnotations.AfterAnnotation.class, s); } else { if (prev != null) { String s = prev.get(CoreAnnotations.AfterAnnotation.class); s += word.originalText() + word.after(); prev.set(CoreAnnotations.AfterAnnotation.class, s); prevString += word.before() + word.originalText(); CoreLabel wi = new CoreLabel(); wi.setWord(word.word()); wi.set(CoreAnnotations.OriginalTextAnnotation.class, word.originalText()); wi.set(CoreAnnotations.BeforeAnnotation.class, prevString+word.before()); wi.set(CoreAnnotations.AfterAnnotation.class, word.after());
String contextStr = getCoreLabel(start).originalText(); float tagScore = lex.score(tagging, start, wordIndex.get(words[start]), contextStr); if (tagScore > Float.NEGATIVE_INFINITY || floodTags) {
String morphStr = childLabel.originalText(); if (morphStr == null || morphStr.equals("")) { morphStr = label.value();
String contextStr = getCoreLabel(start).originalText(); float tagScore = lex.score(tagging, start, wordIndex.get(words[start]), contextStr); if (tagScore > Float.NEGATIVE_INFINITY || floodTags) {
String featureStr = ((CoreLabel) yield.get(i)).originalText(); Pair<String,String> lemmaMorph = MorphoFeatureSpecification.splitMorphString(word, featureStr); String lemma = lemmaMorph.first();