@Override public int beginPosition() { return label.beginPosition(); }
@Override public Item convertItemSpan(Item item) { if(bestSegmentationB == null || bestSegmentationB.isEmpty()) throw new RuntimeException(this.getClass().getName() + ": No 1best segmentation available"); item.start = bestSegmentationB.get(item.start).beginPosition(); item.end = bestSegmentationB.get(item.end - 1).endPosition(); return item; }
private static Optional<CoreMap> overlapsWithMention(CoreMap needle, List<CoreMap> haystack) { List<CoreLabel> tokens = needle.get(CoreAnnotations.TokensAnnotation.class); int charBegin = tokens.get(0).beginPosition(); int charEnd = tokens.get(tokens.size()-1).endPosition(); return (haystack.stream().filter(mention_ -> { List<CoreLabel> tokens_ = mention_.get(CoreAnnotations.TokensAnnotation.class); int charBegin_ = tokens_.get(0).beginPosition(); int charEnd_ = tokens_.get(tokens_.size()-1).endPosition(); // Check overlap return !(charBegin_ > charEnd || charEnd_ < charBegin); }).findFirst()); }
public static String tokensToString(List<CoreLabel> tokens) { StringBuffer os = new StringBuffer(); boolean first = true; for(CoreLabel t: tokens){ if(! first) os.append(" "); os.append(t.word() + "{" + t.beginPosition() + ", " + t.endPosition() + "}"); first = false; } return os.toString(); }
private static List<CoreLabel> tokensForCharacters(List<CoreLabel> tokens, int charBegin, int charEnd) { assert charBegin >= 0; List<CoreLabel> segment = Generics.newArrayList(); for(CoreLabel token: tokens) { if (token.endPosition() < charBegin || token.beginPosition() >= charEnd) { continue; } segment.add(token); } return segment; }
/** * Joins all the tokens together (more or less) according to their original whitespace. * It assumes all whitespace was " ". * * @param tokens list of tokens which implement {@link HasOffset} and {@link HasWord} * @return a string of the tokens with the appropriate amount of spacing */ public static String joinWithOriginalWhiteSpace(List<CoreLabel> tokens) { if (tokens.isEmpty()) { return ""; } CoreLabel lastToken = tokens.get(0); StringBuilder buffer = new StringBuilder(lastToken.word()); for (int i = 1; i < tokens.size(); i++) { CoreLabel currentToken = tokens.get(i); int numSpaces = currentToken.beginPosition() - lastToken.endPosition(); if (numSpaces < 0) { numSpaces = 0; } buffer.append(repeat(' ', numSpaces)).append(currentToken.word()); lastToken = currentToken; } return buffer.toString(); }
public boolean rangeContainsCharIndex(Pair<Integer, Integer> tokenRange, int charIndex) { List<CoreLabel> tokens = doc.get(CoreAnnotations.TokensAnnotation.class); CoreLabel startToken = tokens.get(tokenRange.first()); CoreLabel endToken = tokens.get(tokenRange.second()); int startTokenCharBegin = startToken.beginPosition(); int endTokenCharEnd = endToken.endPosition(); return (startTokenCharBegin <= charIndex && charIndex <= endTokenCharEnd); }
int secondStart = cl.beginPosition() + secondOffset; int secondEnd = secondStart + secondLength; compoundBuffer.add(copyCoreLabel(cl, second, secondStart, secondEnd)); return copyCoreLabel(cl, first, cl.beginPosition(), secondStart);
/** * Splits a compound marked by the lexer. */ private CoreLabel processCompound(CoreLabel cl) { cl.remove(ParentAnnotation.class); String[] parts = pSpace.split(pDash.matcher(cl.word()).replaceAll(" - ")); int lengthAccum = 0; for (String part : parts) { CoreLabel newLabel = new CoreLabel(cl); newLabel.setWord(part); newLabel.setValue(part); newLabel.setBeginPosition(cl.beginPosition() + lengthAccum); newLabel.setEndPosition(cl.beginPosition() + lengthAccum + part.length()); newLabel.set(OriginalTextAnnotation.class, part); compoundBuffer.add(newLabel); lengthAccum += part.length(); } return compoundBuffer.remove(0); }
sb.append(label.beginPosition()); sb.append("\t"); sb.append(label.endPosition());
/** * set isNewline() */ private static void setNewlineStatus(List<CoreLabel> tokensList) { // label newlines for (CoreLabel token : tokensList) { if (token.word().equals(AbstractTokenizer.NEWLINE_TOKEN) && (token.endPosition() - token.beginPosition() == 1)) token.set(CoreAnnotations.IsNewlineAnnotation.class, true); else token.set(CoreAnnotations.IsNewlineAnnotation.class, false); } }
int secondStart = cl.beginPosition() + secondOffset; int secondEnd = secondStart + secondLength; compoundBuffer.add(copyCoreLabel(cl, second, secondStart, secondEnd)); return copyCoreLabel(cl, first, cl.beginPosition(), secondStart);
public String tokenRangeToString(Pair<Integer, Integer> tokenRange) { List<CoreLabel> tokens = doc.get(CoreAnnotations.TokensAnnotation.class); // see if the token range matches an entity mention List<CoreMap> entityMentionsInDoc = doc.get(CoreAnnotations.MentionsAnnotation.class); Integer potentialMatchingEntityMentionIndex = tokens.get(tokenRange.first).get(CoreAnnotations.EntityMentionIndexAnnotation.class); CoreMap potentialMatchingEntityMention = null; if (entityMentionsInDoc != null && potentialMatchingEntityMentionIndex != null) { potentialMatchingEntityMention = entityMentionsInDoc.get(potentialMatchingEntityMentionIndex); } // if there is a matching entity mention, return it's text (which has been processed to remove // things like newlines and xml)...if there isn't return the full substring of the document text if (potentialMatchingEntityMention != null && potentialMatchingEntityMention.get( CoreAnnotations.CharacterOffsetBeginAnnotation.class) == tokens.get(tokenRange.first).beginPosition() && potentialMatchingEntityMention.get( CoreAnnotations.CharacterOffsetEndAnnotation.class) == tokens.get(tokenRange.second).endPosition()) { return potentialMatchingEntityMention.get(CoreAnnotations.TextAnnotation.class); } else { return doc.get(CoreAnnotations.TextAnnotation.class).substring( tokens.get(tokenRange.first).beginPosition(), tokens.get(tokenRange.second).endPosition()); } }
text.append(token.before()); assert last != null; int missingWhitespace = (token.beginPosition() - last.endPosition()) - token.before().length(); while (missingWhitespace > 0) { text.append(' ');
System.out.println(p.name + " " + weight + " name"); } else if (mention.type.equals(PRONOUN)) { int charBeginKey = doc.get(CoreAnnotations.TokensAnnotation.class).get(mention.begin).beginPosition(); Person p = doCoreference(charBeginKey, quote); if (p != null) {
CoreLabel token = tokens.get(i); for (IntPair offsets:chunkCharOffsets) { assert(token.beginPosition() >= 0); assert(token.endPosition() >= 0); int offsetBegin = offsets.getSource(); output.add(tokenFactory.makeToken(text.substring(token.beginPosition(), offsetBegin), token.beginPosition(), offsetBegin-token.beginPosition())); output.add(tokenFactory.makeToken(text.substring(offsetBegin,offsetEnd), offsetBegin, offsetEnd-offsetBegin)); offsetEnd, token.endPosition()-offsetEnd)); } else { output.add(tokenFactory.makeToken(text.substring(token.beginPosition(), offsetBegin), token.beginPosition(), offsetBegin-token.beginPosition())); output.add(tokenFactory.makeToken(text.substring(offsetBegin,token.endPosition()), offsetBegin, token.endPosition()-offsetBegin)); output.add(tokenFactory.makeToken(text.substring(token.beginPosition(),offsetEnd), token.beginPosition(), offsetEnd-token.beginPosition())); output.add(tokenFactory.makeToken(text.substring(offsetEnd,token.endPosition()), offsetEnd, token.endPosition()-offsetEnd));
List<CoreMap> personWithinMatch = titlePersonMatcher.groupNodes(1); if (overallMatch.size() == corefMentionTokens.size()) { int personBeginOffset = ((CoreLabel) personWithinMatch.get(0)).beginPosition(); int personEndOffset = ((CoreLabel) personWithinMatch.get(personWithinMatch.size()-1)).endPosition(); Pair<Integer,Integer> personOffsets = new Pair(personBeginOffset, personEndOffset);
out.println(" " + m + ":[" + tokens.get(m.startIndex - 1).beginPosition() + ", " + tokens.get(m.endIndex - 2).endPosition() + ')');
/** * Handles verbs with attached suffixes, marked by the lexer: * * Escribamosela => Escribamo + se + la => escribamos + se + la * Sentaos => senta + os => sentad + os * Damelo => da + me + lo * */ private CoreLabel processVerb(CoreLabel cl) { cl.remove(ParentAnnotation.class); SpanishVerbStripper.StrippedVerb stripped = verbStripper.separatePronouns(cl.word()); if (stripped == null) { return cl; } // Split the CoreLabel into separate labels, tracking changing begin + end // positions. int stemEnd = cl.beginPosition() + stripped.getOriginalStem().length(); int lengthRemoved = 0; for (String pronoun : stripped.getPronouns()) { int beginOffset = stemEnd + lengthRemoved; compoundBuffer.add(copyCoreLabel(cl, pronoun, beginOffset)); lengthRemoved += pronoun.length(); } CoreLabel stem = copyCoreLabel(cl, stripped.getStem(), cl.beginPosition(), stemEnd); stem.setOriginalText(stripped.getOriginalStem()); return stem; }
public List<CoreLabel> segmentStringToTokenList(String line) { List<CoreLabel> tokenList = CollectionUtils.makeList(); List<CoreLabel> labeledSequence = segmentStringToIOB(line); for (IntPair span : IOBUtils.TokenSpansForIOB(labeledSequence)) { CoreLabel token = new CoreLabel(); String text = IOBUtils.IOBToString(labeledSequence, prefixMarker, suffixMarker, span.getSource(), span.getTarget()); token.setWord(text); token.setValue(text); token.set(CoreAnnotations.TextAnnotation.class, text); token.set(CoreAnnotations.ArabicSegAnnotation.class, "1"); int start = labeledSequence.get(span.getSource()).beginPosition(); int end = labeledSequence.get(span.getTarget() - 1).endPosition(); token.setOriginalText(line.substring(start, end)); token.set(CoreAnnotations.CharacterOffsetBeginAnnotation.class, start); token.set(CoreAnnotations.CharacterOffsetEndAnnotation.class, end); tokenList.add(token); } return tokenList; }