@Override public int endPosition() { return label.endPosition(); }
@Override public Item convertItemSpan(Item item) { if(bestSegmentationB == null || bestSegmentationB.isEmpty()) throw new RuntimeException(this.getClass().getName() + ": No 1best segmentation available"); item.start = bestSegmentationB.get(item.start).beginPosition(); item.end = bestSegmentationB.get(item.end - 1).endPosition(); return item; }
private static Optional<CoreMap> overlapsWithMention(CoreMap needle, List<CoreMap> haystack) { List<CoreLabel> tokens = needle.get(CoreAnnotations.TokensAnnotation.class); int charBegin = tokens.get(0).beginPosition(); int charEnd = tokens.get(tokens.size()-1).endPosition(); return (haystack.stream().filter(mention_ -> { List<CoreLabel> tokens_ = mention_.get(CoreAnnotations.TokensAnnotation.class); int charBegin_ = tokens_.get(0).beginPosition(); int charEnd_ = tokens_.get(tokens_.size()-1).endPosition(); // Check overlap return !(charBegin_ > charEnd || charEnd_ < charBegin); }).findFirst()); }
public static String tokensToString(List<CoreLabel> tokens) { StringBuffer os = new StringBuffer(); boolean first = true; for(CoreLabel t: tokens){ if(! first) os.append(" "); os.append(t.word() + "{" + t.beginPosition() + ", " + t.endPosition() + "}"); first = false; } return os.toString(); }
private static List<CoreLabel> tokensForCharacters(List<CoreLabel> tokens, int charBegin, int charEnd) { assert charBegin >= 0; List<CoreLabel> segment = Generics.newArrayList(); for(CoreLabel token: tokens) { if (token.endPosition() < charBegin || token.beginPosition() >= charEnd) { continue; } segment.add(token); } return segment; }
/** * Joins all the tokens together (more or less) according to their original whitespace. * It assumes all whitespace was " ". * * @param tokens list of tokens which implement {@link HasOffset} and {@link HasWord} * @return a string of the tokens with the appropriate amount of spacing */ public static String joinWithOriginalWhiteSpace(List<CoreLabel> tokens) { if (tokens.isEmpty()) { return ""; } CoreLabel lastToken = tokens.get(0); StringBuilder buffer = new StringBuilder(lastToken.word()); for (int i = 1; i < tokens.size(); i++) { CoreLabel currentToken = tokens.get(i); int numSpaces = currentToken.beginPosition() - lastToken.endPosition(); if (numSpaces < 0) { numSpaces = 0; } buffer.append(repeat(' ', numSpaces)).append(currentToken.word()); lastToken = currentToken; } return buffer.toString(); }
public boolean rangeContainsCharIndex(Pair<Integer, Integer> tokenRange, int charIndex) { List<CoreLabel> tokens = doc.get(CoreAnnotations.TokensAnnotation.class); CoreLabel startToken = tokens.get(tokenRange.first()); CoreLabel endToken = tokens.get(tokenRange.second()); int startTokenCharBegin = startToken.beginPosition(); int endTokenCharEnd = endToken.endPosition(); return (startTokenCharBegin <= charIndex && charIndex <= endTokenCharEnd); }
sb.append(label.beginPosition()); sb.append("\t"); sb.append(label.endPosition()); sb.append("\n");
/** * set isNewline() */ private static void setNewlineStatus(List<CoreLabel> tokensList) { // label newlines for (CoreLabel token : tokensList) { if (token.word().equals(AbstractTokenizer.NEWLINE_TOKEN) && (token.endPosition() - token.beginPosition() == 1)) token.set(CoreAnnotations.IsNewlineAnnotation.class, true); else token.set(CoreAnnotations.IsNewlineAnnotation.class, false); } }
public String tokenRangeToString(Pair<Integer, Integer> tokenRange) { List<CoreLabel> tokens = doc.get(CoreAnnotations.TokensAnnotation.class); // see if the token range matches an entity mention List<CoreMap> entityMentionsInDoc = doc.get(CoreAnnotations.MentionsAnnotation.class); Integer potentialMatchingEntityMentionIndex = tokens.get(tokenRange.first).get(CoreAnnotations.EntityMentionIndexAnnotation.class); CoreMap potentialMatchingEntityMention = null; if (entityMentionsInDoc != null && potentialMatchingEntityMentionIndex != null) { potentialMatchingEntityMention = entityMentionsInDoc.get(potentialMatchingEntityMentionIndex); } // if there is a matching entity mention, return it's text (which has been processed to remove // things like newlines and xml)...if there isn't return the full substring of the document text if (potentialMatchingEntityMention != null && potentialMatchingEntityMention.get( CoreAnnotations.CharacterOffsetBeginAnnotation.class) == tokens.get(tokenRange.first).beginPosition() && potentialMatchingEntityMention.get( CoreAnnotations.CharacterOffsetEndAnnotation.class) == tokens.get(tokenRange.second).endPosition()) { return potentialMatchingEntityMention.get(CoreAnnotations.TextAnnotation.class); } else { return doc.get(CoreAnnotations.TextAnnotation.class).substring( tokens.get(tokenRange.first).beginPosition(), tokens.get(tokenRange.second).endPosition()); } }
text.append(token.before()); assert last != null; int missingWhitespace = (token.beginPosition() - last.endPosition()) - token.before().length(); while (missingWhitespace > 0) { text.append(' ');
if (overallMatch.size() == corefMentionTokens.size()) { int personBeginOffset = ((CoreLabel) personWithinMatch.get(0)).beginPosition(); int personEndOffset = ((CoreLabel) personWithinMatch.get(personWithinMatch.size()-1)).endPosition(); Pair<Integer,Integer> personOffsets = new Pair(personBeginOffset, personEndOffset); kbpMentionFound = kbpMentions.get(personOffsets);
for (IntPair offsets:chunkCharOffsets) { assert(token.beginPosition() >= 0); assert(token.endPosition() >= 0); int offsetBegin = offsets.getSource(); int offsetEnd = offsets.getTarget(); output.add(tokenFactory.makeToken(text.substring(offsetBegin,offsetEnd), offsetBegin, offsetEnd-offsetBegin)); output.add(tokenFactory.makeToken(text.substring(offsetEnd,token.endPosition()), offsetEnd, token.endPosition()-offsetEnd)); } else { output.add(tokenFactory.makeToken(text.substring(token.beginPosition(), offsetBegin), token.beginPosition(), offsetBegin-token.beginPosition())); output.add(tokenFactory.makeToken(text.substring(offsetBegin,token.endPosition()), offsetBegin, token.endPosition()-offsetBegin)); output.add(tokenFactory.makeToken(text.substring(offsetEnd,token.endPosition()), offsetEnd, token.endPosition()-offsetEnd)); } else {
names.incrementCount(prevName, 1); prevName = token.word(); prevEnd = token.endPosition();
tokens.get(m.endIndex - 2).endPosition() + ')');
origWord = tokenLabel.word(); } else { origWord = origText.substring(cl.beginPosition(), cl.endPosition()); iobList.get(iobList.size() - 1).setEndPosition(cl.endPosition());
if (endRelCharOffset > annoText.length()) { endRelCharOffset = annoText.length(); } if (allowPartialTokens) { while (i < annoTokens.size() && annoTokens.get(i).endPosition() <= beginCharOffset) { i++; while (j < annoTokens.size() && annoTokens.get(j).endPosition() <= endCharOffset) { assert(annoTokens.get(j).beginPosition() >= beginCharOffset); j++;
while (i < tokens.size() && tokens.get(i).endPosition() <= end) { quoteTokens.add(tokens.get(i)); i++;
", " + tokens.get(m.endIndex - 2).endPosition() + ')');
public List<CoreLabel> segmentStringToTokenList(String line) { List<CoreLabel> tokenList = CollectionUtils.makeList(); List<CoreLabel> labeledSequence = segmentStringToIOB(line); for (IntPair span : IOBUtils.TokenSpansForIOB(labeledSequence)) { CoreLabel token = new CoreLabel(); String text = IOBUtils.IOBToString(labeledSequence, prefixMarker, suffixMarker, span.getSource(), span.getTarget()); token.setWord(text); token.setValue(text); token.set(CoreAnnotations.TextAnnotation.class, text); token.set(CoreAnnotations.ArabicSegAnnotation.class, "1"); int start = labeledSequence.get(span.getSource()).beginPosition(); int end = labeledSequence.get(span.getTarget() - 1).endPosition(); token.setOriginalText(line.substring(start, end)); token.set(CoreAnnotations.CharacterOffsetBeginAnnotation.class, start); token.set(CoreAnnotations.CharacterOffsetEndAnnotation.class, end); tokenList.add(token); } return tokenList; }