public static void annotateChunks(List<? extends CoreMap> chunks, Map<String,String> attributes) { for (CoreMap chunk:chunks) { annotateChunk(chunk, attributes); } }
/** * Create a new chunk Annotation with basic chunk information. * CharacterOffsetBeginAnnotation - set to CharacterOffsetBeginAnnotation of first token in chunk * CharacterOffsetEndAnnotation - set to CharacterOffsetEndAnnotation of last token in chunk * TokensAnnotation - List of tokens in this chunk * TokenBeginAnnotation - Index of first token in chunk (index in original list of tokens) * tokenStartIndex + totalTokenOffset * TokenEndAnnotation - Index of last token in chunk (index in original list of tokens) * tokenEndIndex + totalTokenOffset * TextAnnotation - String extracted from the origAnnotation using character offset information for this chunk * @param tokens - List of tokens to look for chunks * @param tokenStartIndex - Index (relative to current list of tokens) at which this chunk starts * @param tokenEndIndex - Index (relative to current list of tokens) at which this chunk ends (not inclusive) * @param totalTokenOffset - Index of tokens to offset by * @param tokenChunkKey - If not null, each token is annotated with the chunk using this key * @param tokenTextKey - Key to use to find the token text * @param tokenLabelKey - If not null, each token is annotated with the text associated with the chunk using this key * @return Annotation representing new chunk */ public static Annotation getAnnotatedChunk(List<CoreLabel> tokens, int tokenStartIndex, int tokenEndIndex, int totalTokenOffset, Class tokenChunkKey, Class tokenTextKey, Class tokenLabelKey) { Annotation chunk = getAnnotatedChunk(tokens, tokenStartIndex, tokenEndIndex, totalTokenOffset); annotateChunkText(chunk, tokenTextKey); annotateChunkTokens(chunk, tokenChunkKey, tokenLabelKey); return chunk; }
public void annotateGroup(int group, Map<String,String> attributes) { int groupStart = start(group); if (groupStart >=0) { int groupEnd = end(group); ChunkAnnotationUtils.annotateChunks(elements, groupStart, groupEnd, attributes); } }
/** * Create a new chunk Annotation with basic chunk information * CharacterOffsetBeginAnnotation - set to CharacterOffsetBeginAnnotation of first token in chunk * CharacterOffsetEndAnnotation - set to CharacterOffsetEndAnnotation of last token in chunk * TokensAnnotation - List of tokens in this chunk * TokenBeginAnnotation - Index of first token in chunk (index in original list of tokens) * tokenStartIndex + annotation's TokenBeginAnnotation * TokenEndAnnotation - Index of last token in chunk (index in original list of tokens) * tokenEndIndex + annotation's TokenBeginAnnotation * TextAnnotation - String extracted from the origAnnotation using character offset information for this chunk * @param annotation - Annotation from which to extract the text for this chunk * @param tokenStartIndex - Index (relative to current list of tokens) at which this chunk starts * @param tokenEndIndex - Index (relative to current list of tokens) at which this chunk ends (not inclusive) * @param tokenChunkKey - If not null, each token is annotated with the chunk using this key * @param tokenLabelKey - If not null, each token is annotated with the text associated with the chunk using this key * @return Annotation representing new chunk */ public static Annotation getAnnotatedChunk(CoreMap annotation, int tokenStartIndex, int tokenEndIndex, Class tokenChunkKey, Class tokenLabelKey) { Annotation chunk = getAnnotatedChunk(annotation, tokenStartIndex, tokenEndIndex); annotateChunkTokens(chunk, tokenChunkKey, tokenLabelKey); return chunk; }
/** * Create a new chunk Annotation with basic chunk information * CharacterOffsetBeginAnnotation - set to CharacterOffsetBeginAnnotation of first token in chunk * CharacterOffsetEndAnnotation - set to CharacterOffsetEndAnnotation of last token in chunk * TokensAnnotation - List of tokens in this chunk * TokenBeginAnnotation - Index of first token in chunk (index in original list of tokens) * tokenStartIndex + annotation's TokenBeginAnnotation * TokenEndAnnotation - Index of last token in chunk (index in original list of tokens) * tokenEndIndex + annotation's TokenBeginAnnotation * TextAnnotation - String extracted from the origAnnotation using character offset information for this chunk * @param annotation - Annotation from which to extract the text for this chunk * @param tokenStartIndex - Index (relative to current list of tokens) at which this chunk starts * @param tokenEndIndex - Index (relative to current list of tokens) at which this chunk ends (not inclusive) * @return Annotation representing new chunk */ public static Annotation getAnnotatedChunk(CoreMap annotation, int tokenStartIndex, int tokenEndIndex) { Integer annoTokenBegin = annotation.get(CoreAnnotations.TokenBeginAnnotation.class); if (annoTokenBegin == null) { annoTokenBegin = 0; } List<CoreLabel> tokens = annotation.get(CoreAnnotations.TokensAnnotation.class); Annotation chunk = getAnnotatedChunk(tokens, tokenStartIndex, tokenEndIndex, annoTokenBegin); boolean annotatedTextFromCharOffsets = annotateChunkText(chunk, annotation); if (!annotatedTextFromCharOffsets) { // Use tokens to get text annotation annotateChunkText(chunk, CoreAnnotations.TextAnnotation.class); } return chunk; }
sentence = sentences.get(i); Integer firstNonWsCharOffset = getFirstNonWsCharOffset(sentence, false); if (firstNonWsCharOffset != null && firstNonWsCharOffset >= offsetEnd) { mergeChunks(sentences, text, startSentIndex, i+1); if (entityAtSentEnd) { CoreMap nextSentence = sentences.get(i+1); Character c = getFirstNonWsChar(nextSentence); if (c != null) { doMerge = !Character.isUpperCase(c); mergeChunks(sentences, text, i, i+2); CoreMap sentence = sentences.get(i); CoreMap nextSentence = sentences.get(i+1); String sentTrimmedText = getTrimmedText(sentence); String nextSentTrimmedText = getTrimmedText(nextSentence); if (sentTrimmedText.length() <= 1 || nextSentTrimmedText.length() <= 1) { Character c = getFirstNonWsChar(nextSentence); mergeChunks(sentences, text, i, i+2); } else { i++;
public CoreMap extractAnnotation(CoreMap sourceAnnotation) { if (chunkOffsets != null) { annotation = ChunkAnnotationUtils.getMergedChunk(sourceAnnotation.get(CoreAnnotations.NumerizedTokensAnnotation.class), chunkOffsets.getBegin(), chunkOffsets.getEnd(), CoreMapAttributeAggregator.DEFAULT_NUMERIC_TOKENS_AGGREGATORS ); if (sourceAnnotation.containsKey(CoreAnnotations.TextAnnotation.class)) { ChunkAnnotationUtils.annotateChunkText(annotation, sourceAnnotation); } charOffsets = Interval.toInterval(annotation.get(CoreAnnotations.CharacterOffsetBeginAnnotation.class), annotation.get(CoreAnnotations.CharacterOffsetEndAnnotation.class)); tokenOffsets = Interval.toInterval(annotation.get(CoreAnnotations.TokenBeginAnnotation.class), annotation.get(CoreAnnotations.TokenEndAnnotation.class), Interval.INTERVAL_OPEN_END); } else { Integer baseCharOffset = sourceAnnotation.get(CoreAnnotations.CharacterOffsetBeginAnnotation.class); if (baseCharOffset == null) { baseCharOffset = 0; }; chunkOffsets = ChunkAnnotationUtils.getChunkOffsetsUsingCharOffsets(sourceAnnotation.get(CoreAnnotations.NumerizedTokensAnnotation.class), charOffsets.getBegin() + baseCharOffset, charOffsets.getEnd() + baseCharOffset); CoreMap annotation2 = ChunkAnnotationUtils.getMergedChunk(sourceAnnotation.get(CoreAnnotations.NumerizedTokensAnnotation.class), chunkOffsets.getBegin(), chunkOffsets.getEnd(), CoreMapAttributeAggregator.DEFAULT_NUMERIC_TOKENS_AGGREGATORS ); annotation = ChunkAnnotationUtils.getAnnotatedChunkUsingCharOffsets(sourceAnnotation, charOffsets.getBegin(), charOffsets.getEnd()); tokenOffsets = Interval.toInterval(annotation.get(CoreAnnotations.TokenBeginAnnotation.class), annotation.get(CoreAnnotations.TokenEndAnnotation.class), Interval.INTERVAL_OPEN_END); annotation.set(CoreAnnotations.NumerizedTokensAnnotation.class, annotation2.get(CoreAnnotations.NumerizedTokensAnnotation.class)); } annotation.set(TimeExpression.ChildrenAnnotation.class, annotation.get(CoreAnnotations.NumerizedTokensAnnotation.class)); annotation.set(Annotation.class, this); text = annotation.get(CoreAnnotations.TextAnnotation.class); temporal = temporalFunc.apply(annotation); return annotation; }
chunkOffsets.getBegin(), chunkOffsets.getEnd()); if (sourceAnnotation.containsKey(CoreAnnotations.TextAnnotation.class)) { ChunkAnnotationUtils.annotateChunkText(annotation, sourceAnnotation); chunkOffsets = ChunkAnnotationUtils.getChunkOffsetsUsingCharOffsets((List<? extends CoreMap>) sourceAnnotation.get(tokensAnnotationKey), charOffsets.getBegin() + baseCharOffset, charOffsets.getEnd() + baseCharOffset); CoreMap annotation2 = aggregator.merge((List<? extends CoreMap>) sourceAnnotation.get(tokensAnnotationKey), chunkOffsets.getBegin(), chunkOffsets.getEnd()); annotation = ChunkAnnotationUtils.getAnnotatedChunkUsingCharOffsets(sourceAnnotation, charOffsets.getBegin(), charOffsets.getEnd()); tokenOffsets = Interval.toInterval(annotation.get(CoreAnnotations.TokenBeginAnnotation.class), annotation.get(CoreAnnotations.TokenEndAnnotation.class), Interval.INTERVAL_OPEN_END);
int tokenEnd = i; if (tokenBegin >= 0 && tokenEnd > tokenBegin) { CoreMap chunk = ChunkAnnotationUtils.getAnnotatedChunk(tokens, tokenBegin, tokenEnd, totalTokensOffset, tokenChunkKey, textKey, tokenLabelKey); chunk.set(labelKey, prevTagType.type); CoreMap chunk = ChunkAnnotationUtils.getAnnotatedChunk(tokens, tokenBegin, tokens.size(), totalTokensOffset, tokenChunkKey, textKey, tokenLabelKey); chunk.set(labelKey, prevTagType.type);
public static List<CoreMap> getAnnotatedChunksUsingSortedCharOffsets( CoreMap annotation, List<IntPair> charOffsets) { return getAnnotatedChunksUsingSortedCharOffsets(annotation, charOffsets, true, null, null, true); }
/** * Give an list of character offsets for chunk, fix sentence splitting * so sentences doesn't break the chunks. * * @param docAnnotation Document with sentences * @param chunkCharOffsets ordered pairs of different chunks that should appear in sentences * @return true if fix was okay (chunks are in all sentences), false otherwise */ public static boolean fixChunkSentenceBoundaries(CoreMap docAnnotation, List<IntPair> chunkCharOffsets) { return fixChunkSentenceBoundaries(docAnnotation, chunkCharOffsets, false, false, false); }
chunk.set(CoreAnnotations.TokenBeginAnnotation.class, tokenBegin + annoTokenBegin); chunk.set(CoreAnnotations.TokenEndAnnotation.class, tokenEnd + annoTokenBegin); annotateChunkTokens(chunk, tokenChunkKey, tokenLabelKey); chunks.add(chunk); if (j >= annoTokens.size()) break;
if (origText != null) { ChunkAnnotationUtils.annotateChunkText(cm, annotation); text = cm.get(CoreAnnotations.TextAnnotation.class);
tokenEnd = t.get(CoreAnnotations.TokenEndAnnotation.class) - sentTokenStart; } else { CoreMap cm = ChunkAnnotationUtils.getAnnotatedChunkUsingCharOffsets(docAnnotation, t.get(CoreAnnotations.CharacterOffsetBeginAnnotation.class), t.get(CoreAnnotations.CharacterOffsetEndAnnotation.class)); int tokenIndex = 0; if (tokens == null) { CoreMap cm = ChunkAnnotationUtils.getAnnotatedChunkUsingCharOffsets(docAnnotation, t.get(CoreAnnotations.CharacterOffsetBeginAnnotation.class), t.get(CoreAnnotations.CharacterOffsetEndAnnotation.class));
sentence = sentences.get(i); Integer firstNonWsCharOffset = getFirstNonWsCharOffset(sentence, false); if (firstNonWsCharOffset != null && firstNonWsCharOffset >= offsetEnd) { mergeChunks(sentences, text, startSentIndex, i+1); if (entityAtSentEnd) { CoreMap nextSentence = sentences.get(i+1); Character c = getFirstNonWsChar(nextSentence); if (c != null) { doMerge = !Character.isUpperCase(c); mergeChunks(sentences, text, i, i+2); CoreMap sentence = sentences.get(i); CoreMap nextSentence = sentences.get(i+1); String sentTrimmedText = getTrimmedText(sentence); String nextSentTrimmedText = getTrimmedText(nextSentence); if (sentTrimmedText.length() <= 1 || nextSentTrimmedText.length() <= 1) { Character c = getFirstNonWsChar(nextSentence); mergeChunks(sentences, text, i, i+2); } else { i++;
annotation = ChunkAnnotationUtils.getMergedChunk((List<? extends CoreMap>) sourceAnnotation.get(tokensAnnotationKey), chunkOffsets.getBegin(), chunkOffsets.getEnd(), aggregators ); if (sourceAnnotation.containsKey(CoreAnnotations.TextAnnotation.class)) { ChunkAnnotationUtils.annotateChunkText(annotation, sourceAnnotation); chunkOffsets = ChunkAnnotationUtils.getChunkOffsetsUsingCharOffsets((List<? extends CoreMap>) sourceAnnotation.get(tokensAnnotationKey), charOffsets.getBegin() + baseCharOffset, charOffsets.getEnd() + baseCharOffset); CoreMap annotation2 = ChunkAnnotationUtils.getMergedChunk((List<? extends CoreMap>) sourceAnnotation.get(tokensAnnotationKey), chunkOffsets.getBegin(), chunkOffsets.getEnd(), aggregators ); annotation = ChunkAnnotationUtils.getAnnotatedChunkUsingCharOffsets(sourceAnnotation, charOffsets.getBegin(), charOffsets.getEnd()); tokenOffsets = Interval.toInterval(annotation.get(CoreAnnotations.TokenBeginAnnotation.class), annotation.get(CoreAnnotations.TokenEndAnnotation.class), Interval.INTERVAL_OPEN_END);
chunkOffsets.getBegin(), chunkOffsets.getEnd()); if (sourceAnnotation.containsKey(CoreAnnotations.TextAnnotation.class)) { ChunkAnnotationUtils.annotateChunkText(annotation, sourceAnnotation); chunkOffsets = ChunkAnnotationUtils.getChunkOffsetsUsingCharOffsets((List<? extends CoreMap>) sourceAnnotation.get(tokensAnnotationKey), charOffsets.getBegin() + baseCharOffset, charOffsets.getEnd() + baseCharOffset); CoreMap annotation2 = aggregator.merge((List<? extends CoreMap>) sourceAnnotation.get(tokensAnnotationKey), chunkOffsets.getBegin(), chunkOffsets.getEnd()); annotation = ChunkAnnotationUtils.getAnnotatedChunkUsingCharOffsets(sourceAnnotation, charOffsets.getBegin(), charOffsets.getEnd()); tokenOffsets = Interval.toInterval(annotation.get(CoreAnnotations.TokenBeginAnnotation.class), annotation.get(CoreAnnotations.TokenEndAnnotation.class), Interval.INTERVAL_OPEN_END);
/** * Create a new chunk Annotation with basic chunk information * CharacterOffsetBeginAnnotation - set to CharacterOffsetBeginAnnotation of first token in chunk * CharacterOffsetEndAnnotation - set to CharacterOffsetEndAnnotation of last token in chunk * TokensAnnotation - List of tokens in this chunk * TokenBeginAnnotation - Index of first token in chunk (index in original list of tokens) * tokenStartIndex + annotation's TokenBeginAnnotation * TokenEndAnnotation - Index of last token in chunk (index in original list of tokens) * tokenEndIndex + annotation's TokenBeginAnnotation * TextAnnotation - String extracted from the origAnnotation using character offset information for this chunk * @param annotation - Annotation from which to extract the text for this chunk * @param tokenStartIndex - Index (relative to current list of tokens) at which this chunk starts * @param tokenEndIndex - Index (relative to current list of tokens) at which this chunk ends (not inclusive) * @param tokenChunkKey - If not null, each token is annotated with the chunk using this key * @param tokenLabelKey - If not null, each token is annotated with the text associated with the chunk using this key * @return Annotation representing new chunk */ public static Annotation getAnnotatedChunk(CoreMap annotation, int tokenStartIndex, int tokenEndIndex, Class tokenChunkKey, Class tokenLabelKey) { Annotation chunk = getAnnotatedChunk(annotation, tokenStartIndex, tokenEndIndex); annotateChunkTokens(chunk, tokenChunkKey, tokenLabelKey); return chunk; }
/** * Create a new chunk Annotation with basic chunk information * CharacterOffsetBeginAnnotation - set to CharacterOffsetBeginAnnotation of first token in chunk * CharacterOffsetEndAnnotation - set to CharacterOffsetEndAnnotation of last token in chunk * TokensAnnotation - List of tokens in this chunk * TokenBeginAnnotation - Index of first token in chunk (index in original list of tokens) * tokenStartIndex + annotation's TokenBeginAnnotation * TokenEndAnnotation - Index of last token in chunk (index in original list of tokens) * tokenEndIndex + annotation's TokenBeginAnnotation * TextAnnotation - String extracted from the origAnnotation using character offset information for this chunk * @param annotation - Annotation from which to extract the text for this chunk * @param tokenStartIndex - Index (relative to current list of tokens) at which this chunk starts * @param tokenEndIndex - Index (relative to current list of tokens) at which this chunk ends (not inclusive) * @return Annotation representing new chunk */ public static Annotation getAnnotatedChunk(CoreMap annotation, int tokenStartIndex, int tokenEndIndex) { Integer annoTokenBegin = annotation.get(CoreAnnotations.TokenBeginAnnotation.class); if (annoTokenBegin == null) { annoTokenBegin = 0; } List<CoreLabel> tokens = annotation.get(CoreAnnotations.TokensAnnotation.class); Annotation chunk = getAnnotatedChunk(tokens, tokenStartIndex, tokenEndIndex, annoTokenBegin); boolean annotatedTextFromCharOffsets = annotateChunkText(chunk, annotation); if (!annotatedTextFromCharOffsets) { // Use tokens to get text annotation annotateChunkText(chunk, CoreAnnotations.TextAnnotation.class); } return chunk; }
CoreMap chunk = ChunkAnnotationUtils.getAnnotatedChunk(tokens, i, i + 1, totalTokensOffset, null, null, null); chunk.set(CoreAnnotations.NamedEntityTagAnnotation.class,"ORGANIZATION");