@Override protected void doExecute(ExecutionContext ctx) { StringFieldValue input = (StringFieldValue)ctx.getValue(); SpanList spanList = input.setSpanTree(new SpanTree(SpanTrees.LINGUISTICS)).spanList(); int lastPosition = 0; for (Iterator<GramSplitter.Gram> it = linguistics.getGramSplitter().split(input.getString(), gramSize); it.hasNext();) { GramSplitter.Gram gram = it.next(); // if there is a gap before this gram, then annotate the gram as punctuation // (technically it may be of various types, but it does not matter - we just // need to annotate it somehow (as a non-term) to make sure it is added to the summary) if (lastPosition < gram.getStart()) { typedSpan(lastPosition, gram.getStart() - lastPosition, TokenType.PUNCTUATION, spanList); } // annotate gram as a word term String gramString = gram.extractFrom(input.getString()); typedSpan(gram.getStart(), gram.getLength(), TokenType.ALPHABETIC, spanList). annotate(LinguisticsAnnotator.lowerCaseTermAnnotation(gramString, gramString)); lastPosition = gram.getStart() + gram.getLength(); } // handle punctuation at the end if (lastPosition < input.toString().length()) { typedSpan(lastPosition, input.toString().length() - lastPosition, TokenType.PUNCTUATION, spanList); } }
/** * Annotates the given string with the appropriate linguistics annotations. * * @param text the text to annotate * @return whether or not anything was annotated */ public boolean annotate(StringFieldValue text) { if (text.getSpanTree(SpanTrees.LINGUISTICS) != null) return true; // Already annotated with LINGUISTICS. Tokenizer tokenizer = factory.getTokenizer(); String input = (text.getString().length() <= config.getMaxTokenizeLength()) ? text.getString() : text.getString().substring(0, config.getMaxTokenizeLength()); Iterable<Token> tokens = tokenizer.tokenize(input, config.getLanguage(), config.getStemMode(), config.getRemoveAccents()); TermOccurrences termOccurrences = new TermOccurrences(config.getMaxTermOccurrences()); SpanTree tree = new SpanTree(SpanTrees.LINGUISTICS); for (Token token : tokens) { addAnnotationSpan(text.getString(), tree.spanList(), tokenizer, token, config.getStemMode(), termOccurrences); } if (tree.numAnnotations() == 0) return false; text.setSpanTree(tree); return true; }