@Override public FieldValue getFieldValue(StructuredFieldValue doc) { StringFieldValue sfv = (StringFieldValue) super.getFieldValue(doc); Map<String, SpanTree> trees = extractSpanTrees.get(doc); if (trees != null) { for (SpanTree tree : trees.values()) { sfv.setSpanTree(tree); } } return sfv; }
@Override protected void doExecute(ExecutionContext ctx) { StringFieldValue input = (StringFieldValue)ctx.getValue(); if (input.getString().isEmpty()) { return; } StringFieldValue output = input.clone(); ctx.setValue(output); String prev = output.getString(); String next = toLowerCase(prev); SpanList root = new SpanList(); SpanTree tree = new SpanTree(SpanTrees.LINGUISTICS, root); SpanNode node = new Span(0, prev.length()); tree.annotate(node, new Annotation(AnnotationTypes.TERM, next.equals(prev) ? null : new StringFieldValue(next))); tree.annotate(node, new Annotation(AnnotationTypes.TOKEN_TYPE, new IntegerFieldValue(TokenType.ALPHABETIC.getValue()))); root.add(node); output.setSpanTree(tree); }
@Override protected void doExecute(ExecutionContext ctx) { StringFieldValue input = (StringFieldValue)ctx.getValue(); SpanList spanList = input.setSpanTree(new SpanTree(SpanTrees.LINGUISTICS)).spanList(); int lastPosition = 0; for (Iterator<GramSplitter.Gram> it = linguistics.getGramSplitter().split(input.getString(), gramSize); it.hasNext();) { GramSplitter.Gram gram = it.next(); // if there is a gap before this gram, then annotate the gram as punctuation // (technically it may be of various types, but it does not matter - we just // need to annotate it somehow (as a non-term) to make sure it is added to the summary) if (lastPosition < gram.getStart()) { typedSpan(lastPosition, gram.getStart() - lastPosition, TokenType.PUNCTUATION, spanList); } // annotate gram as a word term String gramString = gram.extractFrom(input.getString()); typedSpan(gram.getStart(), gram.getLength(), TokenType.ALPHABETIC, spanList). annotate(LinguisticsAnnotator.lowerCaseTermAnnotation(gramString, gramString)); lastPosition = gram.getStart() + gram.getLength(); } // handle punctuation at the end if (lastPosition < input.toString().length()) { typedSpan(lastPosition, input.toString().length() - lastPosition, TokenType.PUNCTUATION, spanList); } }
/** * Annotates the given string with the appropriate linguistics annotations. * * @param text the text to annotate * @return whether or not anything was annotated */ public boolean annotate(StringFieldValue text) { if (text.getSpanTree(SpanTrees.LINGUISTICS) != null) return true; // Already annotated with LINGUISTICS. Tokenizer tokenizer = factory.getTokenizer(); String input = (text.getString().length() <= config.getMaxTokenizeLength()) ? text.getString() : text.getString().substring(0, config.getMaxTokenizeLength()); Iterable<Token> tokens = tokenizer.tokenize(input, config.getLanguage(), config.getStemMode(), config.getRemoveAccents()); TermOccurrences termOccurrences = new TermOccurrences(config.getMaxTermOccurrences()); SpanTree tree = new SpanTree(SpanTrees.LINGUISTICS); for (Token token : tokens) { addAnnotationSpan(text.getString(), tree.spanList(), tokenizer, token, config.getStemMode(), termOccurrences); } if (tree.numAnnotations() == 0) return false; text.setSpanTree(tree); return true; }
treeName.deserialize(this); tree.setName(treeName.getString()); value.setSpanTree(tree); readSpanTree(tree, false);
treeName.deserialize(this); tree.setName(treeName.getString()); value.setSpanTree(tree); readSpanTree(tree, false);