Refine search
protected static void extractNPorPRP(CoreMap s, List<Mention> mentions, Set<IntPair> mentionSpanSet, Set<IntPair> namedEntitySpanSet) { List<CoreLabel> sent = s.get(CoreAnnotations.TokensAnnotation.class); Tree tree = s.get(TreeCoreAnnotations.TreeAnnotation.class); tree.indexLeaves(); SemanticGraph dependency = s.get(SemanticGraphCoreAnnotations.EnhancedDependenciesAnnotation.class); TregexPattern tgrepPattern = npOrPrpMentionPattern; TregexMatcher matcher = tgrepPattern.matcher(tree); while (matcher.find()) { Tree t = matcher.getMatch(); List<Tree> mLeaves = t.getLeaves(); int beginIdx = ((CoreLabel)mLeaves.get(0).label()).get(CoreAnnotations.IndexAnnotation.class)-1; int endIdx = ((CoreLabel)mLeaves.get(mLeaves.size()-1).label()).get(CoreAnnotations.IndexAnnotation.class); if (",".equals(sent.get(endIdx-1).word())) { endIdx--; } // try not to have span that ends with , IntPair mSpan = new IntPair(beginIdx, endIdx); if(!mentionSpanSet.contains(mSpan) && !insideNE(mSpan, namedEntitySpanSet)) { int dummyMentionId = -1; Mention m = new Mention(dummyMentionId, beginIdx, endIdx, dependency, new ArrayList<>(sent.subList(beginIdx, endIdx)), t); mentions.add(m); mentionSpanSet.add(mSpan); } } } /** Extract enumerations (A, B, and C) */
public List<CoreLabel> segmentStringToTokenList(String line) { List<CoreLabel> tokenList = CollectionUtils.makeList(); List<CoreLabel> labeledSequence = segmentStringToIOB(line); for (IntPair span : IOBUtils.TokenSpansForIOB(labeledSequence)) { CoreLabel token = new CoreLabel(); String text = IOBUtils.IOBToString(labeledSequence, prefixMarker, suffixMarker, span.getSource(), span.getTarget()); token.setWord(text); token.setValue(text); token.set(CoreAnnotations.TextAnnotation.class, text); token.set(CoreAnnotations.ArabicSegAnnotation.class, "1"); int start = labeledSequence.get(span.getSource()).beginPosition(); int end = labeledSequence.get(span.getTarget() - 1).endPosition(); token.setOriginalText(line.substring(start, end)); token.set(CoreAnnotations.CharacterOffsetBeginAnnotation.class, start); token.set(CoreAnnotations.CharacterOffsetEndAnnotation.class, end); tokenList.add(token); } return tokenList; }
protected static void extractEnumerations(CoreMap s, List<Mention> mentions, Set<IntPair> mentionSpanSet, Set<IntPair> namedEntitySpanSet) { List<CoreLabel> sent = s.get(CoreAnnotations.TokensAnnotation.class); Tree tree = s.get(TreeCoreAnnotations.TreeAnnotation.class); SemanticGraph dependency = s.get(SemanticGraphCoreAnnotations.EnhancedDependenciesAnnotation.class); TregexPattern tgrepPattern = enumerationsMentionPattern; TregexMatcher matcher = tgrepPattern.matcher(tree); Map<IntPair, Tree> spanToMentionSubTree = Generics.newHashMap(); while (matcher.find()) { matcher.getMatch(); Tree m1 = matcher.getNode("m1"); Tree m2 = matcher.getNode("m2"); List<Tree> mLeaves = m1.getLeaves(); int beginIdx = ((CoreLabel)mLeaves.get(0).label()).get(CoreAnnotations.IndexAnnotation.class)-1; int endIdx = ((CoreLabel)mLeaves.get(mLeaves.size()-1).label()).get(CoreAnnotations.IndexAnnotation.class); spanToMentionSubTree.put(new IntPair(beginIdx, endIdx), m1); mLeaves = m2.getLeaves(); beginIdx = ((CoreLabel)mLeaves.get(0).label()).get(CoreAnnotations.IndexAnnotation.class)-1; endIdx = ((CoreLabel)mLeaves.get(mLeaves.size()-1).label()).get(CoreAnnotations.IndexAnnotation.class); spanToMentionSubTree.put(new IntPair(beginIdx, endIdx), m2); } for(IntPair mSpan : spanToMentionSubTree.keySet()){ if(!mentionSpanSet.contains(mSpan) && !insideNE(mSpan, namedEntitySpanSet)) { int dummyMentionId = -1; Mention m = new Mention(dummyMentionId, mSpan.get(0), mSpan.get(1), dependency, new ArrayList<>(sent.subList(mSpan.get(0), mSpan.get(1))), spanToMentionSubTree.get(mSpan)); mentions.add(m); mentionSpanSet.add(mSpan); } } }
private boolean checkIfSatisfiedMaxDepth(SemanticGraph g, IndexedWord parent, IndexedWord child, IntPair depths) { if (depths.get(0) == Integer.MAX_VALUE) return true; if (parent.equals(child)) return true; boolean foundInMaxDepth = false; for (IndexedWord c : g.getChildren(parent)) { if (c.equals(child)) { return true; } } depths.set(1, depths.get(1) + 1); if (depths.get(1) >= depths.get(0)) return false; for (IndexedWord c : g.getChildren(parent)) { foundInMaxDepth = checkIfSatisfiedMaxDepth(g, c, child, depths); if (foundInMaxDepth == true) return foundInMaxDepth; } return false; }
List<CoreLabel> sentenceAnno = sentence.get(CoreAnnotations.TokensAnnotation.class); Tree sentenceTree = sentence.get(TreeCoreAnnotations.TreeAnnotation.class); Map<Pair<Integer,Integer>,String> sentenceInfo = Generics.newHashMap(); Set<Tree> sentenceSubTrees = sentenceTree.subTrees(); sentenceTree.setSpans(); Map<Pair<Integer,Integer>,Tree> treeSpanMap = Generics.newHashMap(); Map<Pair<Integer,Integer>,List<Tree>> wordSpanMap = Generics.newHashMap(); IntPair span = ctree.getSpan(); if (span != null) { treeSpanMap.put(Pair.makePair(span.getSource(), span.getTarget()), ctree); wordSpanMap.put(Pair.makePair(span.getSource(), span.getTarget()), ctree.getLeaves()); for (CoreLabel newAnno : sentenceAnno) { index += 1; String word = newAnno.word(); String tag = newAnno.tag(); String cat = newAnno.ner(); String coref = newAnno.get(CorefCoreAnnotations.CorefAnnotation.class); finalSentence[index] = new String[4];
Set<Class<?>> keySet = coreLabel.keySetNotNull(); keysToSerialize.remove(SectionEndAnnotation.class); if (coreLabel.word() != null) builder.setWord(coreLabel.word()); if (keySet.contains(PartOfSpeechAnnotation.class)) { builder.setPos(coreLabel.tag()); keysToSerialize.remove(PartOfSpeechAnnotation.class); } CoreMap sectionAnnotations = coreLabel.get(SectionStartAnnotation.class); if (sectionAnnotations.get(SectionAnnotation.class) != null) builder.setSectionName(sectionAnnotations.get(SectionAnnotation.class)); if (sectionAnnotations.get(AuthorAnnotation.class) != null) builder.setSectionAuthor(sectionAnnotations.get(AuthorAnnotation.class)); if (keySet.contains(SpanAnnotation.class)) { IntPair span = getAndRegister(coreLabel, keysToSerialize, SpanAnnotation.class); builder.setSpan(CoreNLPProtos.Span.newBuilder().setBegin(span.getSource()).setEnd(span.getTarget()).build()); if (keySet.contains(CoNLLUTokenSpanAnnotation.class)) { IntPair span = getAndRegister(coreLabel, keysToSerialize, CoNLLUTokenSpanAnnotation.class); builder.setConllUTokenSpan(CoreNLPProtos.Span.newBuilder().setBegin(span.getSource()).setEnd(span.getTarget()).build());
private static void extractNPorPRP(CoreMap s, List<Mention> mentions, Set<IntPair> mentionSpanSet, Set<IntPair> namedEntitySpanSet) { List<CoreLabel> sent = s.get(CoreAnnotations.TokensAnnotation.class); Tree tree = s.get(TreeCoreAnnotations.TreeAnnotation.class); tree.indexLeaves(); SemanticGraph basicDependency = s.get(SemanticGraphCoreAnnotations.BasicDependenciesAnnotation.class); SemanticGraph enhancedDependency = s.get(SemanticGraphCoreAnnotations.EnhancedDependenciesAnnotation.class); if (enhancedDependency == null) { while (matcher.find()) { Tree t = matcher.getMatch(); List<Tree> mLeaves = t.getLeaves(); int beginIdx = ((CoreLabel)mLeaves.get(0).label()).get(CoreAnnotations.IndexAnnotation.class)-1; int endIdx = ((CoreLabel)mLeaves.get(mLeaves.size()-1).label()).get(CoreAnnotations.IndexAnnotation.class); if (",".equals(sent.get(endIdx-1).word())) { endIdx--; } // try not to have span that ends with , IntPair mSpan = new IntPair(beginIdx, endIdx); if(!mentionSpanSet.contains(mSpan) && (!insideNE(mSpan, namedEntitySpanSet) || t.value().startsWith("PRP")) ) {
private static void findSpeakersInArticle(Document doc, Dictionaries dict) { List<CoreMap> sentences = doc.annotation.get(CoreAnnotations.SentencesAnnotation.class); IntPair beginQuotation = null; IntPair endQuotation = null; boolean insideQuotation = false; int utterNum = -1; for (int i = 0 ; i < sentences.size(); i++) { List<CoreLabel> sent = sentences.get(i).get(CoreAnnotations.TokensAnnotation.class); for(int j = 0 ; j < sent.size() ; j++) { int utterIndex = sent.get(j).get(CoreAnnotations.UtteranceAnnotation.class); if(utterIndex != 0 && !insideQuotation) { utterNum = utterIndex; insideQuotation = true; beginQuotation = new IntPair(i,j); } else if (utterIndex == 0 && insideQuotation) { insideQuotation = false; endQuotation = new IntPair(i,j); findQuotationSpeaker(doc, utterNum, sentences, beginQuotation, endQuotation, dict); } } } if(insideQuotation) { endQuotation = new IntPair(sentences.size()-1, sentences.get(sentences.size()-1).get(CoreAnnotations.TokensAnnotation.class).size()-1); findQuotationSpeaker(doc, utterNum, sentences, beginQuotation, endQuotation, dict); } }
Collection<CoreMap> mentions = doc.corefChainMap.get(id); for (CoreMap m:mentions) { CoreMap sent = sentences.get(m.get(CoreAnnotations.SentenceIndexAnnotation.class)); Tree root = sent.get(TreeCoreAnnotations.TreeAnnotation.class); Tree t = m.get(TreeCoreAnnotations.TreeAnnotation.class); Tree npt = t; Tree npt2 = t; if (npt.isPreTerminal()) { npt = npt.parent(root); mentionTokenLengthCounter.incrementCount(length); IntPair span = t.getSpan(); if (span != null) { if (span.getSource() == tokenStart && span.getTarget() == tokenEnd - 1) { mentionExactTreeSpan++; } else { if (nptSpan.getSource() == tokenStart && nptSpan.getTarget() == tokenEnd - 1) { nonPretermSpanMatches++; npt2 = npt; CoreMap mention = ((CoreLabel) tlabel).get(CorefMentionAnnotation.class); if (((CoreLabel) tlabel).containsKey(NamedEntityAnnotation.class)) { if (((CoreLabel) plabel).containsKey(NamedEntityAnnotation.class)) {
String text = docAnnotation.get(CoreAnnotations.TextAnnotation.class); List<CoreLabel> tokens = docAnnotation.get(CoreAnnotations.TokensAnnotation.class); List<CoreLabel> output = new ArrayList<>(tokens.size()); int i = 0; CoreLabel token = tokens.get(i); for (IntPair offsets:chunkCharOffsets) { assert(token.beginPosition() >= 0); assert(token.endPosition() >= 0); int offsetBegin = offsets.getSource(); int offsetEnd = offsets.getTarget(); while (offsetBegin < token.get(CoreAnnotations.CharacterOffsetBeginAnnotation.class) || offsetBegin >= token.get(CoreAnnotations.CharacterOffsetEndAnnotation.class)) { output.add(token); output.add(token); docAnnotation.set(CoreAnnotations.TokensAnnotation.class, output); return true;
for (int i = 0; i < sentences.size(); i++) { CoreMap sentence = sentences.get(i); Tree tree = sentence.get(TreeCoreAnnotations.TreeAnnotation.class); tree.setSpans(); List<String[]> sentWords = document.sentenceWordLists.get(i); String label = nerSpan.third(); CoreMap nerChunk = ChunkAnnotationUtils.getAnnotatedChunk(sentence, startToken, endToken+1); nerChunk.set(CoreAnnotations.NamedEntityTagAnnotation.class, label); nerChunk.set(CoreAnnotations.SentenceIndexAnnotation.class, sentence.get(CoreAnnotations.SentenceIndexAnnotation.class)); nerChunks.add(nerChunk); Tree t = getTreeNonTerminal(tree, startToken, endToken, true); if (t.getSpan().getSource() == startToken && t.getSpan().getTarget() == endToken) { nerChunk.set(TreeCoreAnnotations.TreeAnnotation.class, t); if (options.annotateTreeNer) { Label tlabel = t.label(); if (tlabel instanceof CoreLabel) { ((CoreLabel) tlabel).set(NamedEntityAnnotation.class, nerChunk); Label tlabel = t.label(); if (tlabel instanceof CoreLabel) { ((CoreLabel) tlabel).set(CorefMentionAnnotation.class, mention);
protected static void extractNamedEntityMentions(CoreMap s, List<Mention> mentions, Set<IntPair> mentionSpanSet, Set<IntPair> namedEntitySpanSet) { List<CoreLabel> sent = s.get(CoreAnnotations.TokensAnnotation.class); SemanticGraph basicDependency = s.get(SemanticGraphCoreAnnotations.BasicDependenciesAnnotation.class); SemanticGraph enhancedDependency = s.get(SemanticGraphCoreAnnotations.EnhancedDependenciesAnnotation.class); if (enhancedDependency == null) { enhancedDependency = s.get(SemanticGraphCoreAnnotations.BasicDependenciesAnnotation.class); int beginIndex = -1; for(CoreLabel w : sent) { String nerString = w.ner(); if(!nerString.equals(preNE)) { int endIndex = w.get(CoreAnnotations.IndexAnnotation.class) - 1; if(!preNE.matches("O")){ if(w.get(CoreAnnotations.TextAnnotation.class).equals("'s") && w.tag().equals("POS")) { endIndex++; IntPair mSpan = new IntPair(beginIndex, endIndex); IntPair mSpan = new IntPair(beginIndex, sent.size()); if(!mentionSpanSet.contains(mSpan)) { int dummyMentionId = -1;
private void extractMentionForHeadword(IndexedWord headword, SemanticGraph dep, CoreMap s, List<Mention> mentions, Set<IntPair> mentionSpanSet, Set<IntPair> namedEntitySpanSet) { List<CoreLabel> sent = s.get(CoreAnnotations.TokensAnnotation.class); SemanticGraph basic = s.get(SemanticGraphCoreAnnotations.BasicDependenciesAnnotation.class); SemanticGraph enhanced = s.get(SemanticGraphCoreAnnotations.EnhancedDependenciesAnnotation.class); if (enhanced == null) { enhanced = s.get(SemanticGraphCoreAnnotations.BasicDependenciesAnnotation.class); if(headword.tag().startsWith("PRP")) { extractPronounForHeadword(headword, dep, s, mentions, mentionSpanSet, namedEntitySpanSet); return; int beginIdx = npSpan.get(0); int endIdx = npSpan.get(1)+1; if (",".equals(sent.get(endIdx-1).word())) { endIdx--; } // try not to have span that ends with , if ("IN".equals(sent.get(beginIdx).tag())) { beginIdx++; } // try to remove first IN. addMention(beginIdx, endIdx, headword, mentions, mentionSpanSet, namedEntitySpanSet, sent, basic, enhanced); IndexedWord conjChild = dep.getChildWithReln(headword, UniversalEnglishGrammaticalRelations.CONJUNCT); for(IndexedWord c : conjChildren) { if(c.index() < conjChild.index()) conjChild = c; for(int endIdxFirstElement = left.index()-1 ; endIdxFirstElement > beginIdx ; endIdxFirstElement--) { if(!sent.get(endIdxFirstElement-1).tag().matches("CC|,")) { if(headword.index()-1 < endIdxFirstElement) { addMention(beginIdx, endIdxFirstElement, headword, mentions, mentionSpanSet, namedEntitySpanSet, sent, basic, enhanced);
int beginIdx = headword.index()-1; int endIdx = headword.index(); if(sent.size() > headword.index() && sent.get(headword.index()).word().matches("all|both")) { IndexedWord c = dep.getNodeByIndex(headword.index()+1); SemanticGraphEdge edge = dep.getEdge(headword, c); IntPair mSpan = new IntPair(beginIdx, endIdx); if(!mentionSpanSet.contains(mSpan) && (!insideNE(mSpan, namedEntitySpanSet)) ) { int dummyMentionId = -1; m.headIndex = headword.index()-1; m.headWord = sent.get(m.headIndex); m.headString = m.headWord.word().toLowerCase(Locale.ENGLISH); mentions.add(m); mentionSpanSet.add(mSpan); if(conjChildren.size() > 0) { IntPair npSpan = getNPSpan(headword, dep, sent); beginIdx = npSpan.get(0); endIdx = npSpan.get(1)+1; if (",".equals(sent.get(endIdx-1).word())) { endIdx--; } // try not to have span that ends with , addMention(beginIdx, endIdx, headword, mentions, mentionSpanSet, namedEntitySpanSet, sent, basic, enhanced);
/** * Create a ParseTree proto from a Tree. If the Tree is a scored tree, the scores will * be preserved. * @param parseTree The parse tree to convert. * @return A protocol buffer message corresponding to this tree. */ public CoreNLPProtos.ParseTree toProto(Tree parseTree) { CoreNLPProtos.ParseTree.Builder builder = CoreNLPProtos.ParseTree.newBuilder(); // Required fields for (Tree child : parseTree.children()) { builder.addChild(toProto(child)); } // Optional fields IntPair span = parseTree.getSpan(); if (span != null) { builder.setYieldBeginIndex(span.getSource()); builder.setYieldEndIndex(span.getTarget()); } if (parseTree.label() != null) { builder.setValue(parseTree.label().value()); } if (!Double.isNaN(parseTree.score())) { builder.setScore(parseTree.score()); } Integer sentiment; if (parseTree.label() instanceof CoreMap && (sentiment = ((CoreMap) parseTree.label()).get(RNNCoreAnnotations.PredictedClass.class)) != null) { builder.setSentiment(CoreNLPProtos.Sentiment.forNumber(sentiment)); } // Return return builder.build(); }
List<Mention> mentions = predictedMentions.get(i); CoreMap sent = sentences.get(i); List<CoreLabel> tokens = sent.get(TokensAnnotation.class); Set<IntPair> mentionSpanSet = mentionSpanSetList.get(i); sb.append(tokens.get(k+j).word()).append(" "); if( endIndex < tokens.size() && tokens.get(endIndex).word().equals("'s") && tokens.get(endIndex).tag().equals("POS")) { Tree tree = sent.get(TreeAnnotation.class); Tree sToken = tree.getLeaves().get(beginIndex); Tree eToken = tree.getLeaves().get(endIndex); Tree join = tree.joinNode(sToken, eToken); Tree sJoin = join.getLeaves().get(0); Tree eJoin = join.getLeaves().get(join.getLeaves().size()-1); if( beginIndex > 0 && tokens.get(beginIndex-1).tag().equals("DT")) { Tree tree = sent.get(TreeAnnotation.class); Tree sToken = tree.getLeaves().get(beginIndex-1); Tree eToken = tree.getLeaves().get(endIndex-1); IntPair span = new IntPair(beginIndex, endIndex); if(phrase.equalsIgnoreCase(ne) && !mentionSpanSet.contains(span)) { int dummyMentionId = -1;
CoreMap annotation, List<IntPair> charOffsets, boolean charOffsetIsRelative, Class tokenChunkKey, Class tokenLabelKey, boolean allowPartialTokens) { String annoText = annotation.get(CoreAnnotations.TextAnnotation.class); List<CoreMap> chunks = new ArrayList<>(charOffsets.size()); List<CoreLabel> annoTokens = annotation.get(CoreAnnotations.TokensAnnotation.class); Integer annoCharBegin = annotation.get(CoreAnnotations.CharacterOffsetBeginAnnotation.class); if (annoCharBegin == null) { annoCharBegin = 0; } Integer annoTokenBegin = annotation.get(CoreAnnotations.TokenBeginAnnotation.class); int i = 0; for (IntPair p:charOffsets) { int beginRelCharOffset = charOffsetIsRelative? p.getSource():p.getSource()-annoCharBegin; int endRelCharOffset = charOffsetIsRelative? p.getTarget():p.getTarget()-annoCharBegin; int beginCharOffset = beginRelCharOffset + annoCharBegin; int endCharOffset = endRelCharOffset + annoCharBegin; if (endRelCharOffset > annoText.length()) { endRelCharOffset = annoText.length(); } if (allowPartialTokens) { while (i < annoTokens.size() && annoTokens.get(i).endPosition() <= beginCharOffset) { i++; while (i < annoTokens.size() && annoTokens.get(i).beginPosition() < beginCharOffset) { i++; int j = i; if (allowPartialTokens) { while (j < annoTokens.size() && annoTokens.get(j).beginPosition() < endCharOffset) { j++;
if (token.containsKey(CoreAnnotations.CoNLLUTokenSpanAnnotation.class)) { IntPair tokenSpan = token.get(CoreAnnotations.CoNLLUTokenSpanAnnotation.class); if (tokenSpan.getSource() == token.index()) { String range = String.format("%d-%d", tokenSpan.getSource(), tokenSpan.getTarget()); sb.append(String.format("%s\t%s\t_\t_\t_\t_\t_\t_\t_\t_%n", range, token.originalText()));
protected static void addNamedEntityStrings(CoreMap s, Set<String> neStrings, Set<IntPair> namedEntitySpanSet) { List<CoreLabel> tokens = s.get(TokensAnnotation.class); for(IntPair p : namedEntitySpanSet) { StringBuilder sb = new StringBuilder(); for(int idx=p.get(0) ; idx < p.get(1) ; idx++) { sb.append(tokens.get(idx).word()).append(" "); } String str = sb.toString().trim(); if(str.endsWith(" 's")) { str = str.substring(0, str.length()-3); } neStrings.add(str); } }