public int tokenToLocation(CoreLabel token) { CoreMap sentence = doc.get(CoreAnnotations.SentencesAnnotation.class).get( token.get(CoreAnnotations.SentenceIndexAnnotation.class)); return sentence.get(CoreAnnotations.TokenBeginAnnotation.class) + token.get(CoreAnnotations.IndexAnnotation.class) - 1; }
/** * Set index for each token and sentence in the document. * @param doc */ private static void setTokenIndices(Document doc) { int token_index = 0; for (CoreMap sent : doc.annotation.get(SentencesAnnotation.class)) { for (CoreLabel token : sent.get(TokensAnnotation.class)) { token.set(TokenBeginAnnotation.class, token_index++); } } }
/** * Splits a compound marked by the lexer. */ private CoreLabel processCompound(CoreLabel cl) { cl.remove(ParentAnnotation.class); String[] parts = pSpace.split(pDash.matcher(cl.word()).replaceAll(" - ")); int lengthAccum = 0; for (String part : parts) { CoreLabel newLabel = new CoreLabel(cl); newLabel.setWord(part); newLabel.setValue(part); newLabel.setBeginPosition(cl.beginPosition() + lengthAccum); newLabel.setEndPosition(cl.beginPosition() + lengthAccum + part.length()); newLabel.set(OriginalTextAnnotation.class, part); compoundBuffer.add(newLabel); lengthAccum += part.length(); } return compoundBuffer.remove(0); }
/** * set isNewline() */ private static void setNewlineStatus(List<CoreLabel> tokensList) { // label newlines for (CoreLabel token : tokensList) { if (token.word().equals(AbstractTokenizer.NEWLINE_TOKEN) && (token.endPosition() - token.beginPosition() == 1)) token.set(CoreAnnotations.IsNewlineAnnotation.class, true); else token.set(CoreAnnotations.IsNewlineAnnotation.class, false); } }
private static void runPipeline(StanfordCoreNLP pipeline, String text, PrintWriter out) { Annotation annotation = new Annotation(text); pipeline.annotate(annotation); // An Annotation is a Map and you can get and use the various analyses individually. out.println(); // The toString() method on an Annotation just prints the text of the Annotation // But you can see what is in it with other methods like toShorterString() out.println("The top level annotation"); out.println(annotation.toShorterString()); List<CoreMap> sentences = annotation.get(CoreAnnotations.SentencesAnnotation.class); for (CoreMap sentence : sentences) { // Print out token annotations for (CoreLabel token:sentence.get(CoreAnnotations.TokensAnnotation.class)) { // Print out words, lemma, ne, and normalized ne String word = token.get(CoreAnnotations.TextAnnotation.class); String lemma = token.get(CoreAnnotations.LemmaAnnotation.class); String pos = token.get(CoreAnnotations.PartOfSpeechAnnotation.class); String ne = token.get(CoreAnnotations.NamedEntityTagAnnotation.class); String normalized = token.get(CoreAnnotations.NormalizedNamedEntityTagAnnotation.class); out.println("token: " + "word="+word + ", lemma="+lemma + ", pos=" + pos + ", ne=" + ne + ", normalized=" + normalized); } } out.flush(); }
props.setProperty("annotators", "tokenize, ssplit, pos, lemma, ner, parse, dcoref, sentiment"); StanfordCoreNLP pipeline = new StanfordCoreNLP(props); annotation = new Annotation(IOUtils.slurpFileNoExceptions(args[0])); } else { annotation = new Annotation("Kosgi Santosh sent an email to Stanford University. He didn't get a reply."); pipeline.annotate(annotation); pipeline.prettyPrint(annotation, out); out.println(annotation.toShorterString()); out.println(); out.println(graph.toString(SemanticGraph.OutputFormat.LIST)); IndexedWord node = graph.getNodeByIndexSafe(5); out.println("Printing dependencies around \"" + node.word() + "\" index " + node.index()); assert edgeList.size() == 1; int head = edgeList.get(0).getGovernor().index(); out.println(" " + m + ", i.e., 0-based character offsets [" + tokens.get(m.startIndex - 1).beginPosition() + ", " + tokens.get(m.endIndex - 2).endPosition() + ')');
StanfordCoreNLP pipeline = new StanfordCoreNLP(props); document = new Annotation(IOUtils.slurpFileNoExceptions(args[0])); } else { document = new Annotation("克林顿说,华盛顿将逐步落实对韩国的经济援助。金大中对克林顿的讲话报以掌声:克林顿总统在会谈中重申,他坚定地支持韩国摆脱经济危机。"); pipeline.annotate(document); List<CoreMap> sentences = document.get(CoreAnnotations.SentencesAnnotation.class); for (CoreMap token : sentence.get(CoreAnnotations.TokensAnnotation.class)) { out.println(token.toShorterString("Text", "CharacterOffsetBegin", "CharacterOffsetEnd", "Index", "PartOfSpeech", "NamedEntityTag")); out.println(sentence.get(SemanticGraphCoreAnnotations.BasicDependenciesAnnotation.class).toString(SemanticGraph.OutputFormat.LIST)); sentNo++; List<CoreLabel> tokens = sentences.get(m.sentNum - 1).get(CoreAnnotations.TokensAnnotation.class); out.println(" " + m + ":[" + tokens.get(m.startIndex - 1).beginPosition() + ", " + tokens.get(m.endIndex - 2).endPosition() + ')');
List<CoreMap> sentences = anno.get(CoreAnnotations.SentencesAnnotation.class); for (CoreMap sentence:sentences) { if (!Constants.USE_GOLD_PARSES && !replicateCoNLL) { sentence.remove(TreeCoreAnnotations.TreeAnnotation.class); } else { Tree tree = sentence.get(TreeCoreAnnotations.TreeAnnotation.class); if (LEMMATIZE) { treeLemmatizer.transformTree(tree); for (CoreLabel token:anno.get(CoreAnnotations.TokensAnnotation.class)) { if (!token.containsKey(CoreAnnotations.SpeakerAnnotation.class)) { token.set(CoreAnnotations.SpeakerAnnotation.class, ""); String curSpeaker = token.get(CoreAnnotations.SpeakerAnnotation.class); if (!curSpeaker.equals(preSpeaker)) { utterance++; preSpeaker = curSpeaker; token.set(CoreAnnotations.UtteranceAnnotation.class, utterance); stanfordProcessor.annotate(anno); for (CoreMap sentence:anno.get(CoreAnnotations.SentencesAnnotation.class)) { allWords.add(sentence.get(CoreAnnotations.TokensAnnotation.class)); allTrees.add(sentence.get(TreeCoreAnnotations.TreeAnnotation.class));
List<CoreMap> sentences = dataset.get(CoreAnnotations.SentencesAnnotation.class); if (sentences.size() > 0 && !sentences.get(0).containsKey(TreeCoreAnnotations.TreeAnnotation.class)) { logger.info("Annotating dataset with " + processor); processor.annotate(dataset); } else { logger.info("Found existing syntactic annotations. Will not use the NLP processor."); List<CoreMap> sentences = dataset.get(CoreAnnotations.SentencesAnnotation.class); logger.fine("Extracted " + sentences.size() + " sentences."); for (CoreMap sentence : sentences) { List<CoreLabel> tokens = sentence.get(CoreAnnotations.TokensAnnotation.class); logger.fine("Processing sentence " + tokens); Tree tree = sentence.get(TreeCoreAnnotations.TreeAnnotation.class); if(tree == null) throw new RuntimeException("ERROR: MR requires full syntactic analysis!"); CoreLabel l = (CoreLabel) tree.label(); if(forceGenerationOfIndexSpans || (! l.containsKey(CoreAnnotations.BeginIndexAnnotation.class) && ! l.containsKey(CoreAnnotations.EndIndexAnnotation.class))){ tree.indexSpans(0); logger.fine("Index spans were generated."); } else { logger.fine("Index spans were NOT generated."); logger.fine("Parse tree using CoreLabel:\n" + tree.pennString()); logger.fine("Finding head for entity: " + ent); int headPos = assignSyntacticHead(ent, tree, tokens, calculateHeadSpan); logger.fine("Syntactic head of mention \"" + ent + "\" is: " + tokens.get(headPos).word());
private String findNextParagraphSpeaker(List<CoreMap> paragraph, int paragraphOffset, Dictionaries dict) { CoreMap lastSent = paragraph.get(paragraph.size()-1); String speaker = ""; for(CoreLabel w : lastSent.get(CoreAnnotations.TokensAnnotation.class)) { if(w.get(CoreAnnotations.LemmaAnnotation.class).equals("report") || w.get(CoreAnnotations.LemmaAnnotation.class).equals("say")) { String word = w.get(CoreAnnotations.TextAnnotation.class); SemanticGraph dependency = lastSent.get(SemanticGraphCoreAnnotations.EnhancedDependenciesAnnotation.class); IndexedWord t = dependency.getNodeByWordPattern(word); for(Pair<GrammaticalRelation,IndexedWord> child : dependency.childPairs(t)){ if(child.first().getShortName().equals("nsubj")) { int subjectIndex = child.second().index(); // start from 1 IntTuple headPosition = new IntTuple(2); headPosition.set(0, paragraph.size()-1 + paragraphOffset); headPosition.set(1, subjectIndex-1); if(mentionheadPositions.containsKey(headPosition) && mentionheadPositions.get(headPosition).nerString.startsWith("PER")) { speaker = Integer.toString(mentionheadPositions.get(headPosition).mentionID); } } } } } return speaker; }
@Override public void annotate(Annotation annotation) { if (annotation.containsKey(CoreAnnotations.SentencesAnnotation.class)) { List<CoreMap> sentences = annotation.get(CoreAnnotations.SentencesAnnotation.class); for (CoreMap sentence : sentences) { Tree binarized = sentence.get(TreeCoreAnnotations.BinarizedTreeAnnotation.class); if (binarized == null) { throw new AssertionError("Binarized sentences not built by parser"); IntPair p = bt.getSpan(); int sen = RNNCoreAnnotations.getPredictedClass(bt); String sentStr = SentimentUtils.sentimentString(model, sen); if (((CoreLabel) tree.label()).containsKey(CoreAnnotations.SpanAnnotation.class)) { throw new IllegalStateException("This code assumes you don't have SpanAnnotation"); if (str != null) { CoreLabel cl = (CoreLabel) t.label(); cl.set(SentimentCoreAnnotations.SentimentClass.class, str); cl.remove(CoreAnnotations.SpanAnnotation.class);
public void process(CoNLLDocument doc) mentionTokenLengthCounter.incrementCount(length); IntPair span = t.getSpan(); if (span != null) { if (span.getSource() == tokenStart && span.getTarget() == tokenEnd - 1) { mentionExactTreeSpan++; } else { if (nptSpan.getSource() == tokenStart && nptSpan.getTarget() == tokenEnd - 1) { nonPretermSpanMatches++; npt2 = npt; CoreMap mention = ((CoreLabel) tlabel).get(CorefMentionAnnotation.class); if (((CoreLabel) tlabel).containsKey(NamedEntityAnnotation.class)) { if (((CoreLabel) plabel).containsKey(NamedEntityAnnotation.class)) { logger.info("NER Mention: " + m); CoreMap parentNerChunk = ((CoreLabel) plabel).get(NamedEntityAnnotation.class); logger.info("Nested inside NER Mention: " + parentNerChunk); logger.info("Nested inside NER Mention parent node: " + parent);
protected static void extractNPorPRP(CoreMap s, List<Mention> mentions, Set<IntPair> mentionSpanSet, Set<IntPair> namedEntitySpanSet) { List<CoreLabel> sent = s.get(CoreAnnotations.TokensAnnotation.class); Tree tree = s.get(TreeCoreAnnotations.TreeAnnotation.class); tree.indexLeaves(); SemanticGraph dependency = s.get(SemanticGraphCoreAnnotations.EnhancedDependenciesAnnotation.class); TregexPattern tgrepPattern = npOrPrpMentionPattern; TregexMatcher matcher = tgrepPattern.matcher(tree); while (matcher.find()) { Tree t = matcher.getMatch(); List<Tree> mLeaves = t.getLeaves(); int beginIdx = ((CoreLabel)mLeaves.get(0).label()).get(CoreAnnotations.IndexAnnotation.class)-1; int endIdx = ((CoreLabel)mLeaves.get(mLeaves.size()-1).label()).get(CoreAnnotations.IndexAnnotation.class); if (",".equals(sent.get(endIdx-1).word())) { endIdx--; } // try not to have span that ends with , IntPair mSpan = new IntPair(beginIdx, endIdx); if(!mentionSpanSet.contains(mSpan) && !insideNE(mSpan, namedEntitySpanSet)) { int dummyMentionId = -1; Mention m = new Mention(dummyMentionId, beginIdx, endIdx, dependency, new ArrayList<>(sent.subList(beginIdx, endIdx)), t); mentions.add(m); mentionSpanSet.add(mSpan); } } } /** Extract enumerations (A, B, and C) */
stanfordProcessor.annotate(anno); List<CoreMap> sentences = anno.get(CoreAnnotations.SentencesAnnotation.class); for(CoreLabel w : s.get(CoreAnnotations.TokensAnnotation.class)){ w.set(CoreAnnotations.IndexAnnotation.class, i++); if(!w.containsKey(CoreAnnotations.UtteranceAnnotation.class)) { w.set(CoreAnnotations.UtteranceAnnotation.class, 0); allTrees.add(s.get(TreeCoreAnnotations.TreeAnnotation.class)); allWords.add(s.get(CoreAnnotations.TokensAnnotation.class)); EntityComparator comparator = new EntityComparator(); extractGoldMentions(s, allGoldMentions, comparator);
private static void extractNPorPRP(CoreMap s, List<Mention> mentions, Set<IntPair> mentionSpanSet, Set<IntPair> namedEntitySpanSet) { List<CoreLabel> sent = s.get(CoreAnnotations.TokensAnnotation.class); Tree tree = s.get(TreeCoreAnnotations.TreeAnnotation.class); tree.indexLeaves(); SemanticGraph basicDependency = s.get(SemanticGraphCoreAnnotations.BasicDependenciesAnnotation.class); SemanticGraph enhancedDependency = s.get(SemanticGraphCoreAnnotations.EnhancedDependenciesAnnotation.class); if (enhancedDependency == null) { while (matcher.find()) { Tree t = matcher.getMatch(); List<Tree> mLeaves = t.getLeaves(); int beginIdx = ((CoreLabel)mLeaves.get(0).label()).get(CoreAnnotations.IndexAnnotation.class)-1; int endIdx = ((CoreLabel)mLeaves.get(mLeaves.size()-1).label()).get(CoreAnnotations.IndexAnnotation.class); if (",".equals(sent.get(endIdx-1).word())) { endIdx--; } // try not to have span that ends with , IntPair mSpan = new IntPair(beginIdx, endIdx); if(!mentionSpanSet.contains(mSpan) && (!insideNE(mSpan, namedEntitySpanSet) || t.value().startsWith("PRP")) ) { boolean isNE = true; for(CoreLabel cl : m.originalSpan) { if(!cl.tag().startsWith("NNP")) isNE = false;
private static void findSpeakersInArticle(Document doc, Dictionaries dict) { List<CoreMap> sentences = doc.annotation.get(CoreAnnotations.SentencesAnnotation.class); IntPair beginQuotation = null; IntPair endQuotation = null; boolean insideQuotation = false; int utterNum = -1; for (int i = 0 ; i < sentences.size(); i++) { List<CoreLabel> sent = sentences.get(i).get(CoreAnnotations.TokensAnnotation.class); for(int j = 0 ; j < sent.size() ; j++) { int utterIndex = sent.get(j).get(CoreAnnotations.UtteranceAnnotation.class); if(utterIndex != 0 && !insideQuotation) { utterNum = utterIndex; insideQuotation = true; beginQuotation = new IntPair(i,j); } else if (utterIndex == 0 && insideQuotation) { insideQuotation = false; endQuotation = new IntPair(i,j); findQuotationSpeaker(doc, utterNum, sentences, beginQuotation, endQuotation, dict); } } } if(insideQuotation) { endQuotation = new IntPair(sentences.size()-1, sentences.get(sentences.size()-1).get(CoreAnnotations.TokensAnnotation.class).size()-1); findQuotationSpeaker(doc, utterNum, sentences, beginQuotation, endQuotation, dict); } }
private static boolean findSpeaker(Document doc, int utterNum, int sentNum, List<CoreMap> sentences, int startIndex, int endIndex, Dictionaries dict) { List<CoreLabel> sent = sentences.get(sentNum).get(CoreAnnotations.TokensAnnotation.class); for(int i = startIndex ; i < endIndex ; i++) { CoreLabel cl = sent.get(i); if(cl.get(CoreAnnotations.UtteranceAnnotation.class)!=0) continue; String lemma = cl.lemma(); String word = cl.word(); if(dict.reportVerb.contains(lemma) && cl.tag().startsWith("V")) { // find subject SemanticGraph dependency = sentences.get(sentNum).get(SemanticGraphCoreAnnotations.EnhancedDependenciesAnnotation.class); if (dependency == null) { dependency = sentences.get(sentNum).get(SemanticGraphCoreAnnotations.BasicDependenciesAnnotation.class); } IndexedWord w = dependency.getNodeByWordPattern(word); if (w != null) { if(findSubject(doc, dependency, w, sentNum, utterNum)) return true; for(IndexedWord p : dependency.getPathToRoot(w)) { if(!p.tag().startsWith("V") && !p.tag().startsWith("MD")) break; if(findSubject(doc, dependency, p, sentNum, utterNum)) return true; // handling something like "was talking", "can tell" } } else { Redwood.log("debug-preprocessor", "Cannot find node in dependency for word " + word); } } } return false; }
protected static void extractEnumerations(CoreMap s, List<Mention> mentions, Set<IntPair> mentionSpanSet, Set<IntPair> namedEntitySpanSet) { List<CoreLabel> sent = s.get(CoreAnnotations.TokensAnnotation.class); Tree tree = s.get(TreeCoreAnnotations.TreeAnnotation.class); SemanticGraph dependency = s.get(SemanticGraphCoreAnnotations.EnhancedDependenciesAnnotation.class); TregexPattern tgrepPattern = enumerationsMentionPattern; TregexMatcher matcher = tgrepPattern.matcher(tree); Map<IntPair, Tree> spanToMentionSubTree = Generics.newHashMap(); while (matcher.find()) { matcher.getMatch(); Tree m1 = matcher.getNode("m1"); Tree m2 = matcher.getNode("m2"); List<Tree> mLeaves = m1.getLeaves(); int beginIdx = ((CoreLabel)mLeaves.get(0).label()).get(CoreAnnotations.IndexAnnotation.class)-1; int endIdx = ((CoreLabel)mLeaves.get(mLeaves.size()-1).label()).get(CoreAnnotations.IndexAnnotation.class); spanToMentionSubTree.put(new IntPair(beginIdx, endIdx), m1); mLeaves = m2.getLeaves(); beginIdx = ((CoreLabel)mLeaves.get(0).label()).get(CoreAnnotations.IndexAnnotation.class)-1; endIdx = ((CoreLabel)mLeaves.get(mLeaves.size()-1).label()).get(CoreAnnotations.IndexAnnotation.class); spanToMentionSubTree.put(new IntPair(beginIdx, endIdx), m2); } for(IntPair mSpan : spanToMentionSubTree.keySet()){ if(!mentionSpanSet.contains(mSpan) && !insideNE(mSpan, namedEntitySpanSet)) { int dummyMentionId = -1; Mention m = new Mention(dummyMentionId, mSpan.get(0), mSpan.get(1), dependency, new ArrayList<>(sent.subList(mSpan.get(0), mSpan.get(1))), spanToMentionSubTree.get(mSpan)); mentions.add(m); mentionSpanSet.add(mSpan); } } }