edu.stanford.nlp.ling.CoreLabel java code examples

Refine search

public int tokenToLocation(CoreLabel token) {
 CoreMap sentence = doc.get(CoreAnnotations.SentencesAnnotation.class).get(
   token.get(CoreAnnotations.SentenceIndexAnnotation.class));
 return sentence.get(CoreAnnotations.TokenBeginAnnotation.class) +
   token.get(CoreAnnotations.IndexAnnotation.class) - 1;
}

/**
 * Set index for each token and sentence in the document.
 * @param doc
 */
private static void setTokenIndices(Document doc) {
 int token_index = 0;
 for (CoreMap sent : doc.annotation.get(SentencesAnnotation.class)) {
  for (CoreLabel token : sent.get(TokensAnnotation.class)) {
   token.set(TokenBeginAnnotation.class, token_index++);
  }
 }
}

/**
 * Splits a compound marked by the lexer.
 */
private CoreLabel processCompound(CoreLabel cl) {
 cl.remove(ParentAnnotation.class);
 String[] parts = pSpace.split(pDash.matcher(cl.word()).replaceAll(" - "));
 int lengthAccum = 0;
 for (String part : parts) {
  CoreLabel newLabel = new CoreLabel(cl);
  newLabel.setWord(part);
  newLabel.setValue(part);
  newLabel.setBeginPosition(cl.beginPosition() + lengthAccum);
  newLabel.setEndPosition(cl.beginPosition() + lengthAccum + part.length());
  newLabel.set(OriginalTextAnnotation.class, part);
  compoundBuffer.add(newLabel);
  lengthAccum += part.length();
 }
 return compoundBuffer.remove(0);
}

private static CoreLabel makeStartLabel(String label) {
  CoreLabel root = new CoreLabel();
  root.set(CoreAnnotations.ValueAnnotation.class, label);
  root.set(CoreAnnotations.IndexAnnotation.class, 0);
  return root;
}

@Override
public Label newLabel(String labelStr) {
 CoreLabel label = new CoreLabel();
 label.setValue(labelStr);
 return label;
}

/**
 * set isNewline()
 */
private static void setNewlineStatus(List<CoreLabel> tokensList) {
 // label newlines
 for (CoreLabel token : tokensList) {
  if (token.word().equals(AbstractTokenizer.NEWLINE_TOKEN) && (token.endPosition() - token.beginPosition() == 1))
   token.set(CoreAnnotations.IsNewlineAnnotation.class, true);
  else
   token.set(CoreAnnotations.IsNewlineAnnotation.class, false);
 }
}

private static void runPipeline(StanfordCoreNLP pipeline, String text, PrintWriter out) {
 Annotation annotation = new Annotation(text);
 pipeline.annotate(annotation);
 // An Annotation is a Map and you can get and use the various analyses individually.
 out.println();
 // The toString() method on an Annotation just prints the text of the Annotation
 // But you can see what is in it with other methods like toShorterString()
 out.println("The top level annotation");
 out.println(annotation.toShorterString());
 List<CoreMap> sentences = annotation.get(CoreAnnotations.SentencesAnnotation.class);
 for (CoreMap sentence : sentences) {
  // Print out token annotations
  for (CoreLabel token:sentence.get(CoreAnnotations.TokensAnnotation.class)) {
   // Print out words, lemma, ne, and normalized ne
   String word = token.get(CoreAnnotations.TextAnnotation.class);
   String lemma = token.get(CoreAnnotations.LemmaAnnotation.class);
   String pos = token.get(CoreAnnotations.PartOfSpeechAnnotation.class);
   String ne = token.get(CoreAnnotations.NamedEntityTagAnnotation.class);
   String normalized = token.get(CoreAnnotations.NormalizedNamedEntityTagAnnotation.class);
   out.println("token: " + "word="+word + ", lemma="+lemma + ", pos=" + pos + ", ne=" + ne + ", normalized=" + normalized);
  }
 }
 out.flush();
}

props.setProperty("annotators", "tokenize, ssplit, pos, lemma, ner, parse, dcoref, sentiment");
StanfordCoreNLP pipeline = new StanfordCoreNLP(props);
 annotation = new Annotation(IOUtils.slurpFileNoExceptions(args[0]));
} else {
 annotation = new Annotation("Kosgi Santosh sent an email to Stanford University. He didn't get a reply.");
pipeline.annotate(annotation);
pipeline.prettyPrint(annotation, out);
out.println(annotation.toShorterString());
out.println();
 out.println(graph.toString(SemanticGraph.OutputFormat.LIST));
 IndexedWord node = graph.getNodeByIndexSafe(5);
 out.println("Printing dependencies around \"" + node.word() + "\" index " + node.index());
  assert edgeList.size() == 1;
  int head = edgeList.get(0).getGovernor().index();
   out.println("  " + m + ", i.e., 0-based character offsets [" + tokens.get(m.startIndex - 1).beginPosition() +
       ", " + tokens.get(m.endIndex - 2).endPosition() + ')');

StanfordCoreNLP pipeline = new StanfordCoreNLP(props);
 document = new Annotation(IOUtils.slurpFileNoExceptions(args[0]));
} else {
 document = new Annotation("克林顿说，华盛顿将逐步落实对韩国的经济援助。金大中对克林顿的讲话报以掌声：克林顿总统在会谈中重申，他坚定地支持韩国摆脱经济危机。");
pipeline.annotate(document);
List<CoreMap> sentences =  document.get(CoreAnnotations.SentencesAnnotation.class);
 for (CoreMap token : sentence.get(CoreAnnotations.TokensAnnotation.class)) {
  out.println(token.toShorterString("Text", "CharacterOffsetBegin", "CharacterOffsetEnd", "Index", "PartOfSpeech", "NamedEntityTag"));
 out.println(sentence.get(SemanticGraphCoreAnnotations.BasicDependenciesAnnotation.class).toString(SemanticGraph.OutputFormat.LIST));
 sentNo++;
  List<CoreLabel> tokens = sentences.get(m.sentNum - 1).get(CoreAnnotations.TokensAnnotation.class);
  out.println("  " + m + ":[" + tokens.get(m.startIndex - 1).beginPosition() + ", " +
      tokens.get(m.endIndex - 2).endPosition() + ')');

List<CoreMap> sentences = anno.get(CoreAnnotations.SentencesAnnotation.class);
for (CoreMap sentence:sentences) {
 if (!Constants.USE_GOLD_PARSES && !replicateCoNLL) {
  sentence.remove(TreeCoreAnnotations.TreeAnnotation.class);
 } else {
  Tree tree = sentence.get(TreeCoreAnnotations.TreeAnnotation.class);
  if (LEMMATIZE) {
   treeLemmatizer.transformTree(tree);
for (CoreLabel token:anno.get(CoreAnnotations.TokensAnnotation.class)) {
 if (!token.containsKey(CoreAnnotations.SpeakerAnnotation.class))  {
  token.set(CoreAnnotations.SpeakerAnnotation.class, "");
 String curSpeaker = token.get(CoreAnnotations.SpeakerAnnotation.class);
 if (!curSpeaker.equals(preSpeaker)) {
  utterance++;
  preSpeaker = curSpeaker;
 token.set(CoreAnnotations.UtteranceAnnotation.class, utterance);
stanfordProcessor.annotate(anno);
for (CoreMap sentence:anno.get(CoreAnnotations.SentencesAnnotation.class)) {
 allWords.add(sentence.get(CoreAnnotations.TokensAnnotation.class));
 allTrees.add(sentence.get(TreeCoreAnnotations.TreeAnnotation.class));

 List<CoreMap> sentences = dataset.get(CoreAnnotations.SentencesAnnotation.class);
 if (sentences.size() > 0 && !sentences.get(0).containsKey(TreeCoreAnnotations.TreeAnnotation.class)) {
  logger.info("Annotating dataset with " + processor);
  processor.annotate(dataset);
 } else {
  logger.info("Found existing syntactic annotations. Will not use the NLP processor.");
List<CoreMap> sentences = dataset.get(CoreAnnotations.SentencesAnnotation.class);
logger.fine("Extracted " + sentences.size() + " sentences.");
for (CoreMap sentence : sentences) {
 List<CoreLabel> tokens = sentence.get(CoreAnnotations.TokensAnnotation.class);
 logger.fine("Processing sentence " + tokens);
 Tree tree = sentence.get(TreeCoreAnnotations.TreeAnnotation.class);
 if(tree == null) throw new RuntimeException("ERROR: MR requires full syntactic analysis!");
 CoreLabel l = (CoreLabel) tree.label();
 if(forceGenerationOfIndexSpans || (! l.containsKey(CoreAnnotations.BeginIndexAnnotation.class) && ! l.containsKey(CoreAnnotations.EndIndexAnnotation.class))){
  tree.indexSpans(0);
  logger.fine("Index spans were generated.");
 } else {
  logger.fine("Index spans were NOT generated.");
 logger.fine("Parse tree using CoreLabel:\n" + tree.pennString());
   logger.fine("Finding head for entity: " + ent);
   int headPos = assignSyntacticHead(ent, tree, tokens, calculateHeadSpan);
   logger.fine("Syntactic head of mention \"" + ent + "\" is: " + tokens.get(headPos).word());

private String findNextParagraphSpeaker(List<CoreMap> paragraph, int paragraphOffset, Dictionaries dict) {
 CoreMap lastSent = paragraph.get(paragraph.size()-1);
 String speaker = "";
 for(CoreLabel w : lastSent.get(CoreAnnotations.TokensAnnotation.class)) {
  if(w.get(CoreAnnotations.LemmaAnnotation.class).equals("report") || w.get(CoreAnnotations.LemmaAnnotation.class).equals("say")) {
   String word = w.get(CoreAnnotations.TextAnnotation.class);
   SemanticGraph dependency = lastSent.get(SemanticGraphCoreAnnotations.EnhancedDependenciesAnnotation.class);
   IndexedWord t = dependency.getNodeByWordPattern(word);
   for(Pair<GrammaticalRelation,IndexedWord> child : dependency.childPairs(t)){
    if(child.first().getShortName().equals("nsubj")) {
     int subjectIndex = child.second().index();  // start from 1
     IntTuple headPosition = new IntTuple(2);
     headPosition.set(0, paragraph.size()-1 + paragraphOffset);
     headPosition.set(1, subjectIndex-1);
     if(mentionheadPositions.containsKey(headPosition)
       && mentionheadPositions.get(headPosition).nerString.startsWith("PER")) {
      speaker = Integer.toString(mentionheadPositions.get(headPosition).mentionID);
     }
    }
   }
  }
 }
 return speaker;
}

@Override
public void annotate(Annotation annotation) {
 if (annotation.containsKey(CoreAnnotations.SentencesAnnotation.class)) {
  List<CoreMap> sentences = annotation.get(CoreAnnotations.SentencesAnnotation.class);
  for (CoreMap sentence : sentences) {
   Tree binarized = sentence.get(TreeCoreAnnotations.BinarizedTreeAnnotation.class);
   if (binarized == null) {
    throw new AssertionError("Binarized sentences not built by parser");
     IntPair p = bt.getSpan();
     int sen = RNNCoreAnnotations.getPredictedClass(bt);
     String sentStr = SentimentUtils.sentimentString(model, sen);
    if (((CoreLabel) tree.label()).containsKey(CoreAnnotations.SpanAnnotation.class)) {
     throw new IllegalStateException("This code assumes you don't have SpanAnnotation");
     if (str != null) {
      CoreLabel cl = (CoreLabel) t.label();
      cl.set(SentimentCoreAnnotations.SentimentClass.class, str);
      cl.remove(CoreAnnotations.SpanAnnotation.class);

public void process(CoNLLDocument doc)
   mentionTokenLengthCounter.incrementCount(length);
   IntPair span = t.getSpan();
   if (span != null) {
    if (span.getSource() == tokenStart && span.getTarget() == tokenEnd - 1) {
     mentionExactTreeSpan++;
    } else {
   if (nptSpan.getSource() == tokenStart && nptSpan.getTarget() == tokenEnd - 1) {
    nonPretermSpanMatches++;
    npt2 = npt;
     CoreMap mention = ((CoreLabel) tlabel).get(CorefMentionAnnotation.class);
    if (((CoreLabel) tlabel).containsKey(NamedEntityAnnotation.class)) {
       if (((CoreLabel) plabel).containsKey(NamedEntityAnnotation.class)) {
        logger.info("NER Mention: " + m);
        CoreMap parentNerChunk = ((CoreLabel) plabel).get(NamedEntityAnnotation.class);
        logger.info("Nested inside NER Mention: " + parentNerChunk);
        logger.info("Nested inside NER Mention parent node: " + parent);

protected static void extractNPorPRP(CoreMap s, List<Mention> mentions, Set<IntPair> mentionSpanSet, Set<IntPair> namedEntitySpanSet) {
 List<CoreLabel> sent = s.get(CoreAnnotations.TokensAnnotation.class);
 Tree tree = s.get(TreeCoreAnnotations.TreeAnnotation.class);
 tree.indexLeaves();
 SemanticGraph dependency = s.get(SemanticGraphCoreAnnotations.EnhancedDependenciesAnnotation.class);
 TregexPattern tgrepPattern = npOrPrpMentionPattern;
 TregexMatcher matcher = tgrepPattern.matcher(tree);
 while (matcher.find()) {
  Tree t = matcher.getMatch();
  List<Tree> mLeaves = t.getLeaves();
  int beginIdx = ((CoreLabel)mLeaves.get(0).label()).get(CoreAnnotations.IndexAnnotation.class)-1;
  int endIdx = ((CoreLabel)mLeaves.get(mLeaves.size()-1).label()).get(CoreAnnotations.IndexAnnotation.class);
  if (",".equals(sent.get(endIdx-1).word())) { endIdx--; } // try not to have span that ends with ,
  IntPair mSpan = new IntPair(beginIdx, endIdx);
  if(!mentionSpanSet.contains(mSpan) && !insideNE(mSpan, namedEntitySpanSet)) {
   int dummyMentionId = -1;
   Mention m = new Mention(dummyMentionId, beginIdx, endIdx, dependency, new ArrayList<>(sent.subList(beginIdx, endIdx)), t);
   mentions.add(m);
   mentionSpanSet.add(mSpan);
  }
 }
}
/** Extract enumerations (A, B, and C) */

stanfordProcessor.annotate(anno);
List<CoreMap> sentences = anno.get(CoreAnnotations.SentencesAnnotation.class);
 for(CoreLabel w : s.get(CoreAnnotations.TokensAnnotation.class)){
  w.set(CoreAnnotations.IndexAnnotation.class, i++);
  if(!w.containsKey(CoreAnnotations.UtteranceAnnotation.class)) {
   w.set(CoreAnnotations.UtteranceAnnotation.class, 0);
 allTrees.add(s.get(TreeCoreAnnotations.TreeAnnotation.class));
 allWords.add(s.get(CoreAnnotations.TokensAnnotation.class));
 EntityComparator comparator = new EntityComparator();
 extractGoldMentions(s, allGoldMentions, comparator);

private static void extractNPorPRP(CoreMap s, List<Mention> mentions, Set<IntPair> mentionSpanSet, Set<IntPair> namedEntitySpanSet) {
 List<CoreLabel> sent = s.get(CoreAnnotations.TokensAnnotation.class);
 Tree tree = s.get(TreeCoreAnnotations.TreeAnnotation.class);
 tree.indexLeaves();
 SemanticGraph basicDependency = s.get(SemanticGraphCoreAnnotations.BasicDependenciesAnnotation.class);
 SemanticGraph enhancedDependency = s.get(SemanticGraphCoreAnnotations.EnhancedDependenciesAnnotation.class);
 if (enhancedDependency == null) {
 while (matcher.find()) {
  Tree t = matcher.getMatch();
  List<Tree> mLeaves = t.getLeaves();
  int beginIdx = ((CoreLabel)mLeaves.get(0).label()).get(CoreAnnotations.IndexAnnotation.class)-1;
  int endIdx = ((CoreLabel)mLeaves.get(mLeaves.size()-1).label()).get(CoreAnnotations.IndexAnnotation.class);
  if (",".equals(sent.get(endIdx-1).word())) { endIdx--; } // try not to have span that ends with ,
  IntPair mSpan = new IntPair(beginIdx, endIdx);
  if(!mentionSpanSet.contains(mSpan) && (!insideNE(mSpan, namedEntitySpanSet) || t.value().startsWith("PRP")) ) {
    boolean isNE = true;
    for(CoreLabel cl : m.originalSpan) {
     if(!cl.tag().startsWith("NNP")) isNE = false;

private static void findSpeakersInArticle(Document doc, Dictionaries dict) {
 List<CoreMap> sentences = doc.annotation.get(CoreAnnotations.SentencesAnnotation.class);
 IntPair beginQuotation = null;
 IntPair endQuotation = null;
 boolean insideQuotation = false;
 int utterNum = -1;
 for (int i = 0 ; i < sentences.size(); i++) {
  List<CoreLabel> sent = sentences.get(i).get(CoreAnnotations.TokensAnnotation.class);
  for(int j = 0 ; j < sent.size() ; j++) {
   int utterIndex = sent.get(j).get(CoreAnnotations.UtteranceAnnotation.class);
   if(utterIndex != 0 && !insideQuotation) {
    utterNum = utterIndex;
    insideQuotation = true;
    beginQuotation = new IntPair(i,j);
   } else if (utterIndex == 0 && insideQuotation) {
    insideQuotation = false;
    endQuotation = new IntPair(i,j);
    findQuotationSpeaker(doc, utterNum, sentences, beginQuotation, endQuotation, dict);
   }
  }
 }
 if(insideQuotation) {
  endQuotation = new IntPair(sentences.size()-1, sentences.get(sentences.size()-1).get(CoreAnnotations.TokensAnnotation.class).size()-1);
  findQuotationSpeaker(doc, utterNum, sentences, beginQuotation, endQuotation, dict);
 }
}

private static boolean findSpeaker(Document doc, int utterNum, int sentNum, List<CoreMap> sentences,
  int startIndex, int endIndex, Dictionaries dict) {
 List<CoreLabel> sent = sentences.get(sentNum).get(CoreAnnotations.TokensAnnotation.class);
 for(int i = startIndex ; i < endIndex ; i++) {
  CoreLabel cl = sent.get(i);
  if(cl.get(CoreAnnotations.UtteranceAnnotation.class)!=0) continue;
  String lemma = cl.lemma();
  String word = cl.word();
  if(dict.reportVerb.contains(lemma) && cl.tag().startsWith("V")) {
   // find subject
   SemanticGraph dependency = sentences.get(sentNum).get(SemanticGraphCoreAnnotations.EnhancedDependenciesAnnotation.class);
   if (dependency == null) {
    dependency = sentences.get(sentNum).get(SemanticGraphCoreAnnotations.BasicDependenciesAnnotation.class);
   }
   IndexedWord w = dependency.getNodeByWordPattern(word);
   if (w != null) {
    if(findSubject(doc, dependency, w, sentNum, utterNum)) return true;
    for(IndexedWord p : dependency.getPathToRoot(w)) {
     if(!p.tag().startsWith("V") && !p.tag().startsWith("MD")) break;
     if(findSubject(doc, dependency, p, sentNum, utterNum)) return true;    // handling something like "was talking", "can tell"
    }
   } else {
    Redwood.log("debug-preprocessor", "Cannot find node in dependency for word " + word);
   }
  }
 }
 return false;
}

protected static void extractEnumerations(CoreMap s, List<Mention> mentions, Set<IntPair> mentionSpanSet, Set<IntPair> namedEntitySpanSet) {
 List<CoreLabel> sent = s.get(CoreAnnotations.TokensAnnotation.class);
 Tree tree = s.get(TreeCoreAnnotations.TreeAnnotation.class);
 SemanticGraph dependency = s.get(SemanticGraphCoreAnnotations.EnhancedDependenciesAnnotation.class);
 TregexPattern tgrepPattern = enumerationsMentionPattern;
 TregexMatcher matcher = tgrepPattern.matcher(tree);
 Map<IntPair, Tree> spanToMentionSubTree = Generics.newHashMap();
 while (matcher.find()) {
  matcher.getMatch();
  Tree m1 = matcher.getNode("m1");
  Tree m2 = matcher.getNode("m2");
  List<Tree> mLeaves = m1.getLeaves();
  int beginIdx = ((CoreLabel)mLeaves.get(0).label()).get(CoreAnnotations.IndexAnnotation.class)-1;
  int endIdx = ((CoreLabel)mLeaves.get(mLeaves.size()-1).label()).get(CoreAnnotations.IndexAnnotation.class);
  spanToMentionSubTree.put(new IntPair(beginIdx, endIdx), m1);
  mLeaves = m2.getLeaves();
  beginIdx = ((CoreLabel)mLeaves.get(0).label()).get(CoreAnnotations.IndexAnnotation.class)-1;
  endIdx = ((CoreLabel)mLeaves.get(mLeaves.size()-1).label()).get(CoreAnnotations.IndexAnnotation.class);
  spanToMentionSubTree.put(new IntPair(beginIdx, endIdx), m2);
 }
 for(IntPair mSpan : spanToMentionSubTree.keySet()){
  if(!mentionSpanSet.contains(mSpan) && !insideNE(mSpan, namedEntitySpanSet)) {
   int dummyMentionId = -1;
   Mention m = new Mention(dummyMentionId, mSpan.get(0), mSpan.get(1), dependency,
       new ArrayList<>(sent.subList(mSpan.get(0), mSpan.get(1))), spanToMentionSubTree.get(mSpan));
   mentions.add(m);
   mentionSpanSet.add(mSpan);
  }
 }
}

Javadoc

A CoreLabel represents a single word with ancillary information attached using CoreAnnotations. A CoreLabel also provides convenient methods to access tags, lemmas, etc. (if the proper annotations are set).

A CoreLabel is a Map from keys (which are Class objects) to values, whose type is determined by the key. That is, it is a heterogeneous typesafe Map (see Josh Bloch, Effective Java, 2nd edition).

The CoreLabel class in particular bridges the gap between old-style JavaNLP Labels and the new CoreMap infrastructure. Instances of this class can be used (almost) anywhere that the now-defunct FeatureLabel family could be used. This data structure is backed by an ArrayCoreMap.

Most used methods

get
set
word
beginPosition
endPosition
originalText
<init>
This constructor attempts to parse the String keys into Class keys. It's mainly useful for reading f
index
lemma
setWord
Set the word value for the label. Also, clears the lemma, since that may have changed if the word ch
getString
setIndex

Popular in Java

Making http post requests using okhttp
orElseThrow (Optional)
Return the contained value, if present, otherwise throw an exception to be created by the provided s
setContentView (Activity)
scheduleAtFixedRate (ScheduledExecutorService)
BufferedWriter (java.io)
Wraps an existing Writer and buffers the output. Expensive interaction with the underlying reader is
EOFException (java.io)
Thrown when a program encounters the end of a file or stream during an input operation.
ConnectException (java.net)
A ConnectException is thrown if a connection cannot be established to a remote host on a specific po
Set (java.util)
A Set is a data structure which does not allow duplicate elements.
ServletException (javax.servlet)
Defines a general exception a servlet can throw when it encounters difficulty.
JComboBox (javax.swing)
Best IntelliJ plugins

How to useCoreLabel in edu.stanford.nlp.ling

Best Java code snippets using edu.stanford.nlp.ling.CoreLabel (Showing top 20 results out of 423)

Refine search

How to use
CoreLabel
in
edu.stanford.nlp.ling