edu.stanford.nlp.pipeline.QuoteAnnotator java code examples

public static Annotation getAnnotatedFile(String text, String baseFileName, Properties props) throws IOException{
 File processedFile = new File(baseFileName + ".ser.gz");
 processCoreNLPIfDoesNotExist(processedFile, props, text);
 Annotation doc = ExtractQuotesUtil.readSerializedProtobufFile(processedFile);
 new QuoteAnnotator(new Properties()).annotate(doc); //important! Re-annotate to take into account that certain tokens are removed in the serialization process.
 return doc;
}

public static String replaceUnicode(String text) {
 return asciiQuotes(text);
}

text = xmlFreeText(text, annotation);
 Pair<List<Pair<Integer, Integer>>, List<Pair<Integer, Integer>>> overall = getQuotes(quotesFrom);
 String docID = annotation.get(CoreAnnotations.DocIDAnnotation.class);
 List<CoreMap> cmQuotesUnicode =
   getCoreMapQuotes(overall.first(), tokens, sentences, text, docID, false);
 List<CoreMap> cmUnclosedUnicode = null;
 if (EXTRACT_UNCLOSED) {
  cmUnclosedUnicode = getCoreMapQuotes(overall.second(), tokens, sentences, text, docID, true);
 int numUnicode = countQuotes(cmQuotesUnicode);
  quotesFrom = replaceUnicode(text);
 overall = getQuotes(quotesFrom);
 docID = annotation.get(CoreAnnotations.DocIDAnnotation.class);
 List<CoreMap> cmQuotesAscii = getCoreMapQuotes(overall.first(), tokens, sentences, text, docID, false);
 List<CoreMap> cmUnclosedAscii = null;
 if (EXTRACT_UNCLOSED) {
  cmUnclosedAscii = getCoreMapQuotes(overall.second(), tokens, sentences, text, docID, true);
 int numAsciiSingle = countQuotes(cmQuotesAscii);
 overall = getQuotes(quotesFrom);
 docID = annotation.get(CoreAnnotations.DocIDAnnotation.class);
 List<CoreMap> cmQuotesAsciiNoSingle =
   getCoreMapQuotes(overall.first(), tokens, sentences, text, docID, false);
 List<CoreMap> cmUnclosedAsciiNoSingle = null;
 if (EXTRACT_UNCLOSED) {

   DIRECTED_QUOTES.get(quote).equals(c)) {
  if (c.equals("’")) {
   if ((i == text.length() - 1 || isSingleQuoteEnd(text, i))) {
 if ((start < 0) && !matchesPrevQuote(c, prevQuote) &&
   (((isSingleQuoteWithUse(c) || c.equals("`")) && isSingleQuoteStart(text, i)) ||
    (c.equals("\"") || DIRECTED_QUOTES.containsKey(c)))) {
  start = i;
    (((c.equals("'") || c.equals("`")) && isSingleQuoteEnd(text, i)) ||
    (c.equals("\"") && isDoubleQuoteEnd(text, i)))) ||
    (c.equals("'") && quote.equals("`") && isSingleQuoteEnd(text, i)) ||  // latex quotes are kind of problematic
    (DIRECTED_QUOTES.containsKey(quote) &&
      DIRECTED_QUOTES.get(quote).equals(c) &&
if (!isAQuoteMapStarter(start, quotesMap) && start >= 0 && start < text.length() - 3) {
 if (EXTRACT_UNCLOSED) {
  unclosedQuotes.add(new Pair<>(start, text.length()));
 Pair<List<Pair<Integer, Integer>>, List<Pair<Integer, Integer>>> embedded = recursiveQuotes(toPass, offset, null);
     recursiveQuotes(toPass, q.first() + qKind.length() + offset, qKindToPass);

 Annotation quote = makeQuote(text.substring(begin, end), begin, end, quoteTokens,
   tokenOffset, beginSentence, endSentence, docID);
Comparator<CoreMap> quoteComparator = getQuoteComparator();
Collections.sort(cmQuotes, quoteComparator);
setQuoteIndices(cmQuotes, unclosed);
return cmQuotes;

private void buildDocumentQuotesList() {
  this.quotes =
    QuoteAnnotator.gatherQuotes(this.annotationDocument).stream().
      map(coreMapQuote -> new CoreQuote(this, coreMapQuote)).collect(Collectors.toList());
}

/**
 * Annotate quotes and extract them like sentences
 */
public Annotator quote(Properties properties) {
 Properties relevantProperties = PropertiesUtils.extractPrefixedProperties(properties,
   Annotator.STANFORD_QUOTE + '.');
 return new QuoteAnnotator(relevantProperties);
}

   DIRECTED_QUOTES.get(quote).equals(c)) {
  if (c.equals("’")) {
   if ((i == text.length() - 1 || isSingleQuoteEnd(text, i))) {
 if ((start < 0) && !matchesPrevQuote(c, prevQuote) &&
   (((isSingleQuoteWithUse(c) || c.equals("`")) && isSingleQuoteStart(text, i)) ||
    (c.equals("\"") || DIRECTED_QUOTES.containsKey(c)))) {
  start = i;
    (((c.equals("'") || c.equals("`")) && isSingleQuoteEnd(text, i)) ||
    (c.equals("\"") && isDoubleQuoteEnd(text, i)))) ||
    (c.equals("'") && quote.equals("`") && isSingleQuoteEnd(text, i)) ||  // latex quotes are kind of problematic
    (DIRECTED_QUOTES.containsKey(quote) &&
      DIRECTED_QUOTES.get(quote).equals(c) &&
if (!isAQuoteMapStarter(start, quotesMap) && start >= 0 && start < text.length() - 3) {
 if (EXTRACT_UNCLOSED) {
  unclosedQuotes.add(new Pair<>(start, text.length()));
 Pair<List<Pair<Integer, Integer>>, List<Pair<Integer, Integer>>> embedded = recursiveQuotes(toPass, offset, null);
     recursiveQuotes(toPass, q.first() + qKind.length() + offset, qKindToPass);

 Annotation quote = makeQuote(text.substring(begin, end), begin, end, quoteTokens,
   tokenOffset, beginSentence, endSentence, docID);
Comparator<CoreMap> quoteComparator = getQuoteComparator();
Collections.sort(cmQuotes, quoteComparator);
setQuoteIndices(cmQuotes, unclosed);
return cmQuotes;

public static List<CoreMap> gatherQuotes(CoreMap curr) {
 List<CoreMap> embedded = curr.get(CoreAnnotations.QuotationsAnnotation.class);
 if (embedded != null) {
  List<CoreMap> extended = Generics.newArrayList();
  for (CoreMap quote : embedded) {
   extended.addAll(gatherQuotes(quote));
  }
  extended.addAll(embedded);
  return extended;
 } else {
  return Generics.newArrayList();
 }
}

/**
 * Annotate quotes and extract them like sentences
 */
public Annotator quote(Properties properties) {
 Properties relevantProperties = PropertiesUtils.extractPrefixedProperties(properties,
   Annotator.STANFORD_QUOTE + '.');
 return new QuoteAnnotator(relevantProperties);
}

text = xmlFreeText(text, annotation);
 Pair<List<Pair<Integer, Integer>>, List<Pair<Integer, Integer>>> overall = getQuotes(quotesFrom);
 String docID = annotation.get(CoreAnnotations.DocIDAnnotation.class);
 List<CoreMap> cmQuotesUnicode =
   getCoreMapQuotes(overall.first(), tokens, sentences, text, docID, false);
 List<CoreMap> cmUnclosedUnicode = null;
 if (EXTRACT_UNCLOSED) {
  cmUnclosedUnicode = getCoreMapQuotes(overall.second(), tokens, sentences, text, docID, true);
 int numUnicode = countQuotes(cmQuotesUnicode);
  quotesFrom = replaceUnicode(text);
 overall = getQuotes(quotesFrom);
 docID = annotation.get(CoreAnnotations.DocIDAnnotation.class);
 List<CoreMap> cmQuotesAscii = getCoreMapQuotes(overall.first(), tokens, sentences, text, docID, false);
 List<CoreMap> cmUnclosedAscii = null;
 if (EXTRACT_UNCLOSED) {
  cmUnclosedAscii = getCoreMapQuotes(overall.second(), tokens, sentences, text, docID, true);
 int numAsciiSingle = countQuotes(cmQuotesAscii);
 overall = getQuotes(quotesFrom);
 docID = annotation.get(CoreAnnotations.DocIDAnnotation.class);
 List<CoreMap> cmQuotesAsciiNoSingle =
   getCoreMapQuotes(overall.first(), tokens, sentences, text, docID, false);
 List<CoreMap> cmUnclosedAsciiNoSingle = null;
 if (EXTRACT_UNCLOSED) {

/** complete the wrapping process post annotation by a pipeline **/
public void wrapAnnotations() {
 // wrap all of the sentences
 if (this.annotationDocument.get(CoreAnnotations.SentencesAnnotation.class) != null) {
  wrapSentences();
  // if there are entity mentions, build a document wide list
  if ( ! sentences.isEmpty() && sentences.get(0).entityMentions() != null) {
   buildDocumentEntityMentionsList();
  }
  // if there are quotes, build a document wide list
  if (QuoteAnnotator.gatherQuotes(this.annotationDocument) != null)
   buildDocumentQuotesList();
 }
}

public static Annotation getAnnotatedFile(String text, String baseFileName, Properties props) throws IOException{
 File processedFile = new File(baseFileName + ".ser.gz");
 processCoreNLPIfDoesNotExist(processedFile, props, text);
 Annotation doc = ExtractQuotesUtil.readSerializedProtobufFile(processedFile);
 new QuoteAnnotator(new Properties()).annotate(doc); //important! Re-annotate to take into account that certain tokens are removed in the serialization process.
 return doc;
}

public static String replaceUnicode(String text) {
 return asciiQuotes(text);
}

pw.println();
pw.println("Extracted quotes: ");
List<CoreMap> allQuotes = QuoteAnnotator.gatherQuotes(annotation);
for (CoreMap quote : allQuotes) {
 String speakerString;

for (CoreMap quote : QuoteAnnotator.gatherQuotes(annotation)) {
 Integer firstSpeakerTokenIndex = quote.get(MentionBeginAnnotation.class);
 if (firstSpeakerTokenIndex != null) {

List<CoreMap> quotes = QuoteAnnotator.gatherQuotes(doc);
l1.set("quotes", quotes.stream().map(quote -> (Consumer<Writer>) (Writer l2) -> {
 l2.set("id", quote.get(CoreAnnotations.QuotationIndexAnnotation.class));

private void buildDocumentQuotesList() {
  this.quotes =
    QuoteAnnotator.gatherQuotes(this.annotationDocument).stream().
      map(coreMapQuote -> new CoreQuote(this, coreMapQuote)).collect(Collectors.toList());
}

public static List<CoreMap> gatherQuotes(CoreMap curr) {
 List<CoreMap> embedded = curr.get(CoreAnnotations.QuotationsAnnotation.class);
 if (embedded != null) {
  List<CoreMap> extended = Generics.newArrayList();
  for (CoreMap quote : embedded) {
   extended.addAll(gatherQuotes(quote));
  }
  extended.addAll(embedded);
  return extended;
 } else {
  return Generics.newArrayList();
 }
}

Javadoc

An annotator which picks quotations out of the given text. Allows for embedded quotations so long as they are either directed unicode quotes or are of a different type of quote than the outer quotations (e.g. "'Gadzooks' is what he said to me" is legal whereas "They called me "Danger" when I was..." is illegal). Uses regular-expression-like rules to find quotes and does not depend on the tokenizer, which allows quotes like ''Tis true!' to be correctly identified.

Considers regular ascii ("", '', ``'', and `') as well as "smart" and international quotation marks as follows: “”,‘’, «», ‹›, 「」, 『』, „”, and ‚’.

Note: extracts everything within these pairs as a whole quote segment, which may or may not be the desired behaviour for texts that use different formatting styles than standard english ones.

There are a number of options that can be passed to the quote annotator to customize its' behaviour:

singleQuotes: "true" or "false", indicating whether or not to consider ' tokens to be quotation marks (default=false).
maxLength: maximum character length of quotes to consider (default=-1).
asciiQuotes: "true" or "false", indicating whether or not to convert all quotes to ascii quotes before processing (can help when there are errors in quote directionality) (default=false).
allowEmbeddedSame: "true" or "false" indicating whether or not to allow smart/directed (everything except " and ') quotes of the same kind to be embedded within one another (default=false).
extractUnclosedQuotes: "true" or "false" indicating whether or not to extract unclosed quotes. If "true", an UnclosedQuotationsAnnotation that is structured exactly the same as the QuotationsAnnotation will be added to the document. Any nested unclosed quotations will be contained in nested UnclosedQuotationsAnnotation on the target unclosed quotation (default=false).

The annotator adds a QuotationsAnnotation to the Annotation which returns a List that contain the following information:

CharacterOffsetBeginAnnotation
CharacterOffsetEndAnnotation
QuotationIndexAnnotation
QuotationsAnnotation (if there are embedded quotes)
TokensAnnotation (if the tokenizer is run before the quote annotator)
TokenBeginAnnotation (if the tokenizer is run before the quote annotator)
TokenEndAnnotation (if the tokenizer is run before the quote annotator)
SentenceBeginAnnotation (if the sentence splitter has bee run before the quote annotator)
SentenceEndAnnotation (if the sentence splitter has bee run before the quote annotator)

Most used methods

<init>
Return a QuoteAnnotator that isolates quotes denoted by the ASCII characters " and ' as well as a va
annotate
asciiQuotes
countQuotes
gatherQuotes
getCoreMapQuotes
getQuoteComparator
getQuotes
isAQuoteMapStarter
isDoubleQuoteEnd
isSingleQuote
isSingleQuoteEnd

Popular in Java

Running tasks concurrently on multiple threads
runOnUiThread (Activity)
getExternalFilesDir (Context)
scheduleAtFixedRate (Timer)
Proxy (java.net)
This class represents proxy server settings. A created instance of Proxy stores a type and an addres
MessageDigest (java.security)
Uses a one-way hash function to turn an arbitrary number of bytes into a fixed-length byte sequence.
Arrays (java.util)
This class contains various methods for manipulating arrays (such as sorting and searching). This cl
GregorianCalendar (java.util)
GregorianCalendar is a concrete subclass of Calendarand provides the standard calendar used by most
IsNull (org.hamcrest.core)
Is the value null?
BorderLayout (java.awt)
A border layout lays out a container, arranging and resizing its components to fit in five regions:
From CI to AI: The AI layer in your organization

How to useQuoteAnnotator in edu.stanford.nlp.pipeline

Best Java code snippets using edu.stanford.nlp.pipeline.QuoteAnnotator (Showing top 20 results out of 315)

How to use
QuoteAnnotator
in
edu.stanford.nlp.pipeline