public static Annotation getAnnotatedFile(String text, String baseFileName, Properties props) throws IOException{ File processedFile = new File(baseFileName + ".ser.gz"); processCoreNLPIfDoesNotExist(processedFile, props, text); Annotation doc = ExtractQuotesUtil.readSerializedProtobufFile(processedFile); new QuoteAnnotator(new Properties()).annotate(doc); //important! Re-annotate to take into account that certain tokens are removed in the serialization process. return doc; }
public static String replaceUnicode(String text) { return asciiQuotes(text); }
text = xmlFreeText(text, annotation); Pair<List<Pair<Integer, Integer>>, List<Pair<Integer, Integer>>> overall = getQuotes(quotesFrom); String docID = annotation.get(CoreAnnotations.DocIDAnnotation.class); List<CoreMap> cmQuotesUnicode = getCoreMapQuotes(overall.first(), tokens, sentences, text, docID, false); List<CoreMap> cmUnclosedUnicode = null; if (EXTRACT_UNCLOSED) { cmUnclosedUnicode = getCoreMapQuotes(overall.second(), tokens, sentences, text, docID, true); int numUnicode = countQuotes(cmQuotesUnicode); quotesFrom = replaceUnicode(text); overall = getQuotes(quotesFrom); docID = annotation.get(CoreAnnotations.DocIDAnnotation.class); List<CoreMap> cmQuotesAscii = getCoreMapQuotes(overall.first(), tokens, sentences, text, docID, false); List<CoreMap> cmUnclosedAscii = null; if (EXTRACT_UNCLOSED) { cmUnclosedAscii = getCoreMapQuotes(overall.second(), tokens, sentences, text, docID, true); int numAsciiSingle = countQuotes(cmQuotesAscii); overall = getQuotes(quotesFrom); docID = annotation.get(CoreAnnotations.DocIDAnnotation.class); List<CoreMap> cmQuotesAsciiNoSingle = getCoreMapQuotes(overall.first(), tokens, sentences, text, docID, false); List<CoreMap> cmUnclosedAsciiNoSingle = null; if (EXTRACT_UNCLOSED) {
DIRECTED_QUOTES.get(quote).equals(c)) { if (c.equals("’")) { if ((i == text.length() - 1 || isSingleQuoteEnd(text, i))) { if ((start < 0) && !matchesPrevQuote(c, prevQuote) && (((isSingleQuoteWithUse(c) || c.equals("`")) && isSingleQuoteStart(text, i)) || (c.equals("\"") || DIRECTED_QUOTES.containsKey(c)))) { start = i; (((c.equals("'") || c.equals("`")) && isSingleQuoteEnd(text, i)) || (c.equals("\"") && isDoubleQuoteEnd(text, i)))) || (c.equals("'") && quote.equals("`") && isSingleQuoteEnd(text, i)) || // latex quotes are kind of problematic (DIRECTED_QUOTES.containsKey(quote) && DIRECTED_QUOTES.get(quote).equals(c) && if (!isAQuoteMapStarter(start, quotesMap) && start >= 0 && start < text.length() - 3) { if (EXTRACT_UNCLOSED) { unclosedQuotes.add(new Pair<>(start, text.length())); Pair<List<Pair<Integer, Integer>>, List<Pair<Integer, Integer>>> embedded = recursiveQuotes(toPass, offset, null); recursiveQuotes(toPass, q.first() + qKind.length() + offset, qKindToPass);
Annotation quote = makeQuote(text.substring(begin, end), begin, end, quoteTokens, tokenOffset, beginSentence, endSentence, docID); Comparator<CoreMap> quoteComparator = getQuoteComparator(); Collections.sort(cmQuotes, quoteComparator); setQuoteIndices(cmQuotes, unclosed); return cmQuotes;
private void buildDocumentQuotesList() { this.quotes = QuoteAnnotator.gatherQuotes(this.annotationDocument).stream(). map(coreMapQuote -> new CoreQuote(this, coreMapQuote)).collect(Collectors.toList()); }
/** * Annotate quotes and extract them like sentences */ public Annotator quote(Properties properties) { Properties relevantProperties = PropertiesUtils.extractPrefixedProperties(properties, Annotator.STANFORD_QUOTE + '.'); return new QuoteAnnotator(relevantProperties); }
DIRECTED_QUOTES.get(quote).equals(c)) { if (c.equals("’")) { if ((i == text.length() - 1 || isSingleQuoteEnd(text, i))) { if ((start < 0) && !matchesPrevQuote(c, prevQuote) && (((isSingleQuoteWithUse(c) || c.equals("`")) && isSingleQuoteStart(text, i)) || (c.equals("\"") || DIRECTED_QUOTES.containsKey(c)))) { start = i; (((c.equals("'") || c.equals("`")) && isSingleQuoteEnd(text, i)) || (c.equals("\"") && isDoubleQuoteEnd(text, i)))) || (c.equals("'") && quote.equals("`") && isSingleQuoteEnd(text, i)) || // latex quotes are kind of problematic (DIRECTED_QUOTES.containsKey(quote) && DIRECTED_QUOTES.get(quote).equals(c) && if (!isAQuoteMapStarter(start, quotesMap) && start >= 0 && start < text.length() - 3) { if (EXTRACT_UNCLOSED) { unclosedQuotes.add(new Pair<>(start, text.length())); Pair<List<Pair<Integer, Integer>>, List<Pair<Integer, Integer>>> embedded = recursiveQuotes(toPass, offset, null); recursiveQuotes(toPass, q.first() + qKind.length() + offset, qKindToPass);
Annotation quote = makeQuote(text.substring(begin, end), begin, end, quoteTokens, tokenOffset, beginSentence, endSentence, docID); Comparator<CoreMap> quoteComparator = getQuoteComparator(); Collections.sort(cmQuotes, quoteComparator); setQuoteIndices(cmQuotes, unclosed); return cmQuotes;
public static List<CoreMap> gatherQuotes(CoreMap curr) { List<CoreMap> embedded = curr.get(CoreAnnotations.QuotationsAnnotation.class); if (embedded != null) { List<CoreMap> extended = Generics.newArrayList(); for (CoreMap quote : embedded) { extended.addAll(gatherQuotes(quote)); } extended.addAll(embedded); return extended; } else { return Generics.newArrayList(); } }
/** * Annotate quotes and extract them like sentences */ public Annotator quote(Properties properties) { Properties relevantProperties = PropertiesUtils.extractPrefixedProperties(properties, Annotator.STANFORD_QUOTE + '.'); return new QuoteAnnotator(relevantProperties); }
text = xmlFreeText(text, annotation); Pair<List<Pair<Integer, Integer>>, List<Pair<Integer, Integer>>> overall = getQuotes(quotesFrom); String docID = annotation.get(CoreAnnotations.DocIDAnnotation.class); List<CoreMap> cmQuotesUnicode = getCoreMapQuotes(overall.first(), tokens, sentences, text, docID, false); List<CoreMap> cmUnclosedUnicode = null; if (EXTRACT_UNCLOSED) { cmUnclosedUnicode = getCoreMapQuotes(overall.second(), tokens, sentences, text, docID, true); int numUnicode = countQuotes(cmQuotesUnicode); quotesFrom = replaceUnicode(text); overall = getQuotes(quotesFrom); docID = annotation.get(CoreAnnotations.DocIDAnnotation.class); List<CoreMap> cmQuotesAscii = getCoreMapQuotes(overall.first(), tokens, sentences, text, docID, false); List<CoreMap> cmUnclosedAscii = null; if (EXTRACT_UNCLOSED) { cmUnclosedAscii = getCoreMapQuotes(overall.second(), tokens, sentences, text, docID, true); int numAsciiSingle = countQuotes(cmQuotesAscii); overall = getQuotes(quotesFrom); docID = annotation.get(CoreAnnotations.DocIDAnnotation.class); List<CoreMap> cmQuotesAsciiNoSingle = getCoreMapQuotes(overall.first(), tokens, sentences, text, docID, false); List<CoreMap> cmUnclosedAsciiNoSingle = null; if (EXTRACT_UNCLOSED) {
/** complete the wrapping process post annotation by a pipeline **/ public void wrapAnnotations() { // wrap all of the sentences if (this.annotationDocument.get(CoreAnnotations.SentencesAnnotation.class) != null) { wrapSentences(); // if there are entity mentions, build a document wide list if ( ! sentences.isEmpty() && sentences.get(0).entityMentions() != null) { buildDocumentEntityMentionsList(); } // if there are quotes, build a document wide list if (QuoteAnnotator.gatherQuotes(this.annotationDocument) != null) buildDocumentQuotesList(); } }
public static Annotation getAnnotatedFile(String text, String baseFileName, Properties props) throws IOException{ File processedFile = new File(baseFileName + ".ser.gz"); processCoreNLPIfDoesNotExist(processedFile, props, text); Annotation doc = ExtractQuotesUtil.readSerializedProtobufFile(processedFile); new QuoteAnnotator(new Properties()).annotate(doc); //important! Re-annotate to take into account that certain tokens are removed in the serialization process. return doc; }
public static String replaceUnicode(String text) { return asciiQuotes(text); }
pw.println(); pw.println("Extracted quotes: "); List<CoreMap> allQuotes = QuoteAnnotator.gatherQuotes(annotation); for (CoreMap quote : allQuotes) { String speakerString;
for (CoreMap quote : QuoteAnnotator.gatherQuotes(annotation)) { Integer firstSpeakerTokenIndex = quote.get(MentionBeginAnnotation.class); if (firstSpeakerTokenIndex != null) {
List<CoreMap> quotes = QuoteAnnotator.gatherQuotes(doc); l1.set("quotes", quotes.stream().map(quote -> (Consumer<Writer>) (Writer l2) -> { l2.set("id", quote.get(CoreAnnotations.QuotationIndexAnnotation.class));
private void buildDocumentQuotesList() { this.quotes = QuoteAnnotator.gatherQuotes(this.annotationDocument).stream(). map(coreMapQuote -> new CoreQuote(this, coreMapQuote)).collect(Collectors.toList()); }
public static List<CoreMap> gatherQuotes(CoreMap curr) { List<CoreMap> embedded = curr.get(CoreAnnotations.QuotationsAnnotation.class); if (embedded != null) { List<CoreMap> extended = Generics.newArrayList(); for (CoreMap quote : embedded) { extended.addAll(gatherQuotes(quote)); } extended.addAll(embedded); return extended; } else { return Generics.newArrayList(); } }