Refine search
private static int numEntitiesInList(Mention m) { int num = 0; for(int i=1 ; i < m.originalSpan.size() ; i++) { CoreLabel cl = m.originalSpan.get(i); if(cl.word().equals(",")) num++; if((cl.word().equalsIgnoreCase("and") || cl.word().equalsIgnoreCase("or")) && !m.originalSpan.get(i-1).word().equals(",")) num++; } return num; } }
@Override public void printAnswers(List<CoreLabel> doc, PrintWriter out) { for (CoreLabel wi : doc) { String answer = wi.get(CoreAnnotations.AnswerAnnotation.class); String goldAnswer = wi.get(CoreAnnotations.GoldAnswerAnnotation.class); out.println(wi.word() + "\t" + goldAnswer + "\t" + answer); } out.println(); }
private static Set<String> getPropers(Mention m) { Set<String> propers = new HashSet<>(); for (int i = m.startIndex; i < m.endIndex; i++) { CoreLabel cl = m.sentenceWords.get(i); String POS = cl.get(CoreAnnotations.PartOfSpeechAnnotation.class); String word = cl.word().toLowerCase(); if (PROPERS.contains(POS)) { propers.add(word); } } return propers; }
int rSize = regex.size(); for (int start = searchStart, end = document.size() - regex.size(); start <= end; start++) { boolean failed = false; for (int i = 0; i < rSize; i++) { Pattern pattern = regex.get(i); String exact = entry.exact.get(i); CoreLabel token = document.get(start + i); String NERType = token.get(CoreAnnotations.NamedEntityTagAnnotation.class); String currentType = token.get(CoreAnnotations.AnswerAnnotation.class); (exact != null && ! (ignoreCase ? exact.equalsIgnoreCase(token.word()) : exact.equals(token.word()))) || ! (entry.overwritableTypes.contains(NERType) || myLabels.contains(NERType)) || ! pattern.matcher(token.word()).matches() // last, as this is likely the expensive operation ) { failed = true;
/** * Look for a distance of up to 3 for something that indicates weight not * money. * * @param pl The list of CoreLabel * @param i The position to scan right from * @return whether a weight word is found */ private static boolean leftScanFindsWeightWord(List<CoreLabel> pl, int i) { if (DEBUG) { log.info("leftScan from: " + pl.get(i).word()); } for (int j = i - 1; j >= 0 && j >= i - 3; j--) { CoreLabel fl = pl.get(j); if (fl.word().startsWith("weigh")) { if (DEBUG) { log.info("leftScan found weight: " + fl.word()); } return true; } } return false; }
/** * Look along CD words and see if next thing is a money word. * * @param pl The list of CoreLabel * @param i The position to scan right from * @return Whether a money word is found */ private static boolean rightScanFindsMoneyWord(List<CoreLabel> pl, int i) { int j = i; if (DEBUG) { log.info("rightScan from: " + pl.get(j).word()); } int sz = pl.size(); while (j < sz && pl.get(j).getString(CoreAnnotations.PartOfSpeechAnnotation.class).equals("CD")) { j++; } if (j >= sz) { return false; } String tag = pl.get(j).getString(CoreAnnotations.PartOfSpeechAnnotation.class); String word = pl.get(j).word(); if (DEBUG) { log.info("rightScan testing: " + word + '/' + tag + "; answer is: " + Boolean.toString((tag.equals("NN") || tag.equals("NNS")) && CURRENCY_WORD_PATTERN.matcher(word).matches())); } return (tag.equals("M") || tag.equals("NN") || tag.equals("NNS")) && CURRENCY_WORD_PATTERN.matcher(word).matches(); }
if (! wi.get(CoreAnnotations.AnswerAnnotation.class).equals(wi.get(CoreAnnotations.GoldAnswerAnnotation.class))) { wrong++; if (!THREE_CLASSES && wi.get(CoreAnnotations.AnswerAnnotation.class).equals("UPPER")) { sb.append(wi.word().toUpperCase()); } else if (wi.get(CoreAnnotations.AnswerAnnotation.class).equals("LOWER")) { sb.append(wi.word().toLowerCase()); } else if (wi.get(CoreAnnotations.AnswerAnnotation.class).equals("INIT_UPPER")) { sb.append(wi.word().substring(0,1).toUpperCase()) .append(wi.word().substring(1)); } else if (wi.get(CoreAnnotations.AnswerAnnotation.class).equals("O")) { sb.append(wi.word()); Matcher alphaMatcher = alphabet.matcher(wi.word()); if (alphaMatcher.matches()) { sb.append("/MIX"); System.err.printf("> wrong = %d ; total = %d%n", wrong, doc.size()); out.println();
/** Write a standard CoNLL format output file. * * @param doc The document: A List of CoreLabel * @param out Where to send the answers to */ @Override @SuppressWarnings({"StringEquality", "StringContatenationInLoop"}) public void printAnswers(List<CoreLabel> doc, PrintWriter out) { // boolean tagsMerged = flags.mergeTags; // boolean useHead = flags.splitOnHead; if ( ! "iob1".equalsIgnoreCase(flags.entitySubclassification)) { deEndify(doc); } for (CoreLabel fl : doc) { String word = fl.word(); if (word == BOUNDARY) { // Using == is okay, because it is set to constant out.println(); } else { String gold = fl.getString(CoreAnnotations.GoldAnswerAnnotation.class); String guess = fl.get(CoreAnnotations.AnswerAnnotation.class); // log.info(word + "\t" + gold + "\t" + guess)); String pos = fl.getString(CoreAnnotations.PartOfSpeechAnnotation.class); String chunk = fl.getString(CoreAnnotations.ChunkAnnotation.class); out.println(fl.word() + '\t' + pos + '\t' + chunk + '\t' + gold + '\t' + guess); } } }
protected ArrayList<Integer> scanForPronouns(Pair<Integer, Integer> nonQuoteRun) { List<CoreLabel> tokens = doc.get(CoreAnnotations.TokensAnnotation.class); ArrayList<Integer> pronounList = new ArrayList<>(); for(int i = nonQuoteRun.first; i <= nonQuoteRun.second && i < tokens.size() ; i++) { if(tokens.get(i).word().equalsIgnoreCase("he") || tokens.get(i).word().equalsIgnoreCase("she")) pronounList.add(i); } return pronounList; }
String label = as.getKey(); boolean lastwordlabeled = lastWordLabeled.get(label); if (s.get(as.getValue()).equals(label)) { if (!lastwordlabeled) { startingLabels.add(label); for(int i = listEndedLabels.size() -1 ; i >=0; i--) str += " </" + listEndedLabels.get(i) + ">"; for(String label : startingLabels){ str += " <" + label + "> "; str += " " + s.word(); writer.write(str.trim() + " ");
private static boolean mentionIsRangren(Mention m, List<CoreLabel> sent) { if (m.spanToString().equals("人") && m.startIndex > 0) { String priorWord = sent.get(m.startIndex - 1).word(); // cdm [2016]: This test matches everything because of the 3rd clause! That can't be right! if (priorWord.endsWith("让") || priorWord.endsWith("令") || priorWord.endsWith("")) { return true; } } return false; }
/** * Look along CD words and see if next thing is a money word * like cents or pounds. * * @param pl The list of CoreLabel * @param i The position to scan right from * @return Whether a money word is found */ private static boolean rightScanFindsMoneyWord(List<CoreLabel> pl, int i) { int j = i; if (DEBUG) { log.info("rightScan from: " + pl.get(j).word()); } int sz = pl.size(); while (j < sz && pl.get(j).getString(CoreAnnotations.PartOfSpeechAnnotation.class).equals("CD")) { j++; } if (j >= sz) { return false; } String tag = pl.get(j).getString(CoreAnnotations.PartOfSpeechAnnotation.class); String word = pl.get(j).word(); if (DEBUG) { log.info("rightScan testing: " + word + '/' + tag + "; answer is: " + Boolean.toString((tag.equals("NN") || tag.equals("NNS")) && CURRENCY_WORD_PATTERN.matcher(word).matches())); } return (tag.equals("NN") || tag.equals("NNS")) && CURRENCY_WORD_PATTERN.matcher(word).matches(); }
for(CoreLabel l: doc){ if(l.word().equals(CoNLLDocumentReaderAndWriter.BOUNDARY) || l.word().equals("-DOCSTART-")){ if(words.size() > 0){ num++; String docid = sentIDprefix + "-"+String.valueOf(num); words.add(l.word()); l.set(CoreAnnotations.ValueAnnotation.class, l.word()); String label = l.get(CoreAnnotations.AnswerAnnotation.class); l.set(CoreAnnotations.TextAnnotation.class, l.word()); l.set(CoreAnnotations.OriginalTextAnnotation.class, l.word()); if(words.size() > 0){ num++; String docid = sentIDprefix + "-"+String.valueOf(num);;
private static List<String> getContentWords(Mention m) { List<String> words = new ArrayList<>(); for (int i = m.startIndex; i < m.endIndex; i++) { CoreLabel cl = m.sentenceWords.get(i); String POS = cl.get(CoreAnnotations.PartOfSpeechAnnotation.class); if (POS.equals("NN") || POS.equals("NNS") || POS.equals("NNP") || POS.equals("NNPS")) { words.add(cl.word().toLowerCase()); } } return words; } }
@Override public void printAnswers(List<CoreLabel> doc, PrintWriter out) { for (CoreLabel wi : doc) { String answer = wi.get(CoreAnnotations.AnswerAnnotation.class); String goldAnswer = wi.get(CoreAnnotations.GoldAnswerAnnotation.class); out.println(wi.word() + '\t' + goldAnswer + '\t' + answer); } out.println(); }
/** * Outputs a partial CONLL-U file with token information (form, lemma, POS) * but without any dependency information. * * @param sentence * @return */ public String printPOSAnnotations(CoreMap sentence) { StringBuilder sb = new StringBuilder(); for (CoreLabel token : sentence.get(CoreAnnotations.TokensAnnotation.class)) { String upos = token.getString(CoreAnnotations.CoarseTagAnnotation.class, "_"); String lemma = token.getString(CoreAnnotations.LemmaAnnotation.class, "_"); String pos = token.getString(CoreAnnotations.PartOfSpeechAnnotation.class, "_"); String featuresString = CoNLLUUtils.toFeatureString(token.get(CoreAnnotations.CoNLLUFeats.class)); String misc = token.getString(CoreAnnotations.CoNLLUMisc.class, "_"); sb.append(String.format("%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s%n", token.index(), token.word(), lemma, upos , pos, featuresString, "_", "_", "_", misc)); } sb.append("\n"); return sb.toString(); }
public List<Integer> scanForAnimates(Pair<Integer, Integer> span) { List<Integer> animateIndices = new ArrayList<>(); List<CoreLabel> tokens = doc.get(CoreAnnotations.TokensAnnotation.class); for(int i = span.first; i <= span.second && i < tokens.size() ; i++) { CoreLabel token = tokens.get(i); if(animacySet.contains(token.word())) animateIndices.add(i); } return animateIndices; }
for(int i = 0; i < tokenSequence.size(); i ++){ CoreLabel crt = tokenSequence.get(i); CoreLabel next = (i < tokenSequence.size() - 1 ? tokenSequence.get(i + 1) : null); CoreLabel prev = (i > 0 ? tokenSequence.get(i - 1) : null); if (CURRENCY_SYMBOL_PATTERN.matcher(crt.word()).matches() && next != null && (next.get(CoreAnnotations.AnswerAnnotation.class).equals("NUMBER") || "CD".equals(next.tag()))) { crt.set(CoreAnnotations.AnswerAnnotation.class, "MONEY"); i = changeLeftToRight(tokenSequence, i + 1, next.get(CoreAnnotations.AnswerAnnotation.class), next.tag(), "MONEY") - 1; else if((CURRENCY_WORD_PATTERN.matcher(crt.word()).matches() || CURRENCY_SYMBOL_PATTERN.matcher(crt.word()).matches()) && prev != null && (prev.get(CoreAnnotations.AnswerAnnotation.class).equals("NUMBER") || "CD".equals(prev.tag())) && ! leftScanFindsWeightWord(tokenSequence, i)) { else if ((PERCENT_WORD_PATTERN.matcher(crt.word()).matches() || PERCENT_SYMBOL_PATTERN.matcher(crt.word()).matches()) && prev != null && (prev.get(CoreAnnotations.AnswerAnnotation.class).equals("NUMBER") ||