@Override public String lemma() { return label.lemma(); }
/** * The subject of this relation triple, as a String of the subject's lemmas. * This method will additionally strip out punctuation as well. */ public String subjectLemmaGloss() { return StringUtils.join(canonicalSubject.stream().filter(x -> !x.tag().matches("[.?,:;'\"!]")).map(x -> x.lemma() == null ? x.word() : x.lemma()), " "); }
/** * The object of this relation triple, as a String of the object's lemmas. * This method will additionally strip out punctuation as well. */ public String objectLemmaGloss() { return StringUtils.join(canonicalObject.stream().filter(x -> !x.tag().matches("[.?,:;'\"!]")).map(x -> x.lemma() == null ? x.word() : x.lemma()), " "); }
@Override public String apply(CoreLabel l) { String s; if(PatternFactory.useLemmaContextTokens){ s = l.lemma(); assert s!=null : "Lemma is null and useLemmaContextTokens is true"; } else s= l.word(); if(ConstantsAndVariables.matchLowerCaseContext) s = s.toLowerCase(); assert s!= null; return s; } };
public void handleLemma(String arg, OutputStream outStream) throws IOException { if (arg == null) { return; } List<CoreLabel> tokens = parser.lemmatize(arg); OutputStreamWriter osw = new OutputStreamWriter(outStream, "utf-8"); for (int i = 0; i < tokens.size(); ++i) { CoreLabel word = tokens.get(i); if (i > 0) { osw.write(" "); } osw.write(word.lemma()); } osw.write("\n"); osw.flush(); }
/** * @see #conjugateEnglish(String, boolean) */ public String conjugateEnglish(CoreLabel token, boolean negated) { return conjugateEnglish(Optional.ofNullable(token.lemma()).orElse(token.word()), negated); }
/** * @see #conjugateEnglish(String, boolean) */ public String conjugateEnglish(CoreLabel token) { return conjugateEnglish(Optional.ofNullable(token.lemma()).orElse(token.word()), false); } }
/** * The relation of this relation triple, as a String of the relation's lemmas. * This method will additionally strip out punctuation as well, and lower-cases the relation. */ public String relationLemmaGloss() { // Construct a human readable relation string String relationGloss = ( (prefixBe ? "be " : "") + StringUtils.join(relation.stream() .filter(x -> x.tag() == null || (!x.tag().matches("[.?,:;'\"!]") && (x.lemma() == null || !x.lemma().matches("[.,;'\"?!]")))) .map(x -> x.lemma() == null ? x.word() : x.lemma()), " ") .toLowerCase() + (suffixBe ? " be" : "") + (suffixOf ? " of" : "") + (istmod ? " at_time" : "") ).trim(); // Some cosmetic tweaks if ("'s".equals(relationGloss)) { return "have"; } else { return relationGloss; } }
private static void print(Annotation annotation, PrintWriter pw, Options options) throws IOException { List<CoreMap> sentences = annotation.get(CoreAnnotations.SentencesAnnotation.class); if(sentences != null) { for (CoreMap sentence : sentences) { StringBuilder sentenceToWrite = new StringBuilder(); for (CoreLabel token : sentence.get(CoreAnnotations.TokensAnnotation.class)) { sentenceToWrite.append(" "); sentenceToWrite.append(token.lemma().toLowerCase()); if (token.get(CoreAnnotations.PartOfSpeechAnnotation.class).startsWith("V")) //verb sentenceToWrite.append("_V"); else if (token.get(CoreAnnotations.PartOfSpeechAnnotation.class).startsWith("N")) //noun sentenceToWrite.append("_N"); } pw.print(sentenceToWrite); //omit first space } } }
private static boolean containsStopWord(CoreLabel l, Set<String> commonEngWords, java.util.regex.Pattern ignoreWordRegex) { // if(useWordResultCache.containsKey(l.word())) // return useWordResultCache.get(l.word()); if ((commonEngWords != null && ((lemmaExists(l) && commonEngWords.contains(l.lemma())) || commonEngWords.contains(l.word()))) || (ignoreWordRegex != null && ((lemmaExists(l) && ignoreWordRegex.matcher(l.lemma()).matches()) || ignoreWordRegex.matcher(l.word()).matches()))){ //|| (ignoreWords !=null && (ignoreWords.contains(l.lemma()) || ignoreWords.contains(l.word())))) { // useWordResultCache.putIfAbsent(l.word(), false); return true; } // // if (l.word().length() >= minLen4Fuzzy) { // try { // String matchedFuzzy = NoisyLabelSentences.containsFuzzy(commonEngWords, // l.word(), minLen4Fuzzy); // if (matchedFuzzy != null) { // synchronized (commonEngWords) { // commonEngWords.add(l.word()); // System.out.println("word is " + l.word() + " and matched fuzzy with " + // matchedFuzzy); // } // useWordResultCache.putIfAbsent(l.word(), false); // return false; // } // } catch (Exception e) { // e.printStackTrace(); // System.out.println("Exception " + " while fuzzy matching " + l.word()); // } // } // useWordResultCache.putIfAbsent(l.word(), true); return false; }
public String[] getSplitPattern(){ ArrayList<ArrayList<IndexedWord>> premodifiers = getPremodifiers(); String[] components = new String[4]; components[0] = headWord.lemma(); if(premodifiers.size() == 0){ components[1] = headWord.lemma(); components[2] = headWord.lemma(); } else if(premodifiers.size() == 1){ ArrayList<AbstractCoreLabel> premod = Generics.newArrayList(); premod.addAll(premodifiers.get(premodifiers.size()-1)); premod.add(headWord); components[1] = getPattern(premod); components[2] = getPattern(premod); } else { ArrayList<AbstractCoreLabel> premod1 = Generics.newArrayList(); premod1.addAll(premodifiers.get(premodifiers.size()-1)); premod1.add(headWord); components[1] = getPattern(premod1); ArrayList<AbstractCoreLabel> premod2 = Generics.newArrayList(); for(ArrayList<IndexedWord> premodifier : premodifiers){ premod2.addAll(premodifier); } premod2.add(headWord); components[2] = getPattern(premod2); } components[3] = getPattern(); return components; }
public static boolean entityClusterAllCorefDictionary(CorefCluster menCluster, CorefCluster antCluster, Dictionaries dict, int dictColumn, int freq){ boolean ret = false; for(Mention men : menCluster.getCorefMentions()){ if(men.isPronominal()) continue; for(Mention ant : antCluster.getCorefMentions()){ if(ant.isPronominal() || men.headWord.lemma().equals(ant.headWord.lemma())) continue; if(entityCorefDictionary(men, ant, dict, dictColumn, freq)){ ret = true; } else { return false; } } } return ret; }
public static boolean entityClusterAllCorefDictionary(CorefCluster menCluster, CorefCluster antCluster, Dictionaries dict, int dictColumn, int freq){ boolean ret = false; for(Mention men : menCluster.getCorefMentions()){ if(men.isPronominal()) continue; for(Mention ant : antCluster.getCorefMentions()){ if(ant.isPronominal() || men.headWord.lemma().equals(ant.headWord.lemma())) continue; if(entityCorefDictionary(men, ant, dict, dictColumn, freq)){ ret = true; } else { return false; } } } return ret; }
/** * Converts a tree to the Morfette training format. */ private static String treeToMorfette(Tree tree) { StringBuilder sb = new StringBuilder(); List<Label> yield = tree.yield(); List<Label> tagYield = tree.preTerminalYield(); assert yield.size() == tagYield.size(); int listLen = yield.size(); for (int i = 0; i < listLen; ++i) { CoreLabel token = (CoreLabel) yield.get(i); CoreLabel tag = (CoreLabel) tagYield.get(i); String morphStr = token.originalText(); if (morphStr == null || morphStr.equals("")) { morphStr = tag.value(); } String lemma = token.lemma(); if (lemma == null || lemma.equals("")) { lemma = token.value(); } sb.append(String.format("%s %s %s%n", token.value(), lemma, morphStr)); } return sb.toString(); }
private void statsWithoutApplyingPatterns(Map<String, DataInstance> sents, PatternsForEachToken patternsForEachToken, Counter<E> patternsLearnedThisIter, TwoDimensionalCounter<CandidatePhrase, E> wordsandLemmaPatExtracted){ for (Entry<String, DataInstance> sentEn : sents.entrySet()) { Map<Integer, Set<E>> pat4Sent = patternsForEachToken.getPatternsForAllTokens(sentEn.getKey()); if (pat4Sent == null) { throw new RuntimeException("How come there are no patterns for " + sentEn.getKey()); } for (Entry<Integer, Set<E>> en : pat4Sent .entrySet()) { CoreLabel token = null; Set<E> p1 = en.getValue(); // Set<Integer> p1 = en.getValue().first(); // Set<Integer> p2 = en.getValue().second(); // Set<Integer> p3 = en.getValue().third(); for (E index : patternsLearnedThisIter.keySet()) { if (p1.contains(index)) { if (token == null) token = sentEn.getValue().getTokens().get(en.getKey()); wordsandLemmaPatExtracted.incrementCount(CandidatePhrase.createOrGet(token.word(), token.lemma()), index); } } } } }
/** * Featurize a given sentence. * * @param sentence The sentence to featurize. * * @return A counter encoding the featurized sentence. */ private static Counter<String> featurize(CoreMap sentence) { ClassicCounter<String> features = new ClassicCounter<>(); String lastLemma = "^"; for (CoreLabel token : sentence.get(CoreAnnotations.TokensAnnotation.class)) { String lemma = token.lemma().toLowerCase(); if (number.matcher(lemma).matches()) { features.incrementCount("**num**"); } else { features.incrementCount(lemma); } if (alpha.matcher(lemma).matches()) { features.incrementCount(lastLemma + "__" + lemma); lastLemma = lemma; } } features.incrementCount(lastLemma + "__$"); return features; }
private static void addLemmata(Tree tree) { tree.yield().forEach(l -> { CoreLabel w = (CoreLabel) l; if(w.lemma() == null) { w.setLemma(MORPH.lemma(w.word(), w.tag())); } }); }
private static boolean findSpeaker(Document doc, int utterNum, int sentNum, List<CoreMap> sentences, int startIndex, int endIndex, Dictionaries dict) { List<CoreLabel> sent = sentences.get(sentNum).get(CoreAnnotations.TokensAnnotation.class); for(int i = startIndex ; i < endIndex ; i++) { CoreLabel cl = sent.get(i); if(cl.get(CoreAnnotations.UtteranceAnnotation.class)!=0) continue; String lemma = cl.lemma(); String word = cl.word(); if(dict.reportVerb.contains(lemma) && cl.tag().startsWith("V")) { // find subject SemanticGraph dependency = sentences.get(sentNum).get(SemanticGraphCoreAnnotations.EnhancedDependenciesAnnotation.class); if (dependency == null) { dependency = sentences.get(sentNum).get(SemanticGraphCoreAnnotations.BasicDependenciesAnnotation.class); } IndexedWord w = dependency.getNodeByWordPattern(word); if (w != null) { if(findSubject(doc, dependency, w, sentNum, utterNum)) return true; for(IndexedWord p : dependency.getPathToRoot(w)) { if(!p.tag().startsWith("V") && !p.tag().startsWith("MD")) break; if(findSubject(doc, dependency, p, sentNum, utterNum)) return true; // handling something like "was talking", "can tell" } } else { Redwood.log("debug-preprocessor", "Cannot find node in dependency for word " + word); } } } return false; }
/** * A funky little helper method to interpret each token of the sentence as an HTML string, and translate it back to text. * Note that this is <b>in place</b>. */ public void unescapeHTML() { // Change in the protobuf for (int i = 0; i < sentence.length(); ++i) { CoreNLPProtos.Token.Builder token = sentence.rawToken(i); token.setWord(StringUtils.unescapeHtml3(token.getWord())); token.setLemma(StringUtils.unescapeHtml3(token.getLemma())); } // Change in the annotation CoreMap cm = sentence.document.asAnnotation().get(CoreAnnotations.SentencesAnnotation.class).get(sentence.sentenceIndex()); for (CoreLabel token : cm.get(CoreAnnotations.TokensAnnotation.class)) { token.setWord(StringUtils.unescapeHtml3(token.word())); token.setLemma(StringUtils.unescapeHtml3(token.lemma())); } } }