} else if (args[0].equals("-stem")) { for (int i = 1; i < args.length; i++) { System.out.println(args[i] + " --> " + stemStatic(WordTag.valueOf(args[i], "_"))); Morphology morph = new Morphology(new FileReader(arg), flags); for (Word next; (next = morph.next()) != null; ) { System.out.print(next);
@Override public Tree transformTree(Tree t) { Morphology morphology = new Morphology(); List<TaggedWord> tagged = null; int index = 0; for (Tree leaf : t.getLeaves()) { Label label = leaf.label(); if (label == null) { continue; } String tag; if (!(label instanceof HasTag) || ((HasTag) label).tag() == null) { if (tagged == null) { tagged = t.taggedYield(); } tag = tagged.get(index).tag(); } else { tag = ((HasTag) label).tag(); } if (!(label instanceof HasLemma)) { throw new IllegalArgumentException("Got a tree with labels which do not support lemma"); } ((HasLemma) label).setLemma(morphology.lemma(label.value(), tag, true)); ++index; } return t; }
/** Lemmatize the word, being sensitive to the tag. * * @param word The word to lemmatize * @param tag What part of speech to assume for it. * @param lowercase If this is true, words other than proper nouns will * be changed to all lowercase. * @return The lemma for the word */ public static synchronized String lemmaStatic(String word, String tag, boolean lowercase) { initStaticLexer(); return lemmatize(word, tag, staticLexer, lowercase); }
private static void addLemma(Morphology morpha, Class<? extends CoreAnnotation<String>> ann, CoreMap map, String word, String tag) { if ( ! tag.isEmpty()) { String phrasalVerb = phrasalVerb(morpha, word, tag); if (phrasalVerb == null) { map.set(ann, morpha.lemma(word, tag)); } else { map.set(ann, phrasalVerb); } } else { map.set(ann, morpha.stem(word)); } }
/** If a token is a phrasal verb with an underscore between a verb and a * particle, return the phrasal verb lemmatized. If not, return null */ private static String phrasalVerb(Morphology morpha, String word, String tag) { // must be a verb and contain an underscore assert(word != null); assert(tag != null); if(!tag.startsWith("VB") || !word.contains("_")) return null; // check whether the last part is a particle String[] verb = word.split("_"); if(verb.length != 2) return null; String particle = verb[1]; if(particles.contains(particle)) { String base = verb[0]; String lemma = morpha.lemma(base, tag); return lemma + '_' + particle; } return null; }
@Override public void visitTree(Tree t) { // A single Morphology is not threadsafe, so to make this class // threadsafe, we have to create a new Morphology for each visit processTree(t, null, new Morphology()); }
/** * Adds the LemmaAnnotation to the given CoreLabel. */ public void stem(CoreLabel label) { stem(label, CoreAnnotations.LemmaAnnotation.class); }
/** * Only works on English, as it is hard coded for using the * Morphology class, which is English-only */ public List<CoreLabel> lemmatize(List<? extends HasWord> tokens) { List<TaggedWord> tagged; if (getOp().testOptions.preTag) { Function<List<? extends HasWord>, List<TaggedWord>> tagger = loadTagger(); tagged = tagger.apply(tokens); } else { Tree tree = parse(tokens); tagged = tree.taggedYield(); } Morphology morpha = new Morphology(); List<CoreLabel> lemmas = Generics.newArrayList(); for (TaggedWord token : tagged) { CoreLabel label = new CoreLabel(); label.setWord(token.word()); label.setTag(token.tag()); morpha.stem(label); lemmas.add(label); } return lemmas; }
/** Lemmatize the word, being sensitive to the tag. * Words other than proper nouns will be changed to all lowercase. * * @param word The word to lemmatize * @param tag What part of speech to assume for it. * @return The lemma for the word */ public static synchronized String lemmaStatic(String word, String tag) { return lemmaStatic(word, tag, true); }
/** Return a new WordTag which has the lemma as the value of word(). * The default is to lowercase non-proper-nouns, unless options have * been set. */ public static WordTag stemStatic(WordTag wT) { return stemStatic(wT.word(), wT.tag()); }
/** * * @param t a tree * @return the WordTags corresponding to the leaves of the tree, * stemmed according to their POS tags in the tree. */ private static List<WordTag> getStemmedWordTagsFromTree(Tree t, boolean threadSafe) { List<WordTag> stemmedWordTags = Generics.newArrayList(); ArrayList<TaggedWord> s = t.taggedYield(); for (TaggedWord w : s) { WordTag wt = threadSafe ? Morphology.stemStaticSynchronized(w.word(), w.tag()) : Morphology.stemStatic(w.word(), w.tag()); stemmedWordTags.add(wt); } return stemmedWordTags; }
public String lemma(String word, String tag, boolean lowercase) { return lemmatize(word, tag, lexer, lowercase); }
public WordLemmaTag lemmatize(String word, String tag) { final String lemma = Morphology.stemStaticSynchronized(word, tag).word(); return new WordLemmaTag(word, lemma, tag); }
/** * Lemmatize returning a {@code WordLemmaTag}. */ public WordLemmaTag lemmatize(WordTag wT) { String tag = wT.tag(); String word = wT.word(); String lemma = lemma(word, tag); return new WordLemmaTag(word, lemma, tag); }
public List<? extends HasWord> tagCoreLabelsOrHasWords(List<? extends HasWord> sentence, Morphology morpha, boolean outputLemmas) { if (sentence.size() > 0 && sentence.get(0) instanceof CoreLabel) { List<CoreLabel> coreLabels = castCoreLabels(sentence); tagCoreLabels(coreLabels); if (outputLemmas) { // We may want to lemmatize things without using an existing // Morphology object, as Morphology objects are not // thread-safe, so we would make a new one here if (morpha == null) { morpha = new Morphology(); } lemmatize(coreLabels, morpha); } return coreLabels; } else { List<TaggedWord> taggedSentence = tagSentence(sentence, false); return taggedSentence; } }
/** Return a new WordTag which has the lemma as the value of word(). * The default is to lowercase non-proper-nouns, unless options have * been set. */ public static synchronized WordTag stemStatic(String word, String tag) { initStaticLexer(); return new WordTag(lemmatize(word, tag, staticLexer, staticLexer.option(1)), tag); }
/** * Adds lemmas to the given list of CoreLabels, using the given * Morphology object. The input list must already have tags set. */ public static void lemmatize(List<CoreLabel> sentence, Morphology morpha) { for (CoreLabel label : sentence) { morpha.stem(label); } }
/** * Create a new word, where the label is formed from * the {@code String} passed in. The String is divided according * to the divider character. We assume that we can always just * divide on the rightmost divider character, rather than trying to * parse up escape sequences. If the divider character isn't found * in the word, then the whole string becomes the word, and lemma and tag * are {@code null}. * We assume that if only one divider character is found, word and tag are presents in * the String, and lemma will be computed. * * @param labelStr The word that will go into the {@code Word} * @return The new WordLemmaTag */ @Override public Label newLabelFromString(String labelStr) { int first = labelStr.indexOf(divider); int second = labelStr.lastIndexOf(divider); if (first == second) { return new WordLemmaTag(labelStr.substring(0, first), Morphology.lemmaStatic(labelStr.substring(0, first), labelStr.substring(first + 1)), labelStr.substring(first + 1)); } else if (first >= 0) { return new WordLemmaTag(labelStr.substring(0, first), labelStr.substring(first + 1, second), labelStr.substring(second + 1)); } else { return new WordLemmaTag(labelStr); } }
/** * Create a new {@code WordLemmaTag}. * * @param word This word is set as the word of this Label * @param tag The {@code value()} of this Label is set as the * tag of this Label */ public WordLemmaTag(String word, String tag) { WordTag wT = new WordTag(word, tag); this.word = word; this.lemma = Morphology.stemStatic(wT).word(); setTag(tag); }