@Override public void visitTree(Tree t) { // A single Morphology is not threadsafe, so to make this class // threadsafe, we have to create a new Morphology for each visit processTree(t, null, new Morphology()); }
Morphology morph = new Morphology(new FileReader(arg), flags); for (Word next; (next = morph.next()) != null; ) { System.out.print(next);
public List<? extends HasWord> tagCoreLabelsOrHasWords(List<? extends HasWord> sentence, Morphology morpha, boolean outputLemmas) { if (sentence.size() > 0 && sentence.get(0) instanceof CoreLabel) { List<CoreLabel> coreLabels = castCoreLabels(sentence); tagCoreLabels(coreLabels); if (outputLemmas) { // We may want to lemmatize things without using an existing // Morphology object, as Morphology objects are not // thread-safe, so we would make a new one here if (morpha == null) { morpha = new Morphology(); } lemmatize(coreLabels, morpha); } return coreLabels; } else { List<TaggedWord> taggedSentence = tagSentence(sentence, false); return taggedSentence; } }
Morphology morpha = (outputLemmas) ? new Morphology() : null;
@Override public void annotate(Annotation annotation) { if (VERBOSE) { log.info("Finding lemmas ..."); } Morphology morphology = new Morphology(); if (annotation.containsKey(CoreAnnotations.SentencesAnnotation.class)) { for (CoreMap sentence : annotation.get(CoreAnnotations.SentencesAnnotation.class)) { List<CoreLabel> tokens = sentence.get(CoreAnnotations.TokensAnnotation.class); //log.info("Lemmatizing sentence: " + tokens); for (CoreLabel token : tokens) { String text = token.get(CoreAnnotations.TextAnnotation.class); String posTag = token.get(CoreAnnotations.PartOfSpeechAnnotation.class); addLemma(morphology, CoreAnnotations.LemmaAnnotation.class, token, text, posTag); } } } else { throw new RuntimeException("Unable to find words/tokens in: " + annotation); } }
@Override public Tree transformTree(Tree t) { Morphology morphology = new Morphology(); List<TaggedWord> tagged = null; int index = 0; for (Tree leaf : t.getLeaves()) { Label label = leaf.label(); if (label == null) { continue; } String tag; if (!(label instanceof HasTag) || ((HasTag) label).tag() == null) { if (tagged == null) { tagged = t.taggedYield(); } tag = tagged.get(index).tag(); } else { tag = ((HasTag) label).tag(); } if (!(label instanceof HasLemma)) { throw new IllegalArgumentException("Got a tree with labels which do not support lemma"); } ((HasLemma) label).setLemma(morphology.lemma(label.value(), tag, true)); ++index; } return t; }
/** * Only works on English, as it is hard coded for using the * Morphology class, which is English-only */ public List<CoreLabel> lemmatize(List<? extends HasWord> tokens) { List<TaggedWord> tagged; if (getOp().testOptions.preTag) { Function<List<? extends HasWord>, List<TaggedWord>> tagger = loadTagger(); tagged = tagger.apply(tokens); } else { Tree tree = parse(tokens); tagged = tree.taggedYield(); } Morphology morpha = new Morphology(); List<CoreLabel> lemmas = Generics.newArrayList(); for (TaggedWord token : tagged) { CoreLabel label = new CoreLabel(); label.setWord(token.word()); label.setTag(token.tag()); morpha.stem(label); lemmas.add(label); } return lemmas; }
Morphology morpha = (outputLemmas) ? new Morphology() : null; for (List<X> sentence : document) { numWords += sentence.size();
protected TaggerWrapper(MaxentTagger tagger) { this.tagger = tagger; this.config = tagger.config; try { tokenizerFactory = chooseTokenizerFactory(config.getTokenize(), config.getTokenizerFactory(), config.getTokenizerOptions(), config.getTokenizerInvertible()); } catch (Exception e) { log.info("Error in tokenizer factory instantiation for class: " + config.getTokenizerFactory()); e.printStackTrace(); tokenizerFactory = PTBTokenizerFactory.newWordTokenizerFactory(config.getTokenizerOptions()); } outputStyle = OutputStyle.fromShortName(config.getOutputFormat()); outputVerbosity = config.getOutputVerbosity(); outputLemmas = config.getOutputLemmas(); morpha = (outputLemmas) ? new Morphology() : null; tokenize = config.getTokenize(); // tagSeparator = config.getTagSeparator(); }
private StanfordLemmatizer() { this.analyzer = new Morphology(); }
public WordStemmer() { morpha = new Morphology(); }
@Override public void visitTree(Tree t) { // A single Morphology is not threadsafe, so to make this class // threadsafe, we have to create a new Morphology for each visit processTree(t, null, new Morphology()); }
public void visitTree(Tree t) { // A single Morphology is not threadsafe, so to make this class // threadsafe, we have to create a new Morphology for each visit processTree(t, null, new Morphology()); }
@Override public void visitTree(Tree t) { // A single Morphology is not threadsafe, so to make this class // threadsafe, we have to create a new Morphology for each visit processTree(t, null, new Morphology()); }
String tag = "VBG"; String word = "painting"; Morphology morphology = new Morphology(); String lemma = morphology.lemma(word, tag);
Properties props = new Properties(); props.put("annotators", "tokenize, ssplit"); StanfordCoreNLP pipeline = new StanfordCoreNLP(props, false); String text = "painting"; Morphology morphology = new Morphology(); Annotation document = pipeline.process(text); List<edu.stanford.nlp.util.CoreMap> sentences = document.get(SentencesAnnotation.class); for(edu.stanford.nlp.util.CoreMap sentence: sentences) { for(CoreLabel token: sentence.get(TokensAnnotation.class)) { String word = token.get(TextAnnotation.class); String tag = ... //get the tag for the current word from somewhere, e.g. an array String lemma = morphology.lemma(word, tag); System.out.println("lemmatized version :" + lemma); } }
public List<? extends HasWord> tagCoreLabelsOrHasWords(List<? extends HasWord> sentence, Morphology morpha, boolean outputLemmas) { if (sentence.size() > 0 && sentence.get(0) instanceof CoreLabel) { List<CoreLabel> coreLabels = castCoreLabels(sentence); tagCoreLabels(coreLabels); if (outputLemmas) { // We may want to lemmatize things without using an existing // Morphology object, as Morphology objects are not // thread-safe, so we would make a new one here if (morpha == null) { morpha = new Morphology(); } lemmatize(coreLabels, morpha); } return coreLabels; } else { List<TaggedWord> taggedSentence = tagSentence(sentence, false); return taggedSentence; } }
public List<? extends HasWord> tagCoreLabelsOrHasWords(List<? extends HasWord> sentence, Morphology morpha, boolean outputLemmas) { if (sentence.size() > 0 && sentence.get(0) instanceof CoreLabel) { List<CoreLabel> coreLabels = castCoreLabels(sentence); tagCoreLabels(coreLabels); if (outputLemmas) { // We may want to lemmatize things without using an existing // Morphology object, as Morphology objects are not // thread-safe, so we would make a new one here if (morpha == null) { morpha = new Morphology(); } lemmatize(coreLabels, morpha); } return coreLabels; } else { List<TaggedWord> taggedSentence = tagSentence(sentence, false); return taggedSentence; } }
public static void main(String[] args) throws FileNotFoundException { String treeString = "(ROOT (S (NP (NNP John)) (VP (VBZ eats) (NP (NN pizza))) (. .)))"; Tree tree = Tree.valueOf(treeString); SemanticGraph graph = SemanticGraphFactory.generateUncollapsedDependencies(tree); //add lemmata Morphology morphology = new Morphology(); for (IndexedWord node : graph.vertexSet()) { String lemma = morphology.lemma(node.word(), node.tag()); node.setLemma(lemma); } System.err.println(graph); SemgrexPattern semgrex = SemgrexPattern.compile("{}=A <<dobj=reln {lemma:/eat/}=B"); SemgrexMatcher matcher = semgrex.matcher(graph); while (matcher.find()) { System.err.println(matcher.getNode("A") + " <<dobj " + matcher.getNode("B")); } }
protected TaggerWrapper(MaxentTagger tagger) { this.tagger = tagger; this.config = tagger.config; try { tokenizerFactory = chooseTokenizerFactory(config.getTokenize(), config.getTokenizerFactory(), config.getTokenizerOptions(), config.getTokenizerInvertible()); } catch (Exception e) { log.info("Error in tokenizer factory instantiation for class: " + config.getTokenizerFactory()); e.printStackTrace(); tokenizerFactory = PTBTokenizerFactory.newWordTokenizerFactory(config.getTokenizerOptions()); } outputStyle = OutputStyle.fromShortName(config.getOutputFormat()); outputVerbosity = config.getOutputVerbosity(); outputLemmas = config.getOutputLemmas(); morpha = (outputLemmas) ? new Morphology() : null; tokenize = config.getTokenize(); // tagSeparator = config.getTagSeparator(); }