edu.stanford.nlp.process.Morphology java code examples

} else if (args[0].equals("-stem")) {
 for (int i = 1; i < args.length; i++) {
  System.out.println(args[i] + " --> " + stemStatic(WordTag.valueOf(args[i], "_")));
   Morphology morph = new Morphology(new FileReader(arg), flags);
   for (Word next; (next = morph.next()) != null; ) {
    System.out.print(next);

@Override
public Tree transformTree(Tree t) {
 Morphology morphology = new Morphology();
 List<TaggedWord> tagged = null;
 int index = 0;
 for (Tree leaf : t.getLeaves()) {
  Label label = leaf.label();
  if (label == null) {
   continue;
  }
  String tag;
  if (!(label instanceof HasTag) || ((HasTag) label).tag() == null) {
   if (tagged == null) {
    tagged = t.taggedYield();
   }
   tag = tagged.get(index).tag();
  } else {
   tag = ((HasTag) label).tag();
  }
  if (!(label instanceof HasLemma)) {
   throw new IllegalArgumentException("Got a tree with labels which do not support lemma");
  }
  ((HasLemma) label).setLemma(morphology.lemma(label.value(), tag, true));
  ++index;
 }
 return t;
}

@Override
public Object apply(Object in) {
 if (in instanceof WordTag) {
  WordTag wt = (WordTag) in;
  String tag = wt.tag();
  return new WordTag(lemmatize(wt.word(), tag, lexer, lexer.option(1)), tag);
 }
 if (in instanceof Word) {
  return stem((Word) in);
 }
 return in;
}

/** Lemmatize the word, being sensitive to the tag.
 *
 *  @param word The word to lemmatize
 *  @param tag What part of speech to assume for it.
 *  @param lowercase If this is true, words other than proper nouns will
 *      be changed to all lowercase.
 *  @return The lemma for the word
 */
public static synchronized String lemmaStatic(String word, String tag,
                       boolean lowercase) {
 initStaticLexer();
 return lemmatize(word, tag, staticLexer, lowercase);
}

private static void addLemma(Morphology morpha,
           Class<? extends CoreAnnotation<String>> ann,
           CoreMap map, String word, String tag) {
 if ( ! tag.isEmpty()) {
  String phrasalVerb = phrasalVerb(morpha, word, tag);
  if (phrasalVerb == null) {
   map.set(ann, morpha.lemma(word, tag));
  } else {
   map.set(ann, phrasalVerb);
  }
 } else {
  map.set(ann, morpha.stem(word));
 }
}

/** If a token is a phrasal verb with an underscore between a verb and a
 *  particle, return the phrasal verb lemmatized. If not, return null
 */
private static String phrasalVerb(Morphology morpha, String word, String tag) {
 // must be a verb and contain an underscore
 assert(word != null);
 assert(tag != null);
 if(!tag.startsWith("VB")  || !word.contains("_")) return null;
 // check whether the last part is a particle
 String[] verb = word.split("_");
 if(verb.length != 2) return null;
 String particle = verb[1];
 if(particles.contains(particle)) {
  String base = verb[0];
  String lemma = morpha.lemma(base, tag);
  return lemma + '_' + particle;
 }
 return null;
}

@Override
public void visitTree(Tree t) {
 // A single Morphology is not threadsafe, so to make this class
 // threadsafe, we have to create a new Morphology for each visit
 processTree(t, null, new Morphology());
}

/**
 * Adds the LemmaAnnotation to the given CoreLabel.
 */
public void stem(CoreLabel label) {
 stem(label, CoreAnnotations.LemmaAnnotation.class);
}

/**
 * Only works on English, as it is hard coded for using the
 * Morphology class, which is English-only
 */
public List<CoreLabel> lemmatize(List<? extends HasWord> tokens) {
 List<TaggedWord> tagged;
 if (getOp().testOptions.preTag) {
  Function<List<? extends HasWord>, List<TaggedWord>> tagger = loadTagger();
  tagged = tagger.apply(tokens);
 } else {
  Tree tree = parse(tokens);
  tagged = tree.taggedYield();
 }
 Morphology morpha = new Morphology();
 List<CoreLabel> lemmas = Generics.newArrayList();
 for (TaggedWord token : tagged) {
  CoreLabel label = new CoreLabel();
  label.setWord(token.word());
  label.setTag(token.tag());
  morpha.stem(label);
  lemmas.add(label);
 }
 return lemmas;
}

/** Lemmatize the word, being sensitive to the tag.
 *  Words other than proper nouns will be changed to all lowercase.
 *
 *  @param word The word to lemmatize
 *  @param tag What part of speech to assume for it.
 *  @return The lemma for the word
 */
public static synchronized String lemmaStatic(String word, String tag) {
 return lemmaStatic(word, tag, true);
}

/** Return a new WordTag which has the lemma as the value of word().
 *  The default is to lowercase non-proper-nouns, unless options have
 *  been set.
 */
public static WordTag stemStatic(WordTag wT) {
 return stemStatic(wT.word(), wT.tag());
}

/**
 *
 * @param t a tree
 * @return the WordTags corresponding to the leaves of the tree,
 * stemmed according to their POS tags in the tree.
 */
private static List<WordTag> getStemmedWordTagsFromTree(Tree t, boolean threadSafe) {
 List<WordTag> stemmedWordTags = Generics.newArrayList();
 ArrayList<TaggedWord> s = t.taggedYield();
 for (TaggedWord w : s) {
  WordTag wt = threadSafe ? Morphology.stemStaticSynchronized(w.word(), w.tag())
      : Morphology.stemStatic(w.word(), w.tag());
  stemmedWordTags.add(wt);
 }
 return stemmedWordTags;
}

public String lemma(String word, String tag, boolean lowercase) {
 return lemmatize(word, tag, lexer, lowercase);
}

public WordLemmaTag lemmatize(String word, String tag)
{ 
  final String lemma = Morphology.stemStaticSynchronized(word, tag).word();
  return new WordLemmaTag(word, lemma, tag);
}

/**
 * Lemmatize returning a {@code WordLemmaTag}.
 */
public WordLemmaTag lemmatize(WordTag wT) {
 String tag = wT.tag();
 String word = wT.word();
 String lemma = lemma(word, tag);
 return new WordLemmaTag(word, lemma, tag);
}

public List<? extends HasWord> tagCoreLabelsOrHasWords(List<? extends HasWord> sentence, Morphology morpha, boolean outputLemmas) {
 if (sentence.size() > 0 && sentence.get(0) instanceof CoreLabel) {
  List<CoreLabel> coreLabels = castCoreLabels(sentence);
  tagCoreLabels(coreLabels);
  if (outputLemmas) {
   // We may want to lemmatize things without using an existing
   // Morphology object, as Morphology objects are not
   // thread-safe, so we would make a new one here
   if (morpha == null) {
    morpha = new Morphology();
   }
   lemmatize(coreLabels, morpha);
  }
  return coreLabels;
 } else {
  List<TaggedWord> taggedSentence = tagSentence(sentence, false);
  return taggedSentence;
 }
}

/** Return a new WordTag which has the lemma as the value of word().
 *  The default is to lowercase non-proper-nouns, unless options have
 *  been set.
 */
public static synchronized WordTag stemStatic(String word, String tag) {
 initStaticLexer();
 return new WordTag(lemmatize(word, tag, staticLexer, staticLexer.option(1)), tag);
}

/**
 * Adds lemmas to the given list of CoreLabels, using the given
 * Morphology object.  The input list must already have tags set.
 */
public static void lemmatize(List<CoreLabel> sentence,
               Morphology morpha) {
 for (CoreLabel label : sentence) {
  morpha.stem(label);
 }
}

/**
 * Create a new word, where the label is formed from
 * the {@code String} passed in.  The String is divided according
 * to the divider character.  We assume that we can always just
 * divide on the rightmost divider character, rather than trying to
 * parse up escape sequences.  If the divider character isn't found
 * in the word, then the whole string becomes the word, and lemma and tag
 * are {@code null}.
 * We assume that if only one divider character is found, word and tag are presents in
 * the String, and lemma will be computed.
 *
 * @param labelStr The word that will go into the {@code Word}
 * @return The new WordLemmaTag
 */
@Override
public Label newLabelFromString(String labelStr) {
 int first = labelStr.indexOf(divider);
 int second = labelStr.lastIndexOf(divider);
 if (first == second) {
  return new WordLemmaTag(labelStr.substring(0, first), Morphology.lemmaStatic(labelStr.substring(0, first), labelStr.substring(first + 1)), labelStr.substring(first + 1));
 } else if (first >= 0) {
  return new WordLemmaTag(labelStr.substring(0, first), labelStr.substring(first + 1, second), labelStr.substring(second + 1));
 } else {
  return new WordLemmaTag(labelStr);
 }
}

/**
 * Create a new {@code WordLemmaTag}.
 *
 * @param word This word is set as the word of this Label
 * @param tag  The {@code value()} of this Label is set as the
 *             tag of this Label
 */
public WordLemmaTag(String word, String tag) {
 WordTag wT = new WordTag(word, tag);
 this.word = word;
 this.lemma = Morphology.stemStatic(wT).word();
 setTag(tag);
}

Javadoc

Morphology computes the base form of English words, by removing just inflections (not derivational morphology). That is, it only does noun plurals, pronoun case, and verb endings, and not things like comparative adjectives or derived nominals. It is based on a finite-state transducer implemented by John Carroll et al., written in flex and publicly available. See: http://www.informatics.susx.ac.uk/research/nlp/carroll/morph.html . There are several ways of invoking Morphology. One is by calling the static methods:

WordTag stemStatic(String word, String tag)
WordTag stemStatic(WordTag wordTag)

If we have created a Morphology object already we can use the methods WordTag stem(String word, string tag) or WordTag stem(WordTag wordTag).

Another way of using Morphology is to run it on an input file by running java Morphology filename. In this case, POS tags MUST be separated from words by an underscore ("_").

Note that a single instance of Morphology is not thread-safe, as the underlying lexer object is not built to be re-entrant. One thing that you can do to get around this is build a new Morphology object for each thread or each set of calls to the Morphology. For example, the MorphaAnnotator builds a Morphology for each document it annotates. The other approach is to use the synchronized methods in this class. The crucial lexer-accessing portion of all the static methods is synchronized (otherwise, their use tended to be threading bugs waiting to happen). If you want less synchronization, create your own Morphology objects.

Most used methods

<init>
lemma
stem
initStaticLexer
lemmaStatic
Lemmatize the word, being sensitive to the tag.
lemmatize
Lemmatize the word, being sensitive to the tag, using the passed in lexer.
next
stemStatic
Return a new WordTag which has the lemma as the value of word(). The default is to lowercase non-pro
stemStaticSynchronized
lemmaStaticSynchronized

Popular in Java

Reactive rest calls using spring rest template
startActivity (Activity)
addToBackStack (FragmentTransaction)
getSupportFragmentManager (FragmentActivity)
HttpServer (com.sun.net.httpserver)
This class implements a simple HTTP server. A HttpServer is bound to an IP address and port number a
TreeMap (java.util)
Walk the nodes of the tree left-to-right or right-to-left. Note that in descending iterations, next
Pattern (java.util.regex)
Patterns are compiled regular expressions. In many cases, convenience methods such as String#matches
Modifier (javassist)
The Modifier class provides static methods and constants to decode class and member access modifiers
Get (org.apache.hadoop.hbase.client)
Used to perform Get operations on a single row. To get everything for a row, instantiate a Get objec
Project (org.apache.tools.ant)
Central representation of an Ant project. This class defines an Ant project with all of its targets,
Top plugins for WebStorm

How to useMorphology in edu.stanford.nlp.process

Best Java code snippets using edu.stanford.nlp.process.Morphology (Showing top 20 results out of 315)

How to use
Morphology
in
edu.stanford.nlp.process