edu.stanford.nlp.trees.TreebankLanguagePack java code examples

/**
 * demoDP demonstrates turning a file into tokens and then parse
 * trees.  Note that the trees are printed by calling pennPrint on
 * the Tree object.  It is also possible to pass a PrintWriter to
 * pennPrint if you want to capture the output.
 * This code will work with any supported language.
 */
public static void demoDP(LexicalizedParser lp, String filename) {
 // This option shows loading, sentence-segmenting and tokenizing
 // a file using DocumentPreprocessor.
 TreebankLanguagePack tlp = lp.treebankLanguagePack(); // a PennTreebankLanguagePack for English
 GrammaticalStructureFactory gsf = null;
 if (tlp.supportsGrammaticalStructures()) {
  gsf = tlp.grammaticalStructureFactory();
 }
 // You could also create a tokenizer here (as below) and pass it
 // to DocumentPreprocessor
 for (List<HasWord> sentence : new DocumentPreprocessor(filename)) {
  Tree parse = lp.apply(sentence);
  parse.pennPrint();
  System.out.println();
  if (gsf != null) {
   GrammaticalStructure gs = gsf.newGrammaticalStructure(parse);
   Collection tdl = gs.typedDependenciesCCprocessed();
   System.out.println(tdl);
   System.out.println();
  }
 }
}

@SuppressWarnings("unchecked")
private static TokenizerFactory<Word> getTokenizerFactory() {
 return (TokenizerFactory<Word>)tlp.getTokenizerFactory();
}

public GsIterator() {
 if (keepPunct) {
  puncFilter = Filters.acceptFilter();
 } else if (params.generateOriginalDependencies()) {
  puncFilter = params.treebankLanguagePack().punctuationWordRejectFilter();
 } else {
  puncFilter = params.treebankLanguagePack().punctuationTagRejectFilter();
 }
 hf = params.typedDependencyHeadFinder();
 primeGs();
}

/**
 * Remove things like hyphened functional tags and equals from the
 * end of a node label.
 */
protected String cleanUpLabel(String label) {
 if (nodeCleanup == 1) {
  return tlp.categoryAndFunction(label);
 } else if (nodeCleanup == 2) {
  return tlp.basicCategory(label);
 } 
 return label;
}

/** Make a TreePrint instance. This one uses the default tlp headFinder. */
public TreePrint(String formats, String options, TreebankLanguagePack tlp) {
 this(formats, options, tlp, tlp.headFinder(), tlp.typedDependencyHeadFinder());
}

/** Prints a few aspects of the TreebankLanguagePack, just for debugging.
 */
public static void main(String[] args) {
 TreebankLanguagePack tlp = new TueBaDZLanguagePack();
 System.out.println("Start symbol: " + tlp.startSymbol());
 String start = tlp.startSymbol();
 System.out.println("Should be true: " + (tlp.isStartSymbol(start)));
 String[] strs = new String[]{"-", "-LLB-", "NP-2", "NP=3", "NP-LGS", "NP-TMP=3", "CARD-HD"};
 for (String str : strs) {
  System.out.println("String: " + str + " basic: " + tlp.basicCategory(str) + " basicAndFunc: " + tlp.categoryAndFunction(str));
 }
}

s = tlpp.treebankLanguagePack().basicCategory(s);
if (deletePunct) {
 if (tree.isPreTerminal() && tlpp.treebankLanguagePack().isEvalBIgnoredPunctuationTag(s)) {
  return null;
if (tlpp.treebankLanguagePack().isStartSymbol(s) && tree.numChildren() == 1) {

public ParserAnnotator(ParserGrammar parser, boolean verbose, int maxSent, Function<Tree, Tree> treeMap) {
 this.VERBOSE = verbose;
 this.BUILD_GRAPHS = parser.getTLPParams().supportsBasicDependencies();
 this.parser = parser;
 this.maxSentenceLength = maxSent;
 this.treeMap = treeMap;
 this.maxParseTime = 0;
 this.kBest = 1;
 this.keepPunct = true;
 if (this.BUILD_GRAPHS) {
  TreebankLanguagePack tlp = parser.getTLPParams().treebankLanguagePack();
  this.gsf = tlp.grammaticalStructureFactory(tlp.punctuationWordRejectFilter(), parser.getTLPParams().typedDependencyHeadFinder());
 } else {
  this.gsf = null;
 }
 this.nThreads = 1;
 this.saveBinaryTrees = false;
 this.noSquash = false;
 this.extraDependencies = GrammaticalStructure.Extras.NONE;
}

                          "includePunctuationDependencies");
boolean generateOriginalDependencies = tlp.generateOriginalDependencies();
 puncFilter = Filters.acceptFilter();
} else {
 dependencyFilter = new Dependencies.DependentPuncTagRejectFilter<>(tlp.punctuationTagRejectFilter());
 dependencyWordFilter = new Dependencies.DependentPuncWordRejectFilter<>(tlp.punctuationWordRejectFilter());
 puncFilter = generateOriginalDependencies ? tlp.punctuationWordRejectFilter() : tlp.punctuationTagRejectFilter();
  (formats.containsKey("conll2007") && tlp.supportsGrammaticalStructures())) {
 gsf = tlp.grammaticalStructureFactory(puncFilter, typedDependencyHF);
} else {
 gsf = null;

 hf = tlp.headFinder();
TreePrint print = new TreePrint(format, options, tlp, (hf == null) ? tlp.headFinder(): hf, tlp.typedDependencyHeadFinder());
  trf = tlp.treeReaderFactory();
 } else {
  trf = in -> new PennTreeReader(in, new LabeledScoredTreeFactory(new StringLabelFactory()), new TreeNormalizer());
 i = trees.iterator();
} else {
 i = tlp.treeTokenizerFactory().getTokenizer(new BufferedReader(new InputStreamReader(System.in)));

if (tlp.isStartSymbol(s))
 return transformTree(tree.firstChild());
 return tf.newLeaf(tree.label());
s = tlp.basicCategory(s);
if (((whOption & 1) != 0) && s.startsWith("WH")) {
 s = s.substring(2);
  (tlp.isEvalBIgnoredPunctuationTag(s) ||
  tlp.isPunctuationWord(tree.firstChild().value()))) {
 return null;
 if (kids.length == 1 && tlp.basicCategory(kids[0].value()).equals("NP")) {
  return transformTree(kids[0]);

private int findPreviousHead(int headIdx, Tree[] daughterTrees, boolean origWasInterjection) {
 boolean seenSeparator = false;
 int newHeadIdx = headIdx;
 while (newHeadIdx >= 0) {
  newHeadIdx = newHeadIdx - 1;
  if (newHeadIdx < 0) {
   return newHeadIdx;
  }
  String label = tlp.basicCategory(daughterTrees[newHeadIdx].value());
  if (",".equals(label) || ":".equals(label)) {
   seenSeparator = true;
  } else if (daughterTrees[newHeadIdx].isPreTerminal() && (tlp.isPunctuationTag(label) || ! origWasInterjection && "UH".equals(label)) ||
       "INTJ".equals(label) && ! origWasInterjection) {
   // keep looping
  } else {
   if ( ! seenSeparator) {
    newHeadIdx = -1;
   }
   break;
  }
 }
 return newHeadIdx;
}

/**
 * Stores the passed-in TreebankLanguagePack and sets up charset encodings.
 *
 * @param tlp The treebank language pack to use
 */
protected AbstractTreebankParserParams(TreebankLanguagePack tlp) {
 this.tlp = tlp;
 inputEncoding = tlp.getEncoding();
 outputEncoding = tlp.getEncoding();
 generateOriginalDependencies = false;
}

private void parseTextFile(BufferedReader input, PrintWriter output) {
 DocumentPreprocessor preprocessor = new DocumentPreprocessor(input);
 preprocessor.setSentenceFinalPuncWords(config.tlp.sentenceFinalPunctuationWords());
 preprocessor.setEscaper(config.escaper);
 preprocessor.setSentenceDelimiter(config.sentenceDelimiter);
 preprocessor.setTokenizerFactory(config.tlp.getTokenizerFactory());
 Timing timer = new Timing();
 MaxentTagger tagger = new MaxentTagger(config.tagger);
 List<List<TaggedWord>> tagged = new ArrayList<>();
 for (List<HasWord> sentence : preprocessor) {
  tagged.add(tagger.tagSentence(sentence));
 }
 log.info(String.format("Tagging completed in %.2f sec.%n",
   timer.stop() / 1000.0));
 timer.start();
 int numSentences = 0;
 for (List<TaggedWord> taggedSentence : tagged) {
  GrammaticalStructure parse = predict(taggedSentence);
  Collection<TypedDependency> deps = parse.typedDependencies();
  for (TypedDependency dep : deps)
   output.println(dep);
  output.println();
  numSentences++;
 }
 long millis = timer.stop();
 double seconds = millis / 1000.0;
 log.info(String.format("Parsed %d sentences in %.2f seconds (%.2f sents/sec).%n",
   numSentences, seconds, numSentences / seconds));
}

pwOut.println("ComboParser best");
Tree ot = tree;
if (ot != null && ! op.tlpParams.treebankLanguagePack().isStartSymbol(ot.value())) {
 ot = ot.treeFactory().newTreeNode(op.tlpParams.treebankLanguagePack().startSymbol(), Collections.singletonList(ot));
 goldTreeEval.percolateHeads(op.langpack().headFinder());

public String project(String tagStr) {
 // return tagStr;
 String ret = tlp.basicCategory(tagStr);
 // log.info("BCTP mapped " + tagStr + " to " + ret);
 return ret;
}

LexicalizedParser lp = LexicalizedParser.loadModel(grammar, options);
TreebankLanguagePack tlp = lp.getOp().langpack();
GrammaticalStructureFactory gsf = tlp.grammaticalStructureFactory();
  tlp.getTokenizerFactory().getTokenizer(new StringReader(sent2));
 List<? extends HasWord> sentence2 = toke.tokenize();

private boolean shouldSkip(Tree t, boolean origWasInterjection) {
 return t.isPreTerminal() && (tlp.isPunctuationTag(t.value()) || ! origWasInterjection && "UH".equals(t.value())) ||
     "INTJ".equals(t.value()) && ! origWasInterjection;
}

/**
* Say whether this character is an annotation introducing
* character.
*
* @param ch The character to check
* @return Whether it is an annotation introducing character
*/
public boolean isLabelAnnotationIntroducingCharacter(char ch) {
 if (tlp.isLabelAnnotationIntroducingCharacter(ch)) {
  return true;
 }
 //for heads, there's one more char we want to check because we don't care about grammatical fns
 if (ch == '-') {
  return true;
 }
 return false;
}

public EnglishPTBTreebankCorrector() {
 // initialize the transformations to be done
 ops = new ArrayList<>();
 TreebankLanguagePack tlp = new PennTreebankLanguagePack();
 TregexPatternCompiler tpc = new TregexPatternCompiler(tlp.headFinder(), tlp.getBasicCategoryFunction());
 Macros.addAllMacros(tpc, getBufferedReader(macroStr));
 try {
  BufferedReader br = getBufferedReader(editStr);
  List<TsurgeonPattern> tsp = new ArrayList<>();
  for (String line; (line = br.readLine()) != null; ) {
   TregexPattern matchPattern = tpc.compile(line);
   tsp.clear();
   if (DEBUG) log.info("Pattern is " + line + " [" + matchPattern + ']');
   while (continuing(line = br.readLine())) {
    TsurgeonPattern p = Tsurgeon.parseOperation(line);
    if (DEBUG) log.info("Operation is " + line + " [" + p + ']');
    tsp.add(p);
   }
   if ( ! tsp.isEmpty()) {
    TsurgeonPattern tp = Tsurgeon.collectOperations(tsp);
    ops.add(new Pair<>(matchPattern, tp));
   }
  } // while not at end of file
 } catch (IOException ioe) {
  log.warn(ioe);
 }
}

Javadoc

This interface specifies language/treebank specific information for a Treebank, which a parser or other treebank user might need to know. Some of this is fixed for a (treebank,language) pair, but some of it reflects feature extraction decisions, so it can be sensible to have multiple implementations of this interface for the same (treebank,language) pair. So far this covers punctuation, character encodings, and characters reserved for label annotations. It should probably be expanded to cover other stuff (unknown words?). Various methods in this class return arrays. You should treat them as read-only, even though one cannot enforce that in Java. Implementations in this class do not call basicCategory() on arguments before testing them, so if needed, you should explicitly call basicCategory() yourself before passing arguments to these routines for testing. This class should be able to be an immutable singleton. It contains data on various things, but no state. At some point we should make it a real immutable singleton.

Most used methods

grammaticalStructureFactory
Return a GrammaticalStructureFactory suitable for this language/treebank.
getTokenizerFactory
Return a tokenizer factory which might be suitable for tokenizing text that will be used with this T
punctuationWordRejectFilter
Returns a filter that accepts a String that is not a punctuation word, and rejects punctuation. If o
basicCategory
Returns the basic syntactic category of a String by truncating stuff after a (non-word-initial) occu
categoryAndFunction
Returns the syntactic category and 'function' of a String. This normally involves truncating numeric
getEncoding
Return the charset encoding of the Treebank. See documentation for the Charset class.
headFinder
The HeadFinder to use for your treebank.
isEvalBIgnoredPunctuationTag
Accepts a String that is a punctuation tag that should be ignored by EVALB-style evaluation, and rej
isLabelAnnotationIntroducingCharacter
Say whether this character is an annotation introducing character.
isPunctuationTag
Accepts a String that is a punctuation tag name, and rejects everything else.
isPunctuationWord
Accepts a String that is a punctuation word, and rejects everything else. If one can't tell for sure
isSentenceFinalPunctuationTag
Accepts a String that is a sentence end punctuation tag, and rejects everything else.

Popular in Java

Parsing JSON documents to java classes using gson
getResourceAsStream (ClassLoader)
findViewById (Activity)
runOnUiThread (Activity)
BufferedInputStream (java.io)
A BufferedInputStream adds functionality to another input stream-namely, the ability to buffer the i
Enumeration (java.util)
A legacy iteration interface.New code should use Iterator instead. Iterator replaces the enumeration
ResourceBundle (java.util)
ResourceBundle is an abstract class which is the superclass of classes which provide Locale-specifi
Stream (java.util.stream)
A sequence of elements supporting sequential and parallel aggregate operations. The following exampl
IsNull (org.hamcrest.core)
Is the value null?
Component (java.awt)
A component is an object having a graphical representation that can be displayed on the screen and t
Top Sublime Text plugins

How to useTreebankLanguagePack in edu.stanford.nlp.trees

Best Java code snippets using edu.stanford.nlp.trees.TreebankLanguagePack (Showing top 20 results out of 315)

How to use
TreebankLanguagePack
in
edu.stanford.nlp.trees