/** * demoDP demonstrates turning a file into tokens and then parse * trees. Note that the trees are printed by calling pennPrint on * the Tree object. It is also possible to pass a PrintWriter to * pennPrint if you want to capture the output. * This code will work with any supported language. */ public static void demoDP(LexicalizedParser lp, String filename) { // This option shows loading, sentence-segmenting and tokenizing // a file using DocumentPreprocessor. TreebankLanguagePack tlp = lp.treebankLanguagePack(); // a PennTreebankLanguagePack for English GrammaticalStructureFactory gsf = null; if (tlp.supportsGrammaticalStructures()) { gsf = tlp.grammaticalStructureFactory(); } // You could also create a tokenizer here (as below) and pass it // to DocumentPreprocessor for (List<HasWord> sentence : new DocumentPreprocessor(filename)) { Tree parse = lp.apply(sentence); parse.pennPrint(); System.out.println(); if (gsf != null) { GrammaticalStructure gs = gsf.newGrammaticalStructure(parse); Collection tdl = gs.typedDependenciesCCprocessed(); System.out.println(tdl); System.out.println(); } } }
@SuppressWarnings("unchecked") private static TokenizerFactory<Word> getTokenizerFactory() { return (TokenizerFactory<Word>)tlp.getTokenizerFactory(); }
public GsIterator() { if (keepPunct) { puncFilter = Filters.acceptFilter(); } else if (params.generateOriginalDependencies()) { puncFilter = params.treebankLanguagePack().punctuationWordRejectFilter(); } else { puncFilter = params.treebankLanguagePack().punctuationTagRejectFilter(); } hf = params.typedDependencyHeadFinder(); primeGs(); }
/** * Remove things like hyphened functional tags and equals from the * end of a node label. */ protected String cleanUpLabel(String label) { if (nodeCleanup == 1) { return tlp.categoryAndFunction(label); } else if (nodeCleanup == 2) { return tlp.basicCategory(label); } return label; }
/** Make a TreePrint instance. This one uses the default tlp headFinder. */ public TreePrint(String formats, String options, TreebankLanguagePack tlp) { this(formats, options, tlp, tlp.headFinder(), tlp.typedDependencyHeadFinder()); }
/** Prints a few aspects of the TreebankLanguagePack, just for debugging. */ public static void main(String[] args) { TreebankLanguagePack tlp = new TueBaDZLanguagePack(); System.out.println("Start symbol: " + tlp.startSymbol()); String start = tlp.startSymbol(); System.out.println("Should be true: " + (tlp.isStartSymbol(start))); String[] strs = new String[]{"-", "-LLB-", "NP-2", "NP=3", "NP-LGS", "NP-TMP=3", "CARD-HD"}; for (String str : strs) { System.out.println("String: " + str + " basic: " + tlp.basicCategory(str) + " basicAndFunc: " + tlp.categoryAndFunction(str)); } }
s = tlpp.treebankLanguagePack().basicCategory(s); if (deletePunct) { if (tree.isPreTerminal() && tlpp.treebankLanguagePack().isEvalBIgnoredPunctuationTag(s)) { return null; if (tlpp.treebankLanguagePack().isStartSymbol(s) && tree.numChildren() == 1) {
public ParserAnnotator(ParserGrammar parser, boolean verbose, int maxSent, Function<Tree, Tree> treeMap) { this.VERBOSE = verbose; this.BUILD_GRAPHS = parser.getTLPParams().supportsBasicDependencies(); this.parser = parser; this.maxSentenceLength = maxSent; this.treeMap = treeMap; this.maxParseTime = 0; this.kBest = 1; this.keepPunct = true; if (this.BUILD_GRAPHS) { TreebankLanguagePack tlp = parser.getTLPParams().treebankLanguagePack(); this.gsf = tlp.grammaticalStructureFactory(tlp.punctuationWordRejectFilter(), parser.getTLPParams().typedDependencyHeadFinder()); } else { this.gsf = null; } this.nThreads = 1; this.saveBinaryTrees = false; this.noSquash = false; this.extraDependencies = GrammaticalStructure.Extras.NONE; }
"includePunctuationDependencies"); boolean generateOriginalDependencies = tlp.generateOriginalDependencies(); puncFilter = Filters.acceptFilter(); } else { dependencyFilter = new Dependencies.DependentPuncTagRejectFilter<>(tlp.punctuationTagRejectFilter()); dependencyWordFilter = new Dependencies.DependentPuncWordRejectFilter<>(tlp.punctuationWordRejectFilter()); puncFilter = generateOriginalDependencies ? tlp.punctuationWordRejectFilter() : tlp.punctuationTagRejectFilter(); (formats.containsKey("conll2007") && tlp.supportsGrammaticalStructures())) { gsf = tlp.grammaticalStructureFactory(puncFilter, typedDependencyHF); } else { gsf = null;
hf = tlp.headFinder(); TreePrint print = new TreePrint(format, options, tlp, (hf == null) ? tlp.headFinder(): hf, tlp.typedDependencyHeadFinder()); trf = tlp.treeReaderFactory(); } else { trf = in -> new PennTreeReader(in, new LabeledScoredTreeFactory(new StringLabelFactory()), new TreeNormalizer()); i = trees.iterator(); } else { i = tlp.treeTokenizerFactory().getTokenizer(new BufferedReader(new InputStreamReader(System.in)));
if (tlp.isStartSymbol(s)) return transformTree(tree.firstChild()); return tf.newLeaf(tree.label()); s = tlp.basicCategory(s); if (((whOption & 1) != 0) && s.startsWith("WH")) { s = s.substring(2); (tlp.isEvalBIgnoredPunctuationTag(s) || tlp.isPunctuationWord(tree.firstChild().value()))) { return null; if (kids.length == 1 && tlp.basicCategory(kids[0].value()).equals("NP")) { return transformTree(kids[0]);
private int findPreviousHead(int headIdx, Tree[] daughterTrees, boolean origWasInterjection) { boolean seenSeparator = false; int newHeadIdx = headIdx; while (newHeadIdx >= 0) { newHeadIdx = newHeadIdx - 1; if (newHeadIdx < 0) { return newHeadIdx; } String label = tlp.basicCategory(daughterTrees[newHeadIdx].value()); if (",".equals(label) || ":".equals(label)) { seenSeparator = true; } else if (daughterTrees[newHeadIdx].isPreTerminal() && (tlp.isPunctuationTag(label) || ! origWasInterjection && "UH".equals(label)) || "INTJ".equals(label) && ! origWasInterjection) { // keep looping } else { if ( ! seenSeparator) { newHeadIdx = -1; } break; } } return newHeadIdx; }
/** * Stores the passed-in TreebankLanguagePack and sets up charset encodings. * * @param tlp The treebank language pack to use */ protected AbstractTreebankParserParams(TreebankLanguagePack tlp) { this.tlp = tlp; inputEncoding = tlp.getEncoding(); outputEncoding = tlp.getEncoding(); generateOriginalDependencies = false; }
private void parseTextFile(BufferedReader input, PrintWriter output) { DocumentPreprocessor preprocessor = new DocumentPreprocessor(input); preprocessor.setSentenceFinalPuncWords(config.tlp.sentenceFinalPunctuationWords()); preprocessor.setEscaper(config.escaper); preprocessor.setSentenceDelimiter(config.sentenceDelimiter); preprocessor.setTokenizerFactory(config.tlp.getTokenizerFactory()); Timing timer = new Timing(); MaxentTagger tagger = new MaxentTagger(config.tagger); List<List<TaggedWord>> tagged = new ArrayList<>(); for (List<HasWord> sentence : preprocessor) { tagged.add(tagger.tagSentence(sentence)); } log.info(String.format("Tagging completed in %.2f sec.%n", timer.stop() / 1000.0)); timer.start(); int numSentences = 0; for (List<TaggedWord> taggedSentence : tagged) { GrammaticalStructure parse = predict(taggedSentence); Collection<TypedDependency> deps = parse.typedDependencies(); for (TypedDependency dep : deps) output.println(dep); output.println(); numSentences++; } long millis = timer.stop(); double seconds = millis / 1000.0; log.info(String.format("Parsed %d sentences in %.2f seconds (%.2f sents/sec).%n", numSentences, seconds, numSentences / seconds)); }
pwOut.println("ComboParser best"); Tree ot = tree; if (ot != null && ! op.tlpParams.treebankLanguagePack().isStartSymbol(ot.value())) { ot = ot.treeFactory().newTreeNode(op.tlpParams.treebankLanguagePack().startSymbol(), Collections.singletonList(ot)); goldTreeEval.percolateHeads(op.langpack().headFinder());
public String project(String tagStr) { // return tagStr; String ret = tlp.basicCategory(tagStr); // log.info("BCTP mapped " + tagStr + " to " + ret); return ret; }
LexicalizedParser lp = LexicalizedParser.loadModel(grammar, options); TreebankLanguagePack tlp = lp.getOp().langpack(); GrammaticalStructureFactory gsf = tlp.grammaticalStructureFactory(); tlp.getTokenizerFactory().getTokenizer(new StringReader(sent2)); List<? extends HasWord> sentence2 = toke.tokenize();
private boolean shouldSkip(Tree t, boolean origWasInterjection) { return t.isPreTerminal() && (tlp.isPunctuationTag(t.value()) || ! origWasInterjection && "UH".equals(t.value())) || "INTJ".equals(t.value()) && ! origWasInterjection; }
/** * Say whether this character is an annotation introducing * character. * * @param ch The character to check * @return Whether it is an annotation introducing character */ public boolean isLabelAnnotationIntroducingCharacter(char ch) { if (tlp.isLabelAnnotationIntroducingCharacter(ch)) { return true; } //for heads, there's one more char we want to check because we don't care about grammatical fns if (ch == '-') { return true; } return false; }
public EnglishPTBTreebankCorrector() { // initialize the transformations to be done ops = new ArrayList<>(); TreebankLanguagePack tlp = new PennTreebankLanguagePack(); TregexPatternCompiler tpc = new TregexPatternCompiler(tlp.headFinder(), tlp.getBasicCategoryFunction()); Macros.addAllMacros(tpc, getBufferedReader(macroStr)); try { BufferedReader br = getBufferedReader(editStr); List<TsurgeonPattern> tsp = new ArrayList<>(); for (String line; (line = br.readLine()) != null; ) { TregexPattern matchPattern = tpc.compile(line); tsp.clear(); if (DEBUG) log.info("Pattern is " + line + " [" + matchPattern + ']'); while (continuing(line = br.readLine())) { TsurgeonPattern p = Tsurgeon.parseOperation(line); if (DEBUG) log.info("Operation is " + line + " [" + p + ']'); tsp.add(p); } if ( ! tsp.isEmpty()) { TsurgeonPattern tp = Tsurgeon.collectOperations(tsp); ops.add(new Pair<>(matchPattern, tp)); } } // while not at end of file } catch (IOException ioe) { log.warn(ioe); } }