public static void main(String[] args) { QPTreeTransformer transformer = new QPTreeTransformer(); Treebank tb = new MemoryTreebank(); Properties props = StringUtils.argsToProperties(args); String treeFileName = props.getProperty("treeFile"); if (treeFileName != null) { try { TreeReader tr = new PennTreeReader(new BufferedReader(new InputStreamReader(new FileInputStream(treeFileName))), new LabeledScoredTreeFactory()); Tree t; while ((t = tr.readTree()) != null) { tb.add(t); } } catch (IOException e) { throw new RuntimeException("File problem: " + e); } } for (Tree t : tb) { System.out.println("Original tree"); t.pennPrint(); System.out.println(); System.out.println("Tree transformed"); Tree tree = transformer.transformTree(t); tree.pennPrint(); System.out.println(); System.out.println("----------------------------"); } }
/** * Go through trees and determine their heads and print them. * Just for debugging. <br> * Usage: <code> * java edu.stanford.nlp.trees.international.spanish.SpanishHeadFinder treebankFilePath * </code> * * @param args The treebankFilePath */ public static void main(String[] args) { Treebank treebank = new DiskTreebank(); CategoryWordTag.suppressTerminalDetails = true; treebank.loadPath(args[0]); final HeadFinder chf = new SpanishHeadFinder(); treebank.apply(new TreeVisitor() { public void visitTree(Tree pt) { // pt.percolateHeads(chf); //pt.pennPrint(); Tree head = pt.headTerminal(chf); //System.out.println("======== " + head.label()); } }); }
public TreeTaggedFileReader(TaggedFileRecord record) { filename = record.file; trf = record.trf == null ? new LabeledScoredTreeReaderFactory() : record.trf; transformer = record.treeTransformer; normalizer = record.treeNormalizer; treeFilter = record.treeFilter; treebank = new DiskTreebank(trf, record.encoding); if (record.treeRange != null) { treebank.loadPath(filename, record.treeRange); } else { treebank.loadPath(filename); } treeIterator = treebank.iterator(); findNext(); }
private static Treebank makeTreebank(String treebankPath, Options op, FileFilter filt) { log.info("Training a parser from treebank dir: " + treebankPath); Treebank trainTreebank = op.tlpParams.diskTreebank(); log.info("Reading trees..."); if (filt == null) { trainTreebank.loadPath(treebankPath); } else { trainTreebank.loadPath(treebankPath, filt); } Timing.tick("done [read " + trainTreebank.size() + " trees]."); return trainTreebank; }
@Override public void build() { //Set specific options for this dataset if(options.containsKey(ConfigParser.paramSplit)) { System.err.printf("%s: Ignoring split parameter for this dataset type\n", this.getClass().getName()); } else if(options.containsKey(ConfigParser.paramTagDelim)) { wordTagDelim = options.getProperty(ConfigParser.paramTagDelim); taggedOutput = true; } for(File path : pathsToData) { int prevSize = treebank.size(); treebank.loadPath(path,treeFileExtension,false); toStringBuffer.append(String.format(" Loaded %d trees from %s\n", treebank.size() - prevSize, path.getPath())); prevSize = treebank.size(); } ArabicTreeDecimatedNormalizer tv = new ArabicTreeDecimatedNormalizer(outFileName,makeFlatFile,taggedOutput); treebank.apply(tv); outputFileList.addAll(tv.getFilenames()); tv.closeOutputFiles(); }
guessTreebank.loadPath(guessFile); pwOut.println("GUESS TREEBANK:"); pwOut.println(guessTreebank.textualSummary()); goldTreebank.loadPath(goldFile); pwOut.println("GOLD TREEBANK:"); pwOut.println(goldTreebank.textualSummary()); final Iterator<Tree> goldItr = goldTreebank.iterator(); final Iterator<Tree> guessItr = guessTreebank.iterator(); int goldLineId = 0; int guessLineId = 0;
treebank.loadPath(args[i], new NumberRangesFileFilter(args[i+1], true)); } else if (i < args.length) { treebank.loadPath(args[i], suffix, true); } else { printUsage(); System.out.println(treebank.textualSummary()); treebank.apply(tree -> { int length = tree.yield().size(); if (length >= minLength && length <= maxLength) { treebank.apply(tree -> { int length = tree.yield().size(); if (length >= minLength && length <= maxLength) { treebank.apply(tree -> { Tree tPrime = tn.normalizeWholeTree(tree, tree.treeFactory()); int length = tPrime.yield().size(); treebank.apply(tree -> { Tree tPrime = tn.normalizeWholeTree(tree, tree.treeFactory()); pw.println(SentenceUtils.listToString(tPrime.taggedYield(), false, "_")); treebank.apply(tree -> { int length = tree.yield().size(); if (length >= minLength && length <= maxLength) { Writer w2 = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(decimatePrefix + "-dev.txt"), encoding));
Treebank treebank = new DiskTreebank(in -> new PennTreeReader(in)); Treebank treebank2 = new MemoryTreebank(in -> new PennTreeReader(in)); treebank.loadPath(args[0]); treebank2.loadPath(args[0]); CompositeTreebank c = new CompositeTreebank(treebank, treebank2); Timing.endTime(); TreeTransformer myTransformer2 = new MyTreeTransformer2(); TreeTransformer myTransformer3 = new MyTreeTransformer3(); Treebank tf1 = c.transform(myTransformer).transform(myTransformer2).transform(myTransformer3); Treebank tf2 = new TransformingTreebank(new TransformingTreebank(new TransformingTreebank(c, myTransformer), myTransformer2), myTransformer3); TreeTransformer[] tta = { myTransformer, myTransformer2, myTransformer3 };
PrintWriter pwErr = tlpParams.pw(System.err); pwErr.print("Training "); pwErr.println(trainTreebank.textualSummary(tlp)); if (secondaryTreebank != null) { pwErr.print("Secondary training "); pwErr.println(secondaryTreebank.textualSummary(tlp)); wholeTreebank = wholeTreebank.transform(myTransformer); op.trainOptions.postSplitters = ParentAnnotationStats.getSplitCategories(wholeTreebank, true, 0, op.trainOptions.selectivePostSplitCutOff, op.trainOptions.tagSelectivePostSplitCutOff, tlp); if (op.testOptions.verbose) { trainTreebank = trainTreebank.transform(trainTransformer); if (secondaryTreebank != null) { secondaryTreebank = secondaryTreebank.transform(trainTransformer); tuneTreebank = tuneTreebank.transform(trainTransformer);
treebank.loadPath(trainTreebankPath, trainTreebankFilter); treebank = treebank.transform(transformer); log.info("Read in " + treebank.size() + " trees from " + trainTreebankPath); log.info("Finished parsing " + treebank.size() + " trees, getting " + dvparser.op.trainOptions.dvKBest + " hypotheses each"); testTreebank.loadPath(testTreebankPath, testTreebankFilter); log.info("Read in " + testTreebank.size() + " trees for testing");
/** * Load trees from given path specification. Passes the path and * filter to the underlying treebank. * * @param path file or directory to load from * @param filt a FilenameFilter of files to load */ @Override public void loadPath(File path, FileFilter filt) { treebank.loadPath(path, filt); }
log.info("Usage: java SemanticGraph [-sentFile file|-treeFile file] [-testGraph]"); Tree t = Tree.valueOf("(ROOT (S (NP (NP (DT An) (NN attempt)) (PP (IN on) (NP (NP (NNP Andres) (NNP Pastrana) (POS 's)) (NN life)))) (VP (VBD was) (VP (VBN carried) (PP (IN out) (S (VP (VBG using) (NP (DT a) (JJ powerful) (NN bomb))))))) (. .)))"); tb.add(t); } else if (treeFileName != null) { tb.loadPath(treeFileName); } else { String[] options = {"-retainNPTmpSubcategories"}; List<Word> words = ptb.tokenize(); Tree parseTree = lp.parseTree(words); tb.add(parseTree);
private static void runTiming(Treebank treebank) { System.out.println(); Timing.startTime(); int num = 0; for (Tree t : treebank) { num += t.yield().size(); } Timing.endTime("traversing corpus, counting words with iterator"); log.info("There were " + num + " words in the treebank."); treebank.apply(new TreeVisitor() { int num; // = 0; @Override public void visitTree(final Tree t) { num += t.yield().size(); } }); log.info(); Timing.endTime("traversing corpus, counting words with TreeVisitor"); log.info("There were " + num + " words in the treebank."); log.info(); Timing.startTime(); log.info("This treebank contains " + treebank.size() + " trees."); Timing.endTime("size of corpus"); }
/** * Return the whole treebank as a series of big bracketed lists. * Calling this is a really bad idea if your treebank is large. */ @Override public String toString() { final StringBuilder sb = new StringBuilder(); apply(t -> { sb.append(t); sb.append('\n'); }); return sb.toString(); }
/** * Return various statistics about the treebank (number of sentences, * words, tag set, etc.). * * @return A String with various statistics about the treebank (number of * sentences, words, tag set, etc.). */ public String textualSummary() { return textualSummary(null); }
public static List<Tree> binarizeTreebank(Treebank treebank, Options op) { TreeBinarizer binarizer = TreeBinarizer.simpleTreeBinarizer(op.tlpParams.headFinder(), op.tlpParams.treebankLanguagePack()); BasicCategoryTreeTransformer basicTransformer = new BasicCategoryTreeTransformer(op.langpack()); CompositeTreeTransformer transformer = new CompositeTreeTransformer(); transformer.addTransformer(binarizer); transformer.addTransformer(basicTransformer); treebank = treebank.transform(transformer); HeadFinder binaryHeadFinder = new BinaryHeadFinder(op.tlpParams.headFinder()); List<Tree> binarizedTrees = Generics.newArrayList(); for (Tree tree : treebank) { Trees.convertToCoreLabels(tree); tree.percolateHeadAnnotations(binaryHeadFinder); // Index from 1. Tools downstream expect index from 1, so for // uses internal to the srparser we have to renormalize the // indices, with the result that here we have to index from 1 tree.indexLeaves(1, true); binarizedTrees.add(tree); } return binarizedTrees; }
/** */ @Override public Iterator<Tree> iterator() { return new TransformingTreebankIterator(tb.iterator(), transformer); }
/** Fix all the English Penn Treebank errors, or at least some of them (!). */ @Override public MemoryTreebank transformTrees(Treebank tb) { MemoryTreebank mtb = new MemoryTreebank(tb.treeReaderFactory(), tb.encoding()); for (Tree t : tb) { mtb.add(Tsurgeon.processPatternsOnTree(ops, t)); } return mtb; }