/** * Constructs a new (empty) BasicDocument using a {@link PTBTokenizer}. * Call one of the <tt>init</tt> * methods to populate the document * from a desired source. */ public BasicDocument() { this(PTBTokenizer.factory()); }
/** * Return a tokenizer which might be suitable for tokenizing text that * will be used with this Treebank/Language pair, without tokenizing carriage * returns (i.e., treating them as white space). For German (Negra) we used * to only provide a {@link edu.stanford.nlp.process.WhitespaceTokenizer}, * but people didn't much like that. * So now we provide {@link PTBTokenizer}. It's not customized to German, but * will nevertheless do better than WhitespaceTokenizer at tokenizing German! * * @return A tokenizer */ @Override public TokenizerFactory<Word> getTokenizerFactory() { return PTBTokenizer.factory(); }
/** Return the tokens using PTB tokenizer. * * @param str String to tokenize * @return List of tokens */ private String[] ptbTokenize(String str) { // todo [cdm 2017]: Someday should generalize this to allow use of other tokenizers if (ptbFactory==null) { ptbFactory = PTBTokenizer.factory(); } Tokenizer<Word> tokenizer = ptbFactory.getTokenizer(new StringReader(str)); List<Word> words = tokenizer.tokenize(); String[] res = new String[words.size()]; for (int i = 0, sz = words.size(); i < sz; i++) { res[i] = words.get(i).word(); } return res; }
public MUCMentionExtractor(Dictionaries dict, Properties props, Semantics semantics) throws Exception { super(dict, semantics); String fileName = props.getProperty(Constants.MUC_PROP); fileContents = IOUtils.slurpFile(fileName); currentOffset = 0; tokenizerFactory = PTBTokenizer.factory(new CoreLabelTokenFactory(false), ""); stanfordProcessor = loadStanfordProcessor(props); }
public static void main(String[] args) throws Exception { if (args.length != 2) { log.info("usage: java TaggerDemo2 modelFile fileToTag"); return; } MaxentTagger tagger = new MaxentTagger(args[0]); TokenizerFactory<CoreLabel> ptbTokenizerFactory = PTBTokenizer.factory(new CoreLabelTokenFactory(), "untokenizable=noneKeep"); BufferedReader r = new BufferedReader(new InputStreamReader(new FileInputStream(args[1]), "utf-8")); PrintWriter pw = new PrintWriter(new OutputStreamWriter(System.out, "utf-8")); DocumentPreprocessor documentPreprocessor = new DocumentPreprocessor(r); documentPreprocessor.setTokenizerFactory(ptbTokenizerFactory); for (List<HasWord> sentence : documentPreprocessor) { List<TaggedWord> tSentence = tagger.tagSentence(sentence); pw.println(SentenceUtils.listToString(tSentence, false)); } // print the adjectives in one more sentence. This shows how to get at words and tags in a tagged sentence. List<HasWord> sent = SentenceUtils.toWordList("The", "slimy", "slug", "crawled", "over", "the", "long", ",", "green", "grass", "."); List<TaggedWord> taggedSent = tagger.tagSentence(sent); for (TaggedWord tw : taggedSent) { if (tw.tag().startsWith("JJ")) { pw.println(tw.word()); } } pw.close(); }
tf = PTBTokenizer.factory(new CoreLabelTokenFactory(), "ptb3Escaping=false"); } else if (customTokenizer) { tf = PTBTokenizer.factory(new CoreLabelTokenFactory(), options.getProperty("tokenizerOptions")); } else if (printOriginalText) { tf = PTBTokenizer.factory(new CoreLabelTokenFactory(), "invertible=true"); } else if (whitespaceTokenization) { List<String> whitespaceDelims = sentenceDelims = whitespaceDelims.toArray(new String[whitespaceDelims.size()]); } else { tf = PTBTokenizer.factory(new CoreLabelTokenFactory(), "");
PTBTokenizer.factory(new CoreLabelTokenFactory(), ""); Tokenizer<CoreLabel> tok = tokenizerFactory.getTokenizer(new StringReader(sent2));
/** * Constructs a new (empty) BasicDocument using a {@link PTBTokenizer}. * Call one of the <tt>init</tt> * methods to populate the document * from a desired source. */ public BasicDocument() { this(PTBTokenizer.factory()); }
/** * Constructs a new (empty) BasicDocument using a {@link PTBTokenizer}. * Call one of the <tt>init</tt> * methods to populate the document * from a desired source. */ public BasicDocument() { this(PTBTokenizer.factory()); }
/** * Constructs a new (empty) BasicDocument using a {@link PTBTokenizer}. * Call one of the <tt>init</tt> * methods to populate the document * from a desired source. */ public BasicDocument() { this(PTBTokenizer.factory()); }
/** * Constructs a new (empty) BasicDocument using a {@link PTBTokenizer}. * Call one of the <tt>init</tt> * methods to populate the document * from a desired source. */ public BasicDocument() { this(PTBTokenizer.factory()); }
/** * Returns a factory for {@link PTBTokenizer}. * * @return A tokenizer */ @Override public TokenizerFactory<Word> getTokenizerFactory() { return PTBTokenizer.factory(); }
public PTBTokenizerAnnotator(boolean verbose, String options) { super(verbose); factory = PTBTokenizer.factory(new CoreLabelTokenFactory(), options); }
public MUCMentionExtractor(LexicalizedParser parser, Dictionaries dict, Properties props, Semantics semantics) throws Exception { super(dict, semantics); // setParser(parser); String fileName = props.getProperty(Constants.MUC_PROP); fileContents = IOUtils.slurpFile(fileName); currentOffset = 0; tokenizerFactory = PTBTokenizer.factory(false, new CoreLabelTokenFactory(false)); stanfordProcessor = loadStanfordProcessor(props); }
public MUCMentionExtractor(Dictionaries dict, Properties props, Semantics semantics) throws Exception { super(dict, semantics); String fileName = props.getProperty(Constants.MUC_PROP); fileContents = IOUtils.slurpFile(fileName); currentOffset = 0; tokenizerFactory = PTBTokenizer.factory(new CoreLabelTokenFactory(false), ""); stanfordProcessor = loadStanfordProcessor(props); }
public MUCMentionExtractor(Dictionaries dict, Properties props, Semantics semantics) throws Exception { super(dict, semantics); String fileName = props.getProperty(Constants.MUC_PROP); fileContents = IOUtils.slurpFile(fileName); currentOffset = 0; tokenizerFactory = PTBTokenizer.factory(new CoreLabelTokenFactory(false), ""); stanfordProcessor = loadStanfordProcessor(props); }
public static void writeImage(String sentence, String outFile, LexicalizedParser lp) throws Exception { Tree parse; try { TokenizerFactory<CoreLabel> tokenizerFactory = PTBTokenizer.factory(new CoreLabelTokenFactory(), ""); List<CoreLabel> wordList = tokenizerFactory.getTokenizer(new StringReader(sentence)).tokenize(); parse = lp.apply(wordList); } catch (Exception e) { throw e; } writeImage(parse, outFile); }
public static Graph getGraph(String sentence, LexicalizedParser lp) throws Exception { TokenizerFactory<CoreLabel> tokenizerFactory = PTBTokenizer.factory(new CoreLabelTokenFactory(), ""); List<CoreLabel> wordList = tokenizerFactory.getTokenizer(new StringReader(sentence)).tokenize(); Tree tree = lp.apply(wordList); GrammaticalStructure gs = gsf.newGrammaticalStructure(tree); Collection<TypedDependency> tdl = gs.typedDependencies(); return getGraph(tree, tdl); }
public static Graph getGraph(String sentence) throws Exception { LexicalizedParser lp = LexicalizedParser.loadModel("edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz"); lp.setOptionFlags(new String[]{"-maxLength", "500", "-retainTmpSubcategories"}); TokenizerFactory<CoreLabel> tokenizerFactory = PTBTokenizer.factory(new CoreLabelTokenFactory(), ""); List<CoreLabel> wordList = tokenizerFactory.getTokenizer(new StringReader(sentence)).tokenize(); Tree tree = lp.apply(wordList); GrammaticalStructure gs = gsf.newGrammaticalStructure(tree); Collection<TypedDependency> tdl = gs.typedDependencies(); return getGraph(tree, tdl); }