edu.stanford.nlp.process.PTBTokenizer.factory java code examples

/**
 * Constructs a new (empty) BasicDocument using a {@link PTBTokenizer}.
 * Call one of the <tt>init</tt> * methods to populate the document
 * from a desired source.
 */
public BasicDocument() {
 this(PTBTokenizer.factory());
}

/**
 * Return a tokenizer which might be suitable for tokenizing text that
 * will be used with this Treebank/Language pair, without tokenizing carriage
 * returns (i.e., treating them as white space).  For German (Negra) we used
 * to only provide a {@link edu.stanford.nlp.process.WhitespaceTokenizer},
 * but people didn't much like that.
 * So now we provide {@link PTBTokenizer}. It's not customized to German, but
 * will nevertheless do better than WhitespaceTokenizer at tokenizing German!
 *
 * @return A tokenizer
 */
@Override
public TokenizerFactory<Word> getTokenizerFactory() {
 return PTBTokenizer.factory();
}

/** Return the tokens using PTB tokenizer.
 *
 *  @param str String to tokenize
 *  @return List of tokens
 */
private String[] ptbTokenize(String str) {
 // todo [cdm 2017]: Someday should generalize this to allow use of other tokenizers
 if (ptbFactory==null) {
  ptbFactory = PTBTokenizer.factory();
 }
 Tokenizer<Word> tokenizer = ptbFactory.getTokenizer(new StringReader(str));
 List<Word> words = tokenizer.tokenize();
 String[] res = new String[words.size()];
 for (int i = 0, sz = words.size(); i < sz; i++) {
  res[i] = words.get(i).word();
 }
 return res;
}

public MUCMentionExtractor(Dictionaries dict, Properties props, Semantics semantics) throws Exception {
 super(dict, semantics);
 String fileName = props.getProperty(Constants.MUC_PROP);
 fileContents = IOUtils.slurpFile(fileName);
 currentOffset = 0;
 tokenizerFactory = PTBTokenizer.factory(new CoreLabelTokenFactory(false), "");
 stanfordProcessor = loadStanfordProcessor(props);
}

public static void main(String[] args) throws Exception {
 if (args.length != 2) {
  log.info("usage: java TaggerDemo2 modelFile fileToTag");
  return;
 }
 MaxentTagger tagger = new MaxentTagger(args[0]);
 TokenizerFactory<CoreLabel> ptbTokenizerFactory = PTBTokenizer.factory(new CoreLabelTokenFactory(),
                   "untokenizable=noneKeep");
 BufferedReader r = new BufferedReader(new InputStreamReader(new FileInputStream(args[1]), "utf-8"));
 PrintWriter pw = new PrintWriter(new OutputStreamWriter(System.out, "utf-8"));
 DocumentPreprocessor documentPreprocessor = new DocumentPreprocessor(r);
 documentPreprocessor.setTokenizerFactory(ptbTokenizerFactory);
 for (List<HasWord> sentence : documentPreprocessor) {
  List<TaggedWord> tSentence = tagger.tagSentence(sentence);
  pw.println(SentenceUtils.listToString(tSentence, false));
 }
 // print the adjectives in one more sentence. This shows how to get at words and tags in a tagged sentence.
 List<HasWord> sent = SentenceUtils.toWordList("The", "slimy", "slug", "crawled", "over", "the", "long", ",", "green", "grass", ".");
 List<TaggedWord> taggedSent = tagger.tagSentence(sent);
 for (TaggedWord tw : taggedSent) {
  if (tw.tag().startsWith("JJ")) {
   pw.println(tw.word());
  }
 }
 pw.close();
}

 tf = PTBTokenizer.factory(new CoreLabelTokenFactory(), "ptb3Escaping=false");
} else if (customTokenizer) {
 tf = PTBTokenizer.factory(new CoreLabelTokenFactory(), options.getProperty("tokenizerOptions"));
} else if (printOriginalText) {
 tf = PTBTokenizer.factory(new CoreLabelTokenFactory(), "invertible=true");
} else if (whitespaceTokenization) {
 List<String> whitespaceDelims =
 sentenceDelims = whitespaceDelims.toArray(new String[whitespaceDelims.size()]);
} else {
 tf = PTBTokenizer.factory(new CoreLabelTokenFactory(), "");

factory = PTBTokenizer.factory(new CoreLabelTokenFactory(), options);
break;
factory = PTBTokenizer.factory(new CoreLabelTokenFactory(), options);
break;

  PTBTokenizer.factory(new CoreLabelTokenFactory(), "");
Tokenizer<CoreLabel> tok =
  tokenizerFactory.getTokenizer(new StringReader(sent2));

/**
 * Constructs a new (empty) BasicDocument using a {@link PTBTokenizer}.
 * Call one of the <tt>init</tt> * methods to populate the document
 * from a desired source.
 */
public BasicDocument() {
 this(PTBTokenizer.factory());
}

/**
 * Constructs a new (empty) BasicDocument using a {@link PTBTokenizer}.
 * Call one of the <tt>init</tt> * methods to populate the document
 * from a desired source.
 */
public BasicDocument() {
 this(PTBTokenizer.factory());
}

/**
 * Constructs a new (empty) BasicDocument using a {@link PTBTokenizer}.
 * Call one of the <tt>init</tt> * methods to populate the document
 * from a desired source.
 */
public BasicDocument() {
 this(PTBTokenizer.factory());
}

/**
 * Constructs a new (empty) BasicDocument using a {@link PTBTokenizer}.
 * Call one of the <tt>init</tt> * methods to populate the document
 * from a desired source.
 */
public BasicDocument() {
 this(PTBTokenizer.factory());
}

/**
 * Returns a factory for {@link PTBTokenizer}.
 *
 * @return A tokenizer
 */
@Override
public TokenizerFactory<Word> getTokenizerFactory() {
 return PTBTokenizer.factory();
}

public PTBTokenizerAnnotator(boolean verbose, String options) {
 super(verbose);
 factory = PTBTokenizer.factory(new CoreLabelTokenFactory(), options);
}

 public MUCMentionExtractor(LexicalizedParser parser, Dictionaries dict, Properties props, Semantics semantics) throws Exception {
  super(dict, semantics);
//    setParser(parser);
  String fileName = props.getProperty(Constants.MUC_PROP);
  fileContents = IOUtils.slurpFile(fileName);
  currentOffset = 0;
  tokenizerFactory = PTBTokenizer.factory(false, new CoreLabelTokenFactory(false));
  stanfordProcessor = loadStanfordProcessor(props);
 }

public MUCMentionExtractor(Dictionaries dict, Properties props, Semantics semantics) throws Exception {
 super(dict, semantics);
 String fileName = props.getProperty(Constants.MUC_PROP);
 fileContents = IOUtils.slurpFile(fileName);
 currentOffset = 0;
 tokenizerFactory = PTBTokenizer.factory(new CoreLabelTokenFactory(false), "");
 stanfordProcessor = loadStanfordProcessor(props);
}

public MUCMentionExtractor(Dictionaries dict, Properties props, Semantics semantics) throws Exception {
 super(dict, semantics);
 String fileName = props.getProperty(Constants.MUC_PROP);
 fileContents = IOUtils.slurpFile(fileName);
 currentOffset = 0;
 tokenizerFactory = PTBTokenizer.factory(new CoreLabelTokenFactory(false), "");
 stanfordProcessor = loadStanfordProcessor(props);
}

public static void writeImage(String sentence, String outFile, LexicalizedParser lp) throws Exception {
  
  Tree parse;
  try {
    TokenizerFactory<CoreLabel> tokenizerFactory =
      PTBTokenizer.factory(new CoreLabelTokenFactory(), "");
  List<CoreLabel> wordList = tokenizerFactory.getTokenizer(new StringReader(sentence)).tokenize();
    parse = lp.apply(wordList);            
  } catch (Exception e) {
    throw e;
  }
  writeImage(parse, outFile);
  
}

public static Graph getGraph(String sentence, LexicalizedParser lp) throws Exception {
  TokenizerFactory<CoreLabel> tokenizerFactory =
      PTBTokenizer.factory(new CoreLabelTokenFactory(), "");
  List<CoreLabel> wordList = tokenizerFactory.getTokenizer(new StringReader(sentence)).tokenize();
  Tree tree = lp.apply(wordList);
  GrammaticalStructure gs = gsf.newGrammaticalStructure(tree);
  Collection<TypedDependency> tdl = gs.typedDependencies();
  return getGraph(tree, tdl);
}

public static Graph getGraph(String sentence) throws Exception {
  LexicalizedParser lp = LexicalizedParser.loadModel("edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz");
  lp.setOptionFlags(new String[]{"-maxLength", "500", "-retainTmpSubcategories"});
  TokenizerFactory<CoreLabel> tokenizerFactory =
      PTBTokenizer.factory(new CoreLabelTokenFactory(), "");
  List<CoreLabel> wordList = tokenizerFactory.getTokenizer(new StringReader(sentence)).tokenize();
  Tree tree = lp.apply(wordList);
  GrammaticalStructure gs = gsf.newGrammaticalStructure(tree);
  Collection<TypedDependency> tdl = gs.typedDependencies();
  return getGraph(tree, tdl);
}

Popular methods of PTBTokenizer

<init>
Constructs a new PTBTokenizer that optionally returns carriage returns as their own token, and has a
hasNext
next
newPTBTokenizer
Constructs a new PTBTokenizer that makes CoreLabel tokens. It optionally returns carriage returns as
tokenize
ptb2Text
Returns a presentable version of the given PTB-tokenized words. Pass in a List of Strings and this m
coreLabelFactory
ptbToken2Text
Returns a presentable version of a given PTB token. For instance, it transforms -LRB- into (.
tok
tokReader
untok
getNewlineToken
Returns the string literal inserted for newlines when the -tokenizeNLs options is set.

Popular in Java

Creating JSON documents from java classes using gson
getSharedPreferences (Context)
onRequestPermissionsResult (Fragment)
setContentView (Activity)
Path (java.nio.file)
Time (java.sql)
Java representation of an SQL TIME value. Provides utilities to format and parse the time's represen
HashSet (java.util)
HashSet is an implementation of a Set. All optional operations (adding and removing) are supported.
Set (java.util)
A Set is a data structure which does not allow duplicate elements.
Modifier (javassist)
The Modifier class provides static methods and constants to decode class and member access modifiers
Filter (javax.servlet)
A filter is an object that performs filtering tasks on either the request to a resource (a servlet o
Best plugins for Eclipse

How to use factorymethodin edu.stanford.nlp.process.PTBTokenizer

Best Java code snippets using edu.stanford.nlp.process.PTBTokenizer.factory (Showing top 20 results out of 315)

How to use
factory
method
in
edu.stanford.nlp.process.PTBTokenizer