edu.stanford.nlp.process.PTBTokenizer.newPTBTokenizer java code examples

private void tokenizeDate(String inputDate) {
 tokens = new ArrayList<>();
 Pattern pat = Pattern.compile("[-]");
 if (inputDate == null) {
  System.out.println("Null input date");
 }
 Matcher m = pat.matcher(inputDate);
 String str = m.replaceAll(" - ");
 str = str.replaceAll(",", " ");
 PTBTokenizer<Word> tokenizer = PTBTokenizer.newPTBTokenizer(new BufferedReader(new StringReader(str)));
 while (tokenizer.hasNext()) {
  Word nextToken = tokenizer.next();
  tokens.add(nextToken.toString());
 }
 if(DEBUG) {
  System.out.println("tokens:" + tokens);
 }
}

/**
 * Test program for demonstrating the Stemmer.  It reads text from a
 * a list of files, stems each word, and writes the result to standard
 * output. Note that the word stemmed is expected to be in lower case:
 * forcing lower case must be done outside the Stemmer class.
 * Usage: Stemmer file-name file-name ...
 */
public static void main(String[] args) throws IOException {
 Stemmer s = new Stemmer();
 if (args[0].equals("-file")) {
  Iterator<Word> it = PTBTokenizer.newPTBTokenizer(new InputStreamReader(new FileInputStream(args[1]), "utf-8"));
  while (it.hasNext()) {
   Word token = it.next();
   System.out.print(s.stem(token.word()));
   System.out.print(' ');
  }
 } else {
  for (String arg : args) {
   System.out.print(s.stem(arg));
   System.out.print(' ');
  }
 }
 System.out.println();
}

@Override
public Tree next() {
 if (line == null) {
  throw new NoSuchElementException();
 }
 Reader lineReader = new StringReader(line);
 line = null;
 List<Word> words;
 if (tokenized) {
  words = WhitespaceTokenizer.newWordWhitespaceTokenizer(lineReader).tokenize();
 } else {
  words = PTBTokenizer.newPTBTokenizer(lineReader).tokenize();
 }
 if (!words.isEmpty()) {
  // the parser throws an exception if told to parse an empty sentence.
  Tree parseTree = lp.apply(words);
  return parseTree;
 } else {
  return new SimpleTree();
 }
}

for  (String line; (line = reader.readLine()) != null; ) {
 System.out.println("Processing sentence: " + line);
 PTBTokenizer<Word> ptb = PTBTokenizer.newPTBTokenizer(new StringReader(line));
 List<Word> words = ptb.tokenize();
 Tree parseTree = lp.parseTree(words);

PTBTokenizer<CoreLabel> ptb = PTBTokenizer.newPTBTokenizer(new BufferedReader(new StringReader(doc)), false, true);
List<CoreLabel> words = ptb.tokenize();

/**
 * Constructs a new PTBTokenizer that returns Word tokens and which treats
 * carriage returns as normal whitespace.
 *
 * @param r The Reader whose contents will be tokenized
 * @return A PTBTokenizer that tokenizes a stream to objects of type
 *          {@link Word}
 */
public static PTBTokenizer<Word> newPTBTokenizer(Reader r) {
 return newPTBTokenizer(r, false);
}

private void tokenizeDate(String inputDate) {
 tokens = new ArrayList<String>();
 Pattern pat = Pattern.compile("[-]");
 if (inputDate == null) {
  System.out.println("Null input date");
 }
 Matcher m = pat.matcher(inputDate);
 String str = m.replaceAll(" - ");
 str = str.replaceAll(",", " ");
 PTBTokenizer<Word> tokenizer = PTBTokenizer.newPTBTokenizer(new BufferedReader(new StringReader(str)));
 while (tokenizer.hasNext()) {
  Word nextToken = tokenizer.next();
  tokens.add(nextToken.toString());
 }
 if(DEBUG) {
  System.out.println("tokens:" + tokens);
 }
}

private void tokenizeDate(String inputDate) {
 tokens = new ArrayList<>();
 Pattern pat = Pattern.compile("[-]");
 if (inputDate == null) {
  System.out.println("Null input date");
 }
 Matcher m = pat.matcher(inputDate);
 String str = m.replaceAll(" - ");
 str = str.replaceAll(",", " ");
 PTBTokenizer<Word> tokenizer = PTBTokenizer.newPTBTokenizer(new BufferedReader(new StringReader(str)));
 while (tokenizer.hasNext()) {
  Word nextToken = tokenizer.next();
  tokens.add(nextToken.toString());
 }
 if(DEBUG) {
  System.out.println("tokens:" + tokens);
 }
}

@Override
public String[] tokenize(String sentence) {
  Reader r=new StringReader(sentence);
  PTBTokenizer<Word> tokenizer=PTBTokenizer.newPTBTokenizer(r);
  List<String> l=new ArrayList<String>();
  while(tokenizer.hasNext())
    l.add(tokenizer.next().word());
  
  String[] tok=new String[l.size()+1];
  tok[0]=is2.io.CONLLReader09.ROOT;
  int i=1;
  for(String s:l)
    tok[i++]=s;
  return tok;
}

private void tokenizeDate(String inputDate) {
 tokens = new ArrayList<String>();
 Pattern pat = Pattern.compile("[-]");
 if (inputDate == null) {
  System.out.println("Null input date");
 }
 Matcher m = pat.matcher(inputDate);
 String str = m.replaceAll(" - ");
 str = str.replaceAll(",", " ");
 PTBTokenizer<Word> tokenizer = PTBTokenizer.newPTBTokenizer(new BufferedReader(new StringReader(str)));
 while (tokenizer.hasNext()) {
  Word nextToken = tokenizer.next();
  tokens.add(nextToken.toString());
 }
 if(DEBUG) {
  System.out.println("tokens:" + tokens);
 }
}

@Override
public String[] tokenize(String sentence) {
  Reader r = new StringReader(sentence);
  PTBTokenizer<Word> tokenizer = PTBTokenizer.newPTBTokenizer(r);
  List<String> l = new ArrayList<>();
  while (tokenizer.hasNext()) {
    Word w = tokenizer.next();
    l.add(w.word());
  }
  String[] tok = new String[l.size() + 1];
  tok[0] = is2.io.CONLLReader09.ROOT;
  int i = 1;
  for (String s : l)
    tok[i++] = s;
  return tok;
}

@Override
public Tree next() {
 if (line == null) {
  throw new NoSuchElementException();
 }
 Reader lineReader = new StringReader(line);
 line = null;
 List<Word> words;
 if (tokenized) {
  words = WhitespaceTokenizer.newWordWhitespaceTokenizer(lineReader).tokenize();
 } else {
  words = PTBTokenizer.newPTBTokenizer(lineReader).tokenize();
 }
 if (!words.isEmpty()) {
  // the parser throws an exception if told to parse an empty sentence.
  Tree parseTree = lp.apply(words);
  return parseTree;
 } else {
  return new SimpleTree();
 }
}

@Override
public Tree next() {
 if (line == null) {
  throw new NoSuchElementException();
 }
 Reader lineReader = new StringReader(line);
 line = null;
 List<Word> words;
 if (tokenized) {
  words = WhitespaceTokenizer.newWordWhitespaceTokenizer(lineReader).tokenize();
 } else {
  words = PTBTokenizer.newPTBTokenizer(lineReader).tokenize();
 }
 if (!words.isEmpty()) {
  // the parser throws an exception if told to parse an empty sentence.
  Tree parseTree = lp.apply(words);
  return parseTree;
 } else {
  return new SimpleTree();
 }
}

for (String line; (line = reader.readLine()) != null;) {
 PTBTokenizer<Word> ptb = PTBTokenizer.newPTBTokenizer(new StringReader(line));
 List<Word> words = ptb.tokenize();
 if (!words.isEmpty()) {

public StringInText[] tokenizeplus(String sentence) {
  Reader r = new StringReader(sentence);
  PTBTokenizer<Word> tokenizer = PTBTokenizer.newPTBTokenizer(r);
  List<StringInText> l = new ArrayList<>();
  while (tokenizer.hasNext()) {
    Word w = tokenizer.next();
    l.add(new StringInText(w.word(), w.beginPosition() + startpos, w
        .endPosition() + startpos));
  }
  StringInText[] tok = new StringInText[l.size() + 1];
  tok[0] = new StringInText(is2.io.CONLLReader09.ROOT, 0, 0);
  int i = 1;
  for (StringInText s : l)
    tok[i++] = s;
  startpos += (1 + sentence.length());
  return tok;
}

@Override
public Tree next() {
 if (line == null) {
  throw new NoSuchElementException();
 }
 Reader lineReader = new StringReader(line);
 line = null;
 List<Word> words;
 if (tokenized) {
  words = WhitespaceTokenizer.newWordWhitespaceTokenizer(lineReader).tokenize();
 } else {
  words = PTBTokenizer.newPTBTokenizer(lineReader).tokenize();
 }
 if (!words.isEmpty()) {
  // the parser throws an exception if told to parse an empty sentence.
  Tree parseTree = lp.apply(words);
  return parseTree;
 } else {
  return new SimpleTree();
 }
}

while ((line = reader.readLine()) != null) {
 System.out.println("Processing sentence: " + line);
 PTBTokenizer<Word> ptb = PTBTokenizer.newPTBTokenizer(new StringReader(line));
 List<Word> words = ptb.tokenize();
 lp.parse(words);

Javadoc

Constructs a new PTBTokenizer that returns Word tokens and which treats carriage returns as normal whitespace.

Popular methods of PTBTokenizer

<init>
Constructs a new PTBTokenizer that optionally returns carriage returns as their own token, and has a
hasNext
next
factory
tokenize
ptb2Text
Returns a presentable version of the given PTB-tokenized words. Pass in a List of Strings and this m
coreLabelFactory
ptbToken2Text
Returns a presentable version of a given PTB token. For instance, it transforms -LRB- into (.
tok
tokReader
untok
getNewlineToken
Returns the string literal inserted for newlines when the -tokenizeNLs options is set.

Popular in Java

Running tasks concurrently on multiple threads
orElseThrow (Optional)
Return the contained value, if present, otherwise throw an exception to be created by the provided s
notifyDataSetChanged (ArrayAdapter)
putExtra (Intent)
FileInputStream (java.io)
An input stream that reads bytes from a file. File file = ...finally if (in != null) in.clos
Locale (java.util)
Locale represents a language/country/variant combination. Locales are used to alter the presentatio
Scanner (java.util)
A parser that parses a text string of primitive types and strings with the help of regular expressio
Base64 (org.apache.commons.codec.binary)
Provides Base64 encoding and decoding as defined by RFC 2045.This class implements section 6.8. Base
Logger (org.slf4j)
The org.slf4j.Logger interface is the main user entry point of SLF4J API. It is expected that loggin
Runner (org.openjdk.jmh.runner)
Best plugins for Eclipse

How to use newPTBTokenizermethodin edu.stanford.nlp.process.PTBTokenizer

Best Java code snippets using edu.stanford.nlp.process.PTBTokenizer.newPTBTokenizer (Showing top 17 results out of 315)

How to use
newPTBTokenizer
method
in
edu.stanford.nlp.process.PTBTokenizer