edu.stanford.nlp.process.Tokenizer java code examples

public static void main(String[] args) throws IOException {
 Reader in = new FileReader(args[0]);
 Tokenizer st = new NegraPennTokenizer(in);
 while (st.hasNext()) {
  String s = (String) st.next();
  System.out.println(s);
 }
}

private static boolean isDeletedCharacter(char ch, TokenizerFactory<CoreLabel> tf) {
 List<CoreLabel> tokens = tf.getTokenizer(new StringReader(Character.toString(ch))).tokenize();
 return tokens.isEmpty();
}

String first = (st.hasNext() ? st.peek() : null);
if (first != null && first.startsWith("*x*x*x")) {
 if (DEBUG) {
 while (foundCount < 4 && st.hasNext()) {
  first = st.next();
  if (first != null && first.startsWith("*x*x*x")) {
   foundCount++;

Tree t = null;
while (tokenizer.hasNext() && t == null) {

Tree t = null;
while (tokenizer.hasNext() && t == null) {

public static void main(String[] args) throws IOException {
 Tokenizer<String> att = new ArabicTreebankTokenizer(new FileReader(args[0]));
 while (att.hasNext()) {
  System.out.print(att.next());
 }
}

while (tokenizer.hasNext()) {
 String token = tokenizer.next();
   String label = (tokenizer.peek().equals(leftParen)) ? null : tokenizer.next();
   if (rightParen.equals(label)) {//Skip past empty trees
    continue;

/**
 * Tokenizes the given text to populate the list of words this Document
 * represents. The default implementation uses the current tokenizer and tokenizes
 * the entirety of the text into words. Subclasses should override this method
 * to parse documents in non-standard formats, and/or to pull the title of the
 * document from the text. The given text may be empty ("") but will never
 * be null. Subclasses may want to do additional processing and then just
 * call super.parse.
 *
 * @see #setTokenizerFactory
 */
protected void parse(String text) {
 Tokenizer<Word> toke = tokenizerFactory.getTokenizer(new StringReader(text));
 addAll(toke.tokenize());
}

Tree t = null;
while (tokenizer.hasNext() && t == null) {

/**
 * For testing only.
 */
public static void main(String[] args) throws IOException {
 Tokenizer<String> t = new LexerTokenizer(new JFlexDummyLexer((Reader) null), new BufferedReader(new FileReader(args[0])));
 while (t.hasNext()) {
  System.out.println("token " + t.next());
 }
}

String first = (st.hasNext() ? st.peek() : null);
if (first != null && first.startsWith("*x*x*x")) {
 if (DEBUG) {
 while (foundCount < 4 && st.hasNext()) {
  first = st.next();
  if (first != null && first.startsWith("*x*x*x")) {
   foundCount++;

/** Return the tokens using PTB tokenizer.
 *
 *  @param str String to tokenize
 *  @return List of tokens
 */
private String[] ptbTokenize(String str) {
 // todo [cdm 2017]: Someday should generalize this to allow use of other tokenizers
 if (ptbFactory==null) {
  ptbFactory = PTBTokenizer.factory();
 }
 Tokenizer<Word> tokenizer = ptbFactory.getTokenizer(new StringReader(str));
 List<Word> words = tokenizer.tokenize();
 String[] res = new String[words.size()];
 for (int i = 0, sz = words.size(); i < sz; i++) {
  res[i] = words.get(i).word();
 }
 return res;
}

Tree t = null;
while (tokenizer.hasNext() && t == null) {

@Override
protected HasWord getNext() {
 while (wordIter == null || ! wordIter.hasNext()) {
  if ( ! tok.hasNext()) {
   return null;
  }
  CoreLabel token = tok.next();
  String s = token.word();
  if (s == null) {
   return null;
  }
  if (s.equals(WhitespaceLexer.NEWLINE)) {
   // if newlines were significant, we should make sure to return
   // them when we see them
   List<HasWord> se = Collections.<HasWord>singletonList(token);
   wordIter = se.iterator();
  } else {
   List<HasWord> se = wordSegmenter.segment(s);
   wordIter = se.iterator();
  }
 }
 return wordIter.next();
}

String first = (st.hasNext() ? st.peek() : null);
if (first != null && first.startsWith("*x*x*x")) {
 if (DEBUG) {
 while (foundCount < 4 && st.hasNext()) {
  first = st.next();
  if (first != null && first.startsWith("*x*x*x")) {
   foundCount++;

/**
 * Tokenize the text using the parser's tokenizer
 */
public List<? extends HasWord> tokenize(String sentence) {
 TokenizerFactory<? extends HasWord> tf = treebankLanguagePack().getTokenizerFactory();
 Tokenizer<? extends HasWord> tokenizer = tf.getTokenizer(new StringReader(sentence));
 List<? extends HasWord> tokens = tokenizer.tokenize();
 return tokens;
}

Tree t = null;
while (tokenizer.hasNext() && t == null) {

/**
 * The main() method tokenizes a file in the specified Encoding
 * and prints it to standard output in the specified Encoding.
 * Its arguments are (Infile, Encoding).
 */
public static void main(String[] args) throws IOException {
 if (args.length < 2) {
  log.error("Usage: CHTBTokenizer inputFile encoding");
 }
 String encoding = args[1];
 Reader in = IOUtils.readerFromString(args[0], encoding);
 for (Tokenizer<String> st = new CHTBTokenizer(in); st.hasNext(); ) {
  String s = st.next();
  EncodingPrintWriter.out.println(s, encoding);
  // EncodingPrintWriter.out.println("|" + s + "| (" + s.length() + ")",
  //				encoding);
 }
}

String first = (st.hasNext() ? st.peek() : null);
if (first != null && first.startsWith("*x*x*x")) {
 if (DEBUG) {
 while (foundCount < 4 && st.hasNext()) {
  first = st.next();
  if (first != null && first.startsWith("*x*x*x")) {
   foundCount++;

String word = wordTagPair[0];
if (tokFactory != null) {
 List<CoreLabel> lexList = tokFactory.getTokenizer(new StringReader(word)).tokenize();
 if (lexList.size() == 0) {
  continue;

Javadoc

Tokenizers break up text into individual Objects. These objects may be Strings, Words, or other Objects. A Tokenizer extends the Iterator interface, but provides a lookahead operation peek(). An implementation of this interface is expected to have a constructor that takes a single argument, a Reader.

Most used methods

hasNext
Returns true if and only if this Tokenizer has more elements.
next
Returns the next token from this Tokenizer.
tokenize
Returns all tokens of this Tokenizer as a List for convenience.
peek
Returns the next token, without removing it, from the Tokenizer, so that the same token will be agai

Popular in Java

Reading from database using SQL prepared statement
requestLocationUpdates (LocationManager)
setRequestProperty (URLConnection)
orElseThrow (Optional)
Return the contained value, if present, otherwise throw an exception to be created by the provided s
InetAddress (java.net)
An Internet Protocol (IP) address. This can be either an IPv4 address or an IPv6 address, and in pra
Socket (java.net)
Provides a client-side TCP socket.
Path (java.nio.file)
TimeZone (java.util)
TimeZone represents a time zone offset, and also figures out daylight savings. Typically, you get a
BlockingQueue (java.util.concurrent)
A java.util.Queue that additionally supports operations that wait for the queue to become non-empty
Semaphore (java.util.concurrent)
A counting semaphore. Conceptually, a semaphore maintains a set of permits. Each #acquire blocks if
From CI to AI: The AI layer in your organization

How to useTokenizer in edu.stanford.nlp.process

Best Java code snippets using edu.stanford.nlp.process.Tokenizer (Showing top 20 results out of 315)

How to use
Tokenizer
in
edu.stanford.nlp.process