edu.illinois.cs.cogcomp.nlp.tokenizer.TokenizerStateMachine$State java code examples

this.text = this.textstring.toCharArray();
current = 0;
this.push(new State(TokenizerState.IN_SENTENCE), current);
for (current = 0; current < this.text.length; current++) {
  char character = this.text[current];

String term = getWord();
int lastdash = term.lastIndexOf("-");
if (lastdash != -1)

/**
 * Pop the current state identifier off the stack.
 *
 * @param where the position to terminate the previous token and start the new one.
 * @return true if popped the last item off the stack.
 */
protected boolean pop(int where) {
  State s = stack.remove(stack.size() - 1);
  s.pop(where);
  if (s.size() > 0)
    completed.add(s);
  if (stack.size() == 0) {
    push(new State(TokenizerState.IN_SENTENCE), where + 1);
    this.state = stack.get(stack.size() - 1).stateIndex();
    return true;
  } else {
    this.state = stack.get(stack.size() - 1).stateIndex();
    return false;
  }
}

if (getCurrent().isNumeric()) {
  int advance = 1;
  while (true) {
push(new State(TokenizerState.IN_SPECIAL), current); // No matter
if (getCurrent().isNumeric()) {
  if (current < (text.length - 1)
      && Character.isDigit(text[current + 1])) {
push(new State(TokenizerState.IN_SPECIAL), current); // No matter
    push(new State(TokenizerState.IN_SPECIAL), current);
    push(new State(TokenizerState.IN_WORD), current); // make 's
      && (nnnc == '\000' || Character.isWhitespace(nnnc))) {
    pop(current);
    push(new State(TokenizerState.IN_WORD), current);
  } else if (nc == 't' && pc == 'n'
      && (nnc == '\000' || Character.isWhitespace(nnc))) {
    pop(current - 1);
    push(new State(TokenizerState.IN_WORD), current - 1);
  } else if (nc == 'v' && nnc == 'e'
      && (nnnc == '\000' || Character.isWhitespace(nnnc))) {
    pop(current);
    push(new State(TokenizerState.IN_WORD), current);
  } else if (nc == 'l' && nnc == 'l'
      && (nnnc == '\000' || Character.isWhitespace(nnnc))) {

if (getCurrent().isDate()) {
  int advance = 1;
  while (true) {
push(new State(TokenizerState.IN_SPECIAL), current); // No matter
if (getCurrent().isNumeric()) {
  if (current < (text.length - 1)
      && Character.isDigit(text[current + 1])) {
push(new State(TokenizerState.IN_SPECIAL), current); // No matter
    push(new State(TokenizerState.IN_SPECIAL), current);
    push(new State(TokenizerState.IN_WORD), current); // make 's
      && (nnnc == '\000' || Character.isWhitespace(nnnc))) {
    pop(current);
    push(new State(TokenizerState.IN_WORD), current);
  } else if (nc == 't' && pc == 'n'
      && (nnc == '\000' || Character.isWhitespace(nnc))) {
    pop(current - 1);
    push(new State(TokenizerState.IN_WORD), current - 1);
  } else if (nc == 'v' && nnc == 'e'
      && (nnnc == '\000' || Character.isWhitespace(nnnc))) {
    pop(current);
    push(new State(TokenizerState.IN_WORD), current);
  } else if (nc == 'l' && nnc == 'l'
      && (nnnc == '\000' || Character.isWhitespace(nnnc))) {

if (getCurrent().isDate()) {
  int advance = 1;
  while (true) {
push(new State(TokenizerState.IN_SPECIAL), current); // No matter
if (getCurrent().isNumeric()) {
  if (current < (text.length - 1)
      && Character.isDigit(text[current + 1])) {
push(new State(TokenizerState.IN_SPECIAL), current); // No matter
    push(new State(TokenizerState.IN_SPECIAL), current);
    push(new State(TokenizerState.IN_WORD), current); // make 's
      && (nnnc == '\000' || Character.isWhitespace(nnnc))) {
    pop(current);
    push(new State(TokenizerState.IN_WORD), current);
  } else if (nc == 't' && pc == 'n'
      && (nnc == '\000' || Character.isWhitespace(nnc))) {
    pop(current - 1);
    push(new State(TokenizerState.IN_WORD), current - 1);
  } else if (nc == 'v' && nnc == 'e'
      && (nnnc == '\000' || Character.isWhitespace(nnnc))) {
    pop(current);
    push(new State(TokenizerState.IN_WORD), current);
  } else if (nc == 'l' && nnc == 'l'
      && (nnnc == '\000' || Character.isWhitespace(nnnc))) {

String term = getWord();
int lastdash = term.lastIndexOf("-");
if (lastdash != -1)

String term = getWord();
int lastdash = term.lastIndexOf("-");
if (lastdash != -1)

int words = 0;
for (State s : tsm.completed) {
  int idx = s.stateIndex();
  if (idx == TokenizerState.IN_SENTENCE.ordinal())
    sentences++;
for (State s : tsm.completed) {
  State ms = (State) s;
  if (s.stateIndex() == TokenizerState.IN_SENTENCE.ordinal())
    sentenceEnds[sentenceIndex++] = wordIndex;
  else {

int words = 0;
for (State s : tsm.completed) {
  int idx = s.stateIndex();
  if (idx == TokenizerState.IN_SENTENCE.ordinal())
    sentences++;
for (State s : tsm.completed) {
  State ms = (State) s;
  if (s.stateIndex() == TokenizerState.IN_SENTENCE.ordinal())
    sentenceEnds[sentenceIndex++] = wordIndex;
  else {

this.text = this.textstring.toCharArray();
current = 0;
this.push(new State(TokenizerState.IN_SENTENCE), current);
for (current = 0; current < this.text.length; current++) {
  char character = this.text[current];

this.text = this.textstring.toCharArray();
current = 0;
this.push(new State(TokenizerState.IN_SENTENCE), current);
for (current = 0; current < this.text.length; current++) {
  char character = this.text[current];

@Override
public Pair<String[], IntPair[]> tokenizeSentence(String sentence) {
  
  // parse the test
  TokenizerStateMachine tsm = new TokenizerStateMachine(splitOnDash, splitOnSecondNewline);
  tsm.parseText(sentence);
  // construct the data needed for the tokenization.
  int words = 0;
  for (State s : tsm.completed) {
    int idx = s.stateIndex();
    if (idx != TokenizerState.IN_SENTENCE.ordinal())
      words++;
  }
  IntPair[] wordOffsets = new IntPair[words];
  String[] tokens = new String[words];
  int wordIndex = 0;
  for (State s : tsm.completed) {
    State ms = (State) s;
    if (s.stateIndex() != TokenizerState.IN_SENTENCE.ordinal()) {
      tokens[wordIndex] = new String(tsm.text, ms.start, ms.end - ms.start);
      wordOffsets[wordIndex++] = new IntPair(ms.start, ms.end);
    }
  }
  return new Pair<>(tokens, wordOffsets);
}

  @Override
  public void process(char token) {
    String cword = getCurrent().getWord();
    // let's see if this is a contraction.
    if (cword.equals("'")) {
      String word = getNextWord();
      if (Contractions.contains(word)) {
        // just change the state type to text, this will end up being a
        // word.
        getCurrent().stateindex = TokenType.TEXT.ordinal();
        state = getCurrent().stateindex;
        return;
      }
    } else if (cword.equals(".") && Character.isDigit(token)) {
      // This is a decimal number (probably), just keep the current state and
      // make it a word token
      getCurrent().stateindex = TokenType.TEXT.ordinal();
      state = getCurrent().stateindex;
      return;
    }
    pop(current);
    push(new State(TokenizerState.IN_WORD), current);
  }
},

  @Override
  public void process(char token) {
    String cword = getCurrent().getWord();
    // let's see if this is a contraction.
    if (cword.equals("'")) {
      String word = getNextWord();
      if (Contractions.contains(word)) {
        // just change the state type to text, this will end up being a
        // word.
        getCurrent().stateindex = TokenType.TEXT.ordinal();
        state = getCurrent().stateindex;
        return;
      }
    } else if (cword.equals(".") && Character.isDigit(token)) {
      // This is a decimal number (probably), just keep the current state and
      // make it a word token
      getCurrent().stateindex = TokenType.TEXT.ordinal();
      state = getCurrent().stateindex;
      return;
    }
    pop(current);
    push(new State(TokenizerState.IN_WORD), current);
  }
},

  @Override
  public void process(char token) {
    String cword = getCurrent().getWord();
    // let's see if this is a contraction.
    if (cword.equals("'")) {
      String word = getNextWord();
      if (Contractions.contains(word)) {
        // just change the state type to text, this will end up being a
        // word.
        getCurrent().stateindex = TokenType.TEXT.ordinal();
        state = getCurrent().stateindex;
        return;
      }
    } else if (cword.equals(".") && Character.isDigit(token)) {
      // This is a decimal number (probably), just keep the current state and
      // make it a word token
      getCurrent().stateindex = TokenType.TEXT.ordinal();
      state = getCurrent().stateindex;
      return;
    }
    pop(current);
    push(new State(TokenizerState.IN_WORD), current);
  }
},

@Override
public Pair<String[], IntPair[]> tokenizeSentence(String sentence) {
  
  // parse the test
  TokenizerStateMachine tsm = new TokenizerStateMachine(splitOnDash, splitOnSecondNewline);
  tsm.parseText(sentence);
  // construct the data needed for the tokenization.
  int words = 0;
  for (State s : tsm.completed) {
    int idx = s.stateIndex();
    if (idx != TokenizerState.IN_SENTENCE.ordinal())
      words++;
  }
  IntPair[] wordOffsets = new IntPair[words];
  String[] tokens = new String[words];
  int wordIndex = 0;
  for (State s : tsm.completed) {
    State ms = (State) s;
    if (s.stateIndex() != TokenizerState.IN_SENTENCE.ordinal()) {
      tokens[wordIndex] = new String(tsm.text, ms.start, ms.end - ms.start);
      wordOffsets[wordIndex++] = new IntPair(ms.start, ms.end);
    }
  }
  return new Pair<>(tokens, wordOffsets);
}

/**
 * Pop the current state identifier off the stack.
 * 
 * @param where the position to terminate the previous token and start the new one.
 * @return true if popped the last item off the stack.
 */
protected boolean pop(int where) {
  State s = stack.remove(stack.size() - 1);
  s.pop(where);
  if (s.size() > 0)
    completed.add(s);
  if (stack.size() == 0) {
    push(new State(TokenizerState.IN_SENTENCE), where + 1);
    this.state = stack.get(stack.size() - 1).stateIndex();
    return true;
  } else {
    this.state = stack.get(stack.size() - 1).stateIndex();
    return false;
  }
}

  @Override
  public void process(char token) {
    
    // we have something, so the paragraph has mass.
    stack.get(stack.size()-1).hasMass = true;
    if (token == '$') {
      Character next = peek(1);
      if (Character.isDigit(next) || ( next == '.' && Character.isDigit(peek(2)))) {
        push(new State(TokenizerState.IN_WORD), current);
      } else {
        push(new State(TokenizerState.IN_SPECIAL), current);
      }
    } else {
      // this was just push IN_SPECIAL, added the push in_word to match the
      // old tokenizer
      push(new State(TokenizerState.IN_SPECIAL), current);
    }
  }
},

  @Override
  public void process(char token) {
    
    // we have something, so the paragraph has mass.
    stack.get(stack.size()-1).hasMass = true;
    if (token == '$') {
      Character next = peek(1);
      if (Character.isDigit(next) || ( next == '.' && Character.isDigit(peek(2)))) {
        push(new State(TokenizerState.IN_WORD), current);
      } else {
        push(new State(TokenizerState.IN_SPECIAL), current);
      }
    } else {
      // this was just push IN_SPECIAL, added the push in_word to match the
      // old tokenizer
      push(new State(TokenizerState.IN_SPECIAL), current);
    }
  }
},

Javadoc

State when we are in a sentence.

Most used methods

<init>
Create a new span.
getWord
get the current word if we can.
isAbbr
check to see if the word is all uppercase and periods, indicating an acronym
isNumeric
get the current word if we can.
pop
the state is being popped off.
push
The content is being pushed onto the stack.
size
the size fo the content.
stateIndex
the index of the enumeration.
isDate
get the current word if we can.
isSpecialMeaning

Popular in Java

Finding current android device location
getSharedPreferences (Context)
getContentResolver (Context)
requestLocationUpdates (LocationManager)
EOFException (java.io)
Thrown when a program encounters the end of a file or stream during an input operation.
Connection (java.sql)
A connection represents a link from a Java application to a database. All SQL statements and results
GregorianCalendar (java.util)
GregorianCalendar is a concrete subclass of Calendarand provides the standard calendar used by most
Iterator (java.util)
An iterator over a sequence of objects, such as a collection.If a collection has been changed since
Timer (java.util)
Timers schedule one-shot or recurring TimerTask for execution. Prefer java.util.concurrent.Scheduled
JPanel (javax.swing)
Top 12 Jupyter Notebook extensions

How to useTokenizerStateMachine$State in edu.illinois.cs.cogcomp.nlp.tokenizer

Best Java code snippets using edu.illinois.cs.cogcomp.nlp.tokenizer.TokenizerStateMachine$State (Showing top 20 results out of 315)

How to use
TokenizerStateMachine$State
in
edu.illinois.cs.cogcomp.nlp.tokenizer