this.text = this.textstring.toCharArray(); current = 0; this.push(new State(TokenizerState.IN_SENTENCE), current); for (current = 0; current < this.text.length; current++) { char character = this.text[current];
String term = getWord(); int lastdash = term.lastIndexOf("-"); if (lastdash != -1)
/** * Pop the current state identifier off the stack. * * @param where the position to terminate the previous token and start the new one. * @return true if popped the last item off the stack. */ protected boolean pop(int where) { State s = stack.remove(stack.size() - 1); s.pop(where); if (s.size() > 0) completed.add(s); if (stack.size() == 0) { push(new State(TokenizerState.IN_SENTENCE), where + 1); this.state = stack.get(stack.size() - 1).stateIndex(); return true; } else { this.state = stack.get(stack.size() - 1).stateIndex(); return false; } }
if (getCurrent().isNumeric()) { int advance = 1; while (true) { push(new State(TokenizerState.IN_SPECIAL), current); // No matter if (getCurrent().isNumeric()) { if (current < (text.length - 1) && Character.isDigit(text[current + 1])) { push(new State(TokenizerState.IN_SPECIAL), current); // No matter push(new State(TokenizerState.IN_SPECIAL), current); push(new State(TokenizerState.IN_WORD), current); // make 's && (nnnc == '\000' || Character.isWhitespace(nnnc))) { pop(current); push(new State(TokenizerState.IN_WORD), current); } else if (nc == 't' && pc == 'n' && (nnc == '\000' || Character.isWhitespace(nnc))) { pop(current - 1); push(new State(TokenizerState.IN_WORD), current - 1); } else if (nc == 'v' && nnc == 'e' && (nnnc == '\000' || Character.isWhitespace(nnnc))) { pop(current); push(new State(TokenizerState.IN_WORD), current); } else if (nc == 'l' && nnc == 'l' && (nnnc == '\000' || Character.isWhitespace(nnnc))) {
if (getCurrent().isDate()) { int advance = 1; while (true) { push(new State(TokenizerState.IN_SPECIAL), current); // No matter if (getCurrent().isNumeric()) { if (current < (text.length - 1) && Character.isDigit(text[current + 1])) { push(new State(TokenizerState.IN_SPECIAL), current); // No matter push(new State(TokenizerState.IN_SPECIAL), current); push(new State(TokenizerState.IN_WORD), current); // make 's && (nnnc == '\000' || Character.isWhitespace(nnnc))) { pop(current); push(new State(TokenizerState.IN_WORD), current); } else if (nc == 't' && pc == 'n' && (nnc == '\000' || Character.isWhitespace(nnc))) { pop(current - 1); push(new State(TokenizerState.IN_WORD), current - 1); } else if (nc == 'v' && nnc == 'e' && (nnnc == '\000' || Character.isWhitespace(nnnc))) { pop(current); push(new State(TokenizerState.IN_WORD), current); } else if (nc == 'l' && nnc == 'l' && (nnnc == '\000' || Character.isWhitespace(nnnc))) {
if (getCurrent().isDate()) { int advance = 1; while (true) { push(new State(TokenizerState.IN_SPECIAL), current); // No matter if (getCurrent().isNumeric()) { if (current < (text.length - 1) && Character.isDigit(text[current + 1])) { push(new State(TokenizerState.IN_SPECIAL), current); // No matter push(new State(TokenizerState.IN_SPECIAL), current); push(new State(TokenizerState.IN_WORD), current); // make 's && (nnnc == '\000' || Character.isWhitespace(nnnc))) { pop(current); push(new State(TokenizerState.IN_WORD), current); } else if (nc == 't' && pc == 'n' && (nnc == '\000' || Character.isWhitespace(nnc))) { pop(current - 1); push(new State(TokenizerState.IN_WORD), current - 1); } else if (nc == 'v' && nnc == 'e' && (nnnc == '\000' || Character.isWhitespace(nnnc))) { pop(current); push(new State(TokenizerState.IN_WORD), current); } else if (nc == 'l' && nnc == 'l' && (nnnc == '\000' || Character.isWhitespace(nnnc))) {
String term = getWord(); int lastdash = term.lastIndexOf("-"); if (lastdash != -1)
String term = getWord(); int lastdash = term.lastIndexOf("-"); if (lastdash != -1)
int words = 0; for (State s : tsm.completed) { int idx = s.stateIndex(); if (idx == TokenizerState.IN_SENTENCE.ordinal()) sentences++; for (State s : tsm.completed) { State ms = (State) s; if (s.stateIndex() == TokenizerState.IN_SENTENCE.ordinal()) sentenceEnds[sentenceIndex++] = wordIndex; else {
int words = 0; for (State s : tsm.completed) { int idx = s.stateIndex(); if (idx == TokenizerState.IN_SENTENCE.ordinal()) sentences++; for (State s : tsm.completed) { State ms = (State) s; if (s.stateIndex() == TokenizerState.IN_SENTENCE.ordinal()) sentenceEnds[sentenceIndex++] = wordIndex; else {
this.text = this.textstring.toCharArray(); current = 0; this.push(new State(TokenizerState.IN_SENTENCE), current); for (current = 0; current < this.text.length; current++) { char character = this.text[current];
this.text = this.textstring.toCharArray(); current = 0; this.push(new State(TokenizerState.IN_SENTENCE), current); for (current = 0; current < this.text.length; current++) { char character = this.text[current];
@Override public Pair<String[], IntPair[]> tokenizeSentence(String sentence) { // parse the test TokenizerStateMachine tsm = new TokenizerStateMachine(splitOnDash, splitOnSecondNewline); tsm.parseText(sentence); // construct the data needed for the tokenization. int words = 0; for (State s : tsm.completed) { int idx = s.stateIndex(); if (idx != TokenizerState.IN_SENTENCE.ordinal()) words++; } IntPair[] wordOffsets = new IntPair[words]; String[] tokens = new String[words]; int wordIndex = 0; for (State s : tsm.completed) { State ms = (State) s; if (s.stateIndex() != TokenizerState.IN_SENTENCE.ordinal()) { tokens[wordIndex] = new String(tsm.text, ms.start, ms.end - ms.start); wordOffsets[wordIndex++] = new IntPair(ms.start, ms.end); } } return new Pair<>(tokens, wordOffsets); }
@Override public void process(char token) { String cword = getCurrent().getWord(); // let's see if this is a contraction. if (cword.equals("'")) { String word = getNextWord(); if (Contractions.contains(word)) { // just change the state type to text, this will end up being a // word. getCurrent().stateindex = TokenType.TEXT.ordinal(); state = getCurrent().stateindex; return; } } else if (cword.equals(".") && Character.isDigit(token)) { // This is a decimal number (probably), just keep the current state and // make it a word token getCurrent().stateindex = TokenType.TEXT.ordinal(); state = getCurrent().stateindex; return; } pop(current); push(new State(TokenizerState.IN_WORD), current); } },
@Override public void process(char token) { String cword = getCurrent().getWord(); // let's see if this is a contraction. if (cword.equals("'")) { String word = getNextWord(); if (Contractions.contains(word)) { // just change the state type to text, this will end up being a // word. getCurrent().stateindex = TokenType.TEXT.ordinal(); state = getCurrent().stateindex; return; } } else if (cword.equals(".") && Character.isDigit(token)) { // This is a decimal number (probably), just keep the current state and // make it a word token getCurrent().stateindex = TokenType.TEXT.ordinal(); state = getCurrent().stateindex; return; } pop(current); push(new State(TokenizerState.IN_WORD), current); } },
@Override public void process(char token) { String cword = getCurrent().getWord(); // let's see if this is a contraction. if (cword.equals("'")) { String word = getNextWord(); if (Contractions.contains(word)) { // just change the state type to text, this will end up being a // word. getCurrent().stateindex = TokenType.TEXT.ordinal(); state = getCurrent().stateindex; return; } } else if (cword.equals(".") && Character.isDigit(token)) { // This is a decimal number (probably), just keep the current state and // make it a word token getCurrent().stateindex = TokenType.TEXT.ordinal(); state = getCurrent().stateindex; return; } pop(current); push(new State(TokenizerState.IN_WORD), current); } },
@Override public Pair<String[], IntPair[]> tokenizeSentence(String sentence) { // parse the test TokenizerStateMachine tsm = new TokenizerStateMachine(splitOnDash, splitOnSecondNewline); tsm.parseText(sentence); // construct the data needed for the tokenization. int words = 0; for (State s : tsm.completed) { int idx = s.stateIndex(); if (idx != TokenizerState.IN_SENTENCE.ordinal()) words++; } IntPair[] wordOffsets = new IntPair[words]; String[] tokens = new String[words]; int wordIndex = 0; for (State s : tsm.completed) { State ms = (State) s; if (s.stateIndex() != TokenizerState.IN_SENTENCE.ordinal()) { tokens[wordIndex] = new String(tsm.text, ms.start, ms.end - ms.start); wordOffsets[wordIndex++] = new IntPair(ms.start, ms.end); } } return new Pair<>(tokens, wordOffsets); }
/** * Pop the current state identifier off the stack. * * @param where the position to terminate the previous token and start the new one. * @return true if popped the last item off the stack. */ protected boolean pop(int where) { State s = stack.remove(stack.size() - 1); s.pop(where); if (s.size() > 0) completed.add(s); if (stack.size() == 0) { push(new State(TokenizerState.IN_SENTENCE), where + 1); this.state = stack.get(stack.size() - 1).stateIndex(); return true; } else { this.state = stack.get(stack.size() - 1).stateIndex(); return false; } }
@Override public void process(char token) { // we have something, so the paragraph has mass. stack.get(stack.size()-1).hasMass = true; if (token == '$') { Character next = peek(1); if (Character.isDigit(next) || ( next == '.' && Character.isDigit(peek(2)))) { push(new State(TokenizerState.IN_WORD), current); } else { push(new State(TokenizerState.IN_SPECIAL), current); } } else { // this was just push IN_SPECIAL, added the push in_word to match the // old tokenizer push(new State(TokenizerState.IN_SPECIAL), current); } } },
@Override public void process(char token) { // we have something, so the paragraph has mass. stack.get(stack.size()-1).hasMass = true; if (token == '$') { Character next = peek(1); if (Character.isDigit(next) || ( next == '.' && Character.isDigit(peek(2)))) { push(new State(TokenizerState.IN_WORD), current); } else { push(new State(TokenizerState.IN_SPECIAL), current); } } else { // this was just push IN_SPECIAL, added the push in_word to match the // old tokenizer push(new State(TokenizerState.IN_SPECIAL), current); } } },