/** * Returns the next token. * * @return the next token if it exists, <code>null</code> if no more tokens */ public Token next() { lastToken = token; token = new Token(); // Skip whitespace token.setWhitespace(getTokenOfCharClass(whitespaceSymbols)); // quoted strings currently ignored // get prepunctuation token.setPrepunctuation(getTokenOfCharClass(prepunctuationSymbols)); // get the symbol itself if (singleCharSymbols.indexOf(currentChar) != -1) { token.setWord(String.valueOf((char) currentChar)); getNextChar(); } else { token.setWord(getTokenNotOfCharClass(whitespaceSymbols)); } token.setPosition(currentPosition); token.setLineNumber(lineNumber); // This'll have token *plus* postpunctuation // Get postpunctuation removeTokenPostpunctuation(); return token; }
while (tokenizer.hasNext()) { Token token = tokenizer.next(); String tokenWord = token.getWord(); featureSet.setString("whitespace", token.getWhitespace()); featureSet.setString("prepunctuation", token.getPrepunctuation()); featureSet.setString("punc", token.getPostpunctuation()); featureSet.setString("file_pos", String.valueOf(token.getPosition())); featureSet.setString("line_number", String.valueOf(token.getLineNumber()));
/** * Removes the postpunctuation characters from the current token. Copies * those postpunctuation characters to the class variable * 'postpunctuation'. */ private void removeTokenPostpunctuation() { if (token == null) { return; } final String tokenWord = token.getWord(); int tokenLength = tokenWord.length(); int position = tokenLength - 1; while (position > 0 && postpunctuationSymbols.indexOf((int) tokenWord .charAt(position)) != -1) { position--; } if (tokenLength - 1 != position) { // Copy postpunctuation from token token.setPostpunctuation(tokenWord.substring(position + 1)); // truncate token at postpunctuation token.setWord(tokenWord.substring(0, position + 1)); } else { token.setPostpunctuation(""); } }
String tokenWhiteSpace = token.getWhitespace(); String lastTokenPostpunctuation = null; if (lastToken != null) { lastTokenPostpunctuation = lastToken.getPostpunctuation(); } else if (lastTokenPostpunctuation.indexOf('.') != -1 && tokenWhiteSpace.length() > 1 && Character.isUpperCase(token.getWord().charAt(0))) { return true; } else { String lastWord = lastToken.getWord(); int lastWordLength = lastWord.length(); && Character.isUpperCase(token.getWord().charAt(0)) &&
String tokenWhiteSpace = token.getWhitespace(); String lastTokenPostpunctuation = null; if (lastToken != null) { lastTokenPostpunctuation = lastToken.getPostpunctuation(); } else if (lastTokenPostpunctuation.indexOf('.') != -1 && tokenWhiteSpace.length() > 1 && Character.isUpperCase(token.getWord().charAt(0))) { return true; } else { String lastWord = lastToken.getWord(); int lastWordLength = lastWord.length(); && Character.isUpperCase(token.getWord().charAt(0)) &&
/** * Returns the next token. * * @return the next token if it exists, <code>null</code> if no more tokens */ public Token next() { lastToken = token; token = new Token(); // Skip whitespace token.setWhitespace(getTokenOfCharClass(whitespaceSymbols)); // quoted strings currently ignored // get prepunctuation token.setPrepunctuation(getTokenOfCharClass(prepunctuationSymbols)); // get the symbol itself if (singleCharSymbols.indexOf(currentChar) != -1) { token.setWord(String.valueOf((char) currentChar)); getNextChar(); } else { token.setWord(getTokenNotOfCharClass(whitespaceSymbols)); } token.setPosition(currentPosition); token.setLineNumber(lineNumber); // This'll have token *plus* postpunctuation // Get postpunctuation removeTokenPostpunctuation(); return token; }
while (tokenizer.hasNext()) { Token token = tokenizer.next(); String tokenWord = token.getWord(); featureSet.setString("whitespace", token.getWhitespace()); featureSet.setString("prepunctuation", token.getPrepunctuation()); featureSet.setString("punc", token.getPostpunctuation()); featureSet.setString("file_pos", String.valueOf(token.getPosition())); featureSet.setString("line_number", String.valueOf(token.getLineNumber()));
/** * Removes the postpunctuation characters from the current token. Copies * those postpunctuation characters to the class variable * 'postpunctuation'. */ private void removeTokenPostpunctuation() { if (token == null) { return; } final String tokenWord = token.getWord(); int tokenLength = tokenWord.length(); int position = tokenLength - 1; while (position > 0 && postpunctuationSymbols.indexOf((int) tokenWord .charAt(position)) != -1) { position--; } if (tokenLength - 1 != position) { // Copy postpunctuation from token token.setPostpunctuation(tokenWord.substring(position + 1)); // truncate token at postpunctuation token.setWord(tokenWord.substring(0, position + 1)); } else { token.setPostpunctuation(""); } }