private static boolean isDeletedCharacter(char ch, TokenizerFactory<CoreLabel> tf) { List<CoreLabel> tokens = tf.getTokenizer(new StringReader(Character.toString(ch))).tokenize(); return tokens.isEmpty(); }
Tree t = null; while (tokenizer.hasNext() && t == null) {
Tree t = null; while (tokenizer.hasNext() && t == null) {
/** * Tokenizes the given text to populate the list of words this Document * represents. The default implementation uses the current tokenizer and tokenizes * the entirety of the text into words. Subclasses should override this method * to parse documents in non-standard formats, and/or to pull the title of the * document from the text. The given text may be empty ("") but will never * be null. Subclasses may want to do additional processing and then just * call super.parse. * * @see #setTokenizerFactory */ protected void parse(String text) { Tokenizer<Word> toke = tokenizerFactory.getTokenizer(new StringReader(text)); addAll(toke.tokenize()); }
Tree t = null; while (tokenizer.hasNext() && t == null) {
/** Return the tokens using PTB tokenizer. * * @param str String to tokenize * @return List of tokens */ private String[] ptbTokenize(String str) { // todo [cdm 2017]: Someday should generalize this to allow use of other tokenizers if (ptbFactory==null) { ptbFactory = PTBTokenizer.factory(); } Tokenizer<Word> tokenizer = ptbFactory.getTokenizer(new StringReader(str)); List<Word> words = tokenizer.tokenize(); String[] res = new String[words.size()]; for (int i = 0, sz = words.size(); i < sz; i++) { res[i] = words.get(i).word(); } return res; }
Tree t = null; while (tokenizer.hasNext() && t == null) {
@Override protected HasWord getNext() { while (wordIter == null || ! wordIter.hasNext()) { if ( ! tok.hasNext()) { return null; } CoreLabel token = tok.next(); String s = token.word(); if (s == null) { return null; } if (s.equals(WhitespaceLexer.NEWLINE)) { // if newlines were significant, we should make sure to return // them when we see them List<HasWord> se = Collections.<HasWord>singletonList(token); wordIter = se.iterator(); } else { List<HasWord> se = wordSegmenter.segment(s); wordIter = se.iterator(); } } return wordIter.next(); }
/** * Tokenize the text using the parser's tokenizer */ public List<? extends HasWord> tokenize(String sentence) { TokenizerFactory<? extends HasWord> tf = treebankLanguagePack().getTokenizerFactory(); Tokenizer<? extends HasWord> tokenizer = tf.getTokenizer(new StringReader(sentence)); List<? extends HasWord> tokens = tokenizer.tokenize(); return tokens; }
Tree t = null; while (tokenizer.hasNext() && t == null) {
/** * The main() method tokenizes a file in the specified Encoding * and prints it to standard output in the specified Encoding. * Its arguments are (Infile, Encoding). */ public static void main(String[] args) throws IOException { if (args.length < 2) { log.error("Usage: CHTBTokenizer inputFile encoding"); } String encoding = args[1]; Reader in = IOUtils.readerFromString(args[0], encoding); for (Tokenizer<String> st = new CHTBTokenizer(in); st.hasNext(); ) { String s = st.next(); EncodingPrintWriter.out.println(s, encoding); // EncodingPrintWriter.out.println("|" + s + "| (" + s.length() + ")", // encoding); } }
String word = wordTagPair[0]; if (tokFactory != null) { List<CoreLabel> lexList = tokFactory.getTokenizer(new StringReader(word)).tokenize(); if (lexList.size() == 0) { continue;