/** * Returns a thread-safe tokenizer */ public Tokenizer<CoreLabel> getTokenizer(Reader r) { return factory.getTokenizer(r); }
private static boolean isDeletedCharacter(char ch, TokenizerFactory<CoreLabel> tf) { List<CoreLabel> tokens = tf.getTokenizer(new StringReader(Character.toString(ch))).tokenize(); return tokens.isEmpty(); }
/** * Tokenizes the given text to populate the list of words this Document * represents. The default implementation uses the current tokenizer and tokenizes * the entirety of the text into words. Subclasses should override this method * to parse documents in non-standard formats, and/or to pull the title of the * document from the text. The given text may be empty ("") but will never * be null. Subclasses may want to do additional processing and then just * call super.parse. * * @see #setTokenizerFactory */ protected void parse(String text) { Tokenizer<Word> toke = tokenizerFactory.getTokenizer(new StringReader(text)); addAll(toke.tokenize()); }
/** Return the tokens using PTB tokenizer. * * @param str String to tokenize * @return List of tokens */ private String[] ptbTokenize(String str) { // todo [cdm 2017]: Someday should generalize this to allow use of other tokenizers if (ptbFactory==null) { ptbFactory = PTBTokenizer.factory(); } Tokenizer<Word> tokenizer = ptbFactory.getTokenizer(new StringReader(str)); List<Word> words = tokenizer.tokenize(); String[] res = new String[words.size()]; for (int i = 0, sz = words.size(); i < sz; i++) { res[i] = words.get(i).word(); } return res; }
/** * Tokenize the text using the parser's tokenizer */ public List<? extends HasWord> tokenize(String sentence) { TokenizerFactory<? extends HasWord> tf = treebankLanguagePack().getTokenizerFactory(); Tokenizer<? extends HasWord> tokenizer = tf.getTokenizer(new StringReader(sentence)); List<? extends HasWord> tokens = tokenizer.tokenize(); return tokens; }
} else { if (eolIsSignificant) { tokenizer = tokenizerFactory.getTokenizer(inputReader, "tokenizeNLs"); } else { tokenizer = tokenizerFactory.getTokenizer(inputReader);
String word = wordTagPair[0]; if (tokFactory != null) { List<CoreLabel> lexList = tokFactory.getTokenizer(new StringReader(word)).tokenize(); if (lexList.size() == 0) { continue;
/** * Tokenizes the highlighted text (using a tokenizer appropriate for the * selected language, and initiates the ParseThread to parse the tokenized * text. */ public void parse() { if (textPane.getText().length() == 0) { return; } // use endIndex+1 because substring subtracts 1 String text = textPane.getText().substring(startIndex, endIndex + 1).trim(); if (parser != null && text.length() > 0) { //Tokenizer<? extends HasWord> toke = tlp.getTokenizerFactory().getTokenizer(new CharArrayReader(text.toCharArray())); Tokenizer<? extends HasWord> toke = tlp.getTokenizerFactory().getTokenizer(new StringReader(text)); List<? extends HasWord> wordList = toke.tokenize(); parseThread = new ParseThread(wordList); parseThread.start(); startProgressMonitor("Parsing", PARSE_TIME); } }
List<CoreLabel> tokenizedLine = tf.getTokenizer(new StringReader(line)).tokenize(); System.out.println(SentenceUtils.listToString(tokenizedLine));
private List<CoreLabel> segmentStringToIOB(String line) { List<CoreLabel> tokenList; if (tf == null) { // Whitespace tokenization. tokenList = IOBUtils.StringToIOB(line); } else { List<CoreLabel> tokens = tf.getTokenizer(new StringReader(line)).tokenize(); tokenList = IOBUtils.StringToIOB(tokens, null, false, tf, line); } IOBUtils.labelDomain(tokenList, domain); tokenList = classifier.classify(tokenList); return tokenList; }
final long startTime = System.nanoTime(); try { Tokenizer<CoreLabel> tokenizer = tf.getTokenizer(new BufferedReader(new InputStreamReader(System.in, encoding))); BufferedWriter writer = new BufferedWriter(new OutputStreamWriter(System.out, encoding)); boolean printSpace = false;
try { final String encoding = "UTF-8"; Tokenizer<CoreLabel> tokenizer = tf.getTokenizer(new InputStreamReader(System.in, encoding)); boolean printSpace = false; while (tokenizer.hasNext()) {
final long startTime = System.nanoTime(); try { Tokenizer<CoreLabel> tokenizer = tf.getTokenizer(new InputStreamReader(System.in, encoding)); boolean printSpace = false; while (tokenizer.hasNext()) {
List<CoreLabel> lexListRaw = tf.getTokenizer(new StringReader(raw)).tokenize(); List<CoreLabel> lexListRewritten = tf.getTokenizer(new StringReader(rewritten)).tokenize(); if (lexListRewritten.size() != lexListRaw.size()) { System.err.printf("%s: Different number of tokens in raw and rewritten: %s>>>%s%n", this.getClass().getName(), raw, rewritten); List<CoreLabel> line = tf.getTokenizer(new StringReader(in)).tokenize(); tokenList = IOBUtils.StringToIOB(line, segMarker, false);
List<CoreLabel> words = tokenizerFactory.getTokenizer(new StringReader(sentenceString)).tokenize();
i = trees.iterator(); } else { i = tlp.treeTokenizerFactory().getTokenizer(new BufferedReader(new InputStreamReader(System.in)));
tlp.getTokenizerFactory().getTokenizer(new StringReader(sent2)); List<? extends HasWord> sentence2 = toke.tokenize();
@Override public Iterator<List<IN>> getIterator(Reader r) { Tokenizer<IN> tokenizer = tokenizerFactory.getTokenizer(r);
PTBTokenizer.factory(new CoreLabelTokenFactory(), ""); Tokenizer<CoreLabel> tok = tokenizerFactory.getTokenizer(new StringReader(sent2)); List<CoreLabel> rawWords2 = tok.tokenize(); parse = lp.apply(rawWords2);
/** * Tokenize the text using the parser's tokenizer */ public List<? extends HasWord> tokenize(String sentence) { TokenizerFactory<? extends HasWord> tf = treebankLanguagePack().getTokenizerFactory(); Tokenizer<? extends HasWord> tokenizer = tf.getTokenizer(new StringReader(sentence)); List<? extends HasWord> tokens = tokenizer.tokenize(); return tokens; }