/** Returns a tokenizer wrapping the given Reader. */ @Override public Tokenizer<T> getTokenizer(Reader r) { return new PTBTokenizer<>(r, factory, options); }
@Override public Tokenizer<T> getTokenizer(Reader r, String extraOptions) { if (options == null || options.isEmpty()) { return new PTBTokenizer<>(r, factory, extraOptions); } else { return new PTBTokenizer<>(r, factory, options + ',' + extraOptions); } }
/** * Constructs a new PTBTokenizer that returns Word tokens and which treats * carriage returns as normal whitespace. * * @param r The Reader whose contents will be tokenized * @return A PTBTokenizer that tokenizes a stream to objects of type * {@link Word} */ public static PTBTokenizer<Word> newPTBTokenizer(Reader r) { return new PTBTokenizer<>(r, new WordTokenFactory(), ""); }
/** * Constructs a new PTBTokenizer that makes CoreLabel tokens. * It optionally returns carriage returns * as their own token. CRs come back as Words whose text is * the value of {@code AbstractTokenizer.NEWLINE_TOKEN}. * * @param r The Reader to read tokens from * @param tokenizeNLs Whether to return newlines as separate tokens * (otherwise they normally disappear as whitespace) * @param invertible if set to true, then will produce CoreLabels which * will have fields for the string before and after, and the * character offsets * @return A PTBTokenizer which returns CoreLabel objects */ public static PTBTokenizer<CoreLabel> newPTBTokenizer(Reader r, boolean tokenizeNLs, boolean invertible) { return new PTBTokenizer<>(r, tokenizeNLs, invertible, false, new CoreLabelTokenFactory()); }
for (PTBTokenizer<CoreLabel> tokenizer = new PTBTokenizer<>(r, new CoreLabelTokenFactory(), options); tokenizer.hasNext(); ) { CoreLabel obj = tokenizer.next();
/** Returns a tokenizer wrapping the given Reader. */ @Override public Tokenizer<T> getTokenizer(Reader r) { return new PTBTokenizer<>(r, factory, options); }
/** Returns a tokenizer wrapping the given Reader. */ public Tokenizer<T> getTokenizer(Reader r) { return new PTBTokenizer<T>(r, factory, options); }
/** Returns a tokenizer wrapping the given Reader. */ @Override public Tokenizer<T> getTokenizer(Reader r) { return new PTBTokenizer<T>(r, factory, options); }
/** Returns a tokenizer wrapping the given Reader. */ @Override public Tokenizer<T> getTokenizer(Reader r) { return new PTBTokenizer<>(r, factory, options); }
@Override public Tokenizer<T> getTokenizer(Reader r, String extraOptions) { if (options == null || options.isEmpty()) { return new PTBTokenizer<>(r, factory, extraOptions); } else { return new PTBTokenizer<>(r, factory, options + ',' + extraOptions); } }
@Override public Tokenizer<T> getTokenizer(Reader r, String extraOptions) { if (options == null || options.isEmpty()) { return new PTBTokenizer<>(r, factory, extraOptions); } else { return new PTBTokenizer<>(r, factory, options + ',' + extraOptions); } }
@Override public Tokenizer<T> getTokenizer(Reader r, String extraOptions) { if (options == null || options.isEmpty()) { return new PTBTokenizer<T>(r, factory, extraOptions); } else { return new PTBTokenizer<T>(r, factory, options + ',' + extraOptions); } }
@Override public Tokenizer<?> create(final String s) { // TokenizerFactory<CoreLabel> f = PTBTokenizer.factory(new CoreLabelTokenFactory(), "invertible,ptb3Escaping=false"); return new PTBTokenizer<CoreLabel>(new StringReader(s),new CoreLabelTokenFactory(),"invertible"); } }
/** * Constructs a new PTBTokenizer that returns Word tokens and which treats * carriage returns as normal whitespace. * * @param r The Reader whose contents will be tokenized * @return A PTBTokenizer that tokenizes a stream to objects of type * {@link Word} */ public static PTBTokenizer<Word> newPTBTokenizer(Reader r) { return new PTBTokenizer<>(r, new WordTokenFactory(), ""); }
/** * Constructs a new PTBTokenizer that returns Word tokens and which treats * carriage returns as normal whitespace. * * @param r The Reader whose contents will be tokenized * @return A PTBTokenizer that tokenizes a stream to objects of type * {@link Word} */ public static PTBTokenizer<Word> newPTBTokenizer(Reader r) { return new PTBTokenizer<>(r, new WordTokenFactory(), ""); }
/** * Constructs a new PTBTokenizer that returns Word tokens and which treats * carriage returns as normal whitespace. * * @param r The Reader whose contents will be tokenized * @return A PTBTokenizer that tokenizes a stream to objects of type * {@link Word} */ public static PTBTokenizer<Word> newPTBTokenizer(Reader r) { return new PTBTokenizer<Word>(r, new WordTokenFactory(), ""); }
/** * Constructs a new PTBTokenizer that optionally returns newlines * as their own token. NLs come back as Words whose text is * the value of <code>PTBLexer.NEWLINE_TOKEN</code>. * * @param r The Reader to read tokens from * @param tokenizeNLs Whether to return newlines as separate tokens * (otherwise they normally disappear as whitespace) * @return A PTBTokenizer which returns Word tokens */ public static PTBTokenizer<Word> newPTBTokenizer(Reader r, boolean tokenizeNLs) { return new PTBTokenizer<Word>(r, tokenizeNLs, false, false, new WordTokenFactory()); }
@Override public Tokenizer<CoreLabel> getTokenizer(Reader r) { // TODO Auto-generated method stub return new PTBTokenizer<CoreLabel>(r, new CoreLabelTokenFactory(), ""); }
@Override public void process(JCas aInput, JCas aOutput) throws AnalysisEngineProcessException { Tokenizer<CoreLabel> tokenizer = new PTBTokenizer<CoreLabel>(new StringReader( aInput.getDocumentText()), new CoreLabelTokenFactory(), "invertible"); for (CoreLabel label : tokenizer.tokenize()) { replace(label.beginPosition(), label.endPosition(), label.word()); } } }
public DataBag exec(Tuple input) throws IOException { if (input == null || input.size() < 1 || input.isNull(0)) return null; // Output bag DataBag bagOfTokens = bagFactory.newDefaultBag(); StringReader textInput = new StringReader(input.get(0).toString()); PTBTokenizer ptbt = new PTBTokenizer(textInput, new CoreLabelTokenFactory(), ""); for (CoreLabel label; ptbt.hasNext(); ) { label = (CoreLabel)ptbt.next(); Tuple termText = tupleFactory.newTuple(label.toString()); bagOfTokens.add(termText); } return bagOfTokens; } }