edu.stanford.nlp.process.TokenizerFactory java code examples

/**
 * Returns a thread-safe tokenizer
 */
public Tokenizer<CoreLabel> getTokenizer(Reader r) {
 return factory.getTokenizer(r);
}

TokenizerFactory<CoreLabel> tokFactory = ArabicTokenizer.atbFactory();
String atbVocOptions = "removeProMarker,removeMorphMarker";
tokFactory.setOptions(atbVocOptions);
  String word = wordTagPair[0];
  if (tokFactory != null) {
   List<CoreLabel> lexList = tokFactory.getTokenizer(new StringReader(word)).tokenize();
   if (lexList.size() == 0) {
    continue;

public static TokenizerFactory<CoreLabel> atbFactory() {
 TokenizerFactory<CoreLabel> tf = ArabicTokenizerFactory.newTokenizerFactory();
 for (String option : atbOptions.stringPropertyNames()) {
  tf.setOptions(option);
 }
 return tf;
}

/**
 * Returns a tokenizer with Ancora tokenization.
 */
public static TokenizerFactory<CoreLabel> ancoraFactory() {
 TokenizerFactory<CoreLabel> tf = SpanishTokenizerFactory.newCoreLabelTokenizerFactory();
 tf.setOptions(ANCORA_OPTIONS);
 return tf;
}

private static boolean isDeletedCharacter(char ch, TokenizerFactory<CoreLabel> tf) {
 List<CoreLabel> tokens = tf.getTokenizer(new StringReader(Character.toString(ch))).tokenize();
 return tokens.isEmpty();
}

BufferedReader br = new BufferedReader(new InputStreamReader(new FileInputStream(path), "UTF-8"));
TokenizerFactory<CoreLabel> tf = ArabicTokenizer.factory();
tf.setOptions(tokOptions);
Mapper lexMapper = new DefaultLexicalMapper();
lexMapper.setup(null, "StripSegMarkersInUTF8", "StripMorphMarkersInUTF8");
 List<CoreLabel> tokenizedLine = tf.getTokenizer(new StringReader(line)).tokenize();
 System.out.println(SentenceUtils.listToString(tokenizedLine));

/**
 * Returns a factory for FrenchTokenizer that replicates the tokenization of
 * Green, de Marneffe, and Manning (2011).
 */
public static TokenizerFactory<CoreLabel> ftbFactory() {
 TokenizerFactory<CoreLabel> tf = FrenchTokenizerFactory.newTokenizerFactory();
 tf.setOptions(FTB_OPTIONS);
 return tf;
}

/**
 * Tokenizes the given text to populate the list of words this Document
 * represents. The default implementation uses the current tokenizer and tokenizes
 * the entirety of the text into words. Subclasses should override this method
 * to parse documents in non-standard formats, and/or to pull the title of the
 * document from the text. The given text may be empty ("") but will never
 * be null. Subclasses may want to do additional processing and then just
 * call super.parse.
 *
 * @see #setTokenizerFactory
 */
protected void parse(String text) {
 Tokenizer<Word> toke = tokenizerFactory.getTokenizer(new StringReader(text));
 addAll(toke.tokenize());
}

  ArabicTokenizer.atbFactory() : ArabicTokenizer.factory();
for (String option : tokenizerOptions.stringPropertyNames()) {
 tf.setOptions(option);
tf.setOptions("tokenizeNLs");
try {
 final String encoding = "UTF-8";
 Tokenizer<CoreLabel> tokenizer = tf.getTokenizer(new InputStreamReader(System.in, encoding));
 boolean printSpace = false;
 while (tokenizer.hasNext()) {

/**
 * Creates an ArabicTokenizer. The default tokenizer
 * is ArabicTokenizer.atbFactory(), which produces the
 * same orthographic normalization as Green and Manning (2010).
 *
 * @return A TokenizerFactory that produces each Arabic token as a CoreLabel
 */
private TokenizerFactory<CoreLabel> getTokenizerFactory() {
 TokenizerFactory<CoreLabel> tokFactory = null;
 if ( ! isTokenized) {
  if (tokenizerOptions == null) {
   tokFactory = ArabicTokenizer.atbFactory();
   String atbVocOptions = "removeProMarker,removeMorphMarker,removeLengthening";
   tokFactory.setOptions(atbVocOptions);
  } else {
   if (tokenizerOptions.contains("removeSegMarker")) {
    throw new RuntimeException("Option 'removeSegMarker' cannot be used with ArabicSegmenter");
   }
   tokFactory = ArabicTokenizer.factory();
   tokFactory.setOptions(tokenizerOptions);
  }
  log.info("Loaded ArabicTokenizer with options: " + tokenizerOptions);
 }
 return tokFactory;
}

/** Return the tokens using PTB tokenizer.
 *
 *  @param str String to tokenize
 *  @return List of tokens
 */
private String[] ptbTokenize(String str) {
 // todo [cdm 2017]: Someday should generalize this to allow use of other tokenizers
 if (ptbFactory==null) {
  ptbFactory = PTBTokenizer.factory();
 }
 Tokenizer<Word> tokenizer = ptbFactory.getTokenizer(new StringReader(str));
 List<Word> words = tokenizer.tokenize();
 String[] res = new String[words.size()];
 for (int i = 0, sz = words.size(); i < sz; i++) {
  res[i] = words.get(i).word();
 }
 return res;
}

tf.setOptions(orthoOptions);
final long startTime = System.nanoTime();
try {
 Tokenizer<CoreLabel> tokenizer = tf.getTokenizer(new InputStreamReader(System.in, encoding));
 boolean printSpace = false;
 while (tokenizer.hasNext()) {

tokenizerFactory.setOptions(tokenizerOptions);

/**
 * Tokenize the text using the parser's tokenizer
 */
public List<? extends HasWord> tokenize(String sentence) {
 TokenizerFactory<? extends HasWord> tf = treebankLanguagePack().getTokenizerFactory();
 Tokenizer<? extends HasWord> tokenizer = tf.getTokenizer(new StringReader(sentence));
 List<? extends HasWord> tokens = tokenizer.tokenize();
 return tokens;
}

 orthoOptions = orthoOptions.isEmpty() ? "tokenizeNLs" : orthoOptions + ",tokenizeNLs";
tf.setOptions(orthoOptions);
final long startTime = System.nanoTime();
try {
 Tokenizer<CoreLabel> tokenizer = tf.getTokenizer(new BufferedReader(new InputStreamReader(System.in, encoding)));
 BufferedWriter writer = new BufferedWriter(new OutputStreamWriter(System.out, encoding));
 boolean printSpace = false;

/**
 * Returns a factory for FrenchTokenizer that replicates the tokenization of
 * Green, de Marneffe, and Manning (2011).
 */
public static TokenizerFactory<CoreLabel> ftbFactory() {
 TokenizerFactory<CoreLabel> tf = FrenchTokenizerFactory.newTokenizerFactory();
 tf.setOptions(FTB_OPTIONS);
 return tf;
}

} else {
 if (eolIsSignificant) {
  tokenizer = tokenizerFactory.getTokenizer(inputReader, "tokenizeNLs");
 } else {
  tokenizer = tokenizerFactory.getTokenizer(inputReader);

TokenizerFactory<CoreLabel> tokFactory = ArabicTokenizer.atbFactory();
String atbVocOptions = "removeProMarker,removeMorphMarker";
tokFactory.setOptions(atbVocOptions);
  String word = wordTagPair[0];
  if (tokFactory != null) {
   List<CoreLabel> lexList = tokFactory.getTokenizer(new StringReader(word)).tokenize();
   if (lexList.size() == 0) {
    continue;

public static TokenizerFactory<CoreLabel> atbFactory() {
 TokenizerFactory<CoreLabel> tf = ArabicTokenizerFactory.newTokenizerFactory();
 for (String option : atbOptions.stringPropertyNames()) {
  tf.setOptions(option);
 }
 return tf;
}

/**
 * Tokenizes the highlighted text (using a tokenizer appropriate for the
 * selected language, and initiates the ParseThread to parse the tokenized
 * text.
 */
public void parse() {
 if (textPane.getText().length() == 0) {
  return;
 }
 // use endIndex+1 because substring subtracts 1
 String text = textPane.getText().substring(startIndex, endIndex + 1).trim();
 if (parser != null && text.length() > 0) {
  //Tokenizer<? extends HasWord> toke = tlp.getTokenizerFactory().getTokenizer(new CharArrayReader(text.toCharArray()));
  Tokenizer<? extends HasWord> toke = tlp.getTokenizerFactory().getTokenizer(new StringReader(text));
  List<? extends HasWord> wordList = toke.tokenize();
  parseThread = new ParseThread(wordList);
  parseThread.start();
  startProgressMonitor("Parsing", PARSE_TIME);
 }
}

Javadoc

A TokenizerFactory is a factory that can build a Tokenizer (an extension of Iterator) from a java.io.Reader. IMPORTANT NOTE: A TokenizerFactory should also provide two static methods: public static TokenizerFactory newTokenizerFactory(); public static TokenizerFactory newWordTokenizerFactory(String options); These are expected by certain JavaNLP code (e.g., LexicalizedParser), which wants to produce a TokenizerFactory by reflection.

Most used methods

getTokenizer
Get a tokenizer for this reader.
setOptions
Sets default options for how tokenizers built from this factory should behave.

Popular in Java

Making http requests using okhttp
setRequestProperty (URLConnection)
scheduleAtFixedRate (ScheduledExecutorService)
getSharedPreferences (Context)
SecureRandom (java.security)
This class generates cryptographically secure pseudo-random numbers. It is best to invoke SecureRand
DateFormat (java.text)
Formats or parses dates and times.This class provides factories for obtaining instances configured f
HttpServlet (javax.servlet.http)
Provides an abstract class to be subclassed to create an HTTP servlet suitable for a Web site. A sub
StringUtils (org.apache.commons.lang)
Operations on java.lang.String that arenull safe. * IsEmpty/IsBlank - checks if a String contains
BufferedImage (java.awt.image)
The BufferedImage subclass describes an java.awt.Image with an accessible buffer of image data. All
JButton (javax.swing)
Best plugins for Eclipse

How to useTokenizerFactory in edu.stanford.nlp.process

Best Java code snippets using edu.stanford.nlp.process.TokenizerFactory (Showing top 20 results out of 315)

How to use
TokenizerFactory
in
edu.stanford.nlp.process