edu.stanford.nlp.process.TokenizerFactory.setOptions java code examples

/**
 * Returns a factory for FrenchTokenizer that replicates the tokenization of
 * Green, de Marneffe, and Manning (2011).
 */
public static TokenizerFactory<CoreLabel> ftbFactory() {
 TokenizerFactory<CoreLabel> tf = FrenchTokenizerFactory.newTokenizerFactory();
 tf.setOptions(FTB_OPTIONS);
 return tf;
}

/**
 * Returns a tokenizer with Ancora tokenization.
 */
public static TokenizerFactory<CoreLabel> ancoraFactory() {
 TokenizerFactory<CoreLabel> tf = SpanishTokenizerFactory.newCoreLabelTokenizerFactory();
 tf.setOptions(ANCORA_OPTIONS);
 return tf;
}

public static TokenizerFactory<CoreLabel> atbFactory() {
 TokenizerFactory<CoreLabel> tf = ArabicTokenizerFactory.newTokenizerFactory();
 for (String option : atbOptions.stringPropertyNames()) {
  tf.setOptions(option);
 }
 return tf;
}

/**
 * Creates an ArabicTokenizer. The default tokenizer
 * is ArabicTokenizer.atbFactory(), which produces the
 * same orthographic normalization as Green and Manning (2010).
 *
 * @return A TokenizerFactory that produces each Arabic token as a CoreLabel
 */
private TokenizerFactory<CoreLabel> getTokenizerFactory() {
 TokenizerFactory<CoreLabel> tokFactory = null;
 if ( ! isTokenized) {
  if (tokenizerOptions == null) {
   tokFactory = ArabicTokenizer.atbFactory();
   String atbVocOptions = "removeProMarker,removeMorphMarker,removeLengthening";
   tokFactory.setOptions(atbVocOptions);
  } else {
   if (tokenizerOptions.contains("removeSegMarker")) {
    throw new RuntimeException("Option 'removeSegMarker' cannot be used with ArabicSegmenter");
   }
   tokFactory = ArabicTokenizer.factory();
   tokFactory.setOptions(tokenizerOptions);
  }
  log.info("Loaded ArabicTokenizer with options: " + tokenizerOptions);
 }
 return tokFactory;
}

TokenizerFactory<CoreLabel> tokFactory = ArabicTokenizer.atbFactory();
String atbVocOptions = "removeProMarker,removeMorphMarker";
tokFactory.setOptions(atbVocOptions);

BufferedReader br = new BufferedReader(new InputStreamReader(new FileInputStream(path), "UTF-8"));
TokenizerFactory<CoreLabel> tf = ArabicTokenizer.factory();
tf.setOptions(tokOptions);
Mapper lexMapper = new DefaultLexicalMapper();
lexMapper.setup(null, "StripSegMarkersInUTF8", "StripMorphMarkersInUTF8");

  ArabicTokenizer.atbFactory() : ArabicTokenizer.factory();
for (String option : tokenizerOptions.stringPropertyNames()) {
 tf.setOptions(option);
tf.setOptions("tokenizeNLs");

 orthoOptions = orthoOptions.isEmpty() ? "tokenizeNLs" : orthoOptions + ",tokenizeNLs";
tf.setOptions(orthoOptions);

tf.setOptions(orthoOptions);

tokenizerFactory.setOptions(tokenizerOptions);

/**
 * Returns a factory for FrenchTokenizer that replicates the tokenization of
 * Green, de Marneffe, and Manning (2011).
 */
public static TokenizerFactory<CoreLabel> ftbFactory() {
 TokenizerFactory<CoreLabel> tf = FrenchTokenizerFactory.newTokenizerFactory();
 tf.setOptions(FTB_OPTIONS);
 return tf;
}

/**
 * Returns a tokenizer with Ancora tokenization.
 */
public static TokenizerFactory<CoreLabel> ancoraFactory() {
 TokenizerFactory<CoreLabel> tf = SpanishTokenizerFactory.newCoreLabelTokenizerFactory();
 tf.setOptions(ANCORA_OPTIONS);
 return tf;
}

/**
 * Returns a factory for FrenchTokenizer that replicates the tokenization of
 * Green, de Marneffe, and Manning (2011).
 */
public static TokenizerFactory<CoreLabel> ftbFactory() {
 TokenizerFactory<CoreLabel> tf = FrenchTokenizerFactory.newTokenizerFactory();
 tf.setOptions(FTB_OPTIONS);
 return tf;
}

/**
 * Returns a factory for FrenchTokenizer that replicates the tokenization of
 * Green, de Marneffe, and Manning (2011).
 *
 * @return
 */
public static TokenizerFactory<CoreLabel> ftbFactory() {
 TokenizerFactory<CoreLabel> tf = FrenchTokenizerFactory.newTokenizerFactory();
 tf.setOptions(FTB_OPTIONS);
 return tf;
}

/**
 * Returns a tokenizer with Ancora tokenization.
 */
public static TokenizerFactory<CoreLabel> ancoraFactory() {
 TokenizerFactory<CoreLabel> tf = SpanishTokenizerFactory.newCoreLabelTokenizerFactory();
 tf.setOptions(ANCORA_OPTIONS);
 return tf;
}

public static TokenizerFactory<CoreLabel> atbFactory() {
 TokenizerFactory<CoreLabel> tf = ArabicTokenizerFactory.newTokenizerFactory();
 for (String option : atbOptions.stringPropertyNames()) {
  tf.setOptions(option);
 }
 return tf;
}

public static TokenizerFactory<CoreLabel> atbFactory() {
 TokenizerFactory<CoreLabel> tf = ArabicTokenizerFactory.newTokenizerFactory();
 for (String option : atbOptions.stringPropertyNames()) {
  tf.setOptions(option);
 }
 return tf;
}

public static TokenizerFactory<CoreLabel> atbFactory() {
 TokenizerFactory<CoreLabel> tf = ArabicTokenizerFactory.newTokenizerFactory();
 for (String option : atbOptions.stringPropertyNames()) {
  tf.setOptions(option);
 }
 return tf;
}

/**
 * Creates an ArabicTokenizer. The default tokenizer
 * is ArabicTokenizer.atbFactory(), which produces the
 * same orthographic normalization as Green and Manning (2010).
 *
 * @return A TokenizerFactory that produces each Arabic token as a CoreLabel
 */
private TokenizerFactory<CoreLabel> getTokenizerFactory() {
 TokenizerFactory<CoreLabel> tokFactory = null;
 if ( ! isTokenized) {
  if (tokenizerOptions == null) {
   tokFactory = ArabicTokenizer.atbFactory();
   String atbVocOptions = "removeProMarker,removeMorphMarker,removeLengthening";
   tokFactory.setOptions(atbVocOptions);
  } else {
   if (tokenizerOptions.contains("removeSegMarker")) {
    throw new RuntimeException("Option 'removeSegMarker' cannot be used with ArabicSegmenter");
   }
   tokFactory = ArabicTokenizer.factory();
   tokFactory.setOptions(tokenizerOptions);
  }
  System.err.println("Loaded ArabicTokenizer with options: " + tokenizerOptions);
 }
 return tokFactory;
}

public static CoreNLPTokenAnnotator forLanguage(Language language) throws UnsupportedLanguageException {
  TokenizerFactory<?> factory = FACTORIES.get(language);
  if (factory == null)
    throw new UnsupportedLanguageException(language);
  /*sets special options if source language is English*/
  if (Language.ENGLISH.getLanguage().equals(language.getLanguage()))
    factory.setOptions("ptb3Escaping=false,asciiQuotes=true,normalizeSpace=false");
  return new CoreNLPTokenAnnotator(factory);
}

Javadoc

Sets default options for how tokenizers built from this factory should behave.

Popular methods of TokenizerFactory

getTokenizer
Get a tokenizer for this reader.

Popular in Java

Parsing JSON documents to java classes using gson
requestLocationUpdates (LocationManager)
compareTo (BigDecimal)
getSharedPreferences (Context)
Runnable (java.lang)
Represents a command that can be executed. Often used to run code in a different Thread.
Thread (java.lang)
A thread is a thread of execution in a program. The Java Virtual Machine allows an application to ha
URLConnection (java.net)
A connection to a URL for reading or writing. For HTTP connections, see HttpURLConnection for docume
MessageDigest (java.security)
Uses a one-way hash function to turn an arbitrary number of bytes into a fixed-length byte sequence.
Vector (java.util)
Vector is an implementation of List, backed by an array and synchronized. All optional operations in
Kernel (java.awt.image)
Best IntelliJ plugins

How to use setOptionsmethodin edu.stanford.nlp.process.TokenizerFactory

Best Java code snippets using edu.stanford.nlp.process.TokenizerFactory.setOptions (Showing top 20 results out of 315)

How to use
setOptions
method
in
edu.stanford.nlp.process.TokenizerFactory