/** * Returns a factory for FrenchTokenizer that replicates the tokenization of * Green, de Marneffe, and Manning (2011). */ public static TokenizerFactory<CoreLabel> ftbFactory() { TokenizerFactory<CoreLabel> tf = FrenchTokenizerFactory.newTokenizerFactory(); tf.setOptions(FTB_OPTIONS); return tf; }
/** * Returns a tokenizer with Ancora tokenization. */ public static TokenizerFactory<CoreLabel> ancoraFactory() { TokenizerFactory<CoreLabel> tf = SpanishTokenizerFactory.newCoreLabelTokenizerFactory(); tf.setOptions(ANCORA_OPTIONS); return tf; }
public static TokenizerFactory<CoreLabel> atbFactory() { TokenizerFactory<CoreLabel> tf = ArabicTokenizerFactory.newTokenizerFactory(); for (String option : atbOptions.stringPropertyNames()) { tf.setOptions(option); } return tf; }
/** * Creates an ArabicTokenizer. The default tokenizer * is ArabicTokenizer.atbFactory(), which produces the * same orthographic normalization as Green and Manning (2010). * * @return A TokenizerFactory that produces each Arabic token as a CoreLabel */ private TokenizerFactory<CoreLabel> getTokenizerFactory() { TokenizerFactory<CoreLabel> tokFactory = null; if ( ! isTokenized) { if (tokenizerOptions == null) { tokFactory = ArabicTokenizer.atbFactory(); String atbVocOptions = "removeProMarker,removeMorphMarker,removeLengthening"; tokFactory.setOptions(atbVocOptions); } else { if (tokenizerOptions.contains("removeSegMarker")) { throw new RuntimeException("Option 'removeSegMarker' cannot be used with ArabicSegmenter"); } tokFactory = ArabicTokenizer.factory(); tokFactory.setOptions(tokenizerOptions); } log.info("Loaded ArabicTokenizer with options: " + tokenizerOptions); } return tokFactory; }
TokenizerFactory<CoreLabel> tokFactory = ArabicTokenizer.atbFactory(); String atbVocOptions = "removeProMarker,removeMorphMarker"; tokFactory.setOptions(atbVocOptions);
BufferedReader br = new BufferedReader(new InputStreamReader(new FileInputStream(path), "UTF-8")); TokenizerFactory<CoreLabel> tf = ArabicTokenizer.factory(); tf.setOptions(tokOptions); Mapper lexMapper = new DefaultLexicalMapper(); lexMapper.setup(null, "StripSegMarkersInUTF8", "StripMorphMarkersInUTF8");
ArabicTokenizer.atbFactory() : ArabicTokenizer.factory(); for (String option : tokenizerOptions.stringPropertyNames()) { tf.setOptions(option); tf.setOptions("tokenizeNLs");
orthoOptions = orthoOptions.isEmpty() ? "tokenizeNLs" : orthoOptions + ",tokenizeNLs"; tf.setOptions(orthoOptions);
tf.setOptions(orthoOptions);
tokenizerFactory.setOptions(tokenizerOptions);
/** * Returns a factory for FrenchTokenizer that replicates the tokenization of * Green, de Marneffe, and Manning (2011). */ public static TokenizerFactory<CoreLabel> ftbFactory() { TokenizerFactory<CoreLabel> tf = FrenchTokenizerFactory.newTokenizerFactory(); tf.setOptions(FTB_OPTIONS); return tf; }
/** * Returns a tokenizer with Ancora tokenization. */ public static TokenizerFactory<CoreLabel> ancoraFactory() { TokenizerFactory<CoreLabel> tf = SpanishTokenizerFactory.newCoreLabelTokenizerFactory(); tf.setOptions(ANCORA_OPTIONS); return tf; }
/** * Returns a factory for FrenchTokenizer that replicates the tokenization of * Green, de Marneffe, and Manning (2011). */ public static TokenizerFactory<CoreLabel> ftbFactory() { TokenizerFactory<CoreLabel> tf = FrenchTokenizerFactory.newTokenizerFactory(); tf.setOptions(FTB_OPTIONS); return tf; }
/** * Returns a factory for FrenchTokenizer that replicates the tokenization of * Green, de Marneffe, and Manning (2011). * * @return */ public static TokenizerFactory<CoreLabel> ftbFactory() { TokenizerFactory<CoreLabel> tf = FrenchTokenizerFactory.newTokenizerFactory(); tf.setOptions(FTB_OPTIONS); return tf; }
/** * Returns a tokenizer with Ancora tokenization. */ public static TokenizerFactory<CoreLabel> ancoraFactory() { TokenizerFactory<CoreLabel> tf = SpanishTokenizerFactory.newCoreLabelTokenizerFactory(); tf.setOptions(ANCORA_OPTIONS); return tf; }
public static TokenizerFactory<CoreLabel> atbFactory() { TokenizerFactory<CoreLabel> tf = ArabicTokenizerFactory.newTokenizerFactory(); for (String option : atbOptions.stringPropertyNames()) { tf.setOptions(option); } return tf; }
public static TokenizerFactory<CoreLabel> atbFactory() { TokenizerFactory<CoreLabel> tf = ArabicTokenizerFactory.newTokenizerFactory(); for (String option : atbOptions.stringPropertyNames()) { tf.setOptions(option); } return tf; }
public static TokenizerFactory<CoreLabel> atbFactory() { TokenizerFactory<CoreLabel> tf = ArabicTokenizerFactory.newTokenizerFactory(); for (String option : atbOptions.stringPropertyNames()) { tf.setOptions(option); } return tf; }
/** * Creates an ArabicTokenizer. The default tokenizer * is ArabicTokenizer.atbFactory(), which produces the * same orthographic normalization as Green and Manning (2010). * * @return A TokenizerFactory that produces each Arabic token as a CoreLabel */ private TokenizerFactory<CoreLabel> getTokenizerFactory() { TokenizerFactory<CoreLabel> tokFactory = null; if ( ! isTokenized) { if (tokenizerOptions == null) { tokFactory = ArabicTokenizer.atbFactory(); String atbVocOptions = "removeProMarker,removeMorphMarker,removeLengthening"; tokFactory.setOptions(atbVocOptions); } else { if (tokenizerOptions.contains("removeSegMarker")) { throw new RuntimeException("Option 'removeSegMarker' cannot be used with ArabicSegmenter"); } tokFactory = ArabicTokenizer.factory(); tokFactory.setOptions(tokenizerOptions); } System.err.println("Loaded ArabicTokenizer with options: " + tokenizerOptions); } return tokFactory; }
public static CoreNLPTokenAnnotator forLanguage(Language language) throws UnsupportedLanguageException { TokenizerFactory<?> factory = FACTORIES.get(language); if (factory == null) throw new UnsupportedLanguageException(language); /*sets special options if source language is English*/ if (Language.ENGLISH.getLanguage().equals(language.getLanguage())) factory.setOptions("ptb3Escaping=false,asciiQuotes=true,normalizeSpace=false"); return new CoreNLPTokenAnnotator(factory); }