/** * Returns a thread-safe tokenizer */ public Tokenizer<CoreLabel> getTokenizer(Reader r) { return factory.getTokenizer(r); }
TokenizerFactory<CoreLabel> tokFactory = ArabicTokenizer.atbFactory(); String atbVocOptions = "removeProMarker,removeMorphMarker"; tokFactory.setOptions(atbVocOptions); String word = wordTagPair[0]; if (tokFactory != null) { List<CoreLabel> lexList = tokFactory.getTokenizer(new StringReader(word)).tokenize(); if (lexList.size() == 0) { continue;
public static TokenizerFactory<CoreLabel> atbFactory() { TokenizerFactory<CoreLabel> tf = ArabicTokenizerFactory.newTokenizerFactory(); for (String option : atbOptions.stringPropertyNames()) { tf.setOptions(option); } return tf; }
/** * Returns a tokenizer with Ancora tokenization. */ public static TokenizerFactory<CoreLabel> ancoraFactory() { TokenizerFactory<CoreLabel> tf = SpanishTokenizerFactory.newCoreLabelTokenizerFactory(); tf.setOptions(ANCORA_OPTIONS); return tf; }
private static boolean isDeletedCharacter(char ch, TokenizerFactory<CoreLabel> tf) { List<CoreLabel> tokens = tf.getTokenizer(new StringReader(Character.toString(ch))).tokenize(); return tokens.isEmpty(); }
BufferedReader br = new BufferedReader(new InputStreamReader(new FileInputStream(path), "UTF-8")); TokenizerFactory<CoreLabel> tf = ArabicTokenizer.factory(); tf.setOptions(tokOptions); Mapper lexMapper = new DefaultLexicalMapper(); lexMapper.setup(null, "StripSegMarkersInUTF8", "StripMorphMarkersInUTF8"); List<CoreLabel> tokenizedLine = tf.getTokenizer(new StringReader(line)).tokenize(); System.out.println(SentenceUtils.listToString(tokenizedLine));
/** * Returns a factory for FrenchTokenizer that replicates the tokenization of * Green, de Marneffe, and Manning (2011). */ public static TokenizerFactory<CoreLabel> ftbFactory() { TokenizerFactory<CoreLabel> tf = FrenchTokenizerFactory.newTokenizerFactory(); tf.setOptions(FTB_OPTIONS); return tf; }
/** * Tokenizes the given text to populate the list of words this Document * represents. The default implementation uses the current tokenizer and tokenizes * the entirety of the text into words. Subclasses should override this method * to parse documents in non-standard formats, and/or to pull the title of the * document from the text. The given text may be empty ("") but will never * be null. Subclasses may want to do additional processing and then just * call super.parse. * * @see #setTokenizerFactory */ protected void parse(String text) { Tokenizer<Word> toke = tokenizerFactory.getTokenizer(new StringReader(text)); addAll(toke.tokenize()); }
ArabicTokenizer.atbFactory() : ArabicTokenizer.factory(); for (String option : tokenizerOptions.stringPropertyNames()) { tf.setOptions(option); tf.setOptions("tokenizeNLs"); try { final String encoding = "UTF-8"; Tokenizer<CoreLabel> tokenizer = tf.getTokenizer(new InputStreamReader(System.in, encoding)); boolean printSpace = false; while (tokenizer.hasNext()) {
/** * Creates an ArabicTokenizer. The default tokenizer * is ArabicTokenizer.atbFactory(), which produces the * same orthographic normalization as Green and Manning (2010). * * @return A TokenizerFactory that produces each Arabic token as a CoreLabel */ private TokenizerFactory<CoreLabel> getTokenizerFactory() { TokenizerFactory<CoreLabel> tokFactory = null; if ( ! isTokenized) { if (tokenizerOptions == null) { tokFactory = ArabicTokenizer.atbFactory(); String atbVocOptions = "removeProMarker,removeMorphMarker,removeLengthening"; tokFactory.setOptions(atbVocOptions); } else { if (tokenizerOptions.contains("removeSegMarker")) { throw new RuntimeException("Option 'removeSegMarker' cannot be used with ArabicSegmenter"); } tokFactory = ArabicTokenizer.factory(); tokFactory.setOptions(tokenizerOptions); } log.info("Loaded ArabicTokenizer with options: " + tokenizerOptions); } return tokFactory; }
/** Return the tokens using PTB tokenizer. * * @param str String to tokenize * @return List of tokens */ private String[] ptbTokenize(String str) { // todo [cdm 2017]: Someday should generalize this to allow use of other tokenizers if (ptbFactory==null) { ptbFactory = PTBTokenizer.factory(); } Tokenizer<Word> tokenizer = ptbFactory.getTokenizer(new StringReader(str)); List<Word> words = tokenizer.tokenize(); String[] res = new String[words.size()]; for (int i = 0, sz = words.size(); i < sz; i++) { res[i] = words.get(i).word(); } return res; }
tf.setOptions(orthoOptions); final long startTime = System.nanoTime(); try { Tokenizer<CoreLabel> tokenizer = tf.getTokenizer(new InputStreamReader(System.in, encoding)); boolean printSpace = false; while (tokenizer.hasNext()) {
tokenizerFactory.setOptions(tokenizerOptions);
/** * Tokenize the text using the parser's tokenizer */ public List<? extends HasWord> tokenize(String sentence) { TokenizerFactory<? extends HasWord> tf = treebankLanguagePack().getTokenizerFactory(); Tokenizer<? extends HasWord> tokenizer = tf.getTokenizer(new StringReader(sentence)); List<? extends HasWord> tokens = tokenizer.tokenize(); return tokens; }
orthoOptions = orthoOptions.isEmpty() ? "tokenizeNLs" : orthoOptions + ",tokenizeNLs"; tf.setOptions(orthoOptions); final long startTime = System.nanoTime(); try { Tokenizer<CoreLabel> tokenizer = tf.getTokenizer(new BufferedReader(new InputStreamReader(System.in, encoding))); BufferedWriter writer = new BufferedWriter(new OutputStreamWriter(System.out, encoding)); boolean printSpace = false;
/** * Returns a factory for FrenchTokenizer that replicates the tokenization of * Green, de Marneffe, and Manning (2011). */ public static TokenizerFactory<CoreLabel> ftbFactory() { TokenizerFactory<CoreLabel> tf = FrenchTokenizerFactory.newTokenizerFactory(); tf.setOptions(FTB_OPTIONS); return tf; }
} else { if (eolIsSignificant) { tokenizer = tokenizerFactory.getTokenizer(inputReader, "tokenizeNLs"); } else { tokenizer = tokenizerFactory.getTokenizer(inputReader);
TokenizerFactory<CoreLabel> tokFactory = ArabicTokenizer.atbFactory(); String atbVocOptions = "removeProMarker,removeMorphMarker"; tokFactory.setOptions(atbVocOptions); String word = wordTagPair[0]; if (tokFactory != null) { List<CoreLabel> lexList = tokFactory.getTokenizer(new StringReader(word)).tokenize(); if (lexList.size() == 0) { continue;
public static TokenizerFactory<CoreLabel> atbFactory() { TokenizerFactory<CoreLabel> tf = ArabicTokenizerFactory.newTokenizerFactory(); for (String option : atbOptions.stringPropertyNames()) { tf.setOptions(option); } return tf; }
/** * Tokenizes the highlighted text (using a tokenizer appropriate for the * selected language, and initiates the ParseThread to parse the tokenized * text. */ public void parse() { if (textPane.getText().length() == 0) { return; } // use endIndex+1 because substring subtracts 1 String text = textPane.getText().substring(startIndex, endIndex + 1).trim(); if (parser != null && text.length() > 0) { //Tokenizer<? extends HasWord> toke = tlp.getTokenizerFactory().getTokenizer(new CharArrayReader(text.toCharArray())); Tokenizer<? extends HasWord> toke = tlp.getTokenizerFactory().getTokenizer(new StringReader(text)); List<? extends HasWord> wordList = toke.tokenize(); parseThread = new ParseThread(wordList); parseThread.start(); startProgressMonitor("Parsing", PARSE_TIME); } }