@Override public Tokenizer create() { return new JapaneseTokenizer(null, true, Mode.SEARCH); } }));
@Override public Tokenizer create() { final JapaneseTokenizer t = new JapaneseTokenizer(userDictionary, discartPunctuation, mode); int nBestCost = this.nBestCost; if (nBestExamples != null) { nBestCost = Math.max(nBestCost, t.calcNBestCost(nBestExamples)); } t.setNBestCost(nBestCost); return t; }
@Override public void close() throws IOException { tokenizer.close(); }
setReader(new StringReader(inText)); reset(); try { setNBestCost(1); int prevRootBase = -1; while (incrementToken()) { if (lattice.rootBase != prevRootBase) { prevRootBase = lattice.rootBase; end(); close(); setNBestCost(saveNBestCost);
@SuppressWarnings("resource") @Override protected TokenStream getTokenStream(String strOrig, boolean stemsAllowed, boolean stopWordsAllowed) throws IOException { if (stemsAllowed) { // Blank out tags when stemming only strOrig = blankOutTags(strOrig); CharArraySet stopWords = stopWordsAllowed ? JapaneseAnalyzer.getDefaultStopSet() : CharArraySet.EMPTY_SET; Set<String> stopTags = stopWordsAllowed ? JapaneseAnalyzer.getDefaultStopTags() : Collections.emptySet(); return new JapaneseAnalyzer(null, Mode.SEARCH, stopWords, stopTags).tokenStream("", new StringReader(strOrig)); } else { JapaneseTokenizer tokenizer = new JapaneseTokenizer(null, false, Mode.NORMAL); tokenizer.setReader(new StringReader(strOrig)); return new TagJoiningFilter(tokenizer); } }
parse(); clearAttributes(); assert length > 0; offsetAtt.setOffset(correctOffset(position), correctOffset(position+length)); basicFormAtt.setToken(token); posAtt.setToken(token);
@Override public boolean incrementToken() throws IOException { Token token = tagger.next(); if (token == null) { return false; } else { clearAttributes(); final Morpheme m = token.getMorpheme(); // note, unlike the previous implementation, we set the surface form termAtt.setEmpty().append(token.getSurface()); final int cost = token.getCost(); if (token.isSentenceStart()) { accumulatedCost = 0; sentenceAtt.setSentenceStart(true); } costAtt.setCost(cost - accumulatedCost); accumulatedCost = cost; basicFormAtt.setMorpheme(m); conjugationAtt.setMorpheme(m); partOfSpeechAtt.setMorpheme(m); pronunciationsAtt.setMorpheme(m); readingsAtt.setMorpheme(m); offsetAtt.setOffset(correctOffset(token.getStart()), correctOffset(token.end())); return true; } }
@Override public void end() throws IOException { super.end(); // Set final offset int finalOffset = correctOffset(pos); offsetAtt.setOffset(finalOffset, finalOffset); }
@Override public void end() throws IOException { tokenizer.end(); }
@Override public boolean incrementToken() throws IOException { return tokenizer.incrementToken(); }
@Override public void end() throws IOException { // set final offset final int finalOffset = correctOffset(tagger.end()); offsetAtt.setOffset(finalOffset, finalOffset); } }
public Tokenizer create(Reader reader) { return new JapaneseTokenizer(reader, compositeTokenFilter, dictionaryDir); } }
@Override public JapaneseTokenizer create(AttributeFactory factory) { JapaneseTokenizer t = new JapaneseTokenizer(factory, userDictionary, discardPunctuation, mode); if (nbestExamples != null) { nbestCost = Math.max(nbestCost, t.calcNBestCost(nbestExamples)); } t.setNBestCost(nbestCost); return t; } }
@Override public Tokenizer create(Reader reader) { return new JapaneseTokenizer(reader, userDictionary, discartPunctuation, mode); }
@Override public Tokenizer create(Reader reader) { return new JapaneseTokenizer(reader, null, true, Mode.SEARCH); } }));
@Override public Tokenizer makeTokenizer(Reader r) { return new org.apache.lucene.analysis.ja.JapaneseTokenizer(r, null, false, org.apache.lucene.analysis.ja.JapaneseTokenizer.DEFAULT_MODE); }
@Override public Tokenizer create() { return new JapaneseTokenizer(userDictionary, discartPunctuation, mode); }
TokenizerWrapper() { super(); tokenizerTimestamp = dictionaryTimestamp; tokenizer = new JapaneseTokenizer(userDictionary, discartPunctuation, mode); try { final Field attributesField = getAccessibleField(AttributeSource.class, "attributes"); final Object attributesObj = attributesField.get(tokenizer); attributesField.set(this, attributesObj); final Field attributeImplsField = getAccessibleField(AttributeSource.class, "attributeImpls"); final Object attributeImplsObj = attributeImplsField.get(tokenizer); attributeImplsField.set(this, attributeImplsObj); final Field currentStateField = getAccessibleField(AttributeSource.class, "currentState"); final Object currentStateObj = currentStateField.get(tokenizer); currentStateField.set(this, currentStateObj); } catch (final Exception e) { throw new IllegalStateException( "Failed to update the tokenizer.", e); } }
@Override protected TokenStreamComponents createComponents(String fieldName) { Tokenizer tokenizer = new JapaneseTokenizer(userDict, true, mode); TokenStream stream = new JapaneseBaseFormFilter(tokenizer); stream = new JapanesePartOfSpeechStopFilter(stream, stoptags); stream = new CJKWidthFilter(stream); stream = new StopFilter(stream, stopwords); stream = new JapaneseKatakanaStemFilter(stream); stream = new LowerCaseFilter(stream); return new TokenStreamComponents(tokenizer, stream); }