@Override public TokenStream create(TokenStream tokenStream) { return new JapaneseKatakanaStemFilter(tokenStream, minimumLength); } }
private int stem(char[] term, int length) { if (length < minimumKatakanaLength) { return length; } if (! isKatakana(term, length)) { return length; } if (term[length - 1] == HIRAGANA_KATAKANA_PROLONGED_SOUND_MARK) { return length - 1; } return length; }
@Override public boolean incrementToken() throws IOException { if (input.incrementToken()) { if (!keywordAttr.isKeyword()) { termAttr.setLength(stem(termAttr.buffer(), termAttr.length())); } return true; } else { return false; } }
/** * Returns the next input Token, after being stemmed */ @Override public boolean incrementToken() throws IOException { if (input.incrementToken()) { if (!keywordAtt.isKeyword()) { final char buffer[] = termAtt.buffer(); int length = termAtt.length(); if (length > 3 && buffer[length-1] == KATAKANA_HIRAGANA_PROLONGED_SOUND_MARK && isKatakanaString(buffer, length)) { termAtt.setLength(length - 1); } } return true; } else { return false; } }
@Override public TokenStream create(final TokenStream tokenStream) { return new JapaneseKatakanaStemFilter(tokenStream, minimumLength); } }
@Override public TokenStream create(TokenStream tokenStream) { return new JapaneseKatakanaStemFilter(tokenStream); } }));
@Override public TokenStream create(TokenStream tokenStream) { return new JapaneseKatakanaStemFilter(tokenStream, minimumLength); } }
@Override public TokenStream create(TokenStream input) { return new JapaneseKatakanaStemFilter(input, minimumLength); } }
@Override public TokenStream create(TokenStream tokenStream) { return new JapaneseKatakanaStemFilter(tokenStream); } }));
@Override public TokenStream create(final TokenStream tokenStream) { return new JapaneseKatakanaStemFilter(tokenStream, minimumLength); } }
public TokenStream create(TokenStream stream) { return new JapaneseKatakanaStemFilter(stream); } }
@Override protected TokenStreamComponents createComponents(String fieldName) { Tokenizer tokenizer = new JapaneseTokenizer(userDict, true, mode); TokenStream stream = new JapaneseBaseFormFilter(tokenizer); stream = new JapanesePartOfSpeechStopFilter(stream, stoptags); stream = new CJKWidthFilter(stream); stream = new StopFilter(stream, stopwords); stream = new JapaneseKatakanaStemFilter(stream); stream = new LowerCaseFilter(stream); return new TokenStreamComponents(tokenizer, stream); }
@Override public TokenStream getTokenStream(Tokenizer tokenizer, CharArraySet stemExclusionSet) { TokenStream stream = new JapaneseBaseFormFilter(tokenizer); stream = new CJKWidthFilter(stream); if (caseInsensitive) stream = new LowerCaseFilter(matchVersion, stream); if (useStopWords) { stream = new JapanesePartOfSpeechStopFilter(true, stream, JapaneseAnalyzer.getDefaultStopTags()); stream = new StopFilter(matchVersion, stream, JapaneseAnalyzer.getDefaultStopSet()); } if (useStem) stream = new JapaneseKatakanaStemFilter(stream); return stream; } }
/** * Creates * {@link org.apache.lucene.analysis.util.ReusableAnalyzerBase.TokenStreamComponents} * used to tokenize all the text in the provided {@link Reader}. * * @return {@link org.apache.lucene.analysis.util.ReusableAnalyzerBase.TokenStreamComponents} * built from a {@link JapaneseTokenizer} filtered with * {@link JapaneseWidthFilter}, {@link JapanesePunctuationFilter}, * {@link JapanesePartOfSpeechStopFilter}, {@link JapaneseStopFilter}, * {@link KeywordMarkerFilter} if a stem exclusion set is provided, * {@link JapaneseBasicFormFilter}, {@link JapaneseKatakanaStemFilter}, * and {@link LowerCaseFilter} */ @Override protected TokenStreamComponents createComponents(String field, Reader reader) { Tokenizer tokenizer = new JapaneseTokenizer(reader, null, dictionaryDir); TokenStream stream = new JapaneseWidthFilter(tokenizer); stream = new JapanesePunctuationFilter(true, stream); stream = new JapanesePartOfSpeechStopFilter(true, stream, stoptags); stream = new StopFilter(matchVersion, stream, stopwords); if (!stemExclusionSet.isEmpty()) stream = new KeywordMarkerFilter(stream, stemExclusionSet); stream = new JapaneseBasicFormFilter(stream); stream = new JapaneseKatakanaStemFilter(stream); stream = new LowerCaseFilter(matchVersion, stream); return new TokenStreamComponents(tokenizer, stream); } }