/** Constructs a {@link StandardTokenizer} filtered by a {@link StandardFilter}, a {@link LowerCaseFilter} and a {@link StopFilter}. */ public TokenStream tokenStream(String fieldName, Reader reader) { TokenStream result = new StandardTokenizer(reader); result = new StandardFilter(result); result = new LowerCaseFilter(result); result = new StopFilter(result, stopSet); return result; } }
public TokenStream tokenStream(final String arg0, final Reader reader) { return (new StandardFilter(new StandardTokenizer(LuceneVersion.get(), reader))); } };
/** * Creates a * {@link org.apache.lucene.analysis.Analyzer.TokenStreamComponents} * which tokenizes all the text in the provided {@link Reader}. * * @return A * {@link org.apache.lucene.analysis.Analyzer.TokenStreamComponents} * built from an {@link StandardTokenizer} filtered with * {@link StandardFilter}, {@link LowerCaseFilter}, {@link StopFilter} * , {@link SetKeywordMarkerFilter} if a stem exclusion set is * provided and {@link SnowballFilter}. */ @Override protected TokenStreamComponents createComponents(String fieldName) { final Tokenizer source = new StandardTokenizer(); TokenStream result = new StandardFilter(source); result = new LowerCaseFilter(result); result = new StopFilter(result, stopwords); if(!stemExclusionSet.isEmpty()) result = new SetKeywordMarkerFilter(result, stemExclusionSet); result = new SnowballFilter(result, new LithuanianStemmer()); return new TokenStreamComponents(source, result); } }
@Override public final TokenStream tokenStream(String fieldName, Reader reader) { //Tokenizer tokenStream = new KeywordTokenizer(reader); Tokenizer tokenStream = new StandardTokenizer(Version.LUCENE_36, reader); TokenStream result = new StandardFilter(Version.LUCENE_36, tokenStream); result = new LowerCaseFilter(Version.LUCENE_36, result); return result; }; }
@Override /** * Reproduce the behavior in the org.eaglei.solr.AutoSuggestQueryAnalyzer class. */ public TokenStream create(final TokenStream tokenStream) { TokenStream result = new StandardFilter(Version.LUCENE_36, tokenStream ); result = new LowerCaseFilter( Version.LUCENE_36, result ); return result; }
public TokenStream reusableTokenStream(String fieldName, Reader reader) throws IOException { SavedStreams streams = (SavedStreams) getPreviousTokenStream(); if (streams == null) { streams = new SavedStreams(); setPreviousTokenStream(streams); streams.tokenStream = new StandardTokenizer(reader); streams.filteredTokenStream = new StandardFilter(streams.tokenStream); streams.filteredTokenStream = new LowerCaseFilter(streams.filteredTokenStream); streams.filteredTokenStream = new StopFilter(streams.filteredTokenStream, stopSet); } else { streams.tokenStream.reset(reader); } streams.tokenStream.setMaxTokenLength(maxTokenLength); streams.tokenStream.setReplaceInvalidAcronym(replaceInvalidAcronym); return streams.filteredTokenStream; }
/** * Creates a * {@link org.apache.lucene.analysis.ReusableAnalyzerBase.TokenStreamComponents} * which tokenizes all the text in the provided {@link Reader}. * * @return A * {@link org.apache.lucene.analysis.ReusableAnalyzerBase.TokenStreamComponents} * built from an {@link StandardTokenizer} filtered with * {@link StandardFilter}, {@link IrishLowerCaseFilter}, {@link StopFilter} * , {@link KeywordMarkerFilter} if a stem exclusion set is * provided and {@link SnowballFilter}. */ @Override protected TokenStreamComponents createComponents(String fieldName, Reader reader) { final Tokenizer source = new StandardTokenizer(matchVersion, reader); TokenStream result = new StandardFilter(matchVersion, source); StopFilter s = new StopFilter(matchVersion, result, HYPHENATIONS); s.setEnablePositionIncrements(false); result = s; result = new ElisionFilter(matchVersion, result, DEFAULT_ARTICLES); result = new IrishLowerCaseFilter(result); result = new StopFilter(matchVersion, result, stopwords); if(!stemExclusionSet.isEmpty()) result = new KeywordMarkerFilter(result, stemExclusionSet); result = new SnowballFilter(result, new IrishStemmer()); return new TokenStreamComponents(source, result); } }
@Override public TokenStream getTokenStream(Tokenizer tokenizer, CharArraySet stemExclusionSet) { TokenStream stream = new StandardFilter(matchVersion, tokenizer); if (caseInsensitive) stream = new IrishLowerCaseFilter(stream); if (useStopWords) stream = new StopFilter(matchVersion, stream, IrishAnalyzer.getDefaultStopSet()); if (useStem) { if (!stemExclusionSet.isEmpty()) stream = new SetKeywordMarkerFilter(stream, stemExclusionSet); stream = new SnowballFilter(stream, new IrishStemmer()); } return stream; }
@Override public TokenStream tokenStream(String fieldName, Reader reader) { TokenStream result = new ClassicTokenizer(Version.LUCENE_36, reader); result = new StandardFilter(Version.LUCENE_36, result); result = new LowerCaseFilter(Version.LUCENE_36, result); result = new StopFilter(Version.LUCENE_36, result, DEFAULT_STOP_SET); result = new ASCIIFoldingFilter(result); return result; }
static public TokenStream dutch(TokenStream result) { result = new StandardFilter(result); result = new LowerCaseFilter(result); result = new SnowballFilter(result, new org.tartarus.snowball.ext.DutchStemmer()); return result; }
@Override public StandardFilter create(TokenStream input) { return new StandardFilter(input); } }
/** * Creates * {@link TokenStreamComponents} * used to tokenize all the text in the provided {@link Reader}. * * @return {@link TokenStreamComponents} * built from a {@link StandardTokenizer} filtered with * {@link StandardFilter}, {@link LowerCaseFilter}, {@link StopFilter} * , {@link SetKeywordMarkerFilter} if a stem exclusion set is * provided, {@link GermanNormalizationFilter} and {@link GermanLightStemFilter} */ @Override protected TokenStreamComponents createComponents(String fieldName, Reader reader) { final Tokenizer source = new StandardTokenizer(reader); TokenStream result = new StandardFilter(source); result = new GermanNormalizationFilter(result); return new TokenStreamComponents(source, result); }
/** * Creates * {@link org.apache.lucene.analysis.Analyzer.TokenStreamComponents} * used to tokenize all the text in the provided {@link Reader}. * * @return {@link org.apache.lucene.analysis.Analyzer.TokenStreamComponents} * built from a {@link StandardTokenizer} filtered with * {@link StandardFilter}, {@link LowerCaseFilter}, {@link StopFilter} * , {@link SetKeywordMarkerFilter} if a stem exclusion set is * provided, and {@link SnowballFilter} */ @Override protected TokenStreamComponents createComponents(String fieldName) { final Tokenizer source; if (getVersion().onOrAfter(Version.LUCENE_4_7_0)) { source = new StandardTokenizer(); } else { source = new StandardTokenizer40(); } TokenStream result = new StandardFilter(source); result = new LowerCaseFilter(result); result = new StopFilter(result, stopwords); if (!stemExclusionSet.isEmpty()) result = new SetKeywordMarkerFilter(result, stemExclusionSet); result = new SnowballFilter(result, new org.tartarus.snowball.ext.RussianStemmer()); return new TokenStreamComponents(source, result); } }
@Override public final TokenStream tokenStream(String fieldName, Reader reader) { Tokenizer tokenStream = new StandardTokenizer(Version.LUCENE_36, reader); // may want to use keyword tokenizer for more standard "single token" auto-suggest //Tokenizer tokenStream = new KeywordTokenizer(reader); TokenStream result = new StandardFilter(Version.LUCENE_36, tokenStream); result = new LowerCaseFilter(Version.LUCENE_36, result); result = new EdgeNGramTokenFilter(result, EdgeNGramTokenFilter.Side.FRONT,1, 20); return result; }; }
protected TokenStream normalize(String fieldName, TokenStream in) { TokenStream result = new StandardFilter(in); result = new LowerCaseFilter(result); return result; } }
public TokenStream reusableTokenStream(String fieldName, Reader reader) throws IOException { SavedStreams streams = (SavedStreams) getPreviousTokenStream(); if (streams == null) { streams = new SavedStreams(); setPreviousTokenStream(streams); streams.tokenStream = new StandardTokenizer(reader); streams.filteredTokenStream = new StandardFilter(streams.tokenStream); streams.filteredTokenStream = new LowerCaseFilter(streams.filteredTokenStream); streams.filteredTokenStream = new StopFilter(streams.filteredTokenStream, stopSet); } else { streams.tokenStream.reset(reader); } streams.tokenStream.setMaxTokenLength(maxTokenLength); streams.tokenStream.setReplaceInvalidAcronym(replaceInvalidAcronym); return streams.filteredTokenStream; }
@Override public TokenStream getTokenStream(Tokenizer tokenizer, CharArraySet stemExclusionSet) { TokenStream stream = new StandardFilter(matchVersion, tokenizer); if (caseInsensitive) stream = new LowerCaseFilter(matchVersion, stream); if (useStopWords) stream = new StopFilter(matchVersion, stream, ArmenianAnalyzer.getDefaultStopSet()); if (useStem) { if (!stemExclusionSet.isEmpty()) stream = new SetKeywordMarkerFilter(stream, stemExclusionSet); stream = new SnowballFilter(stream, new ArmenianStemmer()); } return stream; } }
@Override protected TokenStreamComponents createComponents(final String fieldName) { final Tokenizer source = new KeywordTokenizer(); TokenStream result = new StandardFilter(source); result = new CharacterFilter(result); result = new ASCIIFoldingFilter(result); result = new LowerCaseFilter(result); // result = new WordDelimiterFilter(result, WordDelimiterFilter.DIGIT, null); return new TokenStreamComponents(source, result); }
/** Constructs a {@link StandardTokenizer} filtered by a {@link StandardFilter}, a {@link LowerCaseFilter} and a {@link StopFilter}. */ public TokenStream tokenStream(String fieldName, Reader reader) { StandardTokenizer tokenStream = new StandardTokenizer(reader, replaceInvalidAcronym); tokenStream.setMaxTokenLength(maxTokenLength); TokenStream result = new StandardFilter(tokenStream); result = new LowerCaseFilter(result); result = new StopFilter(result, stopSet); return result; }
/** {@inheritDoc} */ @Override protected Analyzer.TokenStreamComponents createComponents(String fieldName) { final Tokenizer source = new StandardTokenizer(); TokenStream result = new StandardFilter(source); result = new LowerCaseFilter(result); result = new StopFilter(result, stopwords); result = new SnowballFilter(result, language); return new TokenStreamComponents(source, result); } }