/** * Returns a (possibly reused) {@link TokenStream} which tokenizes all the * text in the provided {@link Reader}. * * @return A {@link TokenStream} built from a {@link StandardTokenizer} * filtered with {@link LowerCaseFilter}, * {@link StopFilter}, {@link SetKeywordMarkerFilter} if a stem exclusion set is provided, * {@link StemmerOverrideFilter}, and {@link SnowballFilter} */ @Override protected TokenStreamComponents createComponents(String fieldName) { final Tokenizer source = new StandardTokenizer(); TokenStream result = new LowerCaseFilter(source); result = new StopFilter(result, stoptable); if (!excltable.isEmpty()) result = new SetKeywordMarkerFilter(result, excltable); if (stemdict != null) result = new StemmerOverrideFilter(result, stemdict); result = new SnowballFilter(result, new org.tartarus.snowball.ext.DutchStemmer()); return new TokenStreamComponents(source, result); }
public DutchSnowballStemmer() { super(new DutchStemmer()); } public Language getLanguage() { return Language.DUTCH; }
@Override public TokenStream apply(final TokenStream input) { return new SnowballFilter(input, new DutchStemmer()); } };
@Override public TokenStream create(TokenStream tokenStream, Version version) { return new SnowballFilter(tokenStream, new DutchStemmer()); } },
@Override public TokenStream create(TokenStream tokenStream) { tokenStream = new SetKeywordMarkerFilter(tokenStream, exclusions); return new SnowballFilter(tokenStream, new DutchStemmer()); } }
@Override public TokenStream create(TokenStream tokenStream) { tokenStream = new SetKeywordMarkerFilter(tokenStream, exclusions); return new SnowballFilter(tokenStream, new DutchStemmer()); } }
static public TokenStream dutch(TokenStream result) { result = new StandardFilter(result); result = new LowerCaseFilter(result); result = new SnowballFilter(result, new org.tartarus.snowball.ext.DutchStemmer()); return result; }
@Override public TokenStream getTokenStream(Tokenizer tokenizer, CharArraySet stemExclusionSet) { TokenStream stream = new StandardFilter(matchVersion, tokenizer); if (caseInsensitive) stream = new LowerCaseFilter(matchVersion, stream); if (useStopWords) stream = new StopFilter(matchVersion, stream, DutchAnalyzer.getDefaultStopSet()); if (useStem) { if (!stemExclusionSet.isEmpty()) stream = new SetKeywordMarkerFilter(stream, stemExclusionSet); // stream = new StemmerOverrideFilter(stream, DEFAULT_STEM_DICT); // TODO: Dafuq stream = new SnowballFilter(stream, new DutchStemmer()); } return stream; } }
DelimitedPayloadTokenFilterFactory.DEFAULT_DELIMITER, DelimitedPayloadTokenFilterFactory.DEFAULT_ENCODER))); filters.add(PreConfiguredTokenFilter.singleton("dutch_stem", false, input -> new SnowballFilter(input, new DutchStemmer()))); filters.add(PreConfiguredTokenFilter.singleton("edge_ngram", false, input -> new EdgeNGramTokenFilter(input, EdgeNGramTokenFilter.DEFAULT_MIN_GRAM_SIZE, EdgeNGramTokenFilter.DEFAULT_MAX_GRAM_SIZE)));
return new SnowballFilter(tokenStream, new DutchStemmer()); } else if ("dutch_kp".equalsIgnoreCase(language) || "dutchKp".equalsIgnoreCase(language) || "kp".equalsIgnoreCase(language)) { return new SnowballFilter(tokenStream, new KpStemmer());
return new SnowballFilter(tokenStream, new DutchStemmer()); } else if ("dutch_kp".equalsIgnoreCase(language) || "dutchKp".equalsIgnoreCase(language) || "kp".equalsIgnoreCase(language)) { return new SnowballFilter(tokenStream, new KpStemmer());
if (!stemdict.isEmpty()) result = new StemmerOverrideFilter(matchVersion, result, stemdict); result = new SnowballFilter(result, new org.tartarus.snowball.ext.DutchStemmer()); return new TokenStreamComponents(source, result); } else {
public static SnowballProgram getStemmer(String lang) { switch (lang.toUpperCase()) { case "EN": return new EnglishStemmer(); case "PT": return new PortugueseStemmer(); case "ES": return new SpanishStemmer(); case "DE": return new GermanStemmer(); case "FR": return new FrenchStemmer(); case "SV": return new SwedishStemmer(); case "IT": return new ItalianStemmer(); case "NL": return new DutchStemmer(); case "RU": return new RussianStemmer(); case "AR": case "FA": case "ZH": case "KO": return null; } return null; } }