Refine search
/** * Creates a the TokenStreamComponents used to analyze the stream. * * @param fieldName the field that this lucene analyzer will process * @return the token stream filter chain */ @Override protected TokenStreamComponents createComponents(String fieldName) { //final Tokenizer source = new AlphaNumericTokenizer(); final Tokenizer source = new WhitespaceTokenizer(); TokenStream stream = source; stream = new UrlTokenizingFilter(stream); stream = new AlphaNumericFilter(stream); stream = new WordDelimiterGraphFilter(stream, WordDelimiterGraphFilter.GENERATE_WORD_PARTS | WordDelimiterGraphFilter.GENERATE_NUMBER_PARTS | WordDelimiterGraphFilter.PRESERVE_ORIGINAL | WordDelimiterGraphFilter.SPLIT_ON_CASE_CHANGE | WordDelimiterGraphFilter.SPLIT_ON_NUMERICS | WordDelimiterGraphFilter.STEM_ENGLISH_POSSESSIVE, null); stream = new LowerCaseFilter(stream); stream = new StopFilter(stream, stopWords); concatenatingFilter = new TokenPairConcatenatingFilter(stream); return new TokenStreamComponents(source, concatenatingFilter); }
/** * Returns the set of stop words being used. * * @return the set of stop words being used */ public static CharArraySet getStopWords() { final CharArraySet words = StopFilter.makeStopSet(ADDITIONAL_STOP_WORDS, true); words.addAll(StopAnalyzer.ENGLISH_STOP_WORDS_SET); return words; }
@Override public TokenStream getTokenStream(Tokenizer tokenizer, CharArraySet stemExclusionSet) { TokenStream stream = new StandardFilter(matchVersion, tokenizer); if (caseInsensitive) stream = new LowerCaseFilter(matchVersion, stream); if (useStopWords) stream = new StopFilter(matchVersion, stream, IndonesianAnalyzer.getDefaultStopSet()); if (useStem && !stemExclusionSet.isEmpty()) { stream = new SetKeywordMarkerFilter(stream, stemExclusionSet); } return stream; } }
@Override protected TokenStreamComponents createComponents(final String fieldName, final Reader reader) { final StandardTokenizer src = new StandardTokenizer(matchVersion, reader); src.setMaxTokenLength(maxTokenLength); TokenStream tok = new StandardFilter(matchVersion, src); tok = new LowerCaseFilter(matchVersion, tok); tok = new StopFilter(matchVersion, tok, stopwords); return new TokenStreamComponents(src, tok) { @Override protected void setReader(final Reader reader) throws IOException { src.setMaxTokenLength(ReaderStandardAnalyzer.this.maxTokenLength); super.setReader(reader); } }; }
@Override protected TokenStreamComponents createComponents(final String field) { final Tokenizer source = new NGramTokenizer(minNgram(), maxNgram()); final TokenStream result = new StopFilter( new LowerCaseFilter(new StandardFilter(source)), new CharArraySet(asList(stopWords()), true)); return new TokenStreamComponents(source, result); }
@Override public TokenStream getTokenStream(Tokenizer tokenizer, CharArraySet stemExclusionSet) { TokenStream stream = new StandardFilter(matchVersion, tokenizer); if (caseInsensitive) stream = new GreekLowerCaseFilter(matchVersion, stream); if (useStopWords) stream = new StopFilter(matchVersion, stream, GreekAnalyzer.getDefaultStopSet()); if (useStem) { if (!stemExclusionSet.isEmpty()) stream = new SetKeywordMarkerFilter(stream, stemExclusionSet); stream = new GreekStemFilter(stream); } return stream; } }
@Override public TokenStream create(TokenStream input) { StopFilter stopFilter = new StopFilter(input,stopWords); return stopFilter; } }
@Override protected TokenStreamComponents wrapComponents(String fieldName, TokenStreamComponents components) { Set<String> stopWords = stopWordsPerField.get(fieldName); if (stopWords == null) { return components; } StopFilter stopFilter = new StopFilter(components.getTokenStream(), new CharArraySet(stopWords, false)); return new TokenStreamComponents(components.getTokenizer(), stopFilter); }
@Override protected TokenStreamComponents createComponents(String fieldName) { Tokenizer tokenizer; if (type.equals("other")) { tokenizer = new OtherTokenizer(); } else { tokenizer = new SentenceTokenizer(); } TokenStream result = new JiebaTokenFilter(type, tokenizer); if (!type.equals("other") && !stopWords.isEmpty()) { result = new StopFilter(result, stopWords); } return new TokenStreamComponents(tokenizer, result); } }
/** * Returns as {@link CharArraySet} from wordFiles, which * can be a comma-separated list of filenames */ protected final CharArraySet getWordSet(ResourceLoader loader, String wordFiles, boolean ignoreCase) throws IOException { List<String> files = splitFileNames(wordFiles); CharArraySet words = null; if (files.size() > 0) { // default stopwords list has 35 or so words, but maybe don't make it that // big to start words = new CharArraySet(files.size() * 10, ignoreCase); for (String file : files) { List<String> wlist = getLines(loader, file.trim()); words.addAll(StopFilter.makeStopSet(wlist, ignoreCase)); } } return words; }
@Override public TokenStream getTokenStream(Tokenizer tokenizer, CharArraySet stemExclusionSet) { TokenStream stream = new StandardFilter(matchVersion, tokenizer); if (caseInsensitive) stream = new LowerCaseFilter(matchVersion, stream); if (useStopWords) stream = new StopFilter(matchVersion, stream, ThaiAnalyzer.getDefaultStopSet()); if (useStem) { if (!stemExclusionSet.isEmpty()) stream = new SetKeywordMarkerFilter(stream, stemExclusionSet); } return stream; } }
/** {@inheritDoc} */ @Override protected Analyzer.TokenStreamComponents createComponents(String fieldName) { final Tokenizer source = new StandardTokenizer(); TokenStream result = new StandardFilter(source); result = new LowerCaseFilter(result); result = new StopFilter(result, stopwords); result = new SnowballFilter(result, language); return new TokenStreamComponents(source, result); } }
@Override public TokenStream getTokenStream(Tokenizer tokenizer, CharArraySet stemExclusionSet) { TokenStream stream = new StandardFilter(matchVersion, tokenizer); if (caseInsensitive) stream = new IrishLowerCaseFilter(stream); if (useStopWords) stream = new StopFilter(matchVersion, stream, IrishAnalyzer.getDefaultStopSet()); if (useStem) { if (!stemExclusionSet.isEmpty()) stream = new SetKeywordMarkerFilter(stream, stemExclusionSet); stream = new SnowballFilter(stream, new IrishStemmer()); } return stream; }
@Override public TokenStream create(TokenStream tokenStream) { if (removeTrailing) { return new StopFilter(tokenStream, stopWords); } else { return new SuggestStopFilter(tokenStream, stopWords); } }
protected TokenStreamComponents createComponents(String fieldName, Reader reader) { final Tokenizer source = new ChineseTokenStream(reader); if (stopWordManager != null) { //走停止词过滤 CharArraySet stopWords = new CharArraySet(Version.LUCENE_CURRENT, stopWordManager.getStopWords(), true); TokenStream result = new StopFilter(Version.LUCENE_CURRENT, source, stopWords); return new TokenStreamComponents(source, result); } else { //走原始逻辑 return new TokenStreamComponents(source); } }
@Override public TokenStream getTokenStream(Tokenizer tokenizer, CharArraySet stemExclusionSet) { TokenStream stream = new StandardFilter(matchVersion, tokenizer); if (caseInsensitive) stream = new LowerCaseFilter(matchVersion, stream); if (useStopWords) stream = new StopFilter(matchVersion, stream, PersianAnalyzer.getDefaultStopSet()); if (useStem) { if (!stemExclusionSet.isEmpty()) stream = new SetKeywordMarkerFilter(stream, stemExclusionSet); } return stream; } }
@Override protected TokenStreamComponents createComponents(final String fieldName, final Reader reader) { final StandardTokenizer src = new StandardTokenizer(reader); src.setMaxTokenLength(maxTokenLength); TokenStream tok = new StandardFilter(src); tok = new LowerCaseFilter(tok); tok = new StopFilter(tok, stopwords); return new TokenStreamComponents(src, tok) { @Override protected void setReader(final Reader reader) throws IOException { src.setMaxTokenLength(StandardAnalyzer.this.maxTokenLength); super.setReader(reader); } }; } }
@Override public TokenStream getTokenStream(Tokenizer tokenizer, CharArraySet stemExclusionSet) { TokenStream stream = new CJKWidthFilter(tokenizer); stream = new CJKBigramFilter(stream); if (caseInsensitive) stream = new LowerCaseFilter(matchVersion, stream); if (useStopWords) stream = new StopFilter(matchVersion, stream, CJKAnalyzer.getDefaultStopSet()); return stream; } }
@Override public TokenStream getTokenStream(Tokenizer tokenizer, CharArraySet stemExclusionSet) { TokenStream stream = new StandardFilter(matchVersion, tokenizer); if (caseInsensitive) stream = new TurkishLowerCaseFilter(stream); if (useStopWords) stream = new StopFilter(matchVersion, stream, TurkishAnalyzer.getDefaultStopSet()); if (useStem) { if (!stemExclusionSet.isEmpty()) stream = new SetKeywordMarkerFilter(stream, stemExclusionSet); stream = new SnowballFilter(stream, new TurkishStemmer()); } return stream; } }
@Override public TokenStream create(TokenStream tokenStream) { if (removeTrailing) { return new StopFilter(tokenStream, stopWords); } else { return new SuggestStopFilter(tokenStream, stopWords); } }