/** * Creates a stopword set from the given stopword array. * * @param stopWords An array of stopwords * @param ignoreCase If true, all words are lower cased first. * @return a Set containing the words */ public static CharArraySet makeStopSet(String[] stopWords, boolean ignoreCase) { CharArraySet stopSet = new CharArraySet(stopWords.length, ignoreCase); stopSet.addAll(Arrays.asList(stopWords)); return stopSet; }
/** * Returns the next input Token whose term() is not a stop word. */ @Override protected boolean accept() { return !stopWords.contains(termAtt.buffer(), 0, termAtt.length()); }
/** * Creates a new instance initialized with the given stopword set * * @param stopwords * the analyzer's stopword set */ protected StopwordAnalyzerBase(final CharArraySet stopwords) { // analyzers should use char array set for stopwords! this.stopwords = stopwords == null ? CharArraySet.EMPTY_SET : CharArraySet .unmodifiableSet(CharArraySet.copy(stopwords)); }
/** * Creates a * {@link org.apache.lucene.analysis.Analyzer.TokenStreamComponents} * which tokenizes all the text in the provided {@link Reader}. * * @return A * {@link org.apache.lucene.analysis.Analyzer.TokenStreamComponents} * built from an {@link StandardTokenizer} filtered with * {@link LowerCaseFilter}, {@link StopFilter} * , {@link SetKeywordMarkerFilter} if a stem exclusion set is * provided and {@link LatvianStemFilter}. */ @Override protected TokenStreamComponents createComponents(String fieldName) { final Tokenizer source = new StandardTokenizer(); TokenStream result = new LowerCaseFilter(source); result = new StopFilter(result, stopwords); if(!stemExclusionSet.isEmpty()) result = new SetKeywordMarkerFilter(result, stemExclusionSet); result = new LatvianStemFilter(result); return new TokenStreamComponents(source, result); }
/** * Creates a * {@link org.apache.lucene.analysis.ReusableAnalyzerBase.TokenStreamComponents} * which tokenizes all the text in the provided {@link Reader}. * * @return A * {@link org.apache.lucene.analysis.ReusableAnalyzerBase.TokenStreamComponents} * built from an {@link StandardTokenizer} filtered with * {@link StandardFilter}, {@link IrishLowerCaseFilter}, {@link StopFilter} * , {@link KeywordMarkerFilter} if a stem exclusion set is * provided and {@link SnowballFilter}. */ @Override protected TokenStreamComponents createComponents(String fieldName, Reader reader) { final Tokenizer source = new StandardTokenizer(matchVersion, reader); TokenStream result = new StandardFilter(matchVersion, source); StopFilter s = new StopFilter(matchVersion, result, HYPHENATIONS); s.setEnablePositionIncrements(false); result = s; result = new ElisionFilter(matchVersion, result, DEFAULT_ARTICLES); result = new IrishLowerCaseFilter(result); result = new StopFilter(matchVersion, result, stopwords); if(!stemExclusionSet.isEmpty()) result = new KeywordMarkerFilter(result, stemExclusionSet); result = new SnowballFilter(result, new IrishStemmer()); return new TokenStreamComponents(source, result); } }
@Override public boolean incrementToken() throws IOException { while (input.incrementToken()) { final char term[] = termAttribute.buffer(); final int length = termAttribute.length(); final int posIncrement = posIncAttribute.getPositionIncrement(); if (posIncrement > 0) { previous.clear(); } boolean duplicate = (posIncrement == 0 && previous.contains(term, 0, length)); // clone the term, and add to the set of seen terms. char saved[] = new char[length]; System.arraycopy(term, 0, saved, 0, length); previous.add(saved); if (!duplicate) { return true; } } return false; }
/** * Find the unique stem(s) of the provided word * * @param word Word to find the stems for * @return List of stems for the word */ public List<CharsRef> uniqueStems(char word[], int length) { List<CharsRef> stems = stem(word, length); if (stems.size() < 2) { return stems; } CharArraySet terms = new CharArraySet(8, dictionary.ignoreCase); List<CharsRef> deduped = new ArrayList<>(); for (CharsRef s : stems) { if (!terms.contains(s)) { deduped.add(s); terms.add(s); } } return deduped; }
uniqueTerms = new CharArraySet(8, false); int outputTokenSize = 0; while (input.incrementToken()) { if (!uniqueTerms.contains(term, 0, length)) { if (uniqueTerms.size() > 0) { uniqueTerms.add(clonedLastTerm); outputTokenSize += length; if (uniqueTerms.size() < 1) { termAttribute.setEmpty(); return false; uniqueTerms.clear(); return false; if (uniqueTerms.size() == 1) { termAttribute.setEmpty().append(new String(clonedLastTerm)); uniqueTerms.clear(); return true; Object[] items = uniqueTerms.toArray(); uniqueTerms.clear(); return true;
@Override protected TokenStreamComponents wrapComponents(String fieldName, TokenStreamComponents components) { Set<String> stopWords = stopWordsPerField.get(fieldName); if (stopWords == null) { return components; } StopFilter stopFilter = new StopFilter(components.getTokenStream(), new CharArraySet(stopWords, false)); return new TokenStreamComponents(components.getTokenizer(), stopFilter); }
private static CharArraySet resolveNamedWords(Collection<String> words, Map<String, Set<?>> namedWords, boolean ignoreCase) { if (namedWords == null) { return new CharArraySet(words, ignoreCase); } CharArraySet setWords = new CharArraySet(words.size(), ignoreCase); for (String word : words) { if (namedWords.containsKey(word)) { setWords.addAll(namedWords.get(word)); } else { setWords.add(word); } } return setWords; }
/** * Returns a copy of the given set as a {@link CharArraySet}. If the given set * is a {@link CharArraySet} the ignoreCase property will be preserved. * * @param set * a set to copy * @return a copy of the given set as a {@link CharArraySet}. If the given set * is a {@link CharArraySet} the ignoreCase property as well as the * matchVersion will be of the given set will be preserved. */ public static CharArraySet copy(final Set<?> set) { if(set == EMPTY_SET) return EMPTY_SET; if(set instanceof CharArraySet) { final CharArraySet source = (CharArraySet) set; return new CharArraySet(CharArrayMap.copy(source.map)); } return new CharArraySet(set, false); }
/** * Returns as {@link CharArraySet} from wordFiles, which * can be a comma-separated list of filenames */ protected final CharArraySet getWordSet(ResourceLoader loader, String wordFiles, boolean ignoreCase) throws IOException { List<String> files = splitFileNames(wordFiles); CharArraySet words = null; if (files.size() > 0) { // default stopwords list has 35 or so words, but maybe don't make it that // big to start words = new CharArraySet(files.size() * 10, ignoreCase); for (String file : files) { List<String> wlist = getLines(loader, file.trim()); words.addAll(StopFilter.makeStopSet(wlist, ignoreCase)); } } return words; }
/** * Creates a new HTMLStripCharFilter over the provided Reader * with the specified start and end tags. * @param source Reader to strip html tags from. * @param escapedTags Tags in this set (both start and end tags) * will not be filtered out. */ public HTMLStripCharFilter(Reader source, Set<String> escapedTags) { super(source); this.zzReader = source; if (null != escapedTags) { for (String tag : escapedTags) { if (tag.equalsIgnoreCase("BR")) { escapeBR = true; } else if (tag.equalsIgnoreCase("SCRIPT")) { escapeSCRIPT = true; } else if (tag.equalsIgnoreCase("STYLE")) { escapeSTYLE = true; } else { if (null == this.escapedTags) { this.escapedTags = new CharArraySet(16, true); } this.escapedTags.add(tag); } } } }
String words[] = line.split("\\s+"); for (int i = 0; i < words.length; i++) if (words[i].length() > 0) result.add(words[i]);
/** * Creates a set from a Collection of objects. * * @param c * a collection whose elements to be placed into the set * @param ignoreCase * <code>false</code> if and only if the set should be case sensitive * otherwise <code>true</code>. */ public CharArraySet(Collection<?> c, boolean ignoreCase) { this(c.size(), ignoreCase); addAll(c); }
@Override public void reset() throws IOException { super.reset(); previous.clear(); } }
/** * Creates a * {@link org.apache.lucene.analysis.Analyzer.TokenStreamComponents} * which tokenizes all the text in the provided {@link Reader}. * * @return A * {@link org.apache.lucene.analysis.Analyzer.TokenStreamComponents} * built from an {@link StandardTokenizer} filtered with * {@link LowerCaseFilter}, {@link StopFilter} * , {@link SetKeywordMarkerFilter} if a stem exclusion set is * provided and {@link GalicianStemFilter}. */ @Override protected TokenStreamComponents createComponents(String fieldName) { final Tokenizer source = new StandardTokenizer(); TokenStream result = new LowerCaseFilter(source); result = new StopFilter(result, stopwords); if(!stemExclusionSet.isEmpty()) result = new SetKeywordMarkerFilter(result, stemExclusionSet); result = new GalicianStemFilter(result); return new TokenStreamComponents(source, result); }
final CharArraySet seenSurfaceForms = leader.skipDuplicates ? new CharArraySet(leader.getSize(), false) : null; for (Suggest.Suggestion<Entry> suggestion : toReduce) { assert suggestion.getName().equals(name) : "name should be identical across all suggestions"; assert ((CompletionSuggestion) suggestion).skipDuplicates; String text = option.getText().string(); if (seenSurfaceForms.contains(text)) { continue; seenSurfaceForms.add(text);
/** * Reads lines from a Reader and adds every line as an entry to a CharArraySet (omitting * leading and trailing whitespace). Every line of the Reader should contain only * one word. The words need to be in lowercase if you make use of an * Analyzer which uses LowerCaseFilter (like StandardAnalyzer). * * @param reader Reader containing the wordlist * @return A {@link CharArraySet} with the reader's words */ public static CharArraySet getWordSet(Reader reader) throws IOException { return getWordSet(reader, new CharArraySet(INITIAL_CAPACITY, false)); }