/** * Returns a copy of the given set as a {@link CharArraySet}. If the given set * is a {@link CharArraySet} the ignoreCase property will be preserved. * * @param set * a set to copy * @return a copy of the given set as a {@link CharArraySet}. If the given set * is a {@link CharArraySet} the ignoreCase property as well as the * matchVersion will be of the given set will be preserved. */ public static CharArraySet copy(final Set<?> set) { if(set == EMPTY_SET) return EMPTY_SET; if(set instanceof CharArraySet) { final CharArraySet source = (CharArraySet) set; return new CharArraySet(CharArrayMap.copy(source.map)); } return new CharArraySet(set, false); }
/** * Creates a stopword set from the given stopword array. * * @param stopWords An array of stopwords * @param ignoreCase If true, all words are lower cased first. * @return a Set containing the words */ public static CharArraySet makeStopSet(String[] stopWords, boolean ignoreCase) { CharArraySet stopSet = new CharArraySet(stopWords.length, ignoreCase); stopSet.addAll(Arrays.asList(stopWords)); return stopSet; }
/** * Creates a stopword set from the given stopword list. * @param stopWords A List of Strings or char[] or any other toString()-able list representing the stopwords * @param ignoreCase if true, all words are lower cased first * @return A Set ({@link CharArraySet}) containing the words */ public static CharArraySet makeStopSet(List<?> stopWords, boolean ignoreCase){ CharArraySet stopSet = new CharArraySet(stopWords.size(), ignoreCase); stopSet.addAll(stopWords); return stopSet; }
/** * Reads lines from a Reader and adds every line as an entry to a CharArraySet (omitting * leading and trailing whitespace). Every line of the Reader should contain only * one word. The words need to be in lowercase if you make use of an * Analyzer which uses LowerCaseFilter (like StandardAnalyzer). * * @param reader Reader containing the wordlist * @return A {@link CharArraySet} with the reader's words */ public static CharArraySet getWordSet(Reader reader) throws IOException { return getWordSet(reader, new CharArraySet(INITIAL_CAPACITY, false)); }
/** * Reads lines from a Reader and adds every non-comment line as an entry to a CharArraySet (omitting * leading and trailing whitespace). Every line of the Reader should contain only * one word. The words need to be in lowercase if you make use of an * Analyzer which uses LowerCaseFilter (like StandardAnalyzer). * * @param reader Reader containing the wordlist * @param comment The string representing a comment. * @return A CharArraySet with the reader's words */ public static CharArraySet getWordSet(Reader reader, String comment) throws IOException { return getWordSet(reader, comment, new CharArraySet(INITIAL_CAPACITY, false)); }
/** * Reads stopwords from a stopword list in Snowball format. * <p> * The snowball format is the following: * <ul> * <li>Lines may contain multiple words separated by whitespace. * <li>The comment character is the vertical line (|). * <li>Lines may contain trailing comments. * </ul> * * @param reader Reader containing a Snowball stopword list * @return A {@link CharArraySet} with the reader's words */ public static CharArraySet getSnowballWordSet(Reader reader) throws IOException { return getSnowballWordSet(reader, new CharArraySet(INITIAL_CAPACITY, false)); }
/** * Returns an unmodifiable {@link CharArraySet}. This allows to provide * unmodifiable views of internal sets for "read-only" use. * * @param set * a set for which the unmodifiable set is returned. * @return an new unmodifiable {@link CharArraySet}. * @throws NullPointerException * if the given set is <code>null</code>. */ public static CharArraySet unmodifiableSet(CharArraySet set) { if (set == null) throw new NullPointerException("Given set is null"); if (set == EMPTY_SET) return EMPTY_SET; if (set.map instanceof CharArrayMap.UnmodifiableCharArrayMap) return set; return new CharArraySet(CharArrayMap.unmodifiableMap(set.map)); }
try { reader = IOUtils.getDecodingReader(aClass.getResourceAsStream(resource), StandardCharsets.UTF_8); return WordlistLoader.getWordSet(reader, comment, new CharArraySet(16, ignoreCase)); } finally { IOUtils.close(reader);
public RuleWithSetExceptions(String suffix, int min, String replacement, String[] exceptions) { super(suffix, min, replacement); for (int i = 0; i < exceptions.length; i++) { if (!exceptions[i].endsWith(suffix)) throw new RuntimeException("useless exception '" + exceptions[i] + "' does not end with '" + suffix + "'"); } this.exceptions = new CharArraySet(Arrays.asList(exceptions), false); }
private static CharArraySet resolveNamedWords(Collection<String> words, Map<String, Set<?>> namedWords, boolean ignoreCase) { if (namedWords == null) { return new CharArraySet(words, ignoreCase); } CharArraySet setWords = new CharArraySet(words.size(), ignoreCase); for (String word : words) { if (namedWords.containsKey(word)) { setWords.addAll(namedWords.get(word)); } else { setWords.add(word); } } return setWords; }
/** * Sole constructor * * Collects at most <code>num</code> completions * with corresponding document and weight */ TopSuggestGroupDocsCollector(int num, boolean skipDuplicates) { super(1, skipDuplicates); if (num <= 0) { throw new IllegalArgumentException("'num' must be > 0"); } this.num = num; this.priorityQueue = new SuggestScoreDocPriorityQueue(num); if (skipDuplicates) { seenSurfaceForms = new CharArraySet(num, false); pendingResults = new ArrayList<>(); } else { seenSurfaceForms = null; pendingResults = null; } }
public static CharArraySet parseStemExclusion(Settings settings, CharArraySet defaultStemExclusion) { String value = settings.get("stem_exclusion"); if ("_none_".equals(value)) { return CharArraySet.EMPTY_SET; } List<String> stemExclusion = settings.getAsList("stem_exclusion", null); if (stemExclusion != null) { // LUCENE 4 UPGRADE: Should be settings.getAsBoolean("stem_exclusion_case", false)? return new CharArraySet(stemExclusion, false); } else { return defaultStemExclusion; } }
@Override public void inform(ResourceLoader loader) throws IOException { if (stopWordFiles != null) { if (FORMAT_WORDSET.equalsIgnoreCase(format)) { stopWords = getWordSet(loader, stopWordFiles, ignoreCase); } else if (FORMAT_SNOWBALL.equalsIgnoreCase(format)) { stopWords = getSnowballWordSet(loader, stopWordFiles, ignoreCase); } else { throw new IllegalArgumentException("Unknown 'format' specified for 'words' file: " + format); } } else { if (null != format) { throw new IllegalArgumentException("'format' can not be specified w/o an explicit 'words' file: " + format); } stopWords = new CharArraySet(EnglishAnalyzer.ENGLISH_STOP_WORDS_SET, ignoreCase); } }
@Override protected TokenStreamComponents wrapComponents(String fieldName, TokenStreamComponents components) { Set<String> stopWords = stopWordsPerField.get(fieldName); if (stopWords == null) { return components; } StopFilter stopFilter = new StopFilter(components.getTokenStream(), new CharArraySet(stopWords, false)); return new TokenStreamComponents(components.getTokenizer(), stopFilter); }
/** * Creates a new HTMLStripCharFilter over the provided Reader * with the specified start and end tags. * @param source Reader to strip html tags from. * @param escapedTags Tags in this set (both start and end tags) * will not be filtered out. */ public HTMLStripCharFilter(Reader source, Set<String> escapedTags) { super(source); this.zzReader = source; if (null != escapedTags) { for (String tag : escapedTags) { if (tag.equalsIgnoreCase("BR")) { escapeBR = true; } else if (tag.equalsIgnoreCase("SCRIPT")) { escapeSCRIPT = true; } else if (tag.equalsIgnoreCase("STYLE")) { escapeSTYLE = true; } else { if (null == this.escapedTags) { this.escapedTags = new CharArraySet(16, true); } this.escapedTags.add(tag); } } } }
public static CharArraySet getWordSet(Environment env, org.elasticsearch.Version indexCreatedVersion, Settings settings, String settingsPrefix) { List<String> wordList = getWordList(env, settings, settingsPrefix); if (wordList == null) { return null; } boolean ignoreCase = settings.getAsBooleanLenientForPreEs6Indices(indexCreatedVersion, settingsPrefix + "_case", false, deprecationLogger); return new CharArraySet(wordList, ignoreCase); }
/** * Find the unique stem(s) of the provided word * * @param word Word to find the stems for * @return List of stems for the word */ public List<CharsRef> uniqueStems(char word[], int length) { List<CharsRef> stems = stem(word, length); if (stems.size() < 2) { return stems; } CharArraySet terms = new CharArraySet(8, dictionary.ignoreCase); List<CharsRef> deduped = new ArrayList<>(); for (CharsRef s : stems) { if (!terms.contains(s)) { deduped.add(s); terms.add(s); } } return deduped; }
/** same as {@link #getWordSet(ResourceLoader, String, boolean)}, * except the input is in snowball format. */ protected final CharArraySet getSnowballWordSet(ResourceLoader loader, String wordFiles, boolean ignoreCase) throws IOException { List<String> files = splitFileNames(wordFiles); CharArraySet words = null; if (files.size() > 0) { // default stopwords list has 35 or so words, but maybe don't make it that // big to start words = new CharArraySet(files.size() * 10, ignoreCase); for (String file : files) { InputStream stream = null; Reader reader = null; try { stream = loader.openResource(file.trim()); CharsetDecoder decoder = StandardCharsets.UTF_8.newDecoder() .onMalformedInput(CodingErrorAction.REPORT) .onUnmappableCharacter(CodingErrorAction.REPORT); reader = new InputStreamReader(stream, decoder); WordlistLoader.getSnowballWordSet(reader, words); } finally { IOUtils.closeWhileHandlingException(reader, stream); } } } return words; }
/** * Returns as {@link CharArraySet} from wordFiles, which * can be a comma-separated list of filenames */ protected final CharArraySet getWordSet(ResourceLoader loader, String wordFiles, boolean ignoreCase) throws IOException { List<String> files = splitFileNames(wordFiles); CharArraySet words = null; if (files.size() > 0) { // default stopwords list has 35 or so words, but maybe don't make it that // big to start words = new CharArraySet(files.size() * 10, ignoreCase); for (String file : files) { List<String> wlist = getLines(loader, file.trim()); words.addAll(StopFilter.makeStopSet(wlist, ignoreCase)); } } return words; }
/** Creates a new CapitalizationFilterFactory */ public CapitalizationFilterFactory(Map<String, String> args) { super(args); boolean ignoreCase = getBoolean(args, KEEP_IGNORE_CASE, false); Set<String> k = getSet(args, KEEP); if (k != null) { keep = new CharArraySet(10, ignoreCase); keep.addAll(k); } k = getSet(args, OK_PREFIX); if (k != null) { okPrefix = new ArrayList<>(); for (String item : k) { okPrefix.add(item.toCharArray()); } } minWordLength = getInt(args, MIN_WORD_LENGTH, 0); maxWordCount = getInt(args, MAX_WORD_COUNT, CapitalizationFilter.DEFAULT_MAX_WORD_COUNT); maxTokenLength = getInt(args, MAX_TOKEN_LENGTH, CapitalizationFilter.DEFAULT_MAX_TOKEN_LENGTH); onlyFirstWord = getBoolean(args, ONLY_FIRST_WORD, true); forceFirstLetter = getBoolean(args, FORCE_FIRST_LETTER, true); if (!args.isEmpty()) { throw new IllegalArgumentException("Unknown parameters: " + args); } }