String words[] = line.split("\\s+"); for (int i = 0; i < words.length; i++) if (words[i].length() > 0) result.add(words[i]);
private static CharArraySet resolveNamedWords(Collection<String> words, Map<String, Set<?>> namedWords, boolean ignoreCase) { if (namedWords == null) { return new CharArraySet(words, ignoreCase); } CharArraySet setWords = new CharArraySet(words.size(), ignoreCase); for (String word : words) { if (namedWords.containsKey(word)) { setWords.addAll(namedWords.get(word)); } else { setWords.add(word); } } return setWords; }
/** * Reads lines from a Reader and adds every non-comment line as an entry to a CharArraySet (omitting * leading and trailing whitespace). Every line of the Reader should contain only * one word. The words need to be in lowercase if you make use of an * Analyzer which uses LowerCaseFilter (like StandardAnalyzer). * * @param reader Reader containing the wordlist * @param comment The string representing a comment. * @param result the {@link CharArraySet} to fill with the readers words * @return the given {@link CharArraySet} with the reader's words */ public static CharArraySet getWordSet(Reader reader, String comment, CharArraySet result) throws IOException { BufferedReader br = null; try { br = getBufferedReader(reader); String word = null; while ((word = br.readLine()) != null) { if (word.startsWith(comment) == false){ result.add(word.trim()); } } } finally { IOUtils.close(br); } return result; }
private static CharArraySet resolveNamedWords(Collection<String> words, Map<String, Set<?>> namedWords, boolean ignoreCase) { if (namedWords == null) { return new CharArraySet(words, ignoreCase); } CharArraySet setWords = new CharArraySet(words.size(), ignoreCase); for (String word : words) { if (namedWords.containsKey(word)) { setWords.addAll(namedWords.get(word)); } else { setWords.add(word); } } return setWords; }
/** * Reads lines from a Reader and adds every line as an entry to a CharArraySet (omitting * leading and trailing whitespace). Every line of the Reader should contain only * one word. The words need to be in lowercase if you make use of an * Analyzer which uses LowerCaseFilter (like StandardAnalyzer). * * @param reader Reader containing the wordlist * @param result the {@link CharArraySet} to fill with the readers words * @return the given {@link CharArraySet} with the reader's words */ public static CharArraySet getWordSet(Reader reader, CharArraySet result) throws IOException { BufferedReader br = null; try { br = getBufferedReader(reader); String word = null; while ((word = br.readLine()) != null) { result.add(word.trim()); } } finally { IOUtils.close(br); } return result; }
private static CharArraySet resolveNamedWords(Collection<String> words, Map<String, Set<?>> namedWords, boolean ignoreCase) { if (namedWords == null) { return new CharArraySet(words, ignoreCase); } CharArraySet setWords = new CharArraySet(words.size(), ignoreCase); for (String word : words) { if (namedWords.containsKey(word)) { setWords.addAll(namedWords.get(word)); } else { setWords.add(word); } } return setWords; }
/** * Creates a new HTMLStripCharFilter over the provided Reader * with the specified start and end tags. * @param source Reader to strip html tags from. * @param escapedTags Tags in this set (both start and end tags) * will not be filtered out. */ public HTMLStripCharFilter(Reader source, Set<String> escapedTags) { super(source); this.zzReader = source; if (null != escapedTags) { for (String tag : escapedTags) { if (tag.equalsIgnoreCase("BR")) { escapeBR = true; } else if (tag.equalsIgnoreCase("SCRIPT")) { escapeSCRIPT = true; } else if (tag.equalsIgnoreCase("STYLE")) { escapeSTYLE = true; } else { if (null == this.escapedTags) { this.escapedTags = new CharArraySet(16, true); } this.escapedTags.add(tag); } } } }
private static CharArraySet resolveNamedWords(Collection<String> words, Map<String, Set<?>> namedWords, boolean ignoreCase) { if (namedWords == null) { return new CharArraySet(words, ignoreCase); } CharArraySet setWords = new CharArraySet(words.size(), ignoreCase); for (String word : words) { if (namedWords.containsKey(word)) { setWords.addAll(namedWords.get(word)); } else { setWords.add(word); } } return setWords; }
/** * Find the unique stem(s) of the provided word * * @param word Word to find the stems for * @return List of stems for the word */ public List<CharsRef> uniqueStems(char word[], int length) { List<CharsRef> stems = stem(word, length); if (stems.size() < 2) { return stems; } CharArraySet terms = new CharArraySet(8, dictionary.ignoreCase); List<CharsRef> deduped = new ArrayList<>(); for (CharsRef s : stems) { if (!terms.contains(s)) { deduped.add(s); terms.add(s); } } return deduped; }
seenSurfaceForms.add(hit.key); hits.add(hit); if (hits.size() == num) {
@Override public boolean incrementToken() throws IOException { while (input.incrementToken()) { final char term[] = termAttribute.buffer(); final int length = termAttribute.length(); final int posIncrement = posIncAttribute.getPositionIncrement(); if (posIncrement > 0) { previous.clear(); } boolean duplicate = (posIncrement == 0 && previous.contains(term, 0, length)); // clone the term, and add to the set of seen terms. char saved[] = new char[length]; System.arraycopy(term, 0, saved, 0, length); previous.add(saved); if (!duplicate) { return true; } } return false; }
continue; seenSurfaceForms.add(text);
uniqueTerms.add(clonedLastTerm); outputTokenSize += length;
TokenStream tokenStream = new StandardTokenizer(Version.LUCENE_36, new StringReader(string)); CharArraySet stopSet = CharArraySet.copy(Version.LUCENE_36, StandardAnalyzer.STOP_WORD_SET); stopSet.add("add"); stopSet.add("your"); stopSet.add("stop"); stopSet.add("words"); tokenStream = new StopFilter(Version.LUCENE_36, tokenStream, stopSet); //Or, if you just need the added stopwords in a standardanalyzer, you could just pass this stopfilter into the StandardAnalyzer... //analyzer = new StandardAnalyzer(Version.LUCENE_36, stopSet);
/** Add this String into the set */ public boolean add(String text) { return add(text.toCharArray()); }
/** Add this CharSequence into the set */ public boolean add(CharSequence text) { return add(text.toString()); // could be more efficient }
/** Add this CharSequence into the set */ public boolean add(CharSequence text) { return add(text.toString()); // could be more efficient }
/** Add this String into the set */ public boolean add(String text) { return add(text.toCharArray()); }