org.apache.lucene.analysis.CharArraySet java code examples

Refine search

/**
 * Creates a stopword set from the given stopword array.
 * 
 * @param stopWords An array of stopwords
 * @param ignoreCase If true, all words are lower cased first.  
 * @return a Set containing the words
 */    
public static CharArraySet makeStopSet(String[] stopWords, boolean ignoreCase) {
 CharArraySet stopSet = new CharArraySet(stopWords.length, ignoreCase);
 stopSet.addAll(Arrays.asList(stopWords));
 return stopSet;
}

/**
 * Returns the next input Token whose term() is not a stop word.
 */
@Override
protected boolean accept() {
 return !stopWords.contains(termAtt.buffer(), 0, termAtt.length());
}

/**
 * Creates a new instance initialized with the given stopword set
 * 
 * @param stopwords
 *          the analyzer's stopword set
 */
protected StopwordAnalyzerBase(final CharArraySet stopwords) {
 // analyzers should use char array set for stopwords!
 this.stopwords = stopwords == null ? CharArraySet.EMPTY_SET : CharArraySet
   .unmodifiableSet(CharArraySet.copy(stopwords));
}

/**
 * Creates a
 * {@link org.apache.lucene.analysis.Analyzer.TokenStreamComponents}
 * which tokenizes all the text in the provided {@link Reader}.
 * 
 * @return A
 *         {@link org.apache.lucene.analysis.Analyzer.TokenStreamComponents}
 *         built from an {@link StandardTokenizer} filtered with
 *         {@link LowerCaseFilter}, {@link StopFilter}
 *         , {@link SetKeywordMarkerFilter} if a stem exclusion set is
 *         provided and {@link LatvianStemFilter}.
 */
@Override
protected TokenStreamComponents createComponents(String fieldName) {
 final Tokenizer source = new StandardTokenizer();
 TokenStream result = new LowerCaseFilter(source);
 result = new StopFilter(result, stopwords);
 if(!stemExclusionSet.isEmpty())
  result = new SetKeywordMarkerFilter(result, stemExclusionSet);
 result = new LatvianStemFilter(result);
 return new TokenStreamComponents(source, result);
}

 /**
  * Creates a
  * {@link org.apache.lucene.analysis.ReusableAnalyzerBase.TokenStreamComponents}
  * which tokenizes all the text in the provided {@link Reader}.
  * 
  * @return A
  *         {@link org.apache.lucene.analysis.ReusableAnalyzerBase.TokenStreamComponents}
  *         built from an {@link StandardTokenizer} filtered with
  *         {@link StandardFilter}, {@link IrishLowerCaseFilter}, {@link StopFilter}
  *         , {@link KeywordMarkerFilter} if a stem exclusion set is
  *         provided and {@link SnowballFilter}.
  */
 @Override
 protected TokenStreamComponents createComponents(String fieldName,
   Reader reader) {
  final Tokenizer source = new StandardTokenizer(matchVersion, reader);
  TokenStream result = new StandardFilter(matchVersion, source);
  StopFilter s = new StopFilter(matchVersion, result, HYPHENATIONS);
  s.setEnablePositionIncrements(false);
  result = s;
  result = new ElisionFilter(matchVersion, result, DEFAULT_ARTICLES);
  result = new IrishLowerCaseFilter(result);
  result = new StopFilter(matchVersion, result, stopwords);
  if(!stemExclusionSet.isEmpty())
   result = new KeywordMarkerFilter(result, stemExclusionSet);
  result = new SnowballFilter(result, new IrishStemmer());
  return new TokenStreamComponents(source, result);
 }
}

@Override
public boolean incrementToken() throws IOException {
 while (input.incrementToken()) {
  final char term[] = termAttribute.buffer();
  final int length = termAttribute.length();
  final int posIncrement = posIncAttribute.getPositionIncrement();
  
  if (posIncrement > 0) {
   previous.clear();
  }
  
  boolean duplicate = (posIncrement == 0 && previous.contains(term, 0, length));
  
  // clone the term, and add to the set of seen terms.
  char saved[] = new char[length];
  System.arraycopy(term, 0, saved, 0, length);
  previous.add(saved);
  
  if (!duplicate) {
   return true;
  }
 }
 return false;
}

/**
 * Find the unique stem(s) of the provided word
 * 
 * @param word Word to find the stems for
 * @return List of stems for the word
 */
public List<CharsRef> uniqueStems(char word[], int length) {
 List<CharsRef> stems = stem(word, length);
 if (stems.size() < 2) {
  return stems;
 }
 CharArraySet terms = new CharArraySet(8, dictionary.ignoreCase);
 List<CharsRef> deduped = new ArrayList<>();
 for (CharsRef s : stems) {
  if (!terms.contains(s)) {
   deduped.add(s);
   terms.add(s);
  }
 }
 return deduped;
}

uniqueTerms = new CharArraySet(8, false);
int outputTokenSize = 0;
while (input.incrementToken()) {
 if (!uniqueTerms.contains(term, 0, length)) {
  if (uniqueTerms.size() > 0) {
  uniqueTerms.add(clonedLastTerm);
  outputTokenSize += length;
if (uniqueTerms.size() < 1) {
 termAttribute.setEmpty();
 return false;
 uniqueTerms.clear();
 return false;
if (uniqueTerms.size() == 1) {
 termAttribute.setEmpty().append(new String(clonedLastTerm));
 uniqueTerms.clear();
 return true;
Object[] items = uniqueTerms.toArray();
uniqueTerms.clear();
return true;

@Override
protected TokenStreamComponents wrapComponents(String fieldName, TokenStreamComponents components) {
 Set<String> stopWords = stopWordsPerField.get(fieldName);
 if (stopWords == null) {
  return components;
 }
 StopFilter stopFilter = new StopFilter(components.getTokenStream(), 
   new CharArraySet(stopWords, false));
 return new TokenStreamComponents(components.getTokenizer(), stopFilter);
}

private static CharArraySet resolveNamedWords(Collection<String> words, Map<String, Set<?>> namedWords, boolean ignoreCase) {
  if (namedWords == null) {
    return new CharArraySet(words, ignoreCase);
  }
  CharArraySet setWords = new CharArraySet(words.size(), ignoreCase);
  for (String word : words) {
    if (namedWords.containsKey(word)) {
      setWords.addAll(namedWords.get(word));
    } else {
      setWords.add(word);
    }
  }
  return setWords;
}

/**
 * Returns a copy of the given set as a {@link CharArraySet}. If the given set
 * is a {@link CharArraySet} the ignoreCase property will be preserved.
 * 
 * @param set
 *          a set to copy
 * @return a copy of the given set as a {@link CharArraySet}. If the given set
 *         is a {@link CharArraySet} the ignoreCase property as well as the
 *         matchVersion will be of the given set will be preserved.
 */
public static CharArraySet copy(final Set<?> set) {
 if(set == EMPTY_SET)
  return EMPTY_SET;
 if(set instanceof CharArraySet) {
  final CharArraySet source = (CharArraySet) set;
  return new CharArraySet(CharArrayMap.copy(source.map));
 }
 return new CharArraySet(set, false);
}

/**
 * Returns as {@link CharArraySet} from wordFiles, which
 * can be a comma-separated list of filenames
 */
protected final CharArraySet getWordSet(ResourceLoader loader,
  String wordFiles, boolean ignoreCase) throws IOException {
 List<String> files = splitFileNames(wordFiles);
 CharArraySet words = null;
 if (files.size() > 0) {
  // default stopwords list has 35 or so words, but maybe don't make it that
  // big to start
  words = new CharArraySet(files.size() * 10, ignoreCase);
  for (String file : files) {
   List<String> wlist = getLines(loader, file.trim());
   words.addAll(StopFilter.makeStopSet(wlist, ignoreCase));
  }
 }
 return words;
}

/**
 * Creates a new HTMLStripCharFilter over the provided Reader
 * with the specified start and end tags.
 * @param source Reader to strip html tags from.
 * @param escapedTags Tags in this set (both start and end tags)
 *  will not be filtered out.
 */
public HTMLStripCharFilter(Reader source, Set<String> escapedTags) {
 super(source);
 this.zzReader = source;
 if (null != escapedTags) {
  for (String tag : escapedTags) {
   if (tag.equalsIgnoreCase("BR")) {
    escapeBR = true;
   } else if (tag.equalsIgnoreCase("SCRIPT")) {
    escapeSCRIPT = true;
   } else if (tag.equalsIgnoreCase("STYLE")) {
    escapeSTYLE = true;
   } else {
    if (null == this.escapedTags) {
     this.escapedTags = new CharArraySet(16, true);
    }
    this.escapedTags.add(tag);
   }
  }
 }
}

String words[] = line.split("\\s+");
for (int i = 0; i < words.length; i++)
 if (words[i].length() > 0) result.add(words[i]);

/**
 * Creates a set from a Collection of objects. 
 * 
 * @param c
 *          a collection whose elements to be placed into the set
 * @param ignoreCase
 *          <code>false</code> if and only if the set should be case sensitive
 *          otherwise <code>true</code>.
 */
public CharArraySet(Collection<?> c, boolean ignoreCase) {
 this(c.size(), ignoreCase);
 addAll(c);
}

 @Override
 public void reset() throws IOException {
  super.reset();
  previous.clear();
 }
}

/**
 * Creates a
 * {@link org.apache.lucene.analysis.Analyzer.TokenStreamComponents}
 * which tokenizes all the text in the provided {@link Reader}.
 * 
 * @return A
 *         {@link org.apache.lucene.analysis.Analyzer.TokenStreamComponents}
 *         built from an {@link StandardTokenizer} filtered with
 *         {@link LowerCaseFilter}, {@link StopFilter}
 *         , {@link SetKeywordMarkerFilter} if a stem exclusion set is
 *         provided and {@link GalicianStemFilter}.
 */
@Override
protected TokenStreamComponents createComponents(String fieldName) {
 final Tokenizer source = new StandardTokenizer();
 TokenStream result = new LowerCaseFilter(source);
 result = new StopFilter(result, stopwords);
 if(!stemExclusionSet.isEmpty())
  result = new SetKeywordMarkerFilter(result, stemExclusionSet);
 result = new GalicianStemFilter(result);
 return new TokenStreamComponents(source, result);
}

seenSurfaceForms.clear();
  if (seenSurfaceForms.contains(hit.key) == false) {
    seenSurfaceForms.add(hit.key);
    hits.add(hit);
    if (hits.size() == num) {

final CharArraySet seenSurfaceForms = leader.skipDuplicates ? new CharArraySet(leader.getSize(), false) : null;
for (Suggest.Suggestion<Entry> suggestion : toReduce) {
  assert suggestion.getName().equals(name) : "name should be identical across all suggestions";
      assert ((CompletionSuggestion) suggestion).skipDuplicates;
      String text = option.getText().string();
      if (seenSurfaceForms.contains(text)) {
        continue;
      seenSurfaceForms.add(text);

/**
 * Reads lines from a Reader and adds every line as an entry to a CharArraySet (omitting
 * leading and trailing whitespace). Every line of the Reader should contain only
 * one word. The words need to be in lowercase if you make use of an
 * Analyzer which uses LowerCaseFilter (like StandardAnalyzer).
 *
 * @param reader Reader containing the wordlist
 * @return A {@link CharArraySet} with the reader's words
 */
public static CharArraySet getWordSet(Reader reader) throws IOException {
 return getWordSet(reader, new CharArraySet(INITIAL_CAPACITY, false));
}

Javadoc

A simple class that stores Strings as char[]'s in a hash table. Note that this is not a general purpose class. For example, it cannot remove items from the set, nor does it resize its hash table to be smaller, etc. It is designed to be quick to test if a char[] is in the set without the necessity of converting it to a String first.

Most used methods

<init>
contains
true if the len chars of text starting at off are in the set
copy
unmodifiableSet
Returns an unmodifiable CharArraySet. This allows to provide unmodifiable views of internal sets for
add
Add this char[] directly to the set. If ignoreCase is true for this Set, the text array will be dire
addAll
isEmpty
clear
Clears all entries in this set. This method is supported for reusing, but not Set#remove.
equals
getHashCode
getSlot
rehash

Popular in Java

Reading from database using SQL prepared statement
getOriginalFilename (MultipartFile)
Return the original filename in the client's filesystem.This may contain path information depending
putExtra (Intent)
notifyDataSetChanged (ArrayAdapter)
HashMap (java.util)
HashMap is an implementation of Map. All optional operations are supported.All elements are permitte
LinkedHashMap (java.util)
LinkedHashMap is an implementation of Map that guarantees iteration order. All optional operations a
TreeSet (java.util)
TreeSet is an implementation of SortedSet. All optional operations (adding and removing) are support
Manifest (java.util.jar)
The Manifest class is used to obtain attribute information for a JarFile and its entries.
JTable (javax.swing)
BasicDataSource (org.apache.commons.dbcp)
Basic implementation of javax.sql.DataSource that is configured via JavaBeans properties. This is no
Top plugins for Android Studio

How to useCharArraySet in org.apache.lucene.analysis

Best Java code snippets using org.apache.lucene.analysis.CharArraySet (Showing top 20 results out of 333)

Refine search

How to use
CharArraySet
in
org.apache.lucene.analysis