org.apache.lucene.analysis.hunspell java code examples

 @Override
 public TokenStream create(TokenStream tokenStream) {
  return new HunspellStemFilter(tokenStream, dictionary, true, longestOnly);
 }
}

/**
 * Looks up Hunspell word forms from the dictionary
 */
IntsRef lookupWord(char word[], int offset, int length) {
 return lookup(words, word, offset, length);
}

/**
 * Creates a new HunspellStemFilter that will stem tokens from the given TokenStream using affix rules in the provided
 * Dictionary
 *
 * @param input TokenStream whose tokens will be stemmed
 * @param dictionary HunspellDictionary containing the affix rules and words that will be used to stem the tokens
 * @param longestOnly true if only the longest term should be output.
 */
public HunspellStemFilter(TokenStream input, Dictionary dictionary, boolean dedup,  boolean longestOnly) {
 super(input);
 this.dedup = dedup && longestOnly == false; // don't waste time deduping if longestOnly is set
 this.stemmer = new Stemmer(dictionary);
 this.longestOnly = longestOnly;
}

private List<CharsRef> doStem(char word[], int length, boolean caseVariant) {
 List<CharsRef> stems = new ArrayList<>();
 IntsRef forms = dictionary.lookupWord(word, 0, length);
 if (forms != null) {
  for (int i = 0; i < forms.length; i += formStep) {
   if (checkKeepCase || checkNeedAffix || checkOnlyInCompound) {
    dictionary.flagLookup.get(forms.ints[forms.offset+i], scratch);
    char wordFlags[] = Dictionary.decodeFlags(scratch);
    if (checkKeepCase && Dictionary.hasFlag(wordFlags, (char)dictionary.keepcase)) {
     continue;
    if (checkNeedAffix && Dictionary.hasFlag(wordFlags, (char)dictionary.needaffix)) {
     continue;
    if (checkOnlyInCompound && Dictionary.hasFlag(wordFlags, (char)dictionary.onlyincompound)) {
     continue;
   stems.add(newStem(word, length, forms, i));
  boolean v = stems.addAll(stem(word, length, -1, -1, -1, 0, true, true, false, false, caseVariant));
 } catch (IOException bogus) {
  throw new RuntimeException(bogus);

 scratchSegment.setLength(0);
 scratchSegment.append(word, 0, length);
 CharSequence cleaned = dictionary.cleanInput(scratchSegment, segment);
 scratchBuffer = ArrayUtil.grow(scratchBuffer, cleaned.length());
 length = segment.length();
int caseType = caseOf(word, length);
if (caseType == UPPER_CASE) {
 caseFoldTitle(word, length);
 caseFoldLower(titleBuffer, length);
 List<CharsRef> list = doStem(word, length, false);
 list.addAll(doStem(titleBuffer, length, true));
 list.addAll(doStem(lowerBuffer, length, true));
 return list;
} else if (caseType == TITLE_CASE) {
 caseFoldLower(word, length);
 List<CharsRef> list = doStem(word, length, false);
 list.addAll(doStem(lowerBuffer, length, true));
 return list;
} else {
 return doStem(word, length, false);

if (buffer != null && !buffer.isEmpty()) {
 CharsRef nextStem = buffer.remove(0);
 restoreState(savedState);
 posIncAtt.setPositionIncrement(0);
 termAtt.setEmpty().append(nextStem);
buffer = dedup ? stemmer.uniqueStems(termAtt.buffer(), termAtt.length()) : stemmer.stem(termAtt.buffer(), termAtt.length());
} else {
 if (!buffer.isEmpty()) {
  savedState = captureState();

/** Creates a new HunspellStemFilterFactory */
public HunspellStemFilterFactory(Map<String,String> args) {
 super(args);
 dictionaryFiles = require(args, PARAM_DICTIONARY);
 affixFile = get(args, PARAM_AFFIX);
 ignoreCase = getBoolean(args, PARAM_IGNORE_CASE, false);
 longestOnly = getBoolean(args, PARAM_LONGEST_ONLY, false);
 // this isnt necessary: we properly load all dictionaries.
 // but recognize and ignore for back compat
 getBoolean(args, "strictAffixParsing", true);
 // this isn't necessary: multi-stage stripping is fixed and 
 // flags like COMPLEXPREFIXES in the data itself control this.
 // but recognize and ignore for back compat
 getInt(args, "recursionCap", 0);
 if (!args.isEmpty()) {
  throw new IllegalArgumentException("Unknown parameters: " + args);
 }
}

/**
 * Determines the appropriate {@link FlagParsingStrategy} based on the FLAG definition line taken from the affix file
 *
 * @param flagLine Line containing the flag information
 * @return FlagParsingStrategy that handles parsing flags in the way specified in the FLAG definition
 */
static FlagParsingStrategy getFlagParsingStrategy(String flagLine) {
 String parts[] = flagLine.split("\\s+");
 if (parts.length != 2) {
  throw new IllegalArgumentException("Illegal FLAG specification: " + flagLine);
 }
 String flagType = parts[1];
 if (NUM_FLAG_TYPE.equals(flagType)) {
  return new NumFlagParsingStrategy();
 } else if (UTF8_FLAG_TYPE.equals(flagType)) {
  return new SimpleFlagParsingStrategy();
 } else if (LONG_FLAG_TYPE.equals(flagType)) {
  return new DoubleASCIIFlagParsingStrategy();
 }
 throw new IllegalArgumentException("Unknown flag type: " + flagType);
}

/**
 * Find the stem(s) of the provided word.
 * 
 * @param word Word to find the stems for
 * @return List of stems for the word
 */
public List<CharsRef> stem(String word) {
 return stem(word.toCharArray(), word.length());
}

/** folds titlecase variant of word to titleBuffer */
private void caseFoldTitle(char word[], int length) {
 titleBuffer = ArrayUtil.grow(titleBuffer, length);
 System.arraycopy(word, 0, titleBuffer, 0, length);
 for (int i = 1; i < length; i++) {
  titleBuffer[i] = dictionary.caseFold(titleBuffer[i]);
 }
}

static int morphBoundary(String line) {
 int end = indexOfSpaceOrTab(line, 0);
 if (end == -1) {
  return line.length();
 }
 while (end >= 0 && end < line.length()) {
  if (line.charAt(end) == '\t' ||
    end+3 < line.length() && 
    Character.isLetter(line.charAt(end+1)) && 
    Character.isLetter(line.charAt(end+2)) &&
    line.charAt(end+3) == ':') {
   break;
  }
  end = indexOfSpaceOrTab(line, end+1);
 }
 if (end == -1) {
  return line.length();
 }
 return end;
}

/**
 * Parses the given String into a single flag
 *
 * @param rawFlag String to parse into a flag
 * @return Parsed flag
 */
char parseFlag(String rawFlag) {
 char flags[] = parseFlags(rawFlag);
 if (flags.length != 1) {
  throw new IllegalArgumentException("expected only one flag, got: " + rawFlag);
 }
 return flags[0];
}

String unescapeEntry(String entry) {
 StringBuilder sb = new StringBuilder();
 int end = morphBoundary(entry);
 for (int i = 0; i < end; i++) {
  char ch = entry.charAt(i);
  if (ch == '\\' && i+1 < entry.length()) {
   sb.append(entry.charAt(i+1));
   i++;
  } else if (ch == '/') {
   sb.append(FLAG_SEPARATOR);
  } else if (ch == MORPH_SEPARATOR || ch == FLAG_SEPARATOR) {
   // BINARY EXECUTABLES EMBEDDED IN ZULU DICTIONARIES!!!!!!!
  } else {
   sb.append(ch);
  }
 }
 sb.append(MORPH_SEPARATOR);
 if (end < entry.length()) {
  for (int i = end; i < entry.length(); i++) {
   char c = entry.charAt(i);
   if (c == FLAG_SEPARATOR || c == MORPH_SEPARATOR) {
    // BINARY EXECUTABLES EMBEDDED IN ZULU DICTIONARIES!!!!!!!
   } else {
    sb.append(c);
   }
  }
 }
 return sb.toString();
}

/**
 * Retrieves the CharsetDecoder for the given encoding.  Note, This isn't perfect as I think ISCII-DEVANAGARI and
 * MICROSOFT-CP1251 etc are allowed...
 *
 * @param encoding Encoding to retrieve the CharsetDecoder for
 * @return CharSetDecoder for the given encoding
 */
private CharsetDecoder getJavaEncoding(String encoding) {
 if ("ISO8859-14".equals(encoding)) {
  return new ISO8859_14Decoder();
 }
 String canon = CHARSET_ALIASES.get(encoding);
 if (canon != null) {
  encoding = canon;
 }
 Charset charset = Charset.forName(encoding);
 return charset.newDecoder().onMalformedInput(CodingErrorAction.REPLACE);
}

@Override
public TokenStream create(TokenStream tokenStream) {
  return new HunspellStemFilter(tokenStream, dictionary, dedup, longestOnly);
}

IntsRef lookupPrefix(char word[], int offset, int length) {
 return lookup(prefixes, word, offset, length);
}

/**
 * Find the unique stem(s) of the provided word
 * 
 * @param word Word to find the stems for
 * @return List of stems for the word
 */
public List<CharsRef> uniqueStems(char word[], int length) {
 List<CharsRef> stems = stem(word, length);
 if (stems.size() < 2) {
  return stems;
 }
 CharArraySet terms = new CharArraySet(8, dictionary.ignoreCase);
 List<CharsRef> deduped = new ArrayList<>();
 for (CharsRef s : stems) {
  if (!terms.contains(s)) {
   deduped.add(s);
   terms.add(s);
  }
 }
 return deduped;
}

/** folds lowercase variant of word (title cased) to lowerBuffer */
private void caseFoldLower(char word[], int length) {
 lowerBuffer = ArrayUtil.grow(lowerBuffer, length);
 System.arraycopy(word, 0, lowerBuffer, 0, length);
 lowerBuffer[0] = dictionary.caseFold(lowerBuffer[0]);
}

private String parseStemException(String morphData) {
 // first see if it's an alias
 if (morphAliasCount > 0) {
  try {
   int alias = Integer.parseInt(morphData.trim());
   morphData = morphAliases[alias-1];
  } catch (NumberFormatException e) {  
   // fine
  }
 }
 // try to parse morph entry
 int index = morphData.indexOf(" st:");
 if (index < 0) {
  index = morphData.indexOf("\tst:");
 }
 if (index >= 0) {
  int endIndex = indexOfSpaceOrTab(morphData, index+1);
  if (endIndex < 0) {
   endIndex = morphData.length();
  }
  return morphData.substring(index+4, endIndex);
 }
 return null;
}

IntsRef lookupSuffix(char word[], int offset, int length) {
 return lookup(suffixes, word, offset, length);
}

Most used classes

HunspellStemFilter
TokenFilter that uses hunspell affix rules and words to stem tokens. Since hunspell supports a word
Dictionary
In-memory structure for the dictionary (.dic) and affix (.aff) data of a hunspell dictionary.
Dictionary$DoubleASCIIFlagParsingStrategy
Implementation of FlagParsingStrategy that assumes each flag is encoded as two ASCII characters whos
Dictionary$FlagParsingStrategy
Abstraction of the process of parsing flags taken from the affix and dic files
Dictionary$NumFlagParsingStrategy
Implementation of FlagParsingStrategy that assumes each flag is encoded in its numerical form. In th

How to use org.apache.lucene.analysis.hunspell

Best Java code snippets using org.apache.lucene.analysis.hunspell (Showing top 20 results out of 315)