congrats Icon
New! Announcing Tabnine Chat Beta
Learn More
Tabnine Logo
org.apache.lucene.analysis.hunspell
Code IndexAdd Tabnine to your IDE (free)

How to use org.apache.lucene.analysis.hunspell

Best Java code snippets using org.apache.lucene.analysis.hunspell (Showing top 20 results out of 315)

origin: org.apache.lucene/lucene-analyzers-common

 @Override
 public TokenStream create(TokenStream tokenStream) {
  return new HunspellStemFilter(tokenStream, dictionary, true, longestOnly);
 }
}
origin: org.apache.lucene/lucene-analyzers-common

/**
 * Looks up Hunspell word forms from the dictionary
 */
IntsRef lookupWord(char word[], int offset, int length) {
 return lookup(words, word, offset, length);
}
origin: org.apache.lucene/lucene-analyzers-common

/**
 * Creates a new HunspellStemFilter that will stem tokens from the given TokenStream using affix rules in the provided
 * Dictionary
 *
 * @param input TokenStream whose tokens will be stemmed
 * @param dictionary HunspellDictionary containing the affix rules and words that will be used to stem the tokens
 * @param longestOnly true if only the longest term should be output.
 */
public HunspellStemFilter(TokenStream input, Dictionary dictionary, boolean dedup,  boolean longestOnly) {
 super(input);
 this.dedup = dedup && longestOnly == false; // don't waste time deduping if longestOnly is set
 this.stemmer = new Stemmer(dictionary);
 this.longestOnly = longestOnly;
}
origin: org.apache.lucene/lucene-analyzers-common

private List<CharsRef> doStem(char word[], int length, boolean caseVariant) {
 List<CharsRef> stems = new ArrayList<>();
 IntsRef forms = dictionary.lookupWord(word, 0, length);
 if (forms != null) {
  for (int i = 0; i < forms.length; i += formStep) {
   if (checkKeepCase || checkNeedAffix || checkOnlyInCompound) {
    dictionary.flagLookup.get(forms.ints[forms.offset+i], scratch);
    char wordFlags[] = Dictionary.decodeFlags(scratch);
    if (checkKeepCase && Dictionary.hasFlag(wordFlags, (char)dictionary.keepcase)) {
     continue;
    if (checkNeedAffix && Dictionary.hasFlag(wordFlags, (char)dictionary.needaffix)) {
     continue;
    if (checkOnlyInCompound && Dictionary.hasFlag(wordFlags, (char)dictionary.onlyincompound)) {
     continue;
   stems.add(newStem(word, length, forms, i));
  boolean v = stems.addAll(stem(word, length, -1, -1, -1, 0, true, true, false, false, caseVariant));
 } catch (IOException bogus) {
  throw new RuntimeException(bogus);
origin: org.apache.lucene/lucene-analyzers-common

 scratchSegment.setLength(0);
 scratchSegment.append(word, 0, length);
 CharSequence cleaned = dictionary.cleanInput(scratchSegment, segment);
 scratchBuffer = ArrayUtil.grow(scratchBuffer, cleaned.length());
 length = segment.length();
int caseType = caseOf(word, length);
if (caseType == UPPER_CASE) {
 caseFoldTitle(word, length);
 caseFoldLower(titleBuffer, length);
 List<CharsRef> list = doStem(word, length, false);
 list.addAll(doStem(titleBuffer, length, true));
 list.addAll(doStem(lowerBuffer, length, true));
 return list;
} else if (caseType == TITLE_CASE) {
 caseFoldLower(word, length);
 List<CharsRef> list = doStem(word, length, false);
 list.addAll(doStem(lowerBuffer, length, true));
 return list;
} else {
 return doStem(word, length, false);
origin: org.apache.lucene/lucene-analyzers-common

if (buffer != null && !buffer.isEmpty()) {
 CharsRef nextStem = buffer.remove(0);
 restoreState(savedState);
 posIncAtt.setPositionIncrement(0);
 termAtt.setEmpty().append(nextStem);
buffer = dedup ? stemmer.uniqueStems(termAtt.buffer(), termAtt.length()) : stemmer.stem(termAtt.buffer(), termAtt.length());
} else {
 if (!buffer.isEmpty()) {
  savedState = captureState();
origin: org.apache.lucene/lucene-analyzers-common

/** Creates a new HunspellStemFilterFactory */
public HunspellStemFilterFactory(Map<String,String> args) {
 super(args);
 dictionaryFiles = require(args, PARAM_DICTIONARY);
 affixFile = get(args, PARAM_AFFIX);
 ignoreCase = getBoolean(args, PARAM_IGNORE_CASE, false);
 longestOnly = getBoolean(args, PARAM_LONGEST_ONLY, false);
 // this isnt necessary: we properly load all dictionaries.
 // but recognize and ignore for back compat
 getBoolean(args, "strictAffixParsing", true);
 // this isn't necessary: multi-stage stripping is fixed and 
 // flags like COMPLEXPREFIXES in the data itself control this.
 // but recognize and ignore for back compat
 getInt(args, "recursionCap", 0);
 if (!args.isEmpty()) {
  throw new IllegalArgumentException("Unknown parameters: " + args);
 }
}
origin: org.apache.lucene/lucene-analyzers-common

/**
 * Determines the appropriate {@link FlagParsingStrategy} based on the FLAG definition line taken from the affix file
 *
 * @param flagLine Line containing the flag information
 * @return FlagParsingStrategy that handles parsing flags in the way specified in the FLAG definition
 */
static FlagParsingStrategy getFlagParsingStrategy(String flagLine) {
 String parts[] = flagLine.split("\\s+");
 if (parts.length != 2) {
  throw new IllegalArgumentException("Illegal FLAG specification: " + flagLine);
 }
 String flagType = parts[1];
 if (NUM_FLAG_TYPE.equals(flagType)) {
  return new NumFlagParsingStrategy();
 } else if (UTF8_FLAG_TYPE.equals(flagType)) {
  return new SimpleFlagParsingStrategy();
 } else if (LONG_FLAG_TYPE.equals(flagType)) {
  return new DoubleASCIIFlagParsingStrategy();
 }
 throw new IllegalArgumentException("Unknown flag type: " + flagType);
}
origin: org.apache.lucene/lucene-analyzers-common

/**
 * Find the stem(s) of the provided word.
 * 
 * @param word Word to find the stems for
 * @return List of stems for the word
 */
public List<CharsRef> stem(String word) {
 return stem(word.toCharArray(), word.length());
}
origin: org.apache.lucene/lucene-analyzers-common

/** folds titlecase variant of word to titleBuffer */
private void caseFoldTitle(char word[], int length) {
 titleBuffer = ArrayUtil.grow(titleBuffer, length);
 System.arraycopy(word, 0, titleBuffer, 0, length);
 for (int i = 1; i < length; i++) {
  titleBuffer[i] = dictionary.caseFold(titleBuffer[i]);
 }
}

origin: org.apache.lucene/lucene-analyzers-common

static int morphBoundary(String line) {
 int end = indexOfSpaceOrTab(line, 0);
 if (end == -1) {
  return line.length();
 }
 while (end >= 0 && end < line.length()) {
  if (line.charAt(end) == '\t' ||
    end+3 < line.length() && 
    Character.isLetter(line.charAt(end+1)) && 
    Character.isLetter(line.charAt(end+2)) &&
    line.charAt(end+3) == ':') {
   break;
  }
  end = indexOfSpaceOrTab(line, end+1);
 }
 if (end == -1) {
  return line.length();
 }
 return end;
}

origin: org.apache.lucene/lucene-analyzers-common

/**
 * Parses the given String into a single flag
 *
 * @param rawFlag String to parse into a flag
 * @return Parsed flag
 */
char parseFlag(String rawFlag) {
 char flags[] = parseFlags(rawFlag);
 if (flags.length != 1) {
  throw new IllegalArgumentException("expected only one flag, got: " + rawFlag);
 }
 return flags[0];
}
origin: org.apache.lucene/lucene-analyzers-common

String unescapeEntry(String entry) {
 StringBuilder sb = new StringBuilder();
 int end = morphBoundary(entry);
 for (int i = 0; i < end; i++) {
  char ch = entry.charAt(i);
  if (ch == '\\' && i+1 < entry.length()) {
   sb.append(entry.charAt(i+1));
   i++;
  } else if (ch == '/') {
   sb.append(FLAG_SEPARATOR);
  } else if (ch == MORPH_SEPARATOR || ch == FLAG_SEPARATOR) {
   // BINARY EXECUTABLES EMBEDDED IN ZULU DICTIONARIES!!!!!!!
  } else {
   sb.append(ch);
  }
 }
 sb.append(MORPH_SEPARATOR);
 if (end < entry.length()) {
  for (int i = end; i < entry.length(); i++) {
   char c = entry.charAt(i);
   if (c == FLAG_SEPARATOR || c == MORPH_SEPARATOR) {
    // BINARY EXECUTABLES EMBEDDED IN ZULU DICTIONARIES!!!!!!!
   } else {
    sb.append(c);
   }
  }
 }
 return sb.toString();
}

origin: org.apache.lucene/lucene-analyzers-common

/**
 * Retrieves the CharsetDecoder for the given encoding.  Note, This isn't perfect as I think ISCII-DEVANAGARI and
 * MICROSOFT-CP1251 etc are allowed...
 *
 * @param encoding Encoding to retrieve the CharsetDecoder for
 * @return CharSetDecoder for the given encoding
 */
private CharsetDecoder getJavaEncoding(String encoding) {
 if ("ISO8859-14".equals(encoding)) {
  return new ISO8859_14Decoder();
 }
 String canon = CHARSET_ALIASES.get(encoding);
 if (canon != null) {
  encoding = canon;
 }
 Charset charset = Charset.forName(encoding);
 return charset.newDecoder().onMalformedInput(CodingErrorAction.REPLACE);
}
origin: org.elasticsearch/elasticsearch

@Override
public TokenStream create(TokenStream tokenStream) {
  return new HunspellStemFilter(tokenStream, dictionary, dedup, longestOnly);
}
origin: org.apache.lucene/lucene-analyzers-common

IntsRef lookupPrefix(char word[], int offset, int length) {
 return lookup(prefixes, word, offset, length);
}
origin: org.apache.lucene/lucene-analyzers-common

/**
 * Find the unique stem(s) of the provided word
 * 
 * @param word Word to find the stems for
 * @return List of stems for the word
 */
public List<CharsRef> uniqueStems(char word[], int length) {
 List<CharsRef> stems = stem(word, length);
 if (stems.size() < 2) {
  return stems;
 }
 CharArraySet terms = new CharArraySet(8, dictionary.ignoreCase);
 List<CharsRef> deduped = new ArrayList<>();
 for (CharsRef s : stems) {
  if (!terms.contains(s)) {
   deduped.add(s);
   terms.add(s);
  }
 }
 return deduped;
}

origin: org.apache.lucene/lucene-analyzers-common

/** folds lowercase variant of word (title cased) to lowerBuffer */
private void caseFoldLower(char word[], int length) {
 lowerBuffer = ArrayUtil.grow(lowerBuffer, length);
 System.arraycopy(word, 0, lowerBuffer, 0, length);
 lowerBuffer[0] = dictionary.caseFold(lowerBuffer[0]);
}

origin: org.apache.lucene/lucene-analyzers-common

private String parseStemException(String morphData) {
 // first see if it's an alias
 if (morphAliasCount > 0) {
  try {
   int alias = Integer.parseInt(morphData.trim());
   morphData = morphAliases[alias-1];
  } catch (NumberFormatException e) {  
   // fine
  }
 }
 // try to parse morph entry
 int index = morphData.indexOf(" st:");
 if (index < 0) {
  index = morphData.indexOf("\tst:");
 }
 if (index >= 0) {
  int endIndex = indexOfSpaceOrTab(morphData, index+1);
  if (endIndex < 0) {
   endIndex = morphData.length();
  }
  return morphData.substring(index+4, endIndex);
 }
 return null;
}
origin: org.apache.lucene/lucene-analyzers-common

IntsRef lookupSuffix(char word[], int offset, int length) {
 return lookup(suffixes, word, offset, length);
}

org.apache.lucene.analysis.hunspell

Most used classes

  • HunspellStemFilter
    TokenFilter that uses hunspell affix rules and words to stem tokens. Since hunspell supports a word
  • Dictionary
    In-memory structure for the dictionary (.dic) and affix (.aff) data of a hunspell dictionary.
  • Dictionary$DoubleASCIIFlagParsingStrategy
    Implementation of FlagParsingStrategy that assumes each flag is encoded as two ASCII characters whos
  • Dictionary$FlagParsingStrategy
    Abstraction of the process of parsing flags taken from the affix and dic files
  • Dictionary$NumFlagParsingStrategy
    Implementation of FlagParsingStrategy that assumes each flag is encoded in its numerical form. In th
  • HunspellDictionary,
  • HunspellStemFilterFactory,
  • ISO8859_14Decoder,
  • Stemmer,
  • HunspellAffix,
  • HunspellDictionary$DoubleASCIIFlagParsingStrategy,
  • HunspellDictionary$FlagParsingStrategy,
  • HunspellDictionary$NumFlagParsingStrategy,
  • HunspellDictionary$SimpleFlagParsingStrategy,
  • HunspellStemmer$Stem,
  • HunspellStemmer,
  • HunspellWord
Tabnine Logo
  • Products

    Search for Java codeSearch for JavaScript code
  • IDE Plugins

    IntelliJ IDEAWebStormVisual StudioAndroid StudioEclipseVisual Studio CodePyCharmSublime TextPhpStormVimGoLandRubyMineEmacsJupyter NotebookJupyter LabRiderDataGripAppCode
  • Company

    About UsContact UsCareers
  • Resources

    FAQBlogTabnine AcademyTerms of usePrivacy policyJava Code IndexJavascript Code Index
Get Tabnine for your IDE now