@Override public TokenStream create(TokenStream tokenStream) { return new HunspellStemFilter(tokenStream, dictionary, true, longestOnly); } }
/** * Looks up Hunspell word forms from the dictionary */ IntsRef lookupWord(char word[], int offset, int length) { return lookup(words, word, offset, length); }
/** * Creates a new HunspellStemFilter that will stem tokens from the given TokenStream using affix rules in the provided * Dictionary * * @param input TokenStream whose tokens will be stemmed * @param dictionary HunspellDictionary containing the affix rules and words that will be used to stem the tokens * @param longestOnly true if only the longest term should be output. */ public HunspellStemFilter(TokenStream input, Dictionary dictionary, boolean dedup, boolean longestOnly) { super(input); this.dedup = dedup && longestOnly == false; // don't waste time deduping if longestOnly is set this.stemmer = new Stemmer(dictionary); this.longestOnly = longestOnly; }
private List<CharsRef> doStem(char word[], int length, boolean caseVariant) { List<CharsRef> stems = new ArrayList<>(); IntsRef forms = dictionary.lookupWord(word, 0, length); if (forms != null) { for (int i = 0; i < forms.length; i += formStep) { if (checkKeepCase || checkNeedAffix || checkOnlyInCompound) { dictionary.flagLookup.get(forms.ints[forms.offset+i], scratch); char wordFlags[] = Dictionary.decodeFlags(scratch); if (checkKeepCase && Dictionary.hasFlag(wordFlags, (char)dictionary.keepcase)) { continue; if (checkNeedAffix && Dictionary.hasFlag(wordFlags, (char)dictionary.needaffix)) { continue; if (checkOnlyInCompound && Dictionary.hasFlag(wordFlags, (char)dictionary.onlyincompound)) { continue; stems.add(newStem(word, length, forms, i)); boolean v = stems.addAll(stem(word, length, -1, -1, -1, 0, true, true, false, false, caseVariant)); } catch (IOException bogus) { throw new RuntimeException(bogus);
scratchSegment.setLength(0); scratchSegment.append(word, 0, length); CharSequence cleaned = dictionary.cleanInput(scratchSegment, segment); scratchBuffer = ArrayUtil.grow(scratchBuffer, cleaned.length()); length = segment.length(); int caseType = caseOf(word, length); if (caseType == UPPER_CASE) { caseFoldTitle(word, length); caseFoldLower(titleBuffer, length); List<CharsRef> list = doStem(word, length, false); list.addAll(doStem(titleBuffer, length, true)); list.addAll(doStem(lowerBuffer, length, true)); return list; } else if (caseType == TITLE_CASE) { caseFoldLower(word, length); List<CharsRef> list = doStem(word, length, false); list.addAll(doStem(lowerBuffer, length, true)); return list; } else { return doStem(word, length, false);
if (buffer != null && !buffer.isEmpty()) { CharsRef nextStem = buffer.remove(0); restoreState(savedState); posIncAtt.setPositionIncrement(0); termAtt.setEmpty().append(nextStem); buffer = dedup ? stemmer.uniqueStems(termAtt.buffer(), termAtt.length()) : stemmer.stem(termAtt.buffer(), termAtt.length()); } else { if (!buffer.isEmpty()) { savedState = captureState();
/** Creates a new HunspellStemFilterFactory */ public HunspellStemFilterFactory(Map<String,String> args) { super(args); dictionaryFiles = require(args, PARAM_DICTIONARY); affixFile = get(args, PARAM_AFFIX); ignoreCase = getBoolean(args, PARAM_IGNORE_CASE, false); longestOnly = getBoolean(args, PARAM_LONGEST_ONLY, false); // this isnt necessary: we properly load all dictionaries. // but recognize and ignore for back compat getBoolean(args, "strictAffixParsing", true); // this isn't necessary: multi-stage stripping is fixed and // flags like COMPLEXPREFIXES in the data itself control this. // but recognize and ignore for back compat getInt(args, "recursionCap", 0); if (!args.isEmpty()) { throw new IllegalArgumentException("Unknown parameters: " + args); } }
/** * Determines the appropriate {@link FlagParsingStrategy} based on the FLAG definition line taken from the affix file * * @param flagLine Line containing the flag information * @return FlagParsingStrategy that handles parsing flags in the way specified in the FLAG definition */ static FlagParsingStrategy getFlagParsingStrategy(String flagLine) { String parts[] = flagLine.split("\\s+"); if (parts.length != 2) { throw new IllegalArgumentException("Illegal FLAG specification: " + flagLine); } String flagType = parts[1]; if (NUM_FLAG_TYPE.equals(flagType)) { return new NumFlagParsingStrategy(); } else if (UTF8_FLAG_TYPE.equals(flagType)) { return new SimpleFlagParsingStrategy(); } else if (LONG_FLAG_TYPE.equals(flagType)) { return new DoubleASCIIFlagParsingStrategy(); } throw new IllegalArgumentException("Unknown flag type: " + flagType); }
/** * Find the stem(s) of the provided word. * * @param word Word to find the stems for * @return List of stems for the word */ public List<CharsRef> stem(String word) { return stem(word.toCharArray(), word.length()); }
/** folds titlecase variant of word to titleBuffer */ private void caseFoldTitle(char word[], int length) { titleBuffer = ArrayUtil.grow(titleBuffer, length); System.arraycopy(word, 0, titleBuffer, 0, length); for (int i = 1; i < length; i++) { titleBuffer[i] = dictionary.caseFold(titleBuffer[i]); } }
static int morphBoundary(String line) { int end = indexOfSpaceOrTab(line, 0); if (end == -1) { return line.length(); } while (end >= 0 && end < line.length()) { if (line.charAt(end) == '\t' || end+3 < line.length() && Character.isLetter(line.charAt(end+1)) && Character.isLetter(line.charAt(end+2)) && line.charAt(end+3) == ':') { break; } end = indexOfSpaceOrTab(line, end+1); } if (end == -1) { return line.length(); } return end; }
/** * Parses the given String into a single flag * * @param rawFlag String to parse into a flag * @return Parsed flag */ char parseFlag(String rawFlag) { char flags[] = parseFlags(rawFlag); if (flags.length != 1) { throw new IllegalArgumentException("expected only one flag, got: " + rawFlag); } return flags[0]; }
String unescapeEntry(String entry) { StringBuilder sb = new StringBuilder(); int end = morphBoundary(entry); for (int i = 0; i < end; i++) { char ch = entry.charAt(i); if (ch == '\\' && i+1 < entry.length()) { sb.append(entry.charAt(i+1)); i++; } else if (ch == '/') { sb.append(FLAG_SEPARATOR); } else if (ch == MORPH_SEPARATOR || ch == FLAG_SEPARATOR) { // BINARY EXECUTABLES EMBEDDED IN ZULU DICTIONARIES!!!!!!! } else { sb.append(ch); } } sb.append(MORPH_SEPARATOR); if (end < entry.length()) { for (int i = end; i < entry.length(); i++) { char c = entry.charAt(i); if (c == FLAG_SEPARATOR || c == MORPH_SEPARATOR) { // BINARY EXECUTABLES EMBEDDED IN ZULU DICTIONARIES!!!!!!! } else { sb.append(c); } } } return sb.toString(); }
/** * Retrieves the CharsetDecoder for the given encoding. Note, This isn't perfect as I think ISCII-DEVANAGARI and * MICROSOFT-CP1251 etc are allowed... * * @param encoding Encoding to retrieve the CharsetDecoder for * @return CharSetDecoder for the given encoding */ private CharsetDecoder getJavaEncoding(String encoding) { if ("ISO8859-14".equals(encoding)) { return new ISO8859_14Decoder(); } String canon = CHARSET_ALIASES.get(encoding); if (canon != null) { encoding = canon; } Charset charset = Charset.forName(encoding); return charset.newDecoder().onMalformedInput(CodingErrorAction.REPLACE); }
@Override public TokenStream create(TokenStream tokenStream) { return new HunspellStemFilter(tokenStream, dictionary, dedup, longestOnly); }
IntsRef lookupPrefix(char word[], int offset, int length) { return lookup(prefixes, word, offset, length); }
/** * Find the unique stem(s) of the provided word * * @param word Word to find the stems for * @return List of stems for the word */ public List<CharsRef> uniqueStems(char word[], int length) { List<CharsRef> stems = stem(word, length); if (stems.size() < 2) { return stems; } CharArraySet terms = new CharArraySet(8, dictionary.ignoreCase); List<CharsRef> deduped = new ArrayList<>(); for (CharsRef s : stems) { if (!terms.contains(s)) { deduped.add(s); terms.add(s); } } return deduped; }
/** folds lowercase variant of word (title cased) to lowerBuffer */ private void caseFoldLower(char word[], int length) { lowerBuffer = ArrayUtil.grow(lowerBuffer, length); System.arraycopy(word, 0, lowerBuffer, 0, length); lowerBuffer[0] = dictionary.caseFold(lowerBuffer[0]); }
private String parseStemException(String morphData) { // first see if it's an alias if (morphAliasCount > 0) { try { int alias = Integer.parseInt(morphData.trim()); morphData = morphAliases[alias-1]; } catch (NumberFormatException e) { // fine } } // try to parse morph entry int index = morphData.indexOf(" st:"); if (index < 0) { index = morphData.indexOf("\tst:"); } if (index >= 0) { int endIndex = indexOfSpaceOrTab(morphData, index+1); if (endIndex < 0) { endIndex = morphData.length(); } return morphData.substring(index+4, endIndex); } return null; }
IntsRef lookupSuffix(char word[], int offset, int length) { return lookup(suffixes, word, offset, length); }