return new Dictionary(tmp, "hunspell", affixStream, dicStreams, ignoreCase);
parseAlias(line); } else if (line.startsWith(MORPH_ALIAS_KEY)) { parseMorphAlias(line); } else if (line.startsWith(PREFIX_KEY)) { parseAffix(prefixes, line, reader, PREFIX_CONDITION_REGEX_PATTERN, seenPatterns, seenStrips); } else if (line.startsWith(SUFFIX_KEY)) { parseAffix(suffixes, line, reader, SUFFIX_CONDITION_REGEX_PATTERN, seenPatterns, seenStrips); } else if (line.startsWith(FLAG_KEY)) { flagParsingStrategy = getFlagParsingStrategy(line); } else if (line.equals(COMPLEXPREFIXES_KEY)) { complexPrefixes = true; // 2-stage prefix+1-stage suffix instead of 2-stage suffix+1-stage prefix FST<CharsRef> res = parseConversions(reader, num); if (type.equals("ICONV")) { iconv = res; this.prefixes = affixFST(prefixes); this.suffixes = affixFST(suffixes);
ch = caseFold(ch); applyMappings(iconv, reuse); } catch (IOException bogus) { throw new RuntimeException(bogus); reuse.setCharAt(i, caseFold(reuse.charAt(i)));
private List<CharsRef> doStem(char word[], int length, boolean caseVariant) { List<CharsRef> stems = new ArrayList<>(); IntsRef forms = dictionary.lookupWord(word, 0, length); if (forms != null) { for (int i = 0; i < forms.length; i += formStep) { if (checkKeepCase || checkNeedAffix || checkOnlyInCompound) { dictionary.flagLookup.get(forms.ints[forms.offset+i], scratch); char wordFlags[] = Dictionary.decodeFlags(scratch); if (checkKeepCase && Dictionary.hasFlag(wordFlags, (char)dictionary.keepcase)) { continue; if (checkNeedAffix && Dictionary.hasFlag(wordFlags, (char)dictionary.needaffix)) { continue; if (checkOnlyInCompound && Dictionary.hasFlag(wordFlags, (char)dictionary.onlyincompound)) { continue;
int exceptionID = forms.ints[forms.offset + formID + 1]; if (exceptionID > 0) { exception = dictionary.getStemException(exceptionID); } else { exception = null; Dictionary.applyMappings(dictionary.oconv, scratchSegment); } catch (IOException bogus) { throw new RuntimeException(bogus);
continue; line = unescapeEntry(line); hasStemExceptions = parseStemException(line.substring(morphStart+1)) != null; CharSequence cleansed = cleanInput(line, sb); writer.write(cleansed.toString().getBytes(StandardCharsets.UTF_8)); } else { String text = line.substring(0, flagSep); CharSequence cleansed = cleanInput(text, sb); if (cleansed != sb) { sb.setLength(0); String flagPart = line.substring(flagSep + 1, end); if (aliasCount > 0) { flagPart = getAliasValue(Integer.parseInt(flagPart)); String stemException = parseStemException(line.substring(end+1)); if (stemException != null) { stemExceptions = ArrayUtil.grow(stemExceptions, stemExceptionCount+1); throw new IllegalArgumentException("out of order: " + entry + " < " + currentEntry); } else { encodeFlags(flagsScratch, wordForm); int ord = flagLookup.add(flagsScratch.get()); if (ord < 0) {
flagPart = getAliasValue(Integer.parseInt(flagPart)); condition = escapeDash(condition); encodeFlags(scratch, appendFlags); int appendFlagsOrd = flagLookup.add(scratch.get()); if (appendFlagsOrd < 0) { CharSequence cleaned = cleanInput(affixArg, sb); affixArg = cleaned.toString();
@Override public void inform(ResourceLoader loader) throws IOException { String dicts[] = dictionaryFiles.split(","); InputStream affix = null; List<InputStream> dictionaries = new ArrayList<>(); try { dictionaries = new ArrayList<>(); for (String file : dicts) { dictionaries.add(loader.openResource(file)); } affix = loader.openResource(affixFile); Path tempPath = Files.createTempDirectory(Dictionary.getDefaultTempDir(), "Hunspell"); try (Directory tempDir = FSDirectory.open(tempPath)) { this.dictionary = new Dictionary(tempDir, "hunspell", affix, dictionaries, ignoreCase); } finally { IOUtils.rm(tempPath); } } catch (ParseException e) { throw new IOException("Unable to load hunspell data! [dictionary=" + dictionaries + ",affix=" + affixFile + "]", e); } finally { IOUtils.closeWhileHandlingException(affix); IOUtils.closeWhileHandlingException(dictionaries); } }
char appendFlags[] = Dictionary.decodeFlags(scratch); compatible = !Dictionary.hasFlag(appendFlags, (char) dictionary.onlyincompound); char appendFlags[] = Dictionary.decodeFlags(scratch); assert prevFlag >= 0; boolean allowed = dictionary.onlyincompound == -1 || !Dictionary.hasFlag(appendFlags, (char) dictionary.onlyincompound); compatible = allowed && hasCrossCheckedFlag((char)prevFlag, appendFlags, false); } else { char appendFlags[] = Dictionary.decodeFlags(scratch); compatible = !Dictionary.hasFlag(appendFlags, (char) dictionary.onlyincompound); char appendFlags[] = Dictionary.decodeFlags(scratch); assert prevFlag >= 0; boolean allowed = dictionary.onlyincompound == -1 || !Dictionary.hasFlag(appendFlags, (char) dictionary.onlyincompound); compatible = allowed && hasCrossCheckedFlag((char)prevFlag, appendFlags, previousWasPrefix); } else {
/** folds titlecase variant of word to titleBuffer */ private void caseFoldTitle(char word[], int length) { titleBuffer = ArrayUtil.grow(titleBuffer, length); System.arraycopy(word, 0, titleBuffer, 0, length); for (int i = 1; i < length; i++) { titleBuffer[i] = dictionary.caseFold(titleBuffer[i]); } }
scratchSegment.setLength(0); scratchSegment.append(word, 0, length); CharSequence cleaned = dictionary.cleanInput(scratchSegment, segment); scratchBuffer = ArrayUtil.grow(scratchBuffer, cleaned.length()); length = segment.length();
continue; line = unescapeEntry(line); hasStemExceptions = parseStemException(line.substring(morphStart+1)) != null; CharSequence cleansed = cleanInput(line, sb); writer.write(cleansed.toString().getBytes(StandardCharsets.UTF_8)); } else { String text = line.substring(0, flagSep); CharSequence cleansed = cleanInput(text, sb); if (cleansed != sb) { sb.setLength(0); String flagPart = line.substring(flagSep + 1, end); if (aliasCount > 0) { flagPart = getAliasValue(Integer.parseInt(flagPart)); String stemException = parseStemException(line.substring(end+1)); if (stemException != null) { if (stemExceptionCount == stemExceptions.length) { throw new IllegalArgumentException("out of order: " + entry + " < " + currentEntry); } else { encodeFlags(flagsScratch, wordForm); int ord = flagLookup.add(flagsScratch.get()); if (ord < 0) {
IntsRef forms = dictionary.lookupWord(strippedWord, 0, length); if (forms != null) { for (int i = 0; i < forms.length; i += formStep) { dictionary.flagLookup.get(forms.ints[forms.offset+i], scratch); char wordFlags[] = Dictionary.decodeFlags(scratch); if (Dictionary.hasFlag(wordFlags, flag)) { if (chainedPrefix == false && prefixFlag >= 0 && !Dictionary.hasFlag(wordFlags, (char)prefixFlag)) { char appendFlags[] = Dictionary.decodeFlags(scratch); if (!hasCrossCheckedFlag((char)prefixFlag, appendFlags, false)) { continue; char appendFlags[] = Dictionary.decodeFlags(scratch); boolean suffixCircumfix = Dictionary.hasFlag(appendFlags, (char)dictionary.circumfix); if (circumfix != suffixCircumfix) { continue; if (caseVariant && dictionary.keepcase != -1 && Dictionary.hasFlag(wordFlags, (char)dictionary.keepcase)) { continue; if (dictionary.onlyincompound != -1 && Dictionary.hasFlag(wordFlags, (char)dictionary.onlyincompound)) { continue; char appendFlags[] = Dictionary.decodeFlags(scratch); circumfix = Dictionary.hasFlag(appendFlags, (char)dictionary.circumfix);
flagPart = getAliasValue(Integer.parseInt(flagPart)); condition = escapeDash(condition); encodeFlags(scratch, appendFlags); int appendFlagsOrd = flagLookup.add(scratch.get()); if (appendFlagsOrd < 0) { CharSequence cleaned = cleanInput(affixArg, sb); affixArg = cleaned.toString();
int exceptionID = forms.ints[forms.offset + formID + 1]; if (exceptionID > 0) { exception = dictionary.getStemException(exceptionID); } else { exception = null; Dictionary.applyMappings(dictionary.oconv, scratchSegment); } catch (IOException bogus) { throw new RuntimeException(bogus);
char appendFlags[] = Dictionary.decodeFlags(scratch); compatible = !Dictionary.hasFlag(appendFlags, (char) dictionary.onlyincompound); char appendFlags[] = Dictionary.decodeFlags(scratch); assert prevFlag >= 0; boolean allowed = dictionary.onlyincompound == -1 || !Dictionary.hasFlag(appendFlags, (char) dictionary.onlyincompound); compatible = allowed && hasCrossCheckedFlag((char)prevFlag, appendFlags, false); } else { char appendFlags[] = Dictionary.decodeFlags(scratch); compatible = !Dictionary.hasFlag(appendFlags, (char) dictionary.onlyincompound); char appendFlags[] = Dictionary.decodeFlags(scratch); assert prevFlag >= 0; boolean allowed = dictionary.onlyincompound == -1 || !Dictionary.hasFlag(appendFlags, (char) dictionary.onlyincompound); compatible = allowed && hasCrossCheckedFlag((char)prevFlag, appendFlags, previousWasPrefix); } else {
/** folds lowercase variant of word (title cased) to lowerBuffer */ private void caseFoldLower(char word[], int length) { lowerBuffer = ArrayUtil.grow(lowerBuffer, length); System.arraycopy(word, 0, lowerBuffer, 0, length); lowerBuffer[0] = dictionary.caseFold(lowerBuffer[0]); }
scratchSegment.setLength(0); scratchSegment.append(word, 0, length); CharSequence cleaned = dictionary.cleanInput(scratchSegment, segment); scratchBuffer = ArrayUtil.grow(scratchBuffer, cleaned.length()); length = segment.length();
private static Dictionary initDict(Language language) { populateInstalledDicts(); File affixFile; File dictionaryFile; synchronized (HunspellTokenizer.class) { affixFile = affixFiles.get(language); dictionaryFile = dictionaryFiles.get(language); } if (affixFile == null || dictionaryFile == null || !affixFile.exists() || !dictionaryFile.exists()) { Log.logErrorRB("HUNSPELL_TOKENIZER_DICT_NOT_INSTALLED", language.getLocale()); return null; } try { return new Dictionary(new FileInputStream(affixFile), new FileInputStream(dictionaryFile)); } catch (Throwable t) { Log.log(t); return null; } }
parseAlias(line); } else if (line.startsWith(MORPH_ALIAS_KEY)) { parseMorphAlias(line); } else if (line.startsWith(PREFIX_KEY)) { parseAffix(prefixes, line, reader, PREFIX_CONDITION_REGEX_PATTERN, seenPatterns, seenStrips); } else if (line.startsWith(SUFFIX_KEY)) { parseAffix(suffixes, line, reader, SUFFIX_CONDITION_REGEX_PATTERN, seenPatterns, seenStrips); } else if (line.startsWith(FLAG_KEY)) { flagParsingStrategy = getFlagParsingStrategy(line); } else if (line.equals(COMPLEXPREFIXES_KEY)) { complexPrefixes = true; // 2-stage prefix+1-stage suffix instead of 2-stage suffix+1-stage prefix FST<CharsRef> res = parseConversions(reader, num); if (type.equals("ICONV")) { iconv = res; this.prefixes = affixFST(prefixes); this.suffixes = affixFST(suffixes);