/** * Opens a Reader for the given resource using a {@link CharsetDecoder}. * Unlike Java's defaults this reader will throw an exception if your it detects * the read charset doesn't match the expected {@link Charset}. * <p> * Decoding readers are useful to load configuration files, stopword lists or synonym files * to detect character set problems. However, it's not recommended to use as a common purpose * reader. * @param clazz the class used to locate the resource * @param resource the resource name to load * @param charSet the expected charset * @return a reader to read the given file * */ public static Reader getDecodingReader(Class<?> clazz, String resource, Charset charSet) throws IOException { InputStream stream = null; boolean success = false; try { stream = clazz .getResourceAsStream(resource); final Reader reader = getDecodingReader(stream, charSet); success = true; return reader; } finally { if (!success) { IOUtils.close(stream); } } }
boolean success = false; try { input = getBufferedReader(IOUtils.getDecodingReader(stream, charset));
Reader reader = null; try { reader = IOUtils.getDecodingReader(aClass.getResourceAsStream(resource), StandardCharsets.UTF_8); return WordlistLoader.getWordSet(reader, comment, new CharArraySet(16, ignoreCase)); } finally {
/** * Creates a dictionary based on an inputstream. * <p> * NOTE: content is treated as UTF-8 */ public PlainTextDictionary(InputStream dictFile) { in = new BufferedReader(IOUtils.getDecodingReader(dictFile, StandardCharsets.UTF_8)); }
/** * Creates a dictionary based on an inputstream. * Using <code>fieldDelimiter</code> to seperate out the * fields in a line. * <p> * NOTE: content is treated as UTF-8 */ public FileDictionary(InputStream dictFile, String fieldDelimiter) { in = new BufferedReader(IOUtils.getDecodingReader(dictFile, StandardCharsets.UTF_8)); this.fieldDelimiter = fieldDelimiter; }
/** * Builds an analyzer with the given stop words. Lines can be commented out using {@link #STOPWORDS_COMMENT} * @deprecated use {@link #ArabicAnalyzer(Version, Set)} instead */ @Deprecated public ArabicAnalyzer( Version matchVersion, File stopwords ) throws IOException { this(matchVersion, WordlistLoader.getWordSet(IOUtils.getDecodingReader(stopwords, IOUtils.CHARSET_UTF_8), STOPWORDS_COMMENT, matchVersion)); }
/** * Builds an analyzer with the given stop words. Lines can be commented out * using {@link #STOPWORDS_COMMENT} * @deprecated use {@link #PersianAnalyzer(Version, Set)} instead */ @Deprecated public PersianAnalyzer(Version matchVersion, File stopwords) throws IOException { this(matchVersion, WordlistLoader.getWordSet( IOUtils.getDecodingReader(stopwords, IOUtils.CHARSET_UTF_8), STOPWORDS_COMMENT, matchVersion)); }
/** * Builds an analyzer with the given stop words. * @throws IOException * @deprecated use {@link #FrenchAnalyzer(Version, Set)} instead */ @Deprecated public FrenchAnalyzer(Version matchVersion, File stopwords) throws IOException { this(matchVersion, WordlistLoader.getWordSet(IOUtils.getDecodingReader(stopwords, IOUtils.CHARSET_UTF_8), matchVersion)); }
/** * Builds an analyzer with the given stop words. * @deprecated use {@link #BrazilianAnalyzer(Version, Set)} instead */ @Deprecated public BrazilianAnalyzer(Version matchVersion, File stopwords) throws IOException { this(matchVersion, WordlistLoader.getWordSet( IOUtils.getDecodingReader(stopwords, IOUtils.CHARSET_UTF_8), matchVersion)); }
/** * Builds an analyzer with the given stop words. * @deprecated use {@link #GermanAnalyzer(Version, Set)} */ @Deprecated public GermanAnalyzer(Version matchVersion, File stopwords) throws IOException { this(matchVersion, WordlistLoader.getWordSet( IOUtils.getDecodingReader(stopwords, IOUtils.CHARSET_UTF_8), matchVersion)); }
/** * Builds an analyzer with the given stop words. * * @param matchVersion Lucene version to match See * {@link <a href="#version">above</a>} * @param stopwords a file containing stopwords * @deprecated use {@link #CzechAnalyzer(Version, Set)} instead */ @Deprecated public CzechAnalyzer(Version matchVersion, File stopwords ) throws IOException { this(matchVersion, (Set<?>)WordlistLoader.getWordSet( IOUtils.getDecodingReader(stopwords, IOUtils.CHARSET_UTF_8), matchVersion)); }
private BreakIterator parseRules(String filename, ResourceLoader loader) throws IOException { StringBuilder rules = new StringBuilder(); InputStream rulesStream = loader.openResource(filename); BufferedReader reader = new BufferedReader (IOUtils.getDecodingReader(rulesStream, StandardCharsets.UTF_8)); String line = null; while ((line = reader.readLine()) != null) { if ( ! line.startsWith("#")) rules.append(line); rules.append('\n'); } reader.close(); return new RuleBasedBreakIterator(rules.toString()); }
/** * Builds an exclusionlist from the words contained in the given file. * @deprecated use {@link #BrazilianAnalyzer(Version, Set, Set)} instead */ @Deprecated public void setStemExclusionTable( File exclusionlist ) throws IOException { excltable = WordlistLoader.getWordSet( IOUtils.getDecodingReader(exclusionlist, IOUtils.CHARSET_UTF_8), matchVersion); setPreviousTokenStream(null); // force a new stemmer to be created }
/** * Builds an exclusionlist from the words contained in the given file. * @deprecated use {@link #GermanAnalyzer(Version, Set, Set)} instead */ @Deprecated public void setStemExclusionTable(File exclusionlist) throws IOException { exclusionSet = WordlistLoader.getWordSet(IOUtils.getDecodingReader(exclusionlist, IOUtils.CHARSET_UTF_8), matchVersion); setPreviousTokenStream(null); // force a new stemmer to be created }
/** * Builds an exclusionlist from the words contained in the given file. * @throws IOException * @deprecated use {@link #FrenchAnalyzer(Version, Set, Set)} instead */ @Deprecated public void setStemExclusionTable(File exclusionlist) throws IOException { excltable = WordlistLoader.getWordSet(IOUtils.getDecodingReader(exclusionlist, IOUtils.CHARSET_UTF_8), matchVersion); setPreviousTokenStream(null); // force a new stemmer to be created }
/** * Builds an exclusionlist from the words contained in the given file. * @deprecated use {@link #DutchAnalyzer(Version, Set, Set)} instead */ @Deprecated public void setStemExclusionTable(File exclusionlist) { try { excltable = WordlistLoader.getWordSet(IOUtils.getDecodingReader(exclusionlist, IOUtils.CHARSET_UTF_8), matchVersion); setPreviousTokenStream(null); // force a new stemmer to be created } catch (IOException e) { // TODO: throw IOException throw new RuntimeException(e); } }
static CharArraySet loadDefaultStopWordSet() throws IOException { // make sure it is unmodifiable as we expose it in the outer class return CharArraySet.unmodifiableSet(WordlistLoader.getWordSet(IOUtils .getDecodingReader(SmartChineseAnalyzer.class, DEFAULT_STOPWORD_FILE, StandardCharsets.UTF_8), STOPWORD_FILE_COMMENT)); } }
static CharArraySet loadDefaultStopWordSet() throws IOException { // make sure it is unmodifiable as we expose it in the outer class return CharArraySet.unmodifiableSet(WordlistLoader.getWordSet( IOUtils.getDecodingReader(JiebaAnalyzer.class, DEFAULT_STOPWORD_FILE, StandardCharsets.UTF_8), STOPWORD_FILE_COMMENT)); } }
public NumberConcatenationFilterFactory(final IndexSettings indexSettings, final Environment environment, final String name, final Settings settings) { super(indexSettings, name, settings); final String suffixWordsPath = settings.get("suffix_words_path"); if (suffixWordsPath != null) { final File suffixWordsFile = environment.configFile().resolve(suffixWordsPath).toFile(); try (Reader reader = IOUtils.getDecodingReader(new FileInputStream(suffixWordsFile), StandardCharsets.UTF_8)) { suffixWords = WordlistLoader.getWordSet(reader); } catch (final IOException e) { throw new IllegalArgumentException("Could not load " + suffixWordsFile.getAbsolutePath(), e); } } else { suffixWords = new CharArraySet(0, false); } }
public DictRadix<Byte> setCustomTokenizationCases(InputStream input) throws IOException { if (input != null) { final CharArraySet wordsList = WordlistLoader.getSnowballWordSet(IOUtils.getDecodingReader( input, StandardCharsets.UTF_8)); final DictRadix<Byte> radix = new DictRadix<>(false); for (Object aWordsList : wordsList) { radix.addNode((char[]) aWordsList, dummyData); } SPECIAL_TOKENIZATION_CASES = radix; } return SPECIAL_TOKENIZATION_CASES; }