/** * Reads lines from a Reader and adds every line as an entry to a CharArraySet (omitting * leading and trailing whitespace). Every line of the Reader should contain only * one word. The words need to be in lowercase if you make use of an * Analyzer which uses LowerCaseFilter (like StandardAnalyzer). * * @param reader Reader containing the wordlist * @return A {@link CharArraySet} with the reader's words */ public static CharArraySet getWordSet(Reader reader) throws IOException { return getWordSet(reader, new CharArraySet(INITIAL_CAPACITY, false)); }
/** * Reads lines from a Reader and adds every non-comment line as an entry to a CharArraySet (omitting * leading and trailing whitespace). Every line of the Reader should contain only * one word. The words need to be in lowercase if you make use of an * Analyzer which uses LowerCaseFilter (like StandardAnalyzer). * * @param reader Reader containing the wordlist * @param comment The string representing a comment. * @return A CharArraySet with the reader's words */ public static CharArraySet getWordSet(Reader reader, String comment) throws IOException { return getWordSet(reader, comment, new CharArraySet(INITIAL_CAPACITY, false)); }
/** * Creates a CharArraySet from a path. * * @param stopwords * the stopwords file to load * @return a CharArraySet containing the distinct stopwords from the given * file * @throws IOException * if loading the stopwords throws an {@link IOException} */ protected static CharArraySet loadStopwordSet(Path stopwords) throws IOException { Reader reader = null; try { reader = Files.newBufferedReader(stopwords, StandardCharsets.UTF_8); return WordlistLoader.getWordSet(reader); } finally { IOUtils.close(reader); } }
/** * Creates a CharArraySet from a file. * * @param stopwords * the stopwords reader to load * * @return a CharArraySet containing the distinct stopwords from the given * reader * @throws IOException * if loading the stopwords throws an {@link IOException} */ protected static CharArraySet loadStopwordSet(Reader stopwords) throws IOException { try { return WordlistLoader.getWordSet(stopwords); } finally { IOUtils.close(stopwords); } } }
try { reader = IOUtils.getDecodingReader(aClass.getResourceAsStream(resource), StandardCharsets.UTF_8); return WordlistLoader.getWordSet(reader, comment, new CharArraySet(16, ignoreCase)); } finally { IOUtils.close(reader);
/** Builds an analyzer with the stop words from the given file. * @see WordlistLoader#getWordSet(File) */ public StandardAnalyzer(File stopwords) throws IOException { stopSet = WordlistLoader.getWordSet(stopwords); }
/** Builds an analyzer with the stop words from the given reader. * @see WordlistLoader#getWordSet(Reader) */ public StandardAnalyzer(Reader stopwords) throws IOException { stopSet = WordlistLoader.getWordSet(stopwords); }
/** Builds an analyzer with the stop words from the given reader. * @see WordlistLoader#getWordSet(Reader) */ public StandardAnalyzer(Reader stopwords) throws IOException { stopSet = WordlistLoader.getWordSet(stopwords); }
/** Builds an analyzer with the stop words from the given reader. * @see WordlistLoader#getWordSet(Reader) */ public StopAnalyzer(Reader stopwords) throws IOException { stopWords = WordlistLoader.getWordSet(stopwords); }
private void loadKeywordSet() { try (BufferedReader reader = Files.newBufferedReader(keywordPath, Charset.forName("UTF-8"))) { keywordSet = WordlistLoader.getWordSet(reader); lastModifed = Files.getLastModifiedTime(keywordPath).toMillis(); } catch (Exception e) { throw new IllegalArgumentException("Failed to read " + keywordPath, e); } }
/** * Builds an analyzer with the given stop words. Lines can be commented out using {@link #STOPWORDS_COMMENT} * @deprecated use {@link #ArabicAnalyzer(Version, Set)} instead */ @Deprecated public ArabicAnalyzer( Version matchVersion, File stopwords ) throws IOException { this(matchVersion, WordlistLoader.getWordSet(IOUtils.getDecodingReader(stopwords, IOUtils.CHARSET_UTF_8), STOPWORDS_COMMENT, matchVersion)); }
/** * Builds an analyzer with the given stop words. Lines can be commented out * using {@link #STOPWORDS_COMMENT} * @deprecated use {@link #PersianAnalyzer(Version, Set)} instead */ @Deprecated public PersianAnalyzer(Version matchVersion, File stopwords) throws IOException { this(matchVersion, WordlistLoader.getWordSet( IOUtils.getDecodingReader(stopwords, IOUtils.CHARSET_UTF_8), STOPWORDS_COMMENT, matchVersion)); }
/** * Reads lines from a Reader and adds every non-comment line as an entry to a CharArraySet (omitting * leading and trailing whitespace). Every line of the Reader should contain only * one word. The words need to be in lowercase if you make use of an * Analyzer which uses LowerCaseFilter (like StandardAnalyzer). * * @param reader Reader containing the wordlist * @param comment The string representing a comment. * @return A CharArraySet with the reader's words */ public static CharArraySet getWordSet(Reader reader, String comment) throws IOException { return getWordSet(reader, comment, new CharArraySet(INITIAL_CAPACITY, false)); }
/** * Builds an analyzer with the given stop words. * @throws IOException * @deprecated use {@link #FrenchAnalyzer(Version, Set)} instead */ @Deprecated public FrenchAnalyzer(Version matchVersion, File stopwords) throws IOException { this(matchVersion, WordlistLoader.getWordSet(IOUtils.getDecodingReader(stopwords, IOUtils.CHARSET_UTF_8), matchVersion)); }
/** * Builds an analyzer with the given stop words. * @deprecated use {@link #BrazilianAnalyzer(Version, Set)} instead */ @Deprecated public BrazilianAnalyzer(Version matchVersion, File stopwords) throws IOException { this(matchVersion, WordlistLoader.getWordSet( IOUtils.getDecodingReader(stopwords, IOUtils.CHARSET_UTF_8), matchVersion)); }
/** * Builds an analyzer with the given stop words. * @deprecated use {@link #GermanAnalyzer(Version, Set)} */ @Deprecated public GermanAnalyzer(Version matchVersion, File stopwords) throws IOException { this(matchVersion, WordlistLoader.getWordSet( IOUtils.getDecodingReader(stopwords, IOUtils.CHARSET_UTF_8), matchVersion)); }
/** * Builds an exclusionlist from the words contained in the given file. * @deprecated use {@link #BrazilianAnalyzer(Version, Set, Set)} instead */ @Deprecated public void setStemExclusionTable( File exclusionlist ) throws IOException { excltable = WordlistLoader.getWordSet( IOUtils.getDecodingReader(exclusionlist, IOUtils.CHARSET_UTF_8), matchVersion); setPreviousTokenStream(null); // force a new stemmer to be created }
/** * Builds an exclusionlist from the words contained in the given file. * @deprecated use {@link #GermanAnalyzer(Version, Set, Set)} instead */ @Deprecated public void setStemExclusionTable(File exclusionlist) throws IOException { exclusionSet = WordlistLoader.getWordSet(IOUtils.getDecodingReader(exclusionlist, IOUtils.CHARSET_UTF_8), matchVersion); setPreviousTokenStream(null); // force a new stemmer to be created }
/** * Builds an exclusionlist from the words contained in the given file. * @throws IOException * @deprecated use {@link #FrenchAnalyzer(Version, Set, Set)} instead */ @Deprecated public void setStemExclusionTable(File exclusionlist) throws IOException { excltable = WordlistLoader.getWordSet(IOUtils.getDecodingReader(exclusionlist, IOUtils.CHARSET_UTF_8), matchVersion); setPreviousTokenStream(null); // force a new stemmer to be created }
static CharArraySet loadDefaultStopWordSet() throws IOException { // make sure it is unmodifiable as we expose it in the outer class return CharArraySet.unmodifiableSet(WordlistLoader.getWordSet(IOUtils .getDecodingReader(SmartChineseAnalyzer.class, DEFAULT_STOPWORD_FILE, StandardCharsets.UTF_8), STOPWORD_FILE_COMMENT)); } }