@Override protected TokenStreamComponents createComponents(final String fieldName) { final StandardTokenizer src = new StandardTokenizer(); src.setMaxTokenLength(maxTokenLength); TokenStream tok = new LowerCaseFilter(src); tok = new StopFilter(tok, stopwords); return new TokenStreamComponents(src, tok) { @Override protected void setReader(final Reader reader) { // So that if maxTokenLength was changed, the change takes // effect next time tokenStream is called: src.setMaxTokenLength(StandardAnalyzer.this.maxTokenLength); super.setReader(reader); } }; }
@Override public final boolean incrementToken() throws IOException { clearAttributes(); skippedPositions = 0; while(true) { int tokenType = scanner.getNextToken(); if (tokenType == StandardTokenizerImpl.YYEOF) { return false; } if (scanner.yylength() <= maxTokenLength) { posIncrAtt.setPositionIncrement(skippedPositions+1); scanner.getText(termAtt); final int start = scanner.yychar(); offsetAtt.setOffset(correctOffset(start), correctOffset(start+termAtt.length())); typeAtt.setType(StandardTokenizer.TOKEN_TYPES[tokenType]); return true; } else // When we skip a too-long term, we still increment the // position increment skippedPositions++; } }
@Override public Tokenizer create() { StandardTokenizer tokenizer = new StandardTokenizer(); tokenizer.setMaxTokenLength(maxTokenLength); return tokenizer; } }
@Override public final TokenStream tokenStream(String fieldName, Reader reader) { //Tokenizer tokenStream = new KeywordTokenizer(reader); Tokenizer tokenStream = new StandardTokenizer(Version.LUCENE_36, reader); TokenStream result = new StandardFilter(Version.LUCENE_36, tokenStream); result = new LowerCaseFilter(Version.LUCENE_36, result); return result; }; }
/** * Creates * {@link TokenStreamComponents} * used to tokenize all the text in the provided {@link Reader}. * * @return {@link TokenStreamComponents} * built from a {@link StandardTokenizer} filtered with * {@link StandardFilter}, {@link LowerCaseFilter}, {@link StopFilter} * , {@link SetKeywordMarkerFilter} if a stem exclusion set is * provided, {@link GermanNormalizationFilter} and {@link GermanLightStemFilter} */ @Override protected TokenStreamComponents createComponents(String fieldName, Reader reader) { final Tokenizer source = new StandardTokenizer(reader); TokenStream result = new StandardFilter(source); result = new GermanNormalizationFilter(result); return new TokenStreamComponents(source, result); }
public TokenStream reusableTokenStream(String fieldName, Reader reader) throws IOException { SavedStreams streams = (SavedStreams) getPreviousTokenStream(); if (streams == null) { streams = new SavedStreams(); setPreviousTokenStream(streams); streams.tokenStream = new StandardTokenizer(reader); streams.filteredTokenStream = new StandardFilter(streams.tokenStream); streams.filteredTokenStream = new LowerCaseFilter(streams.filteredTokenStream); streams.filteredTokenStream = new StopFilter(streams.filteredTokenStream, stopSet); } else { streams.tokenStream.reset(reader); } streams.tokenStream.setMaxTokenLength(maxTokenLength); streams.tokenStream.setReplaceInvalidAcronym(replaceInvalidAcronym); return streams.filteredTokenStream; }
@Override protected TokenStreamComponents createComponents(String fieldName) { final Tokenizer source = new StandardTokenizer(); // run the widthfilter first before bigramming, it sometimes combines characters. TokenStream result = new CJKWidthFilter(source); result = new LowerCaseFilter(result); result = new CJKBigramFilter(result); return new TokenStreamComponents(source, new StopFilter(result, stopwords)); }
/** * Creates a * {@link org.apache.lucene.analysis.ReusableAnalyzerBase.TokenStreamComponents} * which tokenizes all the text in the provided {@link Reader}. * * @return A * {@link org.apache.lucene.analysis.ReusableAnalyzerBase.TokenStreamComponents} * built from an {@link StandardTokenizer} filtered with * {@link StandardFilter}, {@link IrishLowerCaseFilter}, {@link StopFilter} * , {@link KeywordMarkerFilter} if a stem exclusion set is * provided and {@link SnowballFilter}. */ @Override protected TokenStreamComponents createComponents(String fieldName, Reader reader) { final Tokenizer source = new StandardTokenizer(matchVersion, reader); TokenStream result = new StandardFilter(matchVersion, source); StopFilter s = new StopFilter(matchVersion, result, HYPHENATIONS); s.setEnablePositionIncrements(false); result = s; result = new ElisionFilter(matchVersion, result, DEFAULT_ARTICLES); result = new IrishLowerCaseFilter(result); result = new StopFilter(matchVersion, result, stopwords); if(!stemExclusionSet.isEmpty()) result = new KeywordMarkerFilter(result, stemExclusionSet); result = new SnowballFilter(result, new IrishStemmer()); return new TokenStreamComponents(source, result); } }
/** * * @param input * @return a list of lower-case tokens which strips accents & punctuation * @throws IOException */ public static List<String> getTokensFromAnalyzer(String input) { StandardTokenizer tokenStream = new StandardTokenizer(new StringReader(input)); TokenStream result = new StandardFilter(tokenStream); result = new LowerCaseFilter(result); result = new ASCIIFoldingFilter(result); CharTermAttribute charTermAttribute = result.addAttribute(CharTermAttribute.class); List<String> termList = new ArrayList<String>(); try { tokenStream.reset(); while (result.incrementToken()) { String term = charTermAttribute.toString(); termList.add(term); } result.close(); } catch (IOException e) { LOGGER.debug(e.getMessage(), e); } return termList; } }
public TokenStream tokenStream(String fieldName, Reader reader) { TokenStream result = new StandardTokenizer(reader);
final StringReader stringReader = new StringReader(sentence); final List<Token> tokens = new ArrayList<>(); final StandardTokenizer tokenizer = new StandardTokenizer(); try { tokenizer.setReader(stringReader); tokenizer.reset(); while ( tokenizer.incrementToken() ) { final CharTermAttribute charTermAttribute = tokenizer.getAttribute(CharTermAttribute.class); final OffsetAttribute offsetAttribute = tokenizer.getAttribute(OffsetAttribute.class); tokens.add(Token.newBuilder() .withTokenTerm(charTermAttribute.toString())
@Override protected void setReader(final Reader reader) { // So that if maxTokenLength was changed, the change takes // effect next time tokenStream is called: src.setMaxTokenLength(StandardAnalyzer.this.maxTokenLength); super.setReader(reader); } };
final StandardTokenizer tokenizer = this.tokenizer.get(); try { tokenizer.close(); tokenizer.setReader(stringReader); tokenizer.reset(); while ( tokenizer.incrementToken() ) { final CharTermAttribute charTermAttribute = tokenizer.getAttribute(CharTermAttribute.class); final OffsetAttribute offsetAttribute = tokenizer.getAttribute(OffsetAttribute.class); tokens.add(Token.newBuilder() .withTokenTerm(charTermAttribute.toString())
public void reset(Reader reader) throws IOException { input = reader; reset(); }
public TokenStream tokenStream(final String arg0, final Reader reader) { return new WildcardFilter(new LowerCaseFilter(LuceneVersion.get(), new StandardFilter(LuceneVersion.get(), new StandardTokenizer(LuceneVersion.get(), reader)))); } };
/** {@inheritDoc} */ @Override protected Analyzer.TokenStreamComponents createComponents(String fieldName) { final Tokenizer source = new StandardTokenizer(); TokenStream result = new StandardFilter(source); result = new LowerCaseFilter(result); result = new StopFilter(result, stopwords); result = new SnowballFilter(result, language); return new TokenStreamComponents(source, result); } }
public TokenStream reusableTokenStream(String fieldName, Reader reader) throws IOException { SavedStreams streams = (SavedStreams) getPreviousTokenStream(); if (streams == null) { streams = new SavedStreams(); setPreviousTokenStream(streams); streams.tokenStream = new StandardTokenizer(reader); streams.filteredTokenStream = new StandardFilter(streams.tokenStream); streams.filteredTokenStream = new LowerCaseFilter(streams.filteredTokenStream); streams.filteredTokenStream = new StopFilter(streams.filteredTokenStream, stopSet); } else { streams.tokenStream.reset(reader); } streams.tokenStream.setMaxTokenLength(maxTokenLength); streams.tokenStream.setReplaceInvalidAcronym(replaceInvalidAcronym); return streams.filteredTokenStream; }
@Override public StandardTokenizer create(AttributeFactory factory) { StandardTokenizer tokenizer = new StandardTokenizer(factory); tokenizer.setMaxTokenLength(maxTokenLength); return tokenizer; } }
@Override protected Tokenizer create(Version version) { return new StandardTokenizer(); } }
@Override protected void setReader(final Reader reader) throws IOException { src.setMaxTokenLength(ReaderStandardAnalyzer.this.maxTokenLength); super.setReader(reader); } };