/** * Split the input using configured pattern */ @Override public PatternTokenizer create(final AttributeFactory factory) { return new PatternTokenizer(factory, pattern, group); } }
@Override public boolean incrementToken() { if (index >= str.length()) return false; clearAttributes(); if (group >= 0) { if (index == endIndex) continue; termAtt.setEmpty().append(str, index, endIndex); offsetAtt.setOffset(correctOffset(index), correctOffset(endIndex)); return true; offsetAtt.setOffset(correctOffset(index), correctOffset(matcher.start())); index = matcher.end(); return true; offsetAtt.setOffset(correctOffset(index), correctOffset(str.length()));
@Override public void reset() throws IOException { super.reset(); fillBuffer(input); matcher.reset(str); index = 0; }
@Override public void end() throws IOException { super.end(); final int ofs = correctOffset(str.length()); offsetAtt.setOffset(ofs, ofs); }
@Override public void end() throws IOException { super.end(); final int ofs = correctOffset(str.length()); offsetAtt.setOffset(ofs, ofs); }
@Override public Tokenizer create() { return new PatternTokenizer(pattern, group); } }
@Override public boolean incrementToken() { if (index >= str.length()) return false; clearAttributes(); if (group >= 0) { if (index == endIndex) continue; termAtt.setEmpty().append(str, index, endIndex); offsetAtt.setOffset(correctOffset(index), correctOffset(endIndex)); return true; offsetAtt.setOffset(correctOffset(index), correctOffset(matcher.start())); index = matcher.end(); return true; offsetAtt.setOffset(correctOffset(index), correctOffset(str.length()));
@Override public void reset() throws IOException { super.reset(); fillBuffer(input); matcher.reset(str); index = 0; }
/** * Split the input using configured pattern */ @Override public PatternTokenizer create(final AttributeFactory factory) { return new PatternTokenizer(factory, pattern, group); } }
@Override public Tokenizer create() { return new PatternTokenizer(pattern, group); } }
@Override public Tokenizer create() { return new PatternTokenizer(pattern, group); } }
@Override protected Tokenizer create(Version version) { return new PatternTokenizer(Regex.compile("\\W+", null), -1); } },
@Override protected TokenStreamComponents createComponents(final String field) { //Use default grouping final Tokenizer tokenizer = new PatternTokenizer(pattern,-1); final TokenStream filter = new LowerCaseFilter(tokenizer); return new TokenStreamComponents(tokenizer, filter); } }
@Override protected TokenStreamComponents createComponents(final String field) { //Use default grouping final Tokenizer tokenizer = new PatternTokenizer(pattern,-1); final TokenStream filter = new LowerCaseFilter(tokenizer); return new TokenStreamComponents(tokenizer, filter); } }
@Override protected TokenStreamComponents createComponents(String s) { final Tokenizer tokenizer = new PatternTokenizer(pattern, -1); TokenStream stream = tokenizer; if (lowercase) { stream = new LowerCaseFilter(stream); } if (stopWords != null) { stream = new StopFilter(stream, stopWords); } return new TokenStreamComponents(tokenizer, stream); }
private List<Token> getQueryTokens(String url, String partStringRaw, String partString) throws IOException { int start = getStartIndex(url, partStringRaw); if (!tokenizeQuery) { int end = getEndIndex(start, partStringRaw); return Collections.singletonList(new Token(partString, URLPart.QUERY, start, end)); } return tokenize(URLPart.QUERY, addReader(new PatternTokenizer(QUERY_SEPARATOR, -1), new StringReader(partString)), start); }
@Override protected TokenStreamComponents createComponents(String s) { final Tokenizer tokenizer = new PatternTokenizer(pattern, -1); TokenStream stream = tokenizer; if (lowercase) { stream = new LowerCaseFilter(stream); } if (stopWords != null) { stream = new StopFilter(stream, stopWords); } return new TokenStreamComponents(tokenizer, stream); }
@Override protected TokenStreamComponents createComponents(String s) { final Tokenizer tokenizer = new PatternTokenizer(pattern, -1); TokenStream stream = tokenizer; if (lowercase) { stream = new LowerCaseFilter(stream); } if (stopWords != null) { stream = new StopFilter(stream, stopWords); } return new TokenStreamComponents(tokenizer, stream); }
@Override protected TokenStreamComponents createComponents(String s) { final Tokenizer tokenizer = new PatternTokenizer(pattern, -1); TokenStream stream = tokenizer; if (lowercase) { stream = new LowerCaseFilter(stream); } if (stopWords != null) { stream = new StopFilter(stream, stopWords); } return new TokenStreamComponents(tokenizer, stream); } }
tokenizers.add(PreConfiguredTokenizer.singleton("edge_ngram", () -> new EdgeNGramTokenizer(EdgeNGramTokenizer.DEFAULT_MIN_GRAM_SIZE, EdgeNGramTokenizer.DEFAULT_MAX_GRAM_SIZE), null)); tokenizers.add(PreConfiguredTokenizer.singleton("pattern", () -> new PatternTokenizer(Regex.compile("\\W+", null), -1), null)); tokenizers.add(PreConfiguredTokenizer.singleton("thai", ThaiTokenizer::new, null)); tokenizers.add(PreConfiguredTokenizer.singleton("lowercase", LowerCaseTokenizer::new, () -> new TokenFilterFactory() {