@Override public Tokenizer create(AttributeFactory factory) { return new EdgeNGramTokenizer(factory, minGramSize, maxGramSize); } }
clearAttributes(); int end = start + gramSize; termAtt.setEmpty().append(inStr, start, end); offsetAtt.setOffset(correctOffset(start), correctOffset(end)); gramSize++; return true;
/** * Creates EdgeNGramTokenizer that can generate n-grams in the sizes of the given range * * @param input {@link Reader} holding the input to be tokenized * @param side the {@link Side} from which to chop off an n-gram * @param minGram the smallest n-gram to generate * @param maxGram the largest n-gram to generate */ public EdgeNGramTokenizer(Reader input, Side side, int minGram, int maxGram) { super(input); init(side, minGram, maxGram); }
@Override public void end() { // set final offset final int finalOffset = correctOffset(charsRead); this.offsetAtt.setOffset(finalOffset, finalOffset); }
/** * Creates EdgeNGramTokenizer that can generate n-grams in the sizes of the given range * * @param source {@link AttributeSource} to use * @param input {@link Reader} holding the input to be tokenized * @param side the {@link Side} from which to chop off an n-gram * @param minGram the smallest n-gram to generate * @param maxGram the largest n-gram to generate */ public EdgeNGramTokenizer(AttributeSource source, Reader input, Side side, int minGram, int maxGram) { super(source, input); init(side, minGram, maxGram); }
@Override protected Tokenizer create(Version version) { return new EdgeNGramTokenizer(EdgeNGramTokenizer.DEFAULT_MIN_GRAM_SIZE, EdgeNGramTokenizer.DEFAULT_MAX_GRAM_SIZE); } },
/** * Creates EdgeNGramTokenizer that can generate n-grams in the sizes of the given range * * @param factory {@link org.apache.lucene.util.AttributeSource.AttributeFactory} to use * @param input {@link Reader} holding the input to be tokenized * @param side the {@link Side} from which to chop off an n-gram * @param minGram the smallest n-gram to generate * @param maxGram the largest n-gram to generate */ public EdgeNGramTokenizer(AttributeFactory factory, Reader input, Side side, int minGram, int maxGram) { super(factory, input); init(side, minGram, maxGram); }
public EdgeNGramTokenizer create(Reader input) { return new EdgeNGramTokenizer(input, side, minGramSize, maxGramSize); } }
@Override public Tokenizer create() { if (matcher == null) { return new EdgeNGramTokenizer(minGram, maxGram); } else { return new EdgeNGramTokenizer(minGram, maxGram) { @Override protected boolean isTokenChar(int chr) { return matcher.isTokenChar(chr); } }; } } }
@Override public Tokenizer create() { if (matcher == null) { return new EdgeNGramTokenizer(minGram, maxGram); } else { return new EdgeNGramTokenizer(minGram, maxGram) { @Override protected boolean isTokenChar(int chr) { return matcher.isTokenChar(chr); } }; } } }
@Override public Tokenizer create() { if (matcher == null) { return new EdgeNGramTokenizer(minGram, maxGram); } else { return new EdgeNGramTokenizer(minGram, maxGram) { @Override protected boolean isTokenChar(int chr) { return matcher.isTokenChar(chr); } }; } } }
@Override public Tokenizer create(AttributeFactory factory) { if (luceneMatchVersion.onOrAfter(Version.LUCENE_4_4_0)) { return new EdgeNGramTokenizer(factory, minGramSize, maxGramSize); } return new Lucene43NGramTokenizer(factory, minGramSize, maxGramSize); } }
tokenizers.add(PreConfiguredTokenizer.singleton("ngram", NGramTokenizer::new, null)); tokenizers.add(PreConfiguredTokenizer.singleton("edge_ngram", () -> new EdgeNGramTokenizer(EdgeNGramTokenizer.DEFAULT_MIN_GRAM_SIZE, EdgeNGramTokenizer.DEFAULT_MAX_GRAM_SIZE), null)); tokenizers.add(PreConfiguredTokenizer.singleton("pattern", () -> new PatternTokenizer(Regex.compile("\\W+", null), -1), null)); tokenizers.add(PreConfiguredTokenizer.singleton("thai", ThaiTokenizer::new, null)); () -> new EdgeNGramTokenizer(EdgeNGramTokenizer.DEFAULT_MIN_GRAM_SIZE, EdgeNGramTokenizer.DEFAULT_MAX_GRAM_SIZE), null)); tokenizers.add(PreConfiguredTokenizer.singleton("PathHierarchy", PathHierarchyTokenizer::new, null));