/** Creates the {@link TokenStream} of n-grams from the given {@link Reader} and {@link AttributeFactory}. */ @Override public Tokenizer create(AttributeFactory factory) { return new NGramTokenizer(factory, minGramSize, maxGramSize); } }
@Override public Tokenizer create(AttributeFactory factory) { return new EdgeNGramTokenizer(factory, minGramSize, maxGramSize); } }
@Override public TokenFilter create(TokenStream input) { return new EdgeNGramTokenFilter(input, minGramSize, maxGramSize, preserveOriginal); } }
@Override public final boolean incrementToken() throws IOException { clearAttributes(); return false; consume(); gramSize = minGram; updateLastNonTokenChar(); final boolean isEdgeAndPreviousCharIsTokenChar = edgesOnly && lastNonTokenChar != bufferStart - 1; if (termContainsNonTokenChar || isEdgeAndPreviousCharIsTokenChar) { consume(); gramSize = minGram; continue; posIncAtt.setPositionIncrement(1); posLenAtt.setPositionLength(1); offsetAtt.setOffset(correctOffset(offset), correctOffset(offset + length)); ++gramSize; return true;
@Override public TokenFilter create(TokenStream input) { return new NGramTokenFilter(input, minGramSize, maxGramSize, preserveOriginal); } }
NGramTokenizer(AttributeFactory factory, int minGram, int maxGram, boolean edgesOnly) { super(factory); init(minGram, maxGram, edgesOnly); }
/** Creates a new NGramFilterFactory */ public NGramFilterFactory(Map<String, String> args) { super(args); minGramSize = getInt(args, "minGramSize", NGramTokenFilter.DEFAULT_MIN_NGRAM_SIZE); maxGramSize = getInt(args, "maxGramSize", NGramTokenFilter.DEFAULT_MAX_NGRAM_SIZE); preserveOriginal = getBoolean(args, "keepShortTerm", NGramTokenFilter.DEFAULT_PRESERVE_ORIGINAL); if (!args.isEmpty()) { throw new IllegalArgumentException("Unknown parameters: " + args); } }
/** Creates a new EdgeNGramFilterFactory */ public EdgeNGramFilterFactory(Map<String, String> args) { super(args); minGramSize = getInt(args, "minGramSize", EdgeNGramTokenFilter.DEFAULT_MIN_GRAM_SIZE); maxGramSize = getInt(args, "maxGramSize", EdgeNGramTokenFilter.DEFAULT_MAX_GRAM_SIZE); preserveOriginal = getBoolean(args, "preserveOriginal", EdgeNGramTokenFilter.DEFAULT_PRESERVE_ORIGINAL); if (!args.isEmpty()) { throw new IllegalArgumentException("Unknown parameters: " + args); } }
@Override public final void end() throws IOException { super.end(); assert bufferStart <= bufferEnd; int endOffset = offset; for (int i = bufferStart; i < bufferEnd; ++i) { endOffset += Character.charCount(buffer[i]); } endOffset = correctOffset(endOffset); // set final offset offsetAtt.setOffset(endOffset, endOffset); }
private void updateLastNonTokenChar() { final int termEnd = bufferStart + gramSize - 1; if (termEnd > lastCheckedChar) { for (int i = termEnd; i > lastCheckedChar; --i) { if (!isTokenChar(buffer[i])) { lastNonTokenChar = i; break; } } lastCheckedChar = termEnd; } }
/** Creates a new NGramTokenizerFactory */ public NGramTokenizerFactory(Map<String, String> args) { super(args); minGramSize = getInt(args, "minGramSize", NGramTokenizer.DEFAULT_MIN_NGRAM_SIZE); maxGramSize = getInt(args, "maxGramSize", NGramTokenizer.DEFAULT_MAX_NGRAM_SIZE); if (!args.isEmpty()) { throw new IllegalArgumentException("Unknown parameters: " + args); } }
/** Creates a new EdgeNGramTokenizerFactory */ public EdgeNGramTokenizerFactory(Map<String, String> args) { super(args); minGramSize = getInt(args, "minGramSize", EdgeNGramTokenizer.DEFAULT_MIN_GRAM_SIZE); maxGramSize = getInt(args, "maxGramSize", EdgeNGramTokenizer.DEFAULT_MAX_GRAM_SIZE); if (!args.isEmpty()) { throw new IllegalArgumentException("Unknown parameters: " + args); } }
return false; state = captureState(); restoreState(state); restoreState(state); posIncrAtt.setPositionIncrement(0); termAtt.copyBuffer(curTermBuffer, 0, curTermLength);
return false; state = captureState(); restoreState(state); final int start = Character.offsetByCodePoints(curTermBuffer, 0, curTermLength, 0, curPos); final int end = Character.offsetByCodePoints(curTermBuffer, 0, curTermLength, start, curGramSize); restoreState(state); posIncrAtt.setPositionIncrement(0); termAtt.copyBuffer(curTermBuffer, 0, curTermLength);
/** * Creates EdgeNGramTokenizer that can generate n-grams in the sizes of the given range * * @param source {@link AttributeSource} to use * @param input {@link Reader} holding the input to be tokenized * @param sideLabel the name of the {@link Side} from which to chop off an n-gram * @param minGram the smallest n-gram to generate * @param maxGram the largest n-gram to generate */ public EdgeNGramTokenizer(AttributeSource source, Reader input, String sideLabel, int minGram, int maxGram) { this(source, input, Side.getSide(sideLabel), minGram, maxGram); }
/** * Creates EdgeNGramTokenizer that can generate n-grams in the sizes of the given range * * @param input {@link Reader} holding the input to be tokenized * @param side the {@link Side} from which to chop off an n-gram * @param minGram the smallest n-gram to generate * @param maxGram the largest n-gram to generate */ public EdgeNGramTokenizer(Reader input, Side side, int minGram, int maxGram) { super(input); init(side, minGram, maxGram); }
@Override protected TokenStreamComponents wrapComponents(String fieldName, TokenStreamComponents components) { TokenFilter filter = new EdgeNGramTokenFilter(components.getTokenStream(), minChars, maxChars); return new TokenStreamComponents(components.getTokenizer(), filter); } }
@Override protected TokenStreamComponents wrapComponents(String fieldName, TokenStreamComponents components) { return new TokenStreamComponents(components.getTokenizer(), new NGramTokenFilter(components.getTokenStream(), this.min, this.max)); } }
NGramTokenizer(int minGram, int maxGram, boolean edgesOnly) { init(minGram, maxGram, edgesOnly); }
/** * Creates EdgeNGramTokenizer that can generate n-grams in the sizes of the given range * * @param factory {@link org.apache.lucene.util.AttributeSource.AttributeFactory} to use * @param input {@link Reader} holding the input to be tokenized * @param sideLabel the name of the {@link Side} from which to chop off an n-gram * @param minGram the smallest n-gram to generate * @param maxGram the largest n-gram to generate */ public EdgeNGramTokenizer(AttributeFactory factory, Reader input, String sideLabel, int minGram, int maxGram) { this(factory, input, Side.getSide(sideLabel), minGram, maxGram); }