public PinyinTokenizer(int bufferSize) { super(); termAtt.resizeBuffer(bufferSize); }
upto += length; if (upto == buffer.length) buffer = termAtt.resizeBuffer(1 + buffer.length);
public KeywordTokenizer(AttributeFactory factory, int bufferSize) { super(factory); if (bufferSize > MAX_TOKEN_LENGTH_LIMIT || bufferSize <= 0) { throw new IllegalArgumentException("maxTokenLen must be greater than 0 and less than " + MAX_TOKEN_LENGTH_LIMIT + " passed: " + bufferSize); } termAtt.resizeBuffer(bufferSize); }
public KeywordTokenizer(int bufferSize) { if (bufferSize > MAX_TOKEN_LENGTH_LIMIT || bufferSize <= 0) { throw new IllegalArgumentException("maxTokenLen must be greater than 0 and less than " + MAX_TOKEN_LENGTH_LIMIT + " passed: " + bufferSize); } termAtt.resizeBuffer(bufferSize); }
public PathHierarchyTokenizer (AttributeFactory factory, int bufferSize, char delimiter, char replacement, int skip) { super(factory); if (bufferSize < 0) { throw new IllegalArgumentException("bufferSize cannot be negative"); } if (skip < 0) { throw new IllegalArgumentException("skip cannot be negative"); } termAtt.resizeBuffer(bufferSize); this.delimiter = delimiter; this.replacement = replacement; this.skip = skip; resultToken = new StringBuilder(bufferSize); }
public ReversePathHierarchyTokenizer (AttributeFactory factory, int bufferSize, char delimiter, char replacement, int skip) { super(factory); if (bufferSize < 0) { throw new IllegalArgumentException("bufferSize cannot be negative"); } if (skip < 0) { throw new IllegalArgumentException("skip cannot be negative"); } termAtt.resizeBuffer(bufferSize); this.delimiter = delimiter; this.replacement = replacement; this.skip = skip; resultToken = new StringBuilder(bufferSize); resultTokenBuffer = new char[bufferSize]; delimiterPositions = new ArrayList<>(bufferSize/10); }
private void appendToToken(char ch) { char[] buffer = termAtt.buffer(); if (tokenUpto == buffer.length) { buffer = termAtt.resizeBuffer(tokenUpto + 1); } buffer[tokenUpto++] = ch; }
private void appendToToken(char ch) { char[] buffer = termAtt.buffer(); if (tokenUpto == buffer.length) { buffer = termAtt.resizeBuffer(tokenUpto + 1); } buffer[tokenUpto++] = ch; sepUpto++; }
private void init(int minGram, int maxGram, boolean edgesOnly) { if (minGram < 1) { throw new IllegalArgumentException("minGram must be greater than zero"); } if (minGram > maxGram) { throw new IllegalArgumentException("minGram must not be greater than maxGram"); } this.minGram = minGram; this.maxGram = maxGram; this.edgesOnly = edgesOnly; charBuffer = CharacterUtils.newCharacterBuffer(2 * maxGram + 1024); // 2 * maxGram in case all code points require 2 chars and + 1024 for buffering to not keep polling the Reader buffer = new int[charBuffer.getBuffer().length]; // Make the term att large enough termAtt.resizeBuffer(2 * maxGram); }
@Override public boolean incrementToken() throws IOException { if (input.incrementToken()) { char[] chArray = termAtt.buffer(); int chLen = termAtt.length(); int idx = 0; if (chLen > 1 && (chArray[0] == 'n' || chArray[0] == 't') && isUpperVowel(chArray[1])) { chArray = termAtt.resizeBuffer(chLen + 1); for (int i = chLen; i > 1; i--) { chArray[i] = chArray[i - 1]; } chArray[1] = '-'; termAtt.setLength(chLen + 1); idx = 2; chLen = chLen + 1; } for (int i = idx; i < chLen;) { i += Character.toChars(Character.toLowerCase(chArray[i]), chArray, i); } return true; } else { return false; } }
/** * Flushes a unigram token to output from our buffer. * This happens when we encounter isolated CJK characters, either the whole * CJK string is a single character, or we encounter a CJK character surrounded * by space, punctuation, english, etc, but not beside any other CJK. */ private void flushUnigram() { clearAttributes(); char termBuffer[] = termAtt.resizeBuffer(2); // maximum unigram length (2 surrogates) int len = Character.toChars(buffer[index], termBuffer, 0); termAtt.setLength(len); offsetAtt.setOffset(startOffset[index], endOffset[index]); typeAtt.setType(SINGLE_TYPE); index++; }
@Override public boolean incrementToken() throws IOException { if (input.incrementToken()) { if (!keywordAttr.isKeyword()) { // this stemmer increases word length by 1: worst case '*ã' -> '*ão' final int len = termAtt.length(); final int newlen = stemmer.stem(termAtt.resizeBuffer(len+1), len); termAtt.setLength(newlen); } return true; } else { return false; } } }
@Override public boolean incrementToken() throws IOException { if (input.incrementToken()) { if (!keywordAttr.isKeyword()) { // this stemmer increases word length by 1: worst case '*çom' -> '*ción' final int len = termAtt.length(); final int newlen = stemmer.stem(termAtt.resizeBuffer(len+1), len); termAtt.setLength(newlen); } return true; } else { return false; } } }
/** * Writes the joined unhyphenated term */ private void unhyphenate() { restoreState(savedState); savedState = null; char term[] = termAttribute.buffer(); int length = hyphenated.length(); if (length > termAttribute.length()) { term = termAttribute.resizeBuffer(length); } hyphenated.getChars(0, length, term, 0); termAttribute.setLength(length); offsetAttribute.setOffset(offsetAttribute.startOffset(), lastEndOffset); hyphenated.setLength(0); } }
/** * Writes the concatenation to the attributes */ void write() { clearAttributes(); if (termAttribute.length() < buffer.length()) { termAttribute.resizeBuffer(buffer.length()); } char termbuffer[] = termAttribute.buffer(); buffer.getChars(0, buffer.length(), termbuffer, 0); termAttribute.setLength(buffer.length()); if (hasIllegalOffsets) { offsetAttribute.setOffset(savedStartOffset, savedEndOffset); } else { offsetAttribute.setOffset(startOffset, endOffset); } posIncAttribute.setPositionIncrement(position(true)); typeAttribute.setType(savedType); accumPosInc = 0; }
/** * Flushes a bigram token to output from our buffer * This is the normal case, e.g. ABC -> AB BC */ private void flushBigram() { clearAttributes(); char termBuffer[] = termAtt.resizeBuffer(4); // maximum bigram length in code units (2 supplementaries) int len1 = Character.toChars(buffer[index], termBuffer, 0); int len2 = len1 + Character.toChars(buffer[index+1], termBuffer, len1); termAtt.setLength(len2); offsetAtt.setOffset(startOffset[index], endOffset[index+1]); typeAtt.setType(DOUBLE_TYPE); // when outputting unigrams, all bigrams are synonyms that span two unigrams if (outputUnigrams) { posIncAtt.setPositionIncrement(0); posLengthAtt.setPositionLength(2); } index++; }
@Override public boolean incrementToken() throws IOException { if (input.incrementToken()) { int len = termAtt.length(); if (marker != NOMARKER) { len++; termAtt.resizeBuffer(len); termAtt.buffer()[len - 1] = marker; } reverse( termAtt.buffer(), 0, len ); termAtt.setLength(len); return true; } else { return false; } }
/** * Constructs a compound token. */ private void gramToken() { buffer.append(termAttribute.buffer(), 0, termAttribute.length()); int endOffset = offsetAttribute.endOffset(); clearAttributes(); int length = buffer.length(); char termText[] = termAttribute.buffer(); if (length > termText.length) { termText = termAttribute.resizeBuffer(length); } buffer.getChars(0, length, termText, 0); termAttribute.setLength(length); posIncAttribute.setPositionIncrement(0); posLenAttribute.setPositionLength(2); // bigram offsetAttribute.setOffset(lastStartOffset, endOffset); typeAttribute.setType(GRAM_TYPE); buffer.setLength(0); } }
@Override public final boolean incrementToken() throws IOException { if (!done) { clearAttributes(); done = true; int upto = 0; char[] buffer = termAtt.buffer(); while (true) { final int length = input.read(buffer, upto, buffer.length-upto); if (length == -1) break; upto += length; if (upto == buffer.length) buffer = termAtt.resizeBuffer(1+buffer.length); } termAtt.setLength(upto); finalOffset = correctOffset(upto); offsetAtt.setOffset(correctOffset(0), finalOffset); return true; } return false; }
end = start; } else if (length >= buffer.length-1) { // check if a supplementary could run out of bounds buffer = termAtt.resizeBuffer(2+length); // make sure a supplementary fits in the buffer