public Token next() throws IOException { if (tokens == null || index >= tokens.size()) { if (length == 0) refill(); while (!incrementTokenBuffer()) { refill(); if (length <= 0) // no more bytes to read; return null; } } return tokens.get(index++); }
/** * Returns the last unambiguous break position in the text. * * @return position of character, or -1 if one does not exist */ private int findSafeEnd() { for (int i = length - 1; i >= 0; i--) if (isSafeEnd(buffer[i])) return i + 1; return -1; }
public void reset(Reader input) throws IOException { this.input = input; reset(); }
public JapaneseTokenizer(Reader in, StreamFilter filter, String dictionaryDir) { super(in); StringTagger stringTagger = SenFactory.getStringTagger(dictionaryDir); if(filter != null) stringTagger.addFilter(filter); tagger = new StreamTagger2(stringTagger, in); }
@Override public void end() throws IOException { // set final offset final int finalOffset = correctOffset(tagger.end()); offsetAtt.setOffset(finalOffset, finalOffset); } }
/** * Refill the buffer, accumulating the offset and setting usableLength to the * last unambiguous break position * * @throws IOException */ private void refill() throws IOException { offset += usableLength; int leftover = length - usableLength; System.arraycopy(buffer, usableLength, buffer, 0, leftover); int requested = buffer.length - leftover; int returned = input.read(buffer, leftover, requested); length = returned < 0 ? leftover : returned + leftover; if (returned < requested) /* reader has been emptied, process the rest */ usableLength = length; else { /* still more data to be read, find a safe-stopping place */ usableLength = findSafeEnd(); if (usableLength < 0) usableLength = length; /* * more than IOBUFFER of text without breaks, * gonna possibly truncate tokens */ } iterator.setText(buffer, 0, Math.max(0, usableLength)); breaker.setText(iterator); }
@Override public boolean incrementToken() throws IOException { Token token = tagger.next(); if (token == null) { return false; } else { clearAttributes(); final Morpheme m = token.getMorpheme(); // note, unlike the previous implementation, we set the surface form termAtt.setEmpty().append(token.getSurface()); final int cost = token.getCost(); if (token.isSentenceStart()) { accumulatedCost = 0; sentenceAtt.setSentenceStart(true); } costAtt.setCost(cost - accumulatedCost); accumulatedCost = cost; basicFormAtt.setMorpheme(m); conjugationAtt.setMorpheme(m); partOfSpeechAtt.setMorpheme(m); pronunciationsAtt.setMorpheme(m); readingsAtt.setMorpheme(m); offsetAtt.setOffset(correctOffset(token.getStart()), correctOffset(token.end())); return true; } }
@Override public void reset(Reader in) throws IOException { super.reset(in); tagger.reset(in); accumulatedCost = 0; }