org.apache.lucene.analysis.TokenStream.end java code examples

Refine search

TokenStream.incrementToken

private void fillCache() throws IOException {
 while (input.incrementToken()) {
  cache.add(captureState());
 }
 // capture final state
 input.end();
 finalState = captureState();
}

 TokenStream stream = analyzer.tokenStream(null, new StringReader(text));
CharTermAttribute cattr = stream.addAttribute(CharTermAttribute.class);
stream.reset();
while (stream.incrementToken()) {
 System.out.println(cattr.toString());
}
stream.end();
stream.close();

  public static List<String> keywords( String source ) {
    List<String> keywords = new ArrayList<String>();
    TokenStream ts = null;
    try {
      ts = analyzer.tokenStream( "keywords", new StringReader( source ) );
      ts.reset();
      while ( ts.incrementToken() ) {
        keywords.add( ts.getAttribute( CharTermAttribute.class ).toString() );
      }
      ts.end();
    }
    catch ( IOException e ) {
      logger.error( "Error getting keywords ", e );
    }
    finally {
      try {
         ts.close();
      } catch (IOException ignored) {}
    }
    return keywords;
  }
}

private SToken[] getTokens(String text) throws IOException {
  //FIXME somehow integrate below cycle to getSummary to save the cloning and memory,
  //also creating Tokens is suboptimal with 3.0.0 , this whole class could be replaced by highlighter        
  ArrayList<SToken> result = new ArrayList<>();
  try (TokenStream ts = analyzer.tokenStream("full", text)) {
    CharTermAttribute term = ts.addAttribute(CharTermAttribute.class);
    OffsetAttribute offset = ts.addAttribute(OffsetAttribute.class);
    ts.reset();
    while (ts.incrementToken()) {
      SToken t = new SToken(term.buffer(), 0, term.length(), offset.startOffset(), offset.endOffset());
      result.add(t);
    }
    ts.end();
  }
  return result.toArray(new SToken[result.size()]);
}

final TermToBytesRefAttribute termAtt = ts.addAttribute(TermToBytesRefAttribute.class);
ts.reset();
if (ts.incrementToken() == false) {
 throw new IllegalStateException("The normalization token stream is "
   + "expected to produce exactly 1 token, but got 0 for analyzer "
if (ts.incrementToken()) {
 throw new IllegalStateException("The normalization token stream is "
   + "expected to produce exactly 1 token, but got 2+ for analyzer "
   + this + " and input \"" + text + "\"");
ts.end();
return term;

int prevIncr = 1;
int state = -1;
while (in.incrementToken()) {
 int currentIncr = posIncAtt.getPositionIncrement();
 if (pos == -1 && currentIncr < 1) {
in.end();
if (state != -1) {
 builder.setAccept(state, true);

termsHashPerField.start(field, first);
while (stream.incrementToken()) {
stream.end();

Position posData = null;
int maxOffset = 0;
while (in.incrementToken()) {
 int posInc = posIncAtt.getPositionIncrement();
 if (preservePositionIncrements == false && posInc > 1) {
in.end();

/**
 * @param text
 * @return
 */
public Set<String> getToken(String text) {
  Set<String> list = new LinkedHashSet<>();
  if (CommonUtils.notEmpty(text)) {
    try (StringReader stringReader = new StringReader(text);
        TokenStream tokenStream = dao.getAnalyzer().tokenStream(CommonConstants.BLANK, stringReader)) {
      CharTermAttribute charTermAttribute = tokenStream.addAttribute(CharTermAttribute.class);
      tokenStream.reset();
      while (tokenStream.incrementToken()) {
        list.add(charTermAttribute.toString());
      }
      tokenStream.end();
      return list;
    } catch (IOException e) {
      return list;
    }
  }
  return list;
}

/**
 * @param text
 * @return
 */
public Set<String> getToken(String text) {
  Set<String> list = new LinkedHashSet<>();
  if (CommonUtils.notEmpty(text)) {
    try (StringReader stringReader = new StringReader(text);
        TokenStream tokenStream = dao.getAnalyzer().tokenStream(CommonConstants.BLANK, stringReader)) {
      CharTermAttribute charTermAttribute = tokenStream.addAttribute(CharTermAttribute.class);
      tokenStream.reset();
      while (tokenStream.incrementToken()) {
        list.add(charTermAttribute.toString());
      }
      tokenStream.end();
      return list;
    } catch (IOException e) {
      return list;
    }
  }
  return list;
}

CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);
ts.reset();
while (ts.incrementToken()) {
  skipTerms.add(new Term(fieldName, termAtt.toString()));
ts.end();

while (ts.incrementToken()) {
  String word = termAtt.toString();
  tokenCount++;
ts.end();

@Override
public boolean incrementToken() throws IOException {
 while (sources[currentSource].incrementToken() == false) {
  if (currentSource >= sources.length - 1)
   return false;
  sources[currentSource].end();
  OffsetAttribute att = sourceOffsets[currentSource];
  if (att != null)
   offsetIncrement += att.endOffset();
  currentSource++;
 }
 clearAttributes();
 sources[currentSource].copyTo(this);
 offsetAtt.setOffset(offsetAtt.startOffset() + offsetIncrement, offsetAtt.endOffset() + offsetIncrement);
 return true;
}

  map.put(field, tokenMap);
while (ts.incrementToken()) {
  String token = termAtt.toString();
  MutableInt cnt = tokenMap.get(token);
ts.end();

  private static int findGoodEndForNoHighlightExcerpt(int noMatchSize, Analyzer analyzer, String fieldName, String contents)
      throws IOException {
    try (TokenStream tokenStream = analyzer.tokenStream(fieldName, contents)) {
      if (!tokenStream.hasAttribute(OffsetAttribute.class)) {
        // Can't split on term boundaries without offsets
        return -1;
      }
      int end = -1;
      tokenStream.reset();
      while (tokenStream.incrementToken()) {
        OffsetAttribute attr = tokenStream.getAttribute(OffsetAttribute.class);
        if (attr.endOffset() >= noMatchSize) {
          // Jump to the end of this token if it wouldn't put us past the boundary
          if (attr.endOffset() == noMatchSize) {
            end = noMatchSize;
          }
          return end;
        }
        end = attr.endOffset();
      }
      tokenStream.end();
      // We've exhausted the token stream so we should just highlight everything.
      return end;
    }
  }
}

@Override
protected AToken computeNext() {
  try {
    if (!tkstream.incrementToken()) {
      tkstream.end();
      tkstream.close();
      return endOfData();

@Test
public void testCJKFilter() throws Exception {
  String s = "then quickbrownfoxjumpedoverthelazy dogss dog 2000 普林斯顿大学";
  Analyzer analyzer = analyzerManager.getCommonTokensAnalyzer();
  TokenStream ts = analyzer.tokenStream(FIELD, s);
  CharTermAttribute termAtt = ts.getAttribute(CharTermAttribute.class);
  ts.reset();
  Map<String, Integer> tokens = new HashMap<>();
  while (ts.incrementToken()) {
    String t = termAtt.toString();
    Integer count = tokens.get(t);
    count = (count == null) ? count = 0 : count;
    count++;
    tokens.put(t, count);
  }
  ts.end();
  ts.close();
  assertEquals(7, tokens.size());
  assertEquals(new Integer(1), tokens.get("林斯"));
}

@Test
public void testCommon() throws Exception {
  AnalyzerManager analyzerManager = AnalyzerManager.newInstance(100000);
  Analyzer common = analyzerManager.getCommonTokensAnalyzer();
  TokenStream ts = common.tokenStream("f", "the 5,000.12 5000 and dirty dog");
  ts.reset();
  CharTermAttribute termAtt = ts.getAttribute(CharTermAttribute.class);
  Set<String> seen = new HashSet<>();
  while (ts.incrementToken()) {
    String t = termAtt.toString();
    if (AlphaIdeographFilterFactory.isAlphabetic(t.toCharArray(), t.length()) && t.contains("5")) {
      fail("Shouldn't have found a numeric");
    }
    seen.add(termAtt.toString());
  }
  ts.end();
  ts.close();
  assertTrue(seen.contains("dirty"));
  assertFalse(seen.contains("the"));
}

@Test
public void testGeneral() throws Exception {
  AnalyzerManager analyzerManager = AnalyzerManager.newInstance(100000);
  Analyzer general = analyzerManager.getGeneralAnalyzer();
  TokenStream ts = general.tokenStream("f", "tHe quick aaaa aaa anD dirty dog");
  ts.reset();
  CharTermAttribute termAtt = ts.getAttribute(CharTermAttribute.class);
  Set<String> seen = new HashSet<>();
  while (ts.incrementToken()) {
    seen.add(termAtt.toString());
  }
  ts.end();
  ts.close();
  assertTrue(seen.contains("the"));
  assertTrue(seen.contains("and"));
  assertTrue(seen.contains("dog"));
}

if (input.incrementToken()) {
 input.end();
 finalPosInc = posIncAtt.getPositionIncrement();
 finalOffset = offsetAtt.endOffset();

Javadoc

This method is called by the consumer after the last token has been consumed, after #incrementToken() returned false (using the new TokenStream API). Streams implementing the old API should upgrade to use this feature.

This method can be used to perform any end-of-stream operations, such as setting the final offset of a stream. The final offset of a stream might differ from the offset of the last token eg in case one or more whitespaces followed after the last token, but a WhitespaceTokenizer was used.

Popular methods of TokenStream

incrementToken
Consumers (i.e., IndexWriter) use this method to advance the stream to the next token. Implementing
reset
This method is called by a consumer before it begins consumption using #incrementToken(). Resets thi
close
Releases resources associated with this stream. If you override this method, always call super.close
addAttribute
getAttribute
hasAttribute
next
Returns the next token in the stream, or null at EOS. When possible, the input Token should be used
clearAttributes
reflectWith
assertFinal
endAttributes
getAttributeImplsIterator

Popular in Java

Parsing JSON documents to java classes using gson
getResourceAsStream (ClassLoader)
getSharedPreferences (Context)
startActivity (Activity)
URI (java.net)
A Uniform Resource Identifier that identifies an abstract or physical resource, as specified by RFC
Arrays (java.util)
This class contains various methods for manipulating arrays (such as sorting and searching). This cl
Collection (java.util)
Collection is the root of the collection hierarchy. It defines operations on data collections and t
Deque (java.util)
A linear collection that supports element insertion and removal at both ends. The name deque is shor
Semaphore (java.util.concurrent)
A counting semaphore. Conceptually, a semaphore maintains a set of permits. Each #acquire blocks if
JOptionPane (javax.swing)
Top PhpStorm plugins

How to use endmethodin org.apache.lucene.analysis.TokenStream

Best Java code snippets using org.apache.lucene.analysis.TokenStream.end (Showing top 20 results out of 765)

Refine search

How to use
end
method
in
org.apache.lucene.analysis.TokenStream