org.apache.lucene.analysis.TokenStream.getAttribute java code examples

 TokenStream tokenStream = analyzer.tokenStream(fieldName, reader);
OffsetAttribute offsetAttribute = tokenStream.getAttribute(OffsetAttribute.class);
TermAttribute termAttribute = tokenStream.getAttribute(TermAttribute.class);

while (tokenStream.incrementToken()) {
  int startOffset = offsetAttribute.startOffset();
  int endOffset = offsetAttribute.endOffset();
  String term = termAttribute.term();
}

 public final class LuceneUtils {

  public static List<String> parseKeywords(Analyzer analyzer, String field, String keywords) {

    List<String> result = new ArrayList<String>();
    TokenStream stream  = analyzer.tokenStream(field, new StringReader(keywords));

    try {
      while(stream.incrementToken()) {
        result.add(stream.getAttribute(TermAttribute.class).term());
      }
    }
    catch(IOException e) {
      // not thrown b/c we're using a string reader...
    }

    return result;
  }  
}

  @Override
  protected Word nextWord() {
    try {
      if(this.stream.incrementToken()) {
        return new AnalysisWord(this.stream.getAttribute(CharTermAttribute.class));
      }
    } catch (IOException e) {
      throw new TokenizerException(e);
    }
    return null;
  }
}

  @Override
  protected Word nextWord() {
    try {
      if(this.stream.incrementToken()) {
        return new AnalysisWord(this.stream.getAttribute(CharTermAttribute.class));
      }
    } catch (IOException e) {
      throw new TokenizerException(e);
    }
    return null;
  }
}

 public final class LuceneUtil {

 private LuceneUtil() {}

 public static List<String> tokenizeString(Analyzer analyzer, String string) {
  List<String> result = new ArrayList<String>();
  try {
   TokenStream stream  = analyzer.tokenStream(null, new StringReader(string));
   stream.reset();
   while (stream.incrementToken()) {
    result.add(stream.getAttribute(CharTermAttribute.class).toString());
   }
  } catch (IOException e) {
   // not thrown b/c we're using a string reader...
   throw new RuntimeException(e);
  }
  return result;
 }

}

/** 
 * Creates simple boolean query from the cached tokenstream contents 
 */
protected Query analyzeBoolean(String field, TokenStream stream) throws IOException {
 TermToBytesRefAttribute termAtt = stream.getAttribute(TermToBytesRefAttribute.class);
 
 stream.reset();
 List<Term> terms = new ArrayList<>();
 while (stream.incrementToken()) {
  terms.add(new Term(field, termAtt.getBytesRef()));
 }
 
 return newSynonymQuery(terms.toArray(new Term[terms.size()]));
}

/** 
 * Creates simple term query from the cached tokenstream contents 
 */
protected Query analyzeTerm(String field, TokenStream stream) throws IOException {
 TermToBytesRefAttribute termAtt = stream.getAttribute(TermToBytesRefAttribute.class);
 
 stream.reset();
 if (!stream.incrementToken()) {
  throw new AssertionError();
 }
 
 return newTermQuery(new Term(field, termAtt.getBytesRef()));
}

CharTermAttribute token = tokenStream.getAttribute(CharTermAttribute.class);
tokenStream.reset();
while (tokenStream.incrementToken()) {

/**
 * Creates a span query from the tokenstream.  In the case of a single token, a simple <code>SpanTermQuery</code> is
 * returned.  When multiple tokens, an ordered <code>SpanNearQuery</code> with slop 0 is returned.
 */
protected SpanQuery createSpanQuery(TokenStream in, String field) throws IOException {
 TermToBytesRefAttribute termAtt = in.getAttribute(TermToBytesRefAttribute.class);
 if (termAtt == null) {
  return null;
 }
 List<SpanTermQuery> terms = new ArrayList<>();
 while (in.incrementToken()) {
  terms.add(new SpanTermQuery(new Term(field, termAtt.getBytesRef())));
 }
 if (terms.isEmpty()) {
  return null;
 } else if (terms.size() == 1) {
  return terms.get(0);
 } else {
  return new SpanNearQuery(terms.toArray(new SpanTermQuery[0]), 0, true);
 }
}

CharTermAttribute token = tokenStream.getAttribute(CharTermAttribute.class);
tokenStream.reset();
while (tokenStream.incrementToken()) {

  public static List<String> keywords( String source ) {
    List<String> keywords = new ArrayList<String>();
    TokenStream ts = null;
    try {
      ts = analyzer.tokenStream( "keywords", new StringReader( source ) );
      ts.reset();
      while ( ts.incrementToken() ) {
        keywords.add( ts.getAttribute( CharTermAttribute.class ).toString() );
      }
      ts.end();
    }
    catch ( IOException e ) {
      logger.error( "Error getting keywords ", e );
    }
    finally {
      try {
         ts.close();
      } catch (IOException ignored) {}
    }
    return keywords;
  }
}

/** 
 * Creates complex boolean query from the cached tokenstream contents 
 */
protected Query analyzeMultiBoolean(String field, TokenStream stream, BooleanClause.Occur operator) throws IOException {
 BooleanQuery.Builder q = newBooleanQuery();
 List<Term> currentQuery = new ArrayList<>();
 
 TermToBytesRefAttribute termAtt = stream.getAttribute(TermToBytesRefAttribute.class);
 PositionIncrementAttribute posIncrAtt = stream.getAttribute(PositionIncrementAttribute.class);
 
 stream.reset();
 while (stream.incrementToken()) {
  if (posIncrAtt.getPositionIncrement() != 0) {
   add(q, currentQuery, operator);
   currentQuery.clear();
  }
  currentQuery.add(new Term(field, termAtt.getBytesRef()));
 }
 add(q, currentQuery, operator);
 
 return q.build();
}

mpqb.setSlop(slop);
TermToBytesRefAttribute termAtt = stream.getAttribute(TermToBytesRefAttribute.class);
PositionIncrementAttribute posIncrAtt = stream.getAttribute(PositionIncrementAttribute.class);
int position = -1;

/** 
 * Creates simple phrase query from the cached tokenstream contents 
 */
protected Query analyzePhrase(String field, TokenStream stream, int slop) throws IOException {
 PhraseQuery.Builder builder = new PhraseQuery.Builder();
 builder.setSlop(slop);
 
 TermToBytesRefAttribute termAtt = stream.getAttribute(TermToBytesRefAttribute.class);
 PositionIncrementAttribute posIncrAtt = stream.getAttribute(PositionIncrementAttribute.class);
 int position = -1;    
 
 stream.reset();
 while (stream.incrementToken()) {
  if (enablePositionIncrements) {
   position += posIncrAtt.getPositionIncrement();
  } else {
   position += 1;
  }
  builder.add(new Term(field, termAtt.getBytesRef()), position);
 }
 return builder.build();
}

  @Override
  public Set<String> segment(String text) {
    Set<String> result = InsertionOrderUtil.newSet();
    Reader reader = new StringReader(text);
    try (TokenStream tokenStream = ANALYZER.tokenStream("text", reader)) {
      tokenStream.reset();
      CharTermAttribute term = null;
      while (tokenStream.incrementToken()) {
        term = tokenStream.getAttribute(CharTermAttribute.class);
        result.add(term.toString());
      }
    } catch (Exception e) {
      throw new HugeException("SmartCN segment text '%s' failed",
                  e, text);
    }
    return result;
  }
}

@Override
public Query phraseQuery(String field, TokenStream stream, int slop, boolean enablePosIncrements) throws IOException {
  PhraseQuery.Builder builder = new PhraseQuery.Builder();
  builder.setSlop(slop);
  TermToBytesRefAttribute termAtt = stream.getAttribute(TermToBytesRefAttribute.class);
  PositionIncrementAttribute posIncrAtt = stream.getAttribute(PositionIncrementAttribute.class);
  int position = -1;
  stream.reset();
  while (stream.incrementToken()) {
    if (enablePosIncrements) {
      position += posIncrAtt.getPositionIncrement();
    }
    else {
      position += 1;
    }
    builder.add(new Term(field, termAtt.getBytesRef()), position);
  }
  return builder.build();
}

private void finishInnerStream() throws IOException {
 input.end();
 inputStreamExhausted = true;
 // check for gaps at the end of the tokenstream
 endToken.posIncAtt.setPositionIncrement(this.incAtt.getPositionIncrement());
 OffsetAttribute inputOffsets = input.getAttribute(OffsetAttribute.class);
 endToken.offsetAtt.setOffset(inputOffsets.startOffset(), inputOffsets.endOffset());
}

@Test
public void testCJKFilter() throws Exception {
  String s = "then quickbrownfoxjumpedoverthelazy dogss dog 2000 普林斯顿大学";
  Analyzer analyzer = analyzerManager.getCommonTokensAnalyzer();
  TokenStream ts = analyzer.tokenStream(FIELD, s);
  CharTermAttribute termAtt = ts.getAttribute(CharTermAttribute.class);
  ts.reset();
  Map<String, Integer> tokens = new HashMap<>();
  while (ts.incrementToken()) {
    String t = termAtt.toString();
    Integer count = tokens.get(t);
    count = (count == null) ? count = 0 : count;
    count++;
    tokens.put(t, count);
  }
  ts.end();
  ts.close();
  assertEquals(7, tokens.size());
  assertEquals(new Integer(1), tokens.get("林斯"));
}

@Test
public void testTokenCountFilter() throws Exception {
  AnalyzerManager analyzerManager = AnalyzerManager.newInstance(1000000);
  StringBuilder sb = new StringBuilder();
  for (int i = 0; i < 1001000; i++) {
    sb.append("the ");
  }
  TokenStream ts = analyzerManager.getGeneralAnalyzer().tokenStream("f", sb.toString());
  ts.reset();
  CharTermAttribute termAtt = ts.getAttribute(CharTermAttribute.class);
  int tokens = 0;
  while (ts.incrementToken()) {
    tokens++;
  }
  assertEquals(1000000, tokens);
}

@Test
public void testGeneral() throws Exception {
  AnalyzerManager analyzerManager = AnalyzerManager.newInstance(100000);
  Analyzer general = analyzerManager.getGeneralAnalyzer();
  TokenStream ts = general.tokenStream("f", "tHe quick aaaa aaa anD dirty dog");
  ts.reset();
  CharTermAttribute termAtt = ts.getAttribute(CharTermAttribute.class);
  Set<String> seen = new HashSet<>();
  while (ts.incrementToken()) {
    seen.add(termAtt.toString());
  }
  ts.end();
  ts.close();
  assertTrue(seen.contains("the"));
  assertTrue(seen.contains("and"));
  assertTrue(seen.contains("dog"));
}

Popular methods of TokenStream

incrementToken
Consumers (i.e., IndexWriter) use this method to advance the stream to the next token. Implementing
reset
This method is called by a consumer before it begins consumption using #incrementToken(). Resets thi
close
Releases resources associated with this stream. If you override this method, always call super.close
addAttribute
end
This method is called by the consumer after the last token has been consumed, after #incrementToken(
hasAttribute
next
Returns the next token in the stream, or null at EOS. When possible, the input Token should be used
clearAttributes
reflectWith
assertFinal
endAttributes
getAttributeImplsIterator

Popular in Java

Making http post requests using okhttp
startActivity (Activity)
getSharedPreferences (Context)
runOnUiThread (Activity)
ServerSocket (java.net)
This class represents a server-side socket that waits for incoming client connections. A ServerSocke
TimerTask (java.util)
The TimerTask class represents a task to run at a specified time. The task may be run once or repeat
TimeUnit (java.util.concurrent)
A TimeUnit represents time durations at a given unit of granularity and provides utility methods to
IOUtils (org.apache.commons.io)
General IO stream manipulation utilities. This class provides static utility methods for input/outpu
IsNull (org.hamcrest.core)
Is the value null?
Table (com.google.common.collect)
A collection that associates an ordered pair of keys, called a row key and a column key, with a sing
Best plugins for Eclipse

How to use getAttributemethodin org.apache.lucene.analysis.TokenStream

Best Java code snippets using org.apache.lucene.analysis.TokenStream.getAttribute (Showing top 20 results out of 729)

How to use
getAttribute
method
in
org.apache.lucene.analysis.TokenStream