org.apache.lucene.analysis.Analyzer java code examples

Refine search

 public final class LuceneUtils {

  public static List<String> parseKeywords(Analyzer analyzer, String field, String keywords) {

    List<String> result = new ArrayList<String>();
    TokenStream stream  = analyzer.tokenStream(field, new StringReader(keywords));

    try {
      while(stream.incrementToken()) {
        result.add(stream.getAttribute(TermAttribute.class).term());
      }
    }
    catch(IOException e) {
      // not thrown b/c we're using a string reader...
    }

    return result;
  }  
}

/**
 * Closes the CPE Index.
 */
@Override
public synchronized void close() {
  final int count = INSTANCE.usageCount.get() - 1;
  if (count <= 0) {
    INSTANCE.usageCount.set(0);
    if (searchingAnalyzer != null) {
      searchingAnalyzer.close();
      searchingAnalyzer = null;
    }
    if (indexReader != null) {
      try {
        indexReader.close();
      } catch (IOException ex) {
        LOGGER.trace("", ex);
      }
      indexReader = null;
    }
    queryParser = null;
    indexSearcher = null;
    if (index != null) {
      index.close();
      index = null;
    }
  }
}

    tokReader = new StringReader(field.stringValue());
  tokens = analyzer.reusableTokenStream(field.name(), tokReader);
if (position > 0)
  position += analyzer.getPositionIncrementGap(field.name());
tokens.reset(); // reset the TokenStream to the first token
  offsetAttribute = (OffsetAttribute) tokens.addAttribute(OffsetAttribute.class);
      .addAttribute(PositionIncrementAttribute.class);
  Term term = new Term(field.name(), termAttribute.toString());
  ThriftTerm tterm = new ThriftTerm(term.field()).setText(
      ByteBuffer.wrap(term.text().getBytes("UTF-8"))).setIs_binary(false);
    position += (posIncrAttribute.getPositionIncrement() - 1);
    offsetVector.add(lastOffset + offsetAttribute.startOffset());
    offsetVector.add(lastOffset + offsetAttribute.endOffset());

try (TokenStream stream = tokenStream = field.tokenStream(docState.analyzer, tokenStream)) {
 stream.reset();
 invertState.setAttributeSource(stream);
 termsHashPerField.start(field, first);
 while (stream.incrementToken()) {
  int posIncr = invertState.posIncrAttribute.getPositionIncrement();
  invertState.position += posIncr;
  if (invertState.position < invertState.lastPosition) {
  int startOffset = invertState.offset + invertState.offsetAttribute.startOffset();
  int endOffset = invertState.offset + invertState.offsetAttribute.endOffset();
  if (startOffset < invertState.lastStartOffset || endOffset < startOffset) {
   throw new IllegalArgumentException("startOffset must be non-negative, and endOffset must be >= startOffset, and offsets must not go backwards "
 stream.end();
 invertState.position += invertState.posIncrAttribute.getPositionIncrement();
 invertState.offset += invertState.offsetAttribute.endOffset();
 invertState.position += docState.analyzer.getPositionIncrementGap(fieldInfo.name);
 invertState.offset += docState.analyzer.getOffsetGap(fieldInfo.name);

/**
 * @param text
 * @return
 */
public Set<String> getToken(String text) {
  Set<String> list = new LinkedHashSet<>();
  if (CommonUtils.notEmpty(text)) {
    try (StringReader stringReader = new StringReader(text);
        TokenStream tokenStream = dao.getAnalyzer().tokenStream(CommonConstants.BLANK, stringReader)) {
      CharTermAttribute charTermAttribute = tokenStream.addAttribute(CharTermAttribute.class);
      tokenStream.reset();
      while (tokenStream.incrementToken()) {
        list.add(charTermAttribute.toString());
      }
      tokenStream.end();
      return list;
    } catch (IOException e) {
      return list;
    }
  }
  return list;
}

@Test
public void testParentPathSearchingTokenization() throws Exception {
  try {
    TokenStream ts = parentPathSearchingAnalyzer.tokenStream("text", new StringReader("/jcr:a/b/jcr:c"));
    assertTokenStreamContents(ts, new String[]{"/jcr:a/b"});
  } finally {
    parentPathSearchingAnalyzer.close();
  }
}

testM.invoke(testC, testA.tokenStream("refs", new StringReader(input)), output, null, null, null, null, null, input.length(), true);
System.out.println("Testing full with " + name);
testM.invoke(testC, testA.tokenStream("full", new StringReader(input)), output, null, null, null, null, null, input.length(), true);

@SuppressWarnings("MismatchedQueryAndUpdateOfCollection")
@Test
public void testLemmatization() throws Exception {
  final TokenStream ts = analyzer.tokenStream("foo", new StringReader("מינהל"));
  ts.reset();
  Set<String> terms = new HashSet<>();
  while (ts.incrementToken()) {
    CharTermAttribute att = ts.getAttribute(CharTermAttribute.class);
    terms.add(new String(att.buffer(), 0, att.length()));
    //System.out.println(new String(att.buffer(), 0, att.length()));
  }
}

public static void main(String[] args) throws IOException {
  String TEST_STR = "Hé jij И!  раскази и повѣсти. Ст]' Дѣдо  	Нисторъ. Ива";
  try (Analyzer a = new BLStandardAnalyzer()) {
    TokenStream ts = a.tokenStream("test", new StringReader(TEST_STR));
    CharTermAttribute ta = ts.addAttribute(CharTermAttribute.class);
    while (ts.incrementToken()) {
      System.out.println(new String(ta.buffer(), 0, ta.length()));
    }
    TokenStream ts2 = a.tokenStream(ComplexFieldUtil.propertyField("test", null, "s"),
        new StringReader(TEST_STR));
    ta = ts2.addAttribute(CharTermAttribute.class);
    while (ts2.incrementToken()) {
      System.out.println(new String(ta.buffer(), 0, ta.length()));
    }
  }
}

 public class Tokens {

  private static void printTokens(String string, Analyzer analyzer) throws IOException {
    System.out.println("Using " + analyzer.getClass().getName());
    TokenStream ts = analyzer.tokenStream("default", new StringReader(string));
    OffsetAttribute offsetAttribute = ts.addAttribute(OffsetAttribute.class);
    CharTermAttribute charTermAttribute = ts.addAttribute(CharTermAttribute.class);

    while(ts.incrementToken()) {
      int startOffset = offsetAttribute.startOffset();
      int endOffset = offsetAttribute.endOffset();
      String term = charTermAttribute.toString();
      System.out.println(term + " (" + startOffset + " " + endOffset + ")");
    }
    System.out.println();
  }

  public static void main(String[] args) throws IOException {
    printTokens("foo-bar 1-2-3", new StandardAnalyzer(Version.LUCENE_40));
    printTokens("foo-bar 1-2-3", new ClassicAnalyzer(Version.LUCENE_40));

    QueryParser standardQP = new QueryParser(Version.LUCENE_40, "", new StandardAnalyzer(Version.LUCENE_40));
    BooleanQuery q1 = (BooleanQuery) standardQP.parse("someField:(foo\\-bar\\ 1\\-2\\-3)");
    System.out.println(q1.toString() + "     # of clauses:" + q1.getClauses().length);
  }
}

public Iterator<AToken> parseDocumentField(String fieldName, String content) {
  final TokenStream tkstream = analyzer.tokenStream(fieldName, new StringReader(content));
  final TermAttribute termAtt = tkstream.addAttribute(TermAttribute.class);
  final PositionIncrementAttribute posIncrAttribute = tkstream.addAttribute(PositionIncrementAttribute.class);
  final OffsetAttribute offsetAtt = tkstream.addAttribute(OffsetAttribute.class);

private Token[] parseText(String text) throws IOException {
  if (text == null || text.trim().equals(""))
    return new Token[0];
  final ArrayList result = new ArrayList();
  final TokenStream ts = analyzer.tokenStream(DocumentBuilder.CONTENT_FIELD_NAME, new StringReader(text));
  for (Token token = ts.next(); token != null; token = ts.next()) {
    result.add(token);
  }
  return (Token[]) result.toArray(new Token[result.size()]);
}

public static List<String> tokenizedTermValues(Analyzer analyzer, String field, String text) throws IOException {
  final List<String> tokenList = new ArrayList<String>();
  final TokenStream stream = analyzer.tokenStream( field, new StringReader( text ) );
  try {
    CharTermAttribute term = stream.addAttribute( CharTermAttribute.class );
    stream.reset();
    while ( stream.incrementToken() ) {
      String s = new String( term.buffer(), 0, term.length() );
      tokenList.add( s );
    }
    stream.end();
  }
  finally {
    stream.close();
  }
  return tokenList;
}

try (TokenStream ts = analyzer.tokenStream(fieldName, text)) {
  CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);
  ts.reset();
  while (ts.incrementToken()) {
    skipTerms.add(new Term(fieldName, termAtt.toString()));
  ts.end();
  BytesRef text;
  while ((text = termsEnum.next()) != null) {
    skipTerms.add(new Term(fieldName, text.utf8ToString()));

private SToken[] getTokens(String text) throws IOException {
  //FIXME somehow integrate below cycle to getSummary to save the cloning and memory,
  //also creating Tokens is suboptimal with 3.0.0 , this whole class could be replaced by highlighter        
  ArrayList<SToken> result = new ArrayList<>();
  try (TokenStream ts = analyzer.tokenStream("full", text)) {
    CharTermAttribute term = ts.addAttribute(CharTermAttribute.class);
    OffsetAttribute offset = ts.addAttribute(OffsetAttribute.class);
    ts.reset();
    while (ts.incrementToken()) {
      SToken t = new SToken(term.buffer(), 0, term.length(), offset.startOffset(), offset.endOffset());
      result.add(t);
    }
    ts.end();
  }
  return result.toArray(new SToken[result.size()]);
}

@Override
@JsonIgnore
final public Query getQuery(final QueryContext queryContext) throws IOException {
  final BooleanQuery.Builder builder = new BooleanQuery.Builder();
  final String resolvedField = resolveField(queryContext.getFieldMap());
  try (final TokenStream tokenStream = queryContext.getQueryAnalyzer().tokenStream(resolvedField, query_string)) {
    final CharTermAttribute charTermAttribute = tokenStream.getAttribute(CharTermAttribute.class);
    final PositionIncrementAttribute pocincrAttribute =
        tokenStream.getAttribute(PositionIncrementAttribute.class);
    tokenStream.reset();
    int pos = 0;
    while (tokenStream.incrementToken()) {
      final String charTerm = charTermAttribute.toString();
      int start = pos - distance;
      if (start < 0)
        start = 0;
      final int end = pos + distance + 1;
      for (int i = start; i < end; i++) {
        final float dist = Math.abs(i - pos) + 1;
        final float boost = 1 / dist;
        final SpanTermQuery spanTermQuery = new SpanTermQuery(new Term(resolvedField, charTerm));
        Query query = new BoostQuery(new SpanPositionRangeQuery(spanTermQuery, i, i + 1), boost);
        builder.add(new BooleanClause(query, BooleanClause.Occur.SHOULD));
      }
      pos += pocincrAttribute.getPositionIncrement();
    }
    return builder.build();
  }
}

try (TokenStream ts = analyzer.tokenStream("", text)) {
 CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);
 PositionIncrementAttribute posIncAtt = ts.addAttribute(PositionIncrementAttribute.class);
 ts.reset();
 reuse.clear();
 while (ts.incrementToken()) {
  int length = termAtt.length();
  if (length == 0) {
   throw new IllegalArgumentException("term: " + text + " analyzed to a zero-length token");
  if (posIncAtt.getPositionIncrement() != 1) {
   throw new IllegalArgumentException("term: " + text + " analyzed to a token (" + termAtt +
                     ") with position increment != 1 (got: " + posIncAtt.getPositionIncrement() + ")");
   reuse.setLength(reuse.length() + 1);
  System.arraycopy(termAtt.buffer(), 0, reuse.chars(), end, length);
  reuse.setLength(reuse.length() + length);

  private ArrayList<Data> analyze(Analyzer analyzer1) throws IOException {
    ArrayList<Data> results = new ArrayList<>(50);
    TokenStream ts = analyzer1.tokenStream("foo", text);
    ts.reset();
    while (ts.incrementToken()) {
      Data data = new Data();
      OffsetAttribute offsetAttribute = ts.getAttribute(OffsetAttribute.class);
      data.startOffset = offsetAttribute.startOffset();
      data.endOffset = offsetAttribute.endOffset();
      data.positionLength = ts.getAttribute(PositionLengthAttribute.class).getPositionLength();
      data.positionIncGap = ts.getAttribute(PositionIncrementAttribute.class).getPositionIncrement();
      data.tokenType = ts.getAttribute(HebrewTokenTypeAttribute.class).getType().toString();
      data.term = ts.getAttribute(CharTermAttribute.class).toString();

      if (ts.getAttribute(KeywordAttribute.class) != null)
        data.isKeyword = ts.getAttribute(KeywordAttribute.class).isKeyword();
      // System.out.println(data.term + " " + data.tokenType);
      results.add(data);
    }
    ts.close();

    return results;
  }
}

@Override
protected boolean doProcess(Record record) {
 try {
  List outputValues = record.get(outputFieldName);
  for (Object value : record.get(inputFieldName)) {
   reader.setValue(value.toString());
   TokenStream tokenStream = analyzer.tokenStream("content", reader);
   tokenStream.reset();
   while (tokenStream.incrementToken()) {
    if (token.length() > 0) { // incrementToken() updates the token!
     String tokenStr = new String(token.buffer(), 0, token.length());
     outputValues.add(tokenStr);
    }
   }
   tokenStream.end();
   tokenStream.close();
  }
 } catch (IOException e) {
  throw new MorphlineRuntimeException(e);
 }
 
 // pass record to next command in chain:
 return super.doProcess(record);
}

  private static int findGoodEndForNoHighlightExcerpt(int noMatchSize, Analyzer analyzer, String fieldName, String contents)
      throws IOException {
    try (TokenStream tokenStream = analyzer.tokenStream(fieldName, contents)) {
      if (!tokenStream.hasAttribute(OffsetAttribute.class)) {
        // Can't split on term boundaries without offsets
        return -1;
      }
      int end = -1;
      tokenStream.reset();
      while (tokenStream.incrementToken()) {
        OffsetAttribute attr = tokenStream.getAttribute(OffsetAttribute.class);
        if (attr.endOffset() >= noMatchSize) {
          // Jump to the end of this token if it wouldn't put us past the boundary
          if (attr.endOffset() == noMatchSize) {
            end = noMatchSize;
          }
          return end;
        }
        end = attr.endOffset();
      }
      tokenStream.end();
      // We've exhausted the token stream so we should just highlight everything.
      return end;
    }
  }
}

Javadoc

An Analyzer builds TokenStreams, which analyze text. It thus represents a policy for extracting index terms from text.

In order to define what analysis is done, subclasses must define their TokenStreamComponents in #createComponents(String,Reader). The components are then reused in each call to #tokenStream(String,Reader).

Simple example:

 
Analyzer analyzer = new Analyzer() { 
 @Overrideprotected TokenStreamComponents createComponents(String fieldName, Reader reader) { 
Tokenizer source = new FooTokenizer(reader); 
TokenStream filter = new FooFilter(source); 
filter = new BarFilter(filter); 
return new TokenStreamComponents(source, filter); 
} 
};

For more examples, see the org.apache.lucene.analysis.

For some concrete implementations bundled with Lucene, look in the analysis modules:

Common: Analyzers for indexing content in different languages and domains.
ICU: Exposes functionality from ICU to Apache Lucene.
Kuromoji: Morphological analyzer for Japanese text.
Morfologik: Dictionary-driven lemmatization for the Polish language.
Phonetic: Analysis for indexing phonetic signatures (for sounds-alike search).
Smart Chinese: Analyzer for Simplified Chinese, which indexes words.
Stempel: Algorithmic Stemmer for the Polish Language.
UIMA: Analysis integration with Apache UIMA.

Most used methods

tokenStream
Returns a TokenStream suitable for fieldName, tokenizing the contents of text. This method uses #cr
close
Frees persistent resources used by this Analyzer
getPositionIncrementGap
Invoked before indexing a IndexableField instance if terms have already been added to that field. Th
getOffsetGap
Just like #getPositionIncrementGap, except for Token offsets instead. By default this returns 1. Thi
reusableTokenStream
Creates a TokenStream that is allowed to be re-used from the previous time that the same thread call
getReuseStrategy
Returns the used ReuseStrategy.
normalize
Wrap the given TokenStream in order to apply normalization filters. The default implementation retur
createComponents
Creates a new TokenStreamComponents instance for this analyzer.
initReader
Override this if you want to add a CharFilter chain. The default implementation returns reader uncha
setVersion
Set the version of Lucene this analyzer should mimic the behavior for for analysis.
attributeFactory
Return the AttributeFactory to be used for #tokenStream and #normalize(String,String) on the given F
initReaderForNormalization
Wrap the given Reader with CharFilters that make sense for normalization. This is typically a subset

Popular in Java

Reading from database using SQL prepared statement
runOnUiThread (Activity)
onRequestPermissionsResult (Fragment)
getSupportFragmentManager (FragmentActivity)
PrintWriter (java.io)
Wraps either an existing OutputStream or an existing Writerand provides convenience methods for prin
String (java.lang)
Collections (java.util)
This class consists exclusively of static methods that operate on or return collections. It contains
LoggerFactory (org.slf4j)
The LoggerFactory is a utility class producing Loggers for various logging APIs, most notably for lo
GridBagLayout (java.awt)
The GridBagLayout class is a flexible layout manager that aligns components vertically and horizonta
Menu (java.awt)
Best IntelliJ plugins

How to useAnalyzer in org.apache.lucene.analysis

Best Java code snippets using org.apache.lucene.analysis.Analyzer (Showing top 20 results out of 1,323)

Refine search

How to use
Analyzer
in
org.apache.lucene.analysis