org.apache.lucene.index.TermsEnum java code examples

Refine search

public void listTokens(int freq) throws IOException {
  IndexReader ireader = null;
  TermsEnum iter = null;
  Terms terms;
  try {
    ireader = DirectoryReader.open(indexDirectory);
    int numDocs = ireader.numDocs();
    if (numDocs > 0) {
      Fields uFields = MultiFields.getFields(ireader);//reader.getTermVectors(0);
      terms = uFields.terms(QueryBuilder.DEFS);
      iter = terms.iterator(); // init uid iterator
    }
    while (iter != null && iter.term() != null) {
      //if (iter.term().field().startsWith("f")) {
      if (iter.docFreq() > 16 && iter.term().utf8ToString().length() > freq) {
        LOGGER.warning(iter.term().utf8ToString());
      }
      BytesRef next = iter.next();
      if (next==null) {iter=null;}
    }
  } finally {
    if (ireader != null) {
      try {
        ireader.close();
      } catch (IOException e) {
        LOGGER.log(Level.WARNING, "An error occurred while closing index reader", e);
      }
    }
  }
}

/** Returns the number of documents containing the term
 * <code>t</code>.  This method returns 0 if the term or
 * field does not exists.  This method does not take into
 * account deleted documents that have not yet been merged
 * away. */
@Override
public final long totalTermFreq(Term term) throws IOException {
 final Terms terms = terms(term.field());
 if (terms == null) {
  return 0;
 }
 final TermsEnum termsEnum = terms.iterator();
 if (termsEnum.seekExact(term.bytes())) {
  return termsEnum.totalTermFreq();
 } else {
  return 0;
 }
}

/** Returns {@link PostingsEnum} for the specified field and
 *  term, with control over whether freqs are required.
 *  Some codecs may be able to optimize their
 *  implementation when freqs are not required.  This will
 *  return null if the field or term does not exist.  See {@link
 *  TermsEnum#postings(PostingsEnum,int)}.*/
public static PostingsEnum getTermDocsEnum(IndexReader r, String field, BytesRef term, int flags) throws IOException {
 assert field != null;
 assert term != null;
 final Terms terms = getTerms(r, field);
 if (terms != null) {
  final TermsEnum termsEnum = terms.iterator();
  if (termsEnum.seekExact(term)) {
   return termsEnum.postings(null, flags);
  }
 }
 return null;
}

@Override
public BytesRef lookupOrd(int ord) throws IOException {
 termsEnum.seekExact(ord);
 return termsEnum.term();
}

@Override
public long lookupTerm(BytesRef key) throws IOException {
 SeekStatus status = termsEnum.seekCeil(key);
 switch (status) {
  case FOUND:
   return termsEnum.ord();
  default:
   return -1L - termsEnum.ord();
 }
}

/** Returns an expected cost in simple operations
 *  of processing the occurrences of a term
 *  in a document that contains the term.
 *  This is for use by {@link TwoPhaseIterator#matchCost} implementations.
 *  <br>This may be inaccurate when {@link TermsEnum#totalTermFreq()} is not available.
 *  @param termsEnum The term is the term at which this TermsEnum is positioned.
 */
static float termPositionsCost(TermsEnum termsEnum) throws IOException {
 int docFreq = termsEnum.docFreq();
 assert docFreq > 0;
 long totalTermFreq = termsEnum.totalTermFreq(); // -1 when not available
 float expOccurrencesInMatchingDoc = (totalTermFreq < docFreq) ? 1 : (totalTermFreq / (float) docFreq);
 return TERM_POSNS_SEEK_OPS_PER_DOC + expOccurrencesInMatchingDoc * TERM_OPS_PER_POS;
}

private void getPrefixTerms(ObjectHashSet<Term> terms, final Term prefix, final IndexReader reader) throws IOException {
  // SlowCompositeReaderWrapper could be used... but this would merge all terms from each segment into one terms
  // instance, which is very expensive. Therefore I think it is better to iterate over each leaf individually.
  List<LeafReaderContext> leaves = reader.leaves();
  for (LeafReaderContext leaf : leaves) {
    Terms _terms = leaf.reader().terms(field);
    if (_terms == null) {
      continue;
    }
    TermsEnum termsEnum = _terms.iterator();
    TermsEnum.SeekStatus seekStatus = termsEnum.seekCeil(prefix.bytes());
    if (TermsEnum.SeekStatus.END == seekStatus) {
      continue;
    }
    for (BytesRef term = termsEnum.term(); term != null; term = termsEnum.next()) {
      if (!StringHelper.startsWith(term, prefix.bytes())) {
        break;
      }
      terms.add(new Term(field, BytesRef.deepCopyOf(term)));
      if (terms.size() >= maxExpansions) {
        return;
      }
    }
  }
}

final BytesRef term = termsEnum.next();
if (term == null) {
 break;
final int docFreq = termsEnum.docFreq();
if (docFreq <= 0) {
 throw new RuntimeException("docfreq: " + docFreq + " is out of bounds");
postings = termsEnum.postings(postings, PostingsEnum.ALL);
 if (termsEnum.totalTermFreq() != -1) {
  throw new RuntimeException("field \"" + field + "\" hasFreqs is false, but TermsEnum.totalTermFreq()=" + termsEnum.totalTermFreq() + " (should be -1)");   
 long ord = -1;
 try {
  ord = termsEnum.ord();
 } catch (UnsupportedOperationException uoe) {
  hasOrd = false;
final long totalTermFreq2 = termsEnum.totalTermFreq();
final boolean hasTotalTermFreq = hasFreqs && totalTermFreq2 != -1;
 for(int idx=0;idx<7;idx++) {
  final int skipDocID = (int) (((idx+1)*(long) maxDoc)/8);
  postings = termsEnum.postings(postings, PostingsEnum.ALL);
 for(int idx=0;idx<7;idx++) {
  final int skipDocID = (int) (((idx+1)*(long) maxDoc)/8);
  postings = termsEnum.postings(postings, PostingsEnum.NONE);

int numDocs = reader.numDocs();
if (numDocs > 0) {
    uidIter = terms.iterator();
    TermsEnum.SeekStatus stat = uidIter.seekCeil(new BytesRef(startuid)); //init uid
    if (stat == TermsEnum.SeekStatus.END) {
      uidIter = null;
  while (uidIter != null && uidIter.term() != null
    && uidIter.term().utf8ToString().startsWith(startuid)) {
    BytesRef next = uidIter.next();
    if (next == null) {
      uidIter=null;
  reader.close();

final Terms terms = context.reader().terms(query.field);
if (terms == null) {
  final TermContext termContext = new TermContext(searcher.getTopReaderContext());
  termContext.register(t.state, context.ord, t.docFreq, t.totalTermFreq);
  bq.add(new TermQuery(new Term(query.field, t.term), termContext), Occur.SHOULD);
DocIdSetBuilder builder = new DocIdSetBuilder(context.reader().maxDoc(), terms);
if (collectedTerms.isEmpty() == false) {
 TermsEnum termsEnum2 = terms.iterator();
 for (TermAndState t : collectedTerms) {
  termsEnum2.seekExact(t.term, t.state);
  docs = termsEnum2.postings(docs, PostingsEnum.NONE);
  builder.add(docs);
 docs = termsEnum.postings(docs, PostingsEnum.NONE);
 builder.add(docs);
} while (termsEnum.next() != null);

final boolean hasPositions = terms.hasPositions();
final boolean hasOffsets = terms.hasOffsets();
final boolean hasPayloads = terms.hasPayloads();
assert !hasPayloads || hasPositions;
 termsEnum = terms.iterator();
 while(termsEnum.next() != null) {
  numTerms++;
while(termsEnum.next() != null) {
 termCount++;
 final int freq = (int) termsEnum.totalTermFreq();
 startTerm(termsEnum.term(), freq);
  docsAndPositionsEnum = termsEnum.postings(docsAndPositionsEnum, PostingsEnum.OFFSETS | PostingsEnum.PAYLOADS);
  assert docsAndPositionsEnum != null;
  final int docID = docsAndPositionsEnum.nextDoc();
  assert docID != DocIdSetIterator.NO_MORE_DOCS;
  assert docsAndPositionsEnum.freq() == freq;
   final int pos = docsAndPositionsEnum.nextPosition();
   final int startOffset = docsAndPositionsEnum.startOffset();
   final int endOffset = docsAndPositionsEnum.endOffset();

/**
 * Loads all the prefix terms in the list of terms given the reader.
 *
 * @param reader  Index reader to use.
 * @param values  The list of values to load.
 * @param term    The term to use.
 *
 * @throws IOException If an error is thrown by the prefix term enumeration.
 */
public static void prefix(IndexReader reader, Bucket<Term> bucket, Term term) throws IOException {
 Fields fields = MultiFields.getFields(reader);
 org.apache.lucene.index.Terms terms = fields == null ? null : fields.terms(term.field());
 if (terms == null) return;
 TermsEnum prefixes = terms.intersect(new CompiledAutomaton(PrefixQuery.toAutomaton(term.bytes())), term.bytes());
 BytesRef val;
 while ((val = prefixes.next()) != null) {
  Term t = new Term(term.field(), BytesRef.deepCopyOf(val));
  bucket.add(t, reader.docFreq(t));
 }
}

for ( LeafReaderContext leafReaderContext : searcher.getIndexReader().leaves() )
  Fields fields = leafReaderContext.reader().fields();
  for ( String field : fields )
      while ( (termsRef = terms.next()) != null )
        if ( terms.docFreq() > 1 )
          collector.init( terms.docFreq() );
          searcher.search( new TermQuery( new Term( field, termsRef ) ), collector );

public void buildTermVector(int docid) throws IOException {
  /*
  */
  Set<String> fieldList = new HashSet<>();
  fieldList.add("content");
  Document doc = reader.document(docid, fieldList);
  MemoryIndex mi = MemoryIndex.fromDocument(doc, new StandardAnalyzer());
  IndexReader mr = mi.createSearcher().getIndexReader();
  Terms t = mr.leaves().get(0).reader().terms("content");
  if ((t != null) && (t.size()>0)) {
    TermsEnum te = t.iterator();
    BytesRef term = null;
    System.out.println(t.size());
    while ((term = te.next()) != null) {
      System.out.println("BytesRef: " + term.utf8ToString());
      System.out.println("docFreq: " + te.docFreq());
      System.out.println("totalTermFreq: " + te.totalTermFreq());
    }
  }
}

@Override
public void verify( NodePropertyAccessor accessor, int[] propKeyIds ) throws IndexEntryConflictException, IOException
{
  for ( String field : allFields() )
  {
    if ( LuceneDocumentStructure.useFieldForUniquenessVerification( field ) )
    {
      TermsEnum terms = LuceneDocumentStructure.originalTerms( termsForField( field ), field );
      BytesRef termsRef;
      while ( (termsRef = terms.next()) != null )
      {
        if ( terms.docFreq() > 1 )
        {
          TermQuery query = new TermQuery( new Term( field, termsRef ) );
          searchForDuplicates( query, accessor, propKeyIds, terms.docFreq() );
        }
      }
    }
  }
}

/**
 * Remove a stale file (uidIter.term().text()) from the index database and
 * history cache, and queue the removal of xref.
 *
 * @param removeHistory if false, do not remove history cache for this file
 * @throws java.io.IOException if an error occurs
 */
private void removeFile(boolean removeHistory) throws IOException {
  String path = Util.uid2url(uidIter.term().utf8ToString());
  for (IndexChangedListener listener : listeners) {
    listener.fileRemove(path);
  }
  writer.deleteDocuments(new Term(QueryBuilder.U, uidIter.term()));
  removeXrefFile(path);
  if (removeHistory) {
    removeHistoryFile(path);
  }
  setDirty();
  for (IndexChangedListener listener : listeners) {
    listener.fileRemoved(path);
  }
}

@Override
public Explanation explain(LeafReaderContext context, int doc) throws IOException {
 String desc = "weight(" + getQuery() + " in " + doc + ") [" + function + "]";
 Terms terms = context.reader().terms(fieldName);
 if (terms == null) {
  return Explanation.noMatch(desc + ". Field " + fieldName + " doesn't exist.");
 }
 TermsEnum termsEnum = terms.iterator();
 if (termsEnum.seekExact(new BytesRef(featureName)) == false) {
  return Explanation.noMatch(desc + ". Feature " + featureName + " doesn't exist.");
 }
 PostingsEnum postings = termsEnum.postings(null, PostingsEnum.FREQS);
 if (postings.advance(doc) != doc) {
  return Explanation.noMatch(desc + ". Feature " + featureName + " isn't set.");
 }
 return function.explain(fieldName, featureName, boost, doc, postings.freq());
}

private Query newTermQuery(IndexReader reader, Term term) throws IOException {
 // we build an artificial TermContext that will give an overall df and ttf
 // equal to 1
 TermContext context = new TermContext(reader.getContext());
 for (LeafReaderContext leafContext : reader.leaves()) {
  Terms terms = leafContext.reader().terms(term.field());
  if (terms != null) {
   TermsEnum termsEnum = terms.iterator();
   if (termsEnum.seekExact(term.bytes())) {
    int freq = 1 - context.docFreq(); // we want the total df and ttf to be 1
    context.register(termsEnum.termState(), leafContext.ord, freq, freq);
   }
  }
 }
 return new TermQuery(term, context);
}

 @Override
 public Spans getSpans(final LeafReaderContext context, Postings requiredPostings) throws IOException {
  assert termContext.wasBuiltFor(ReaderUtil.getTopLevelContext(context)) : "The top-reader used to create Weight is not the same as the current reader's top-reader (" + ReaderUtil.getTopLevelContext(context);
  final TermState state = termContext.get(context.ord);
  if (state == null) { // term is not present in that reader
   assert context.reader().docFreq(term) == 0 : "no termstate found but term exists in reader term=" + term;
   return null;
  }
  final Terms terms = context.reader().terms(term.field());
  if (terms == null)
   return null;
  if (terms.hasPositions() == false)
   throw new IllegalStateException("field \"" + term.field() + "\" was indexed without position data; cannot run SpanTermQuery (term=" + term.text() + ")");
  final TermsEnum termsEnum = terms.iterator();
  termsEnum.seekExact(term.bytes(), state);
  final PostingsEnum postings = termsEnum.postings(null, requiredPostings.getRequiredPostings());
  float positionsCost = termPositionsCost(termsEnum) * PHRASE_TO_SPAN_TERM_POSITIONS_COST;
  return new TermSpans(getSimScorer(context), postings, term, positionsCost);
 }
}

 throw new RuntimeException("vector field=" + field + " does not exist in postings; doc=" + j);
TermsEnum postingsTermsEnum = postingsTerms.iterator();
final boolean hasProx = terms.hasOffsets() || terms.hasPositions();
BytesRef term = null;
while ((term = termsEnum.next()) != null) {
 postings = termsEnum.postings(postings, PostingsEnum.ALL);
 assert postings != null;
 if (!postingsTermsEnum.seekExact(term)) {
  throw new RuntimeException("vector term=" + term + " field=" + field + " does not exist in postings; doc=" + j);
 postingsDocs = postingsTermsEnum.postings(postingsDocs, PostingsEnum.ALL);
 assert postingsDocs != null;
 final int advanceDoc = postingsDocs.advance(j);
 if (advanceDoc != j) {
  throw new RuntimeException("vector term=" + term + " field=" + field + ": doc=" + j + " was not found in postings (got: " + advanceDoc + ")");
 final int doc = postings.nextDoc();
  final int tf = postings.freq();
  if (postingsHasFreq && postingsDocs.freq() != tf) {
   throw new RuntimeException("vector term=" + term + " field=" + field + " doc=" + j + ": freq=" + tf + " differs from postings freq=" + postingsDocs.freq());

Javadoc

Iterator to seek ( #seekCeil(BytesRef), #seekExact(BytesRef)) or step through ( #next terms to obtain frequency information ( #docFreq), PostingsEnum or PostingsEnum for the current term ( #postings.

Term enumerations are always ordered by BytesRef.compareTo, which is Unicode sort order if the terms are UTF-8 bytes. Each term in the enumeration is greater than the one before it.

The TermsEnum is unpositioned when you first obtain it and you must first successfully call #next or one of the seek methods.

Most used methods

next
docFreq
Returns the number of documents containing the current term. Do not call this when the enum is unpos
totalTermFreq
Returns the total number of occurrences of this term across all documents (the sum of the freq() for
term
Returns current term. Do not call this when the enum is unpositioned.
seekExact
postings
seekCeil
Seeks to the specified term, if it exists, or to the next (ceiling) term. Returns SeekStatus to indi
ord
Returns ordinal position for current term. This is an optional method (the codec may throw Unsupport
attributes
Returns the related attributes.
termState
Expert: Returns the TermsEnums internal state to position the TermsEnum without re-seeking the term
docs
Get DocsEnum for the current term, with control over whether freqs are required. Do not call this wh
docsAndPositions
Get DocsAndPositionsEnum for the current term, with control over whether offsets and payloads are re

Popular in Java

Reactive rest calls using spring rest template
getSupportFragmentManager (FragmentActivity)
setRequestProperty (URLConnection)
addToBackStack (FragmentTransaction)
FileReader (java.io)
A specialized Reader that reads from a file in the file system. All read requests made by calling me
ResultSet (java.sql)
An interface for an object which represents a database table entry, returned as the result of the qu
Comparator (java.util)
A Comparator is used to compare two objects to determine their ordering with respect to each other.
TimerTask (java.util)
The TimerTask class represents a task to run at a specified time. The task may be run once or repeat
Pattern (java.util.regex)
Patterns are compiled regular expressions. In many cases, convenience methods such as String#matches
Rectangle (java.awt)
A Rectangle specifies an area in a coordinate space that is enclosed by the Rectangle object's top-
Github Copilot alternatives

How to useTermsEnum in org.apache.lucene.index

Best Java code snippets using org.apache.lucene.index.TermsEnum (Showing top 20 results out of 774)

Refine search

How to use
TermsEnum
in
org.apache.lucene.index