org.apache.lucene.index.MultiFields java code examples

Refine search

 /**
  * Returns {@link CollectionStatistics} for a field.
  * 
  * This can be overridden for example, to return a field's statistics
  * across a distributed collection.
  * @lucene.experimental
  */
 public CollectionStatistics collectionStatistics(String field) throws IOException {
  final int docCount;
  final long sumTotalTermFreq;
  final long sumDocFreq;

  assert field != null;
  
  Terms terms = MultiFields.getTerms(reader, field);
  if (terms == null) {
   docCount = 0;
   sumTotalTermFreq = 0;
   sumDocFreq = 0;
  } else {
   docCount = terms.getDocCount();
   sumTotalTermFreq = terms.getSumTotalTermFreq();
   sumDocFreq = terms.getSumDocFreq();
  }

  return new CollectionStatistics(field, reader.maxDoc(), docCount, sumTotalTermFreq, sumDocFreq);
 }
}

public void listTokens(int freq) throws IOException {
  IndexReader ireader = null;
  TermsEnum iter = null;
  Terms terms;
  try {
    ireader = DirectoryReader.open(indexDirectory);
    int numDocs = ireader.numDocs();
    if (numDocs > 0) {
      Fields uFields = MultiFields.getFields(ireader);//reader.getTermVectors(0);
      terms = uFields.terms(QueryBuilder.DEFS);
      iter = terms.iterator(); // init uid iterator
    }
    while (iter != null && iter.term() != null) {
      //if (iter.term().field().startsWith("f")) {
      if (iter.docFreq() > 16 && iter.term().utf8ToString().length() > freq) {
        LOGGER.warning(iter.term().utf8ToString());
      }
      BytesRef next = iter.next();
      if (next==null) {iter=null;}
    }
  } finally {
    if (ireader != null) {
      try {
        ireader.close();
      } catch (IOException e) {
        LOGGER.log(Level.WARNING, "An error occurred while closing index reader", e);
      }
    }
  }
}

  private DocIdSetIterator iterateAllDocs()
  {
    Bits liveDocs = MultiFields.getLiveDocs( reader );
    DocIdSetIterator allDocs = DocIdSetIterator.all( reader.maxDoc() );
    if ( liveDocs == null )
    {
      return allDocs;
    }

    return new FilteredDocIdSetIterator( allDocs )
    {
      @Override
      protected boolean match( int doc )
      {
        return liveDocs.get( doc );
      }
    };
  }
}

Bits liveDocs = MultiFields.getLiveDocs(reader);
Document d;
double tmpDistance;
int docs = reader.numDocs();
byte[] histogram = globalFeature.getByteArrayRepresentation();
for (int i = 0; i < docs; i++) {
  if (reader.hasDeletions() && !liveDocs.get(i)) continue; // if it is deleted, just ignore it.
  d = reader.document(i);
  tmpDistance = getDistance(d, histogram);
  assert (tmpDistance >= 0);

int numDocs = reader.numDocs();
if (numDocs > 0) {
  Fields uFields = MultiFields.getFields(reader);//reader.getTermVectors(0);
  terms = uFields.terms(QueryBuilder.U);
    uidIter = terms.iterator();
    TermsEnum.SeekStatus stat = uidIter.seekCeil(new BytesRef(startuid)); //init uid
    if (stat == TermsEnum.SeekStatus.END) {
      uidIter = null;
  while (uidIter != null && uidIter.term() != null
    && uidIter.term().utf8ToString().startsWith(startuid)) {
  reader.close();

/**
 * uses custom similarity to compute idf, use this if you want to implement
 * IDF(numDocs,docFreq)
 * 
 * @param reader
 * @param field
 * @param tfidfSIM
 * @return
 * @throws IOException
 */
public static Map<String, Float> getIdfs(IndexReader reader, String field,
    TFIDFSimilarity tfidfSIM) throws IOException {
  Map<String, Float> docFrequencies = new HashMap<>();
  TermsEnum termEnum = MultiFields.getTerms(reader, field).iterator();
  BytesRef bytesRef;
  while ((bytesRef = termEnum.next()) != null) {
    if (termEnum.seekExact(bytesRef)) {
      String term = bytesRef.utf8ToString();
      float idf = tfidfSIM.idf(termEnum.docFreq(), reader.numDocs());
      docFrequencies.put(term, idf);
    }
  }
  return docFrequencies;
}

private Map<String, Integer> distinctTermsCount(@Name("label") String label, @Name("key") String key) {
  try {
    SortedIndexReader sortedIndexReader = getSortedIndexReader(label, key, 0, Sort.INDEXORDER);
    Fields fields = MultiFields.getFields(sortedIndexReader.getIndexSearcher().getIndexReader());
    Map<String, Integer> values = new HashMap<>();
    TermsEnum termsEnum;
    Terms terms = fields.terms("string");
    if (terms != null) {
      termsEnum = terms.iterator();
      while ((termsEnum.next()) != null) {
        values.put(termsEnum.term().utf8ToString(), termsEnum.docFreq());
      }
    }
    return values;
  } catch (Exception e) {
    throw new RuntimeException("Error collecting distinct terms of label: " + label + " and key: " + key, e);
  }
}

  public Object perform(IndexReader reader) throws IOException
  {
    List values = new ArrayList();
    Terms terms = MultiFields.getTerms(reader, fieldName);
    if (terms != null)
    {
      TermsEnum termsEnum = terms.iterator(null);
      while (termsEnum.next() != null)
      {
        values.add(termsEnum.term().utf8ToString());
      }
    }
    return values;
  }
});

 @Override
 public void visitMatchingTerms(
  IndexReader reader,
  String fieldName,
  MatchingTermVisitor mtv) throws IOException
 {
  /* check term presence in index here for symmetry with other SimpleTerm's */
  Terms terms = MultiFields.getTerms(reader, fieldName);
  if (terms != null) {
   TermsEnum termsEnum = terms.iterator();

   TermsEnum.SeekStatus status = termsEnum.seekCeil(new BytesRef(getTermText()));
   if (status == TermsEnum.SeekStatus.FOUND) {
    mtv.visitMatchingTerm(getLuceneTerm(fieldName));
   }
  }
 }
}

private void testSearchSpeed(Class<? extends GlobalFeature> featureClass) throws IOException {
  ParallelIndexer parallelIndexer = new ParallelIndexer(DocumentBuilder.NUM_OF_THREADS, indexPath, testExtensive, true);
  parallelIndexer.addExtractor(featureClass);
  parallelIndexer.run();
  IndexReader reader = DirectoryReader.open(new RAMDirectory(FSDirectory.open(Paths.get(indexPath)), IOContext.READONCE));
  Bits liveDocs = MultiFields.getLiveDocs(reader);
  double queryCount = 0d;
  ImageSearcher searcher = new GenericFastImageSearcher(100, featureClass);
  long ms = System.currentTimeMillis();
  String fileName;
  Document queryDoc;
  ImageSearchHits hits;
  for (int i = 0; i < reader.maxDoc(); i++) {
    if (reader.hasDeletions() && !liveDocs.get(i)) continue; // if it is deleted, just ignore it.
    fileName = getIDfromFileName(reader.document(i).getValues(DocumentBuilder.FIELD_NAME_IDENTIFIER)[0]);
    if (queries.keySet().contains(fileName)) {
      queryCount += 1d;
      // ok, we've got a query here for a document ...
      queryDoc = reader.document(i);
      hits = searcher.search(queryDoc, reader);
    }
  }
  ms = System.currentTimeMillis() - ms;
  System.out.printf("%s \t %3.1f \n", featureClass.getName().substring(featureClass.getName().lastIndexOf('.') + 1), (double) ms / queryCount);
}

/** Returns {@link PostingsEnum} for the specified field and
 *  term, with control over whether freqs are required.
 *  Some codecs may be able to optimize their
 *  implementation when freqs are not required.  This will
 *  return null if the field or term does not exist.  See {@link
 *  TermsEnum#postings(PostingsEnum,int)}.*/
public static PostingsEnum getTermDocsEnum(IndexReader r, String field, BytesRef term, int flags) throws IOException {
 assert field != null;
 assert term != null;
 final Terms terms = getTerms(r, field);
 if (terms != null) {
  final TermsEnum termsEnum = terms.iterator();
  if (termsEnum.seekExact(term)) {
   return termsEnum.postings(null, flags);
  }
 }
 return null;
}

/**
 * Returns the list of terms for the specified field.
 *
 * @param reader The index reader
 * @param field  The field
 *
 * @return the list of terms for this field
 *
 * @throws IOException should any IO error be reported by the {@link IndexReader#terms(Term)} method.
 */
@Beta public static List<Term> terms(IndexReader reader, String field) throws IOException {
 LOGGER.debug("Loading terms for field {}", field);
 org.apache.lucene.index.Terms terms = MultiFields.getTerms(reader, field);
 if (terms == null) return Collections.emptyList();
 TermsEnum termsEnum = terms.iterator();
 if (termsEnum == TermsEnum.EMPTY) return Collections.emptyList();
 Map<BytesRef, Term> termsList = new HashMap<BytesRef, Term>(); // TODO use map with byte as key
 while (termsEnum.next() != null) {
  BytesRef t = termsEnum.term();
  if (t == null) break;
  termsList.put(t, new Term(field, BytesRef.deepCopyOf(t)));
 }
 return new ArrayList<>(termsList.values());
}

Terms terms = MultiFields.getTerms(indexReader, field);
CharsRefBuilder spare = new CharsRefBuilder();
if (terms != null) {
 TermsEnum te = terms.iterator();
 BytesRef text;
 while ((text = te.next()) != null) {
  if (te.docFreq() > maxDocFreq) {
   spare.copyUTF8Bytes(text);
   stopWords.add(spare.toString());

private int countTerms(final MultiNodeTermQuery q) throws Exception {
 final Terms terms = MultiFields.getTerms(index.reader, q.getField());
 if (terms == null)
  return 0;
 final TermsEnum termEnum = q.getTermsEnum(terms);
 assertNotNull(termEnum);
 int count = 0;
 BytesRef cur, last = null;
 while ((cur = termEnum.next()) != null) {
  count++;
  if (last != null) {
   assertTrue(last.compareTo(cur) < 0);
  }
  last = BytesRef.deepCopyOf(cur);
 }
 // LUCENE-3314: the results after next() already returned null are undefined,
 // assertNull(termEnum.next());
 return count;
}

@Override
public FieldStats stats(IndexReader reader) throws IOException {
  int maxDoc = reader.maxDoc();
  FieldInfo fi = org.apache.lucene.index.MultiFields.getMergedFieldInfos(reader).fieldInfo(name());
  if (fi == null) {
    return null;
  }
  /**
   * we don't have a specific type for geo_shape so we use an empty {@link FieldStats.Text}.
   * TODO: we should maybe support a new type that knows how to (de)encode the min/max information
   */
  return new FieldStats.Text(maxDoc, -1, -1, -1, isSearchable(), isAggregatable());
}

HighFrequencyIterator() throws IOException {
 Terms terms = MultiFields.getTerms(reader, field);
 if (terms != null) {
  termsEnum = terms.iterator();
 } else {
  termsEnum = null;
 }
 minNumDocs = (int)(thresh * (float)reader.numDocs());
}

/**
 * Loads all the prefix terms in the list of terms given the reader.
 *
 * @param reader  Index reader to use.
 * @param values  The list of values to load.
 * @param term    The term to use.
 *
 * @throws IOException If an error is thrown by the prefix term enumeration.
 */
public static void prefix(IndexReader reader, List<String> values, Term term) throws IOException {
 Fields fields = MultiFields.getFields(reader);
 org.apache.lucene.index.Terms terms = fields == null ? null : fields.terms(term.field());
 if (terms == null) return;
 TermsEnum prefixes = terms.intersect(new CompiledAutomaton(PrefixQuery.toAutomaton(term.bytes())), term.bytes());
 BytesRef val;
 while ((val = prefixes.next()) != null) {
  values.add(val.utf8ToString());
 }
}

/**
 * Loads all the prefix terms in the list of terms given the reader.
 *
 * @param reader  Index reader to use.
 * @param values  The list of values to load.
 * @param term    The term to use.
 *
 * @throws IOException If an error is thrown by the prefix term enumeration.
 */
public static void prefix(IndexReader reader, Bucket<Term> bucket, Term term) throws IOException {
 Fields fields = MultiFields.getFields(reader);
 org.apache.lucene.index.Terms terms = fields == null ? null : fields.terms(term.field());
 if (terms == null) return;
 TermsEnum prefixes = terms.intersect(new CompiledAutomaton(PrefixQuery.toAutomaton(term.bytes())), term.bytes());
 BytesRef val;
 while ((val = prefixes.next()) != null) {
  Term t = new Term(term.field(), BytesRef.deepCopyOf(val));
  bucket.add(t, reader.docFreq(t));
 }
}

  @Override
  public FieldStats.GeoPoint stats(IndexReader reader) throws IOException {
    String field = name();
    FieldInfo fi = org.apache.lucene.index.MultiFields.getMergedFieldInfos(reader).fieldInfo(field);
    if (fi == null) {
      return null;
    }
    Terms terms = org.apache.lucene.index.MultiFields.getTerms(reader, field);
    if (terms == null) {
      return new FieldStats.GeoPoint(reader.maxDoc(), 0L, -1L, -1L, isSearchable(), isAggregatable());
    }
    return new FieldStats.GeoPoint(reader.maxDoc(), terms.getDocCount(), -1L, terms.getSumTotalTermFreq(), isSearchable(),
      isAggregatable(), prefixCodedToGeoPoint(terms.getMin(), numericEncoded),
      prefixCodedToGeoPoint(terms.getMax(), numericEncoded));
  }
}

PostingsEnum docs = MultiFields.getTermDocsEnum(indexReader, Consts.FULL, new BytesRef(FacetsConfig.pathToString(cp.components, cp.length)), 0);
if (docs != null && docs.nextDoc() != DocIdSetIterator.NO_MORE_DOCS) {
 ret = docs.docID();

Javadoc

Exposes flex API, merged from flex API of sub-segments. This is useful when you're interacting with an IndexReader implementation that consists of sequential sub-readers (eg DirectoryReader or MultiReader).

NOTE: for composite readers, you'll get better performance by gathering the sub readers using IndexReader#getContext() to get the atomic leaves and then operate per-LeafReader, instead of using this class.

Most used methods

getTerms
This method may return null if the field does not exist or if it has no terms.
getFields
Returns a single Fields instance for this reader, merging fields/terms/docs/positions on the fly. Th
getLiveDocs
Returns a single Bits instance for this reader, merging live Documents on the fly. This method will
getIndexedFields
Call this to get the (merged) FieldInfos representing the set of indexed fields only for a composit
getMergedFieldInfos
Call this to get the (merged) FieldInfos for a composite reader. NOTE: the returned field numbers wi
getTermDocsEnum
Returns PostingsEnum for the specified field and term, with control over whether freqs are required.
getTermPositionsEnum
Returns PostingsEnum for the specified field and term, with control over whether offsets and payload
<init>
Expert: construct a new MultiFields instance directly.

Popular in Java

Start an intent from android
getExternalFilesDir (Context)
onRequestPermissionsResult (Fragment)
requestLocationUpdates (LocationManager)
FileInputStream (java.io)
An input stream that reads bytes from a file. File file = ...finally if (in != null) in.clos
String (java.lang)
Iterator (java.util)
An iterator over a sequence of objects, such as a collection.If a collection has been changed since
TimeUnit (java.util.concurrent)
A TimeUnit represents time durations at a given unit of granularity and provides utility methods to
Manifest (java.util.jar)
The Manifest class is used to obtain attribute information for a JarFile and its entries.
Cipher (javax.crypto)
This class provides access to implementations of cryptographic ciphers for encryption and decryption
Best IntelliJ plugins

How to useMultiFields in org.apache.lucene.index

Best Java code snippets using org.apache.lucene.index.MultiFields (Showing top 20 results out of 468)

Refine search

How to use
MultiFields
in
org.apache.lucene.index