Refine search
/** * Returns {@link CollectionStatistics} for a field. * * This can be overridden for example, to return a field's statistics * across a distributed collection. * @lucene.experimental */ public CollectionStatistics collectionStatistics(String field) throws IOException { final int docCount; final long sumTotalTermFreq; final long sumDocFreq; assert field != null; Terms terms = MultiFields.getTerms(reader, field); if (terms == null) { docCount = 0; sumTotalTermFreq = 0; sumDocFreq = 0; } else { docCount = terms.getDocCount(); sumTotalTermFreq = terms.getSumTotalTermFreq(); sumDocFreq = terms.getSumDocFreq(); } return new CollectionStatistics(field, reader.maxDoc(), docCount, sumTotalTermFreq, sumDocFreq); } }
public void listTokens(int freq) throws IOException { IndexReader ireader = null; TermsEnum iter = null; Terms terms; try { ireader = DirectoryReader.open(indexDirectory); int numDocs = ireader.numDocs(); if (numDocs > 0) { Fields uFields = MultiFields.getFields(ireader);//reader.getTermVectors(0); terms = uFields.terms(QueryBuilder.DEFS); iter = terms.iterator(); // init uid iterator } while (iter != null && iter.term() != null) { //if (iter.term().field().startsWith("f")) { if (iter.docFreq() > 16 && iter.term().utf8ToString().length() > freq) { LOGGER.warning(iter.term().utf8ToString()); } BytesRef next = iter.next(); if (next==null) {iter=null;} } } finally { if (ireader != null) { try { ireader.close(); } catch (IOException e) { LOGGER.log(Level.WARNING, "An error occurred while closing index reader", e); } } } }
private DocIdSetIterator iterateAllDocs() { Bits liveDocs = MultiFields.getLiveDocs( reader ); DocIdSetIterator allDocs = DocIdSetIterator.all( reader.maxDoc() ); if ( liveDocs == null ) { return allDocs; } return new FilteredDocIdSetIterator( allDocs ) { @Override protected boolean match( int doc ) { return liveDocs.get( doc ); } }; } }
Bits liveDocs = MultiFields.getLiveDocs(reader); Document d; double tmpDistance; int docs = reader.numDocs(); byte[] histogram = globalFeature.getByteArrayRepresentation(); for (int i = 0; i < docs; i++) { if (reader.hasDeletions() && !liveDocs.get(i)) continue; // if it is deleted, just ignore it. d = reader.document(i); tmpDistance = getDistance(d, histogram); assert (tmpDistance >= 0);
int numDocs = reader.numDocs(); if (numDocs > 0) { Fields uFields = MultiFields.getFields(reader);//reader.getTermVectors(0); terms = uFields.terms(QueryBuilder.U); uidIter = terms.iterator(); TermsEnum.SeekStatus stat = uidIter.seekCeil(new BytesRef(startuid)); //init uid if (stat == TermsEnum.SeekStatus.END) { uidIter = null; while (uidIter != null && uidIter.term() != null && uidIter.term().utf8ToString().startsWith(startuid)) { reader.close();
/** * uses custom similarity to compute idf, use this if you want to implement * IDF(numDocs,docFreq) * * @param reader * @param field * @param tfidfSIM * @return * @throws IOException */ public static Map<String, Float> getIdfs(IndexReader reader, String field, TFIDFSimilarity tfidfSIM) throws IOException { Map<String, Float> docFrequencies = new HashMap<>(); TermsEnum termEnum = MultiFields.getTerms(reader, field).iterator(); BytesRef bytesRef; while ((bytesRef = termEnum.next()) != null) { if (termEnum.seekExact(bytesRef)) { String term = bytesRef.utf8ToString(); float idf = tfidfSIM.idf(termEnum.docFreq(), reader.numDocs()); docFrequencies.put(term, idf); } } return docFrequencies; }
private Map<String, Integer> distinctTermsCount(@Name("label") String label, @Name("key") String key) { try { SortedIndexReader sortedIndexReader = getSortedIndexReader(label, key, 0, Sort.INDEXORDER); Fields fields = MultiFields.getFields(sortedIndexReader.getIndexSearcher().getIndexReader()); Map<String, Integer> values = new HashMap<>(); TermsEnum termsEnum; Terms terms = fields.terms("string"); if (terms != null) { termsEnum = terms.iterator(); while ((termsEnum.next()) != null) { values.put(termsEnum.term().utf8ToString(), termsEnum.docFreq()); } } return values; } catch (Exception e) { throw new RuntimeException("Error collecting distinct terms of label: " + label + " and key: " + key, e); } }
public Object perform(IndexReader reader) throws IOException { List values = new ArrayList(); Terms terms = MultiFields.getTerms(reader, fieldName); if (terms != null) { TermsEnum termsEnum = terms.iterator(null); while (termsEnum.next() != null) { values.add(termsEnum.term().utf8ToString()); } } return values; } });
@Override public void visitMatchingTerms( IndexReader reader, String fieldName, MatchingTermVisitor mtv) throws IOException { /* check term presence in index here for symmetry with other SimpleTerm's */ Terms terms = MultiFields.getTerms(reader, fieldName); if (terms != null) { TermsEnum termsEnum = terms.iterator(); TermsEnum.SeekStatus status = termsEnum.seekCeil(new BytesRef(getTermText())); if (status == TermsEnum.SeekStatus.FOUND) { mtv.visitMatchingTerm(getLuceneTerm(fieldName)); } } } }
private void testSearchSpeed(Class<? extends GlobalFeature> featureClass) throws IOException { ParallelIndexer parallelIndexer = new ParallelIndexer(DocumentBuilder.NUM_OF_THREADS, indexPath, testExtensive, true); parallelIndexer.addExtractor(featureClass); parallelIndexer.run(); IndexReader reader = DirectoryReader.open(new RAMDirectory(FSDirectory.open(Paths.get(indexPath)), IOContext.READONCE)); Bits liveDocs = MultiFields.getLiveDocs(reader); double queryCount = 0d; ImageSearcher searcher = new GenericFastImageSearcher(100, featureClass); long ms = System.currentTimeMillis(); String fileName; Document queryDoc; ImageSearchHits hits; for (int i = 0; i < reader.maxDoc(); i++) { if (reader.hasDeletions() && !liveDocs.get(i)) continue; // if it is deleted, just ignore it. fileName = getIDfromFileName(reader.document(i).getValues(DocumentBuilder.FIELD_NAME_IDENTIFIER)[0]); if (queries.keySet().contains(fileName)) { queryCount += 1d; // ok, we've got a query here for a document ... queryDoc = reader.document(i); hits = searcher.search(queryDoc, reader); } } ms = System.currentTimeMillis() - ms; System.out.printf("%s \t %3.1f \n", featureClass.getName().substring(featureClass.getName().lastIndexOf('.') + 1), (double) ms / queryCount); }
/** Returns {@link PostingsEnum} for the specified field and * term, with control over whether freqs are required. * Some codecs may be able to optimize their * implementation when freqs are not required. This will * return null if the field or term does not exist. See {@link * TermsEnum#postings(PostingsEnum,int)}.*/ public static PostingsEnum getTermDocsEnum(IndexReader r, String field, BytesRef term, int flags) throws IOException { assert field != null; assert term != null; final Terms terms = getTerms(r, field); if (terms != null) { final TermsEnum termsEnum = terms.iterator(); if (termsEnum.seekExact(term)) { return termsEnum.postings(null, flags); } } return null; }
/** * Returns the list of terms for the specified field. * * @param reader The index reader * @param field The field * * @return the list of terms for this field * * @throws IOException should any IO error be reported by the {@link IndexReader#terms(Term)} method. */ @Beta public static List<Term> terms(IndexReader reader, String field) throws IOException { LOGGER.debug("Loading terms for field {}", field); org.apache.lucene.index.Terms terms = MultiFields.getTerms(reader, field); if (terms == null) return Collections.emptyList(); TermsEnum termsEnum = terms.iterator(); if (termsEnum == TermsEnum.EMPTY) return Collections.emptyList(); Map<BytesRef, Term> termsList = new HashMap<BytesRef, Term>(); // TODO use map with byte as key while (termsEnum.next() != null) { BytesRef t = termsEnum.term(); if (t == null) break; termsList.put(t, new Term(field, BytesRef.deepCopyOf(t))); } return new ArrayList<>(termsList.values()); }
Terms terms = MultiFields.getTerms(indexReader, field); CharsRefBuilder spare = new CharsRefBuilder(); if (terms != null) { TermsEnum te = terms.iterator(); BytesRef text; while ((text = te.next()) != null) { if (te.docFreq() > maxDocFreq) { spare.copyUTF8Bytes(text); stopWords.add(spare.toString());
private int countTerms(final MultiNodeTermQuery q) throws Exception { final Terms terms = MultiFields.getTerms(index.reader, q.getField()); if (terms == null) return 0; final TermsEnum termEnum = q.getTermsEnum(terms); assertNotNull(termEnum); int count = 0; BytesRef cur, last = null; while ((cur = termEnum.next()) != null) { count++; if (last != null) { assertTrue(last.compareTo(cur) < 0); } last = BytesRef.deepCopyOf(cur); } // LUCENE-3314: the results after next() already returned null are undefined, // assertNull(termEnum.next()); return count; }
@Override public FieldStats stats(IndexReader reader) throws IOException { int maxDoc = reader.maxDoc(); FieldInfo fi = org.apache.lucene.index.MultiFields.getMergedFieldInfos(reader).fieldInfo(name()); if (fi == null) { return null; } /** * we don't have a specific type for geo_shape so we use an empty {@link FieldStats.Text}. * TODO: we should maybe support a new type that knows how to (de)encode the min/max information */ return new FieldStats.Text(maxDoc, -1, -1, -1, isSearchable(), isAggregatable()); }
/** * Loads all the prefix terms in the list of terms given the reader. * * @param reader Index reader to use. * @param values The list of values to load. * @param term The term to use. * * @throws IOException If an error is thrown by the prefix term enumeration. */ public static void prefix(IndexReader reader, List<String> values, Term term) throws IOException { Fields fields = MultiFields.getFields(reader); org.apache.lucene.index.Terms terms = fields == null ? null : fields.terms(term.field()); if (terms == null) return; TermsEnum prefixes = terms.intersect(new CompiledAutomaton(PrefixQuery.toAutomaton(term.bytes())), term.bytes()); BytesRef val; while ((val = prefixes.next()) != null) { values.add(val.utf8ToString()); } }
/** * Loads all the prefix terms in the list of terms given the reader. * * @param reader Index reader to use. * @param values The list of values to load. * @param term The term to use. * * @throws IOException If an error is thrown by the prefix term enumeration. */ public static void prefix(IndexReader reader, Bucket<Term> bucket, Term term) throws IOException { Fields fields = MultiFields.getFields(reader); org.apache.lucene.index.Terms terms = fields == null ? null : fields.terms(term.field()); if (terms == null) return; TermsEnum prefixes = terms.intersect(new CompiledAutomaton(PrefixQuery.toAutomaton(term.bytes())), term.bytes()); BytesRef val; while ((val = prefixes.next()) != null) { Term t = new Term(term.field(), BytesRef.deepCopyOf(val)); bucket.add(t, reader.docFreq(t)); } }
@Override public FieldStats.GeoPoint stats(IndexReader reader) throws IOException { String field = name(); FieldInfo fi = org.apache.lucene.index.MultiFields.getMergedFieldInfos(reader).fieldInfo(field); if (fi == null) { return null; } Terms terms = org.apache.lucene.index.MultiFields.getTerms(reader, field); if (terms == null) { return new FieldStats.GeoPoint(reader.maxDoc(), 0L, -1L, -1L, isSearchable(), isAggregatable()); } return new FieldStats.GeoPoint(reader.maxDoc(), terms.getDocCount(), -1L, terms.getSumTotalTermFreq(), isSearchable(), isAggregatable(), prefixCodedToGeoPoint(terms.getMin(), numericEncoded), prefixCodedToGeoPoint(terms.getMax(), numericEncoded)); } }
PostingsEnum docs = MultiFields.getTermDocsEnum(indexReader, Consts.FULL, new BytesRef(FacetsConfig.pathToString(cp.components, cp.length)), 0); if (docs != null && docs.nextDoc() != DocIdSetIterator.NO_MORE_DOCS) { ret = docs.docID();