/** This method may return null if the field does not exist or if it has no terms. */ public static Terms getTerms(IndexReader r, String field) throws IOException { final List<LeafReaderContext> leaves = r.leaves(); if (leaves.size() == 1) { return leaves.get(0).reader().terms(field); } final List<Terms> termsPerLeaf = new ArrayList<>(leaves.size()); final List<ReaderSlice> slicePerLeaf = new ArrayList<>(leaves.size()); for (int leafIdx = 0; leafIdx < leaves.size(); leafIdx++) { LeafReaderContext ctx = leaves.get(leafIdx); Terms subTerms = ctx.reader().terms(field); if (subTerms != null) { termsPerLeaf.add(subTerms); slicePerLeaf.add(new ReaderSlice(ctx.docBase, r.maxDoc(), leafIdx - 1)); } } if (termsPerLeaf.size() == 0) { return null; } else { return new MultiTerms(termsPerLeaf.toArray(Terms.EMPTY_ARRAY), slicePerLeaf.toArray(ReaderSlice.EMPTY_ARRAY)); } }
@Override public boolean test(LeafReaderContext context) { final int maxDoc = context.reader().maxDoc(); if (maxDoc < minSize) { return false; } final IndexReaderContext topLevelContext = ReaderUtil.getTopLevelContext(context); final float sizeRatio = (float) context.reader().maxDoc() / topLevelContext.reader().maxDoc(); return sizeRatio >= minSizeRatio; } }
/** * @return the {@code NumericDocValues} for a given field * @throws IllegalArgumentException if this field is not indexed with numeric doc values */ public NumericDocValues readDocValues( String field ) { try { NumericDocValues dv = context.reader().getNumericDocValues( field ); if ( dv == null ) { FieldInfo fi = context.reader().getFieldInfos().fieldInfo( field ); DocValuesType actual = null; if ( fi != null ) { actual = fi.getDocValuesType(); } throw new IllegalStateException( "The field '" + field + "' is not indexed properly, expected NumericDV, but got '" + actual + "'" ); } return dv; } catch ( IOException e ) { throw new RuntimeException( e ); } } }
FieldInfo fieldInfo = reader.getFieldInfos().fieldInfo(field); final DocIdSetIterator iterator; if (fieldInfo != null) { break; case NUMERIC: iterator = reader.getNumericDocValues(field); break; case BINARY: iterator = reader.getBinaryDocValues(field); break; case SORTED: iterator = reader.getSortedDocValues(field); break; case SORTED_NUMERIC: iterator = reader.getSortedNumericDocValues(field); break; case SORTED_SET: iterator = reader.getSortedSetDocValues(field); break; default:
private DocMap[] buildDeletionDocMaps(List<CodecReader> readers) { int totalDocs = 0; int numReaders = readers.size(); DocMap[] docMaps = new DocMap[numReaders]; for (int i = 0; i < numReaders; i++) { LeafReader reader = readers.get(i); Bits liveDocs = reader.getLiveDocs(); final PackedLongValues delDocMap; if (liveDocs != null) { delDocMap = removeDeletes(reader.maxDoc(), liveDocs); } else { delDocMap = null; } final int docBase = totalDocs; docMaps[i] = new DocMap() { @Override public int get(int docID) { if (liveDocs == null) { return docBase + docID; } else if (liveDocs.get(docID)) { return docBase + (int) delDocMap.get(docID); } else { return -1; } } }; totalDocs += reader.numDocs(); } return docMaps; }
IndexWriterConfig indexWriterConfig = new IndexWriterConfig(analyzer); int maxLen = 1000000; int len = 0; while (line != null) { len += line.length(); Document document = new Document(); document.add(new TextField(FIELD, line, Field.Store.NO)); docs.add(document); if (len > maxLen) { writer.flush(); try (IndexReader reader = DirectoryReader.open(directory)) { LeafReader wrappedReader = SlowCompositeReaderWrapper.wrap(reader); Terms terms = wrappedReader.terms(FIELD); TermsEnum termsEnum = terms.iterator(); BytesRef bytesRef = termsEnum.next(); int docsWThisField = wrappedReader.getDocCount(FIELD); while (bytesRef != null) { int df = termsEnum.docFreq(); long tf = termsEnum.totalTermFreq(); if (MIN_DOC_FREQ > -1 && df < MIN_DOC_FREQ) { bytesRef = termsEnum.next(); String t = bytesRef.utf8ToString(); if (! WHITE_LIST.contains(t) && ! BLACK_LIST.contains(t)) { queue.insertWithOverflow(new TokenDFTF(t, df, tf));
@Override public Scorer scorer(LeafReaderContext context) throws IOException { Terms terms = context.reader().terms(fieldName); if (terms == null) { return null; } TermsEnum termsEnum = terms.iterator(); if (termsEnum.seekExact(new BytesRef(featureName)) == false) { return null; } SimScorer scorer = function.scorer(fieldName, boost); PostingsEnum postings = termsEnum.postings(null, PostingsEnum.FREQS); return new Scorer(this) { @Override public int docID() { return postings.docID(); } @Override public float score() throws IOException { return scorer.score(postings.docID(), postings.freq()); } @Override public DocIdSetIterator iterator() { return postings; } }; }
DirectoryReader r = DirectoryReader.open(taxoDir); try { final int size = r.numDocs(); final OrdinalMap ordinalMap = map; ordinalMap.setSize(size); int base = 0; PostingsEnum docs = null; for (final LeafReaderContext ctx : r.leaves()) { final LeafReader ar = ctx.reader(); final Terms terms = ar.terms(Consts.FULL); TermsEnum te = terms.iterator(); while (te.next() != null) { FacetLabel cp = new FacetLabel(FacetsConfig.stringToPath(te.term().utf8ToString())); final int ordinal = addCategory(cp); docs = te.postings(docs, PostingsEnum.NONE); ordinalMap.addMapping(docs.nextDoc() + base, ordinal); base += ar.maxDoc(); // no deletions, so we're ok
try { PostingsEnum postingsEnum = null; for (LeafReaderContext ctx : reader.leaves()) { Terms terms = ctx.reader().terms(Consts.FULL); if (terms != null) { // cannot really happen, but be on the safe side TermsEnum termsEnum = terms.iterator(); while (termsEnum.next() != null) { if (!cache.isFull()) { BytesRef t = termsEnum.term(); FacetLabel cp = new FacetLabel(FacetsConfig.stringToPath(t.utf8ToString())); postingsEnum = termsEnum.postings(postingsEnum, PostingsEnum.NONE); boolean res = cache.put(cp, postingsEnum.nextDoc() + ctx.docBase); assert !res : "entries should not have been evicted from the cache";
/** * Read all terms from a field * * @param field the field in the document to load terms from * @param directory Any directory implementation * @return Unique terms represented as UTF-8 * @throws IOException */ public static Set<String> readTerms(String field, Directory directory) throws IOException { try (DirectoryReader reader = DirectoryReader.open(directory)) { Set<String> termStrings = new TreeSet<>(); for (LeafReaderContext atomicReaderContext : reader.leaves()) { LeafReader atomicReader = atomicReaderContext.reader(); TermsEnum iterator = atomicReader.terms(field).iterator(); BytesRef next = iterator.next(); while (next != null) { termStrings.add(iterator.term().utf8ToString()); next = iterator.next(); } } return termStrings; } }
for ( LeafReaderContext readerContext : indexReader.leaves() ) for ( String fieldName : fieldNames ) Terms terms = readerContext.reader().terms( fieldName ); if ( terms != null ) while ( (termsRef = termsEnum.next()) != null ) sampler.include( termsRef.utf8ToString(), termsEnum.docFreq() ); checkCancellation(); return sampler.result( indexReader.numDocs() );
private LeafReader build(IndexWriter writer) throws IOException { for (InputDocument doc : documents) { writer.addDocument(doc.getDocument()); } writer.commit(); writer.forceMerge(1); LeafReader reader = DirectoryReader.open(directory).leaves().get(0).reader(); assert reader != null; docIds = new String[reader.maxDoc()]; for (int i = 0; i < docIds.length; i++) { docIds[i] = reader.document(i).get(InputDocument.ID_FIELD); // TODO can this be more efficient? } return reader; }
@Override DocIdSet processLeaf(Query query, CompositeValuesCollectorQueue queue, LeafReaderContext context, boolean fillDocIdSet) throws IOException { final Terms terms = context.reader().terms(field); if (terms == null) { final TermsEnum te = terms.iterator(); if (lowerValue != null) { if (te.seekCeil(lowerValue) == TermsEnum.SeekStatus.END) { return DocIdSet.EMPTY ; if (te.next() == null) { return DocIdSet.EMPTY; DocIdSetBuilder builder = fillDocIdSet ? new DocIdSetBuilder(context.reader().maxDoc(), terms) : null; PostingsEnum reuse = null; boolean first = true; final BytesRef upper = upperValue == null ? null : BytesRef.deepCopyOf(upperValue); do { if (upper != null && upper.compareTo(te.term()) < 0) { break;
private void getPrefixTerms(ObjectHashSet<Term> terms, final Term prefix, final IndexReader reader) throws IOException { // SlowCompositeReaderWrapper could be used... but this would merge all terms from each segment into one terms // instance, which is very expensive. Therefore I think it is better to iterate over each leaf individually. List<LeafReaderContext> leaves = reader.leaves(); for (LeafReaderContext leaf : leaves) { Terms _terms = leaf.reader().terms(field); if (_terms == null) { continue; } TermsEnum termsEnum = _terms.iterator(); TermsEnum.SeekStatus seekStatus = termsEnum.seekCeil(prefix.bytes()); if (TermsEnum.SeekStatus.END == seekStatus) { continue; } for (BytesRef term = termsEnum.term(); term != null; term = termsEnum.next()) { if (!StringHelper.startsWith(term, prefix.bytes())) { break; } terms.add(new Term(field, BytesRef.deepCopyOf(term))); if (terms.size() >= maxExpansions) { return; } } } }
Terms terms = leafReaderContext.reader().terms(suggesterQuery.getField()); BytesRef term = termsEnum.next(); while (term != null) { if (Thread.currentThread().isInterrupted()) { postingsEnum = termsEnum.postings(postingsEnum, PostingsEnum.POSITIONS | PostingsEnum.FREQS); } else { postingsEnum = termsEnum.postings(postingsEnum, PostingsEnum.NONE); queue.insertWithOverflow(new LookupResultItem(term.utf8ToString(), project, score));
if (reader.hasDeletions()) { final List<LeafReaderContext> leaves = reader.leaves(); final int size = leaves.size(); assert size > 0 : "A reader with deletions must have at least one leave"; if (size == 1) { return leaves.get(0).reader().getLiveDocs(); liveDocs[i] = ctx.reader().getLiveDocs(); starts[i] = ctx.docBase; starts[size] = reader.maxDoc(); return new MultiBits(liveDocs, starts, true); } else {
final List<LeafReaderContext> leaves = r.leaves(); final int size = leaves.size(); return null; } else if (size == 1) { return leaves.get(0).reader().getSortedSetDocValues(field); for (int i = 0; i < size; i++) { LeafReaderContext context = leaves.get(i); SortedSetDocValues v = context.reader().getSortedSetDocValues(field); if (v == null) { v = DocValues.emptySortedSet(); starts[i] = context.docBase; starts[size] = r.maxDoc(); IndexReader.CacheHelper cacheHelper = r.getReaderCacheHelper(); IndexReader.CacheKey owner = cacheHelper == null ? null : cacheHelper.getKey(); OrdinalMap mapping = OrdinalMap.build(owner, values, PackedInts.DEFAULT);
/** Returns the number of documents containing the term * <code>t</code>. This method returns 0 if the term or * field does not exists. This method does not take into * account deleted documents that have not yet been merged * away. */ @Override public final long totalTermFreq(Term term) throws IOException { final Terms terms = terms(term.field()); if (terms == null) { return 0; } final TermsEnum termsEnum = terms.iterator(); if (termsEnum.seekExact(term.bytes())) { return termsEnum.totalTermFreq(); } else { return 0; } }
final void collectTerms(IndexReader reader, MultiTermQuery query, TermCollector collector) throws IOException { IndexReaderContext topReaderContext = reader.getContext(); for (LeafReaderContext context : topReaderContext.leaves()) { final Terms terms = context.reader().terms(query.field); if (terms == null) { // field does not exist continue; } final TermsEnum termsEnum = getTermsEnum(query, terms, collector.attributes); assert termsEnum != null; if (termsEnum == TermsEnum.EMPTY) continue; collector.setReaderContext(topReaderContext, context); collector.setNextEnum(termsEnum); BytesRef bytes; while ((bytes = termsEnum.next()) != null) { if (!collector.collect(bytes)) return; // interrupt whole term collection, so also don't iterate other subReaders } } }