/** Returns {@link PostingsEnum} for the specified term. * This will return null if either the field or * term does not exist. * <p><b>NOTE:</b> The returned {@link PostingsEnum} may contain deleted docs. * @see TermsEnum#postings(PostingsEnum) */ public final PostingsEnum postings(Term term, int flags) throws IOException { assert term.field() != null; assert term.bytes() != null; final Terms terms = terms(term.field()); if (terms != null) { final TermsEnum termsEnum = terms.iterator(); if (termsEnum.seekExact(term.bytes())) { return termsEnum.postings(null, flags); } } return null; }
/** * Expert: returns additional information about this Terms instance * for debugging purposes. */ public Object getStats() throws IOException { StringBuilder sb = new StringBuilder(); sb.append("impl=" + getClass().getSimpleName()); sb.append(",size=" + size()); sb.append(",docCount=" + getDocCount()); sb.append(",sumTotalTermFreq=" + getSumTotalTermFreq()); sb.append(",sumDocFreq=" + getSumDocFreq()); return sb.toString(); } }
@Override public Matches matches(LeafReaderContext context, int doc) throws IOException { TermsEnum te = getTermsEnum(context); if (te == null) { return null; } if (context.reader().terms(term.field()).hasPositions() == false) { return super.matches(context, doc); } return MatchesUtils.forField(term.field(), () -> { PostingsEnum pe = te.postings(null, PostingsEnum.OFFSETS); if (pe.advance(doc) != doc) { return null; } return new TermMatchesIterator(getQuery(), pe); }); }
/** Create a {@link DocIdSetBuilder} instance that is optimized for * accumulating docs that match the given {@link Terms}. */ public DocIdSetBuilder(int maxDoc, Terms terms) throws IOException { this(maxDoc, terms.getDocCount(), terms.getSumDocFreq()); }
/** Sole constructor. * * @param subs The {@link Terms} instances of all sub-readers. * @param subSlices A parallel array (matching {@code * subs}) describing the sub-reader slices. */ public MultiTerms(Terms[] subs, ReaderSlice[] subSlices) throws IOException { this.subs = subs; this.subSlices = subSlices; assert subs.length > 0 : "inefficient: don't use MultiTerms over one sub"; boolean _hasFreqs = true; boolean _hasOffsets = true; boolean _hasPositions = true; boolean _hasPayloads = false; for(int i=0;i<subs.length;i++) { _hasFreqs &= subs[i].hasFreqs(); _hasOffsets &= subs[i].hasOffsets(); _hasPositions &= subs[i].hasPositions(); _hasPayloads |= subs[i].hasPayloads(); } hasFreqs = _hasFreqs; hasOffsets = _hasOffsets; hasPositions = _hasPositions; hasPayloads = hasPositions && _hasPayloads; // if all subs have pos, and at least one has payloads. }
private void getPrefixTerms(ObjectHashSet<Term> terms, final Term prefix, final IndexReader reader) throws IOException { // SlowCompositeReaderWrapper could be used... but this would merge all terms from each segment into one terms // instance, which is very expensive. Therefore I think it is better to iterate over each leaf individually. List<LeafReaderContext> leaves = reader.leaves(); for (LeafReaderContext leaf : leaves) { Terms _terms = leaf.reader().terms(field); if (_terms == null) { continue; } TermsEnum termsEnum = _terms.iterator(); TermsEnum.SeekStatus seekStatus = termsEnum.seekCeil(prefix.bytes()); if (TermsEnum.SeekStatus.END == seekStatus) { continue; } for (BytesRef term = termsEnum.term(); term != null; term = termsEnum.next()) { if (!StringHelper.startsWith(term, prefix.bytes())) { break; } terms.add(new Term(field, BytesRef.deepCopyOf(term))); if (terms.size() >= maxExpansions) { return; } } } }
public void termsList(String field) throws IOException { // again, we'll just look at the first segment. Terms dictionaries // for different segments may well be different, as they depend on // the individual documents that have been added. LeafReader leafReader = reader.leaves().get(0).reader(); Terms terms = leafReader.terms(field); // The Terms object gives us some stats for this term within the segment System.out.println("Number of docs with this term:" + terms.getDocCount()); TermsEnum te = terms.iterator(); BytesRef term; while ((term = te.next()) != null) { System.out.println(term.utf8ToString() + " DF: " + te.docFreq() + " CF: " + te.totalTermFreq()); } }
final Terms terms = context.reader().terms(query.field); if (terms == null) { final TermContext termContext = new TermContext(searcher.getTopReaderContext()); termContext.register(t.state, context.ord, t.docFreq, t.totalTermFreq); bq.add(new TermQuery(new Term(query.field, t.term), termContext), Occur.SHOULD); DocIdSetBuilder builder = new DocIdSetBuilder(context.reader().maxDoc(), terms); if (collectedTerms.isEmpty() == false) { TermsEnum termsEnum2 = terms.iterator(); for (TermAndState t : collectedTerms) { termsEnum2.seekExact(t.term, t.state); docs = termsEnum2.postings(docs, PostingsEnum.NONE); builder.add(docs); docs = termsEnum.postings(docs, PostingsEnum.NONE); builder.add(docs); } while (termsEnum.next() != null);
public void listTokens(int freq) throws IOException { IndexReader ireader = null; TermsEnum iter = null; Terms terms; try { ireader = DirectoryReader.open(indexDirectory); int numDocs = ireader.numDocs(); if (numDocs > 0) { Fields uFields = MultiFields.getFields(ireader);//reader.getTermVectors(0); terms = uFields.terms(QueryBuilder.DEFS); iter = terms.iterator(); // init uid iterator } while (iter != null && iter.term() != null) { //if (iter.term().field().startsWith("f")) { if (iter.docFreq() > 16 && iter.term().utf8ToString().length() > freq) { LOGGER.warning(iter.term().utf8ToString()); } BytesRef next = iter.next(); if (next==null) {iter=null;} } } finally { if (ireader != null) { try { ireader.close(); } catch (IOException e) { LOGGER.log(Level.WARNING, "An error occurred while closing index reader", e); } } } }
final boolean hasFreqs = terms.hasFreqs(); final boolean hasPositions = terms.hasPositions(); final boolean hasPayloads = terms.hasPayloads(); final boolean hasOffsets = terms.hasOffsets(); minTerm = null; } else { BytesRef bb = terms.getMin(); if (bb != null) { assert bb.isValid(); minTerm = BytesRef.deepCopyOf(bb); } else { minTerm = null; bb = terms.getMax(); if (bb != null) { assert bb.isValid(); if (terms.getSumTotalTermFreq() != -1) { throw new RuntimeException("field \"" + field + "\" hasFreqs is false, but Terms.getSumTotalTermFreq()=" + terms.getSumTotalTermFreq() + " (should be -1)"); final TermsEnum termsEnum = terms.iterator(); final Object stats = fieldTerms.getStats(); assert stats != null; if (status.blockTreeStats == null) { final long v = fields.terms(field).getSumTotalTermFreq(); if (v != -1 && sumTotalTermFreq != v) {
ScoreTermsQueue queue = new ScoreTermsQueue(Math.min(maxNumTerms, (int) terms.size())); TermsEnum termsEnum = terms.iterator(); TermsEnum topLevelTermsEnum = topLevelTerms.iterator(); while (termsEnum.next() != null) { BytesRef termBytesRef = termsEnum.term(); boolean foundTerm = topLevelTermsEnum.seekExact(termBytesRef); assert foundTerm : "Term: " + termBytesRef.utf8ToString() + " not found!"; Term term = new Term(fieldName, termBytesRef); if (isNoise(term.bytes().utf8ToString(), freq)) { continue; queue.addOrUpdate(new ScoreTerm(term.field(), term.bytes().utf8ToString(), score));
int numDocs = reader.numDocs(); if (numDocs > 0) { uidIter = terms.iterator(); TermsEnum.SeekStatus stat = uidIter.seekCeil(new BytesRef(startuid)); //init uid if (stat == TermsEnum.SeekStatus.END) { uidIter = null; while (uidIter != null && uidIter.term() != null && uidIter.term().utf8ToString().startsWith(startuid)) { reader.close();
private Query newTermQuery(IndexReader reader, Term term) throws IOException { // we build an artificial TermContext that will give an overall df and ttf // equal to 1 TermContext context = new TermContext(reader.getContext()); for (LeafReaderContext leafContext : reader.leaves()) { Terms terms = leafContext.reader().terms(term.field()); if (terms != null) { TermsEnum termsEnum = terms.iterator(); if (termsEnum.seekExact(term.bytes())) { int freq = 1 - context.docFreq(); // we want the total df and ttf to be 1 context.register(termsEnum.termState(), leafContext.ord, freq, freq); } } } return new TermQuery(term, context); }
final boolean hasPositions = terms.hasPositions(); final boolean hasOffsets = terms.hasOffsets(); final boolean hasPayloads = terms.hasPayloads(); assert !hasPayloads || hasPositions; int numTerms = (int) terms.size(); if (numTerms == -1) { termsEnum = terms.iterator(); while(termsEnum.next() != null) { numTerms++; termsEnum = terms.iterator(); while(termsEnum.next() != null) { termCount++; final int freq = (int) termsEnum.totalTermFreq(); startTerm(termsEnum.term(), freq); assert docsAndPositionsEnum != null; final int docID = docsAndPositionsEnum.nextDoc(); assert docID != DocIdSetIterator.NO_MORE_DOCS; assert docsAndPositionsEnum.freq() == freq;
TermsEnum topLevelIterator = topLevelTerms.iterator(); boolean positions = flags.contains(Flag.Positions) && fieldTermVector.hasPositions(); boolean offsets = flags.contains(Flag.Offsets) && fieldTermVector.hasOffsets(); boolean payloads = flags.contains(Flag.Payloads) && fieldTermVector.hasPayloads(); long termsSize = fieldTermVector.size(); if (hasScores) { termsSize = Math.min(termsSize, termVectorsFilter.size(field)); TermsEnum iterator = fieldTermVector.iterator(); final boolean useDocsAndPos = positions || offsets || payloads; while (iterator.next() != null) { // iterate all terms of the current field BytesRef termBytesRef = iterator.term(); Term term = new Term(field, termBytesRef); writeTermStatistics(statistics == null ? new TermStatistics(termBytesRef, 0, 0) : statistics); } else { boolean foundTerm = topLevelIterator.seekExact(termBytesRef); if (foundTerm) { writeTermStatistics(topLevelIterator);
@Override public Explanation explain(LeafReaderContext context, int doc) throws IOException { String desc = "weight(" + getQuery() + " in " + doc + ") [" + function + "]"; Terms terms = context.reader().terms(fieldName); if (terms == null) { return Explanation.noMatch(desc + ". Field " + fieldName + " doesn't exist."); } TermsEnum termsEnum = terms.iterator(); if (termsEnum.seekExact(new BytesRef(featureName)) == false) { return Explanation.noMatch(desc + ". Feature " + featureName + " doesn't exist."); } PostingsEnum postings = termsEnum.postings(null, PostingsEnum.FREQS); if (postings.advance(doc) != doc) { return Explanation.noMatch(desc + ". Feature " + featureName + " isn't set."); } return function.explain(fieldName, featureName, boost, doc, postings.freq()); }
@Override public Explanation explain(LeafReaderContext context, int doc) throws IOException { Terms terms = context.reader().terms(field); if (terms != null) { TermsEnum segmentTermsEnum = terms.iterator(); BytesRef spare = new BytesRef(); PostingsEnum postingsEnum = null; for (int i = 0; i < TermsIncludingScoreQuery.this.terms.size(); i++) { if (segmentTermsEnum.seekExact(TermsIncludingScoreQuery.this.terms.get(ords[i], spare))) { postingsEnum = segmentTermsEnum.postings(postingsEnum, PostingsEnum.NONE); if (postingsEnum.advance(doc) == doc) { final float score = TermsIncludingScoreQuery.this.scores[ords[i]]; return Explanation.match(score, "Score based on join value " + segmentTermsEnum.term().utf8ToString()); } } } } return Explanation.noMatch("Not a match"); }
/** * Creates a {@link TermContext} from a top-level {@link IndexReaderContext} and the * given {@link Term}. This method will lookup the given term in all context's leaf readers * and register each of the readers containing the term in the returned {@link TermContext} * using the leaf reader's ordinal. * <p> * Note: the given context must be a top-level context. */ public static TermContext build(IndexReaderContext context, Term term) throws IOException { assert context != null && context.isTopLevel; final String field = term.field(); final BytesRef bytes = term.bytes(); final TermContext perReaderTermState = new TermContext(context); //if (DEBUG) System.out.println("prts.build term=" + term); for (final LeafReaderContext ctx : context.leaves()) { //if (DEBUG) System.out.println(" r=" + leaves[i].reader); final Terms terms = ctx.reader().terms(field); if (terms != null) { final TermsEnum termsEnum = terms.iterator(); if (termsEnum.seekExact(bytes)) { final TermState termState = termsEnum.termState(); //if (DEBUG) System.out.println(" found"); perReaderTermState.register(termState, ctx.ord, termsEnum.docFreq(), termsEnum.totalTermFreq()); } } } return perReaderTermState; }
TermsEnum termsEnum = terms.iterator(); final boolean postingsHasFreq = fieldInfo.getIndexOptions().compareTo(IndexOptions.DOCS_AND_FREQS) >= 0; final boolean postingsHasPayload = fieldInfo.hasPayloads(); final boolean vectorsHasPayload = terms.hasPayloads(); throw new RuntimeException("vector field=" + field + " does not exist in postings; doc=" + j); TermsEnum postingsTermsEnum = postingsTerms.iterator(); final boolean hasProx = terms.hasOffsets() || terms.hasPositions(); BytesRef term = null; while ((term = termsEnum.next()) != null) { postings = termsEnum.postings(postings, PostingsEnum.ALL); assert postings != null; final int doc = postings.nextDoc(); final int tf = postings.freq(); if (postingsTerms.hasPositions()) { int postingsPos = postingsDocs.nextPosition(); if (terms.hasPositions() && pos != postingsPos) { throw new RuntimeException("vector term=" + term + " field=" + field + " doc=" + j + ": pos=" + pos + " differs from postings pos=" + postingsPos); if (startOffset != -1 && endOffset != -1 && postingsTerms.hasOffsets()) {
@Override DocIdSet processLeaf(Query query, CompositeValuesCollectorQueue queue, LeafReaderContext context, boolean fillDocIdSet) throws IOException { final Terms terms = context.reader().terms(field); if (terms == null) { final TermsEnum te = terms.iterator(); if (lowerValue != null) { if (te.seekCeil(lowerValue) == TermsEnum.SeekStatus.END) { return DocIdSet.EMPTY ; if (te.next() == null) { return DocIdSet.EMPTY; DocIdSetBuilder builder = fillDocIdSet ? new DocIdSetBuilder(context.reader().maxDoc(), terms) : null; PostingsEnum reuse = null; boolean first = true; final BytesRef upper = upperValue == null ? null : BytesRef.deepCopyOf(upperValue); do { if (upper != null && upper.compareTo(te.term()) < 0) { break;