Refine search
public void listTokens(int freq) throws IOException { IndexReader ireader = null; TermsEnum iter = null; Terms terms; try { ireader = DirectoryReader.open(indexDirectory); int numDocs = ireader.numDocs(); if (numDocs > 0) { Fields uFields = MultiFields.getFields(ireader);//reader.getTermVectors(0); terms = uFields.terms(QueryBuilder.DEFS); iter = terms.iterator(); // init uid iterator } while (iter != null && iter.term() != null) { //if (iter.term().field().startsWith("f")) { if (iter.docFreq() > 16 && iter.term().utf8ToString().length() > freq) { LOGGER.warning(iter.term().utf8ToString()); } BytesRef next = iter.next(); if (next==null) {iter=null;} } } finally { if (ireader != null) { try { ireader.close(); } catch (IOException e) { LOGGER.log(Level.WARNING, "An error occurred while closing index reader", e); } } } }
/** Returns the number of documents containing the term * <code>t</code>. This method returns 0 if the term or * field does not exists. This method does not take into * account deleted documents that have not yet been merged * away. */ @Override public final long totalTermFreq(Term term) throws IOException { final Terms terms = terms(term.field()); if (terms == null) { return 0; } final TermsEnum termsEnum = terms.iterator(); if (termsEnum.seekExact(term.bytes())) { return termsEnum.totalTermFreq(); } else { return 0; } }
/** Returns {@link PostingsEnum} for the specified field and * term, with control over whether freqs are required. * Some codecs may be able to optimize their * implementation when freqs are not required. This will * return null if the field or term does not exist. See {@link * TermsEnum#postings(PostingsEnum,int)}.*/ public static PostingsEnum getTermDocsEnum(IndexReader r, String field, BytesRef term, int flags) throws IOException { assert field != null; assert term != null; final Terms terms = getTerms(r, field); if (terms != null) { final TermsEnum termsEnum = terms.iterator(); if (termsEnum.seekExact(term)) { return termsEnum.postings(null, flags); } } return null; }
/** Returns an expected cost in simple operations * of processing the occurrences of a term * in a document that contains the term. * This is for use by {@link TwoPhaseIterator#matchCost} implementations. * <br>This may be inaccurate when {@link TermsEnum#totalTermFreq()} is not available. * @param termsEnum The term is the term at which this TermsEnum is positioned. */ static float termPositionsCost(TermsEnum termsEnum) throws IOException { int docFreq = termsEnum.docFreq(); assert docFreq > 0; long totalTermFreq = termsEnum.totalTermFreq(); // -1 when not available float expOccurrencesInMatchingDoc = (totalTermFreq < docFreq) ? 1 : (totalTermFreq / (float) docFreq); return TERM_POSNS_SEEK_OPS_PER_DOC + expOccurrencesInMatchingDoc * TERM_OPS_PER_POS; }
private void getPrefixTerms(ObjectHashSet<Term> terms, final Term prefix, final IndexReader reader) throws IOException { // SlowCompositeReaderWrapper could be used... but this would merge all terms from each segment into one terms // instance, which is very expensive. Therefore I think it is better to iterate over each leaf individually. List<LeafReaderContext> leaves = reader.leaves(); for (LeafReaderContext leaf : leaves) { Terms _terms = leaf.reader().terms(field); if (_terms == null) { continue; } TermsEnum termsEnum = _terms.iterator(); TermsEnum.SeekStatus seekStatus = termsEnum.seekCeil(prefix.bytes()); if (TermsEnum.SeekStatus.END == seekStatus) { continue; } for (BytesRef term = termsEnum.term(); term != null; term = termsEnum.next()) { if (!StringHelper.startsWith(term, prefix.bytes())) { break; } terms.add(new Term(field, BytesRef.deepCopyOf(term))); if (terms.size() >= maxExpansions) { return; } } } }
final BytesRef term = termsEnum.next(); if (term == null) { break; final int docFreq = termsEnum.docFreq(); if (docFreq <= 0) { throw new RuntimeException("docfreq: " + docFreq + " is out of bounds"); postings = termsEnum.postings(postings, PostingsEnum.ALL); if (termsEnum.totalTermFreq() != -1) { throw new RuntimeException("field \"" + field + "\" hasFreqs is false, but TermsEnum.totalTermFreq()=" + termsEnum.totalTermFreq() + " (should be -1)"); long ord = -1; try { ord = termsEnum.ord(); } catch (UnsupportedOperationException uoe) { hasOrd = false; final long totalTermFreq2 = termsEnum.totalTermFreq(); final boolean hasTotalTermFreq = hasFreqs && totalTermFreq2 != -1; for(int idx=0;idx<7;idx++) { final int skipDocID = (int) (((idx+1)*(long) maxDoc)/8); postings = termsEnum.postings(postings, PostingsEnum.ALL); for(int idx=0;idx<7;idx++) { final int skipDocID = (int) (((idx+1)*(long) maxDoc)/8); postings = termsEnum.postings(postings, PostingsEnum.NONE);
int numDocs = reader.numDocs(); if (numDocs > 0) { uidIter = terms.iterator(); TermsEnum.SeekStatus stat = uidIter.seekCeil(new BytesRef(startuid)); //init uid if (stat == TermsEnum.SeekStatus.END) { uidIter = null; while (uidIter != null && uidIter.term() != null && uidIter.term().utf8ToString().startsWith(startuid)) { BytesRef next = uidIter.next(); if (next == null) { uidIter=null; reader.close();
final Terms terms = context.reader().terms(query.field); if (terms == null) { final TermContext termContext = new TermContext(searcher.getTopReaderContext()); termContext.register(t.state, context.ord, t.docFreq, t.totalTermFreq); bq.add(new TermQuery(new Term(query.field, t.term), termContext), Occur.SHOULD); DocIdSetBuilder builder = new DocIdSetBuilder(context.reader().maxDoc(), terms); if (collectedTerms.isEmpty() == false) { TermsEnum termsEnum2 = terms.iterator(); for (TermAndState t : collectedTerms) { termsEnum2.seekExact(t.term, t.state); docs = termsEnum2.postings(docs, PostingsEnum.NONE); builder.add(docs); docs = termsEnum.postings(docs, PostingsEnum.NONE); builder.add(docs); } while (termsEnum.next() != null);
final boolean hasPositions = terms.hasPositions(); final boolean hasOffsets = terms.hasOffsets(); final boolean hasPayloads = terms.hasPayloads(); assert !hasPayloads || hasPositions; termsEnum = terms.iterator(); while(termsEnum.next() != null) { numTerms++; while(termsEnum.next() != null) { termCount++; final int freq = (int) termsEnum.totalTermFreq(); startTerm(termsEnum.term(), freq); docsAndPositionsEnum = termsEnum.postings(docsAndPositionsEnum, PostingsEnum.OFFSETS | PostingsEnum.PAYLOADS); assert docsAndPositionsEnum != null; final int docID = docsAndPositionsEnum.nextDoc(); assert docID != DocIdSetIterator.NO_MORE_DOCS; assert docsAndPositionsEnum.freq() == freq; final int pos = docsAndPositionsEnum.nextPosition(); final int startOffset = docsAndPositionsEnum.startOffset(); final int endOffset = docsAndPositionsEnum.endOffset();
/** * Loads all the prefix terms in the list of terms given the reader. * * @param reader Index reader to use. * @param values The list of values to load. * @param term The term to use. * * @throws IOException If an error is thrown by the prefix term enumeration. */ public static void prefix(IndexReader reader, Bucket<Term> bucket, Term term) throws IOException { Fields fields = MultiFields.getFields(reader); org.apache.lucene.index.Terms terms = fields == null ? null : fields.terms(term.field()); if (terms == null) return; TermsEnum prefixes = terms.intersect(new CompiledAutomaton(PrefixQuery.toAutomaton(term.bytes())), term.bytes()); BytesRef val; while ((val = prefixes.next()) != null) { Term t = new Term(term.field(), BytesRef.deepCopyOf(val)); bucket.add(t, reader.docFreq(t)); } }
for ( LeafReaderContext leafReaderContext : searcher.getIndexReader().leaves() ) Fields fields = leafReaderContext.reader().fields(); for ( String field : fields ) while ( (termsRef = terms.next()) != null ) if ( terms.docFreq() > 1 ) collector.init( terms.docFreq() ); searcher.search( new TermQuery( new Term( field, termsRef ) ), collector );
public void buildTermVector(int docid) throws IOException { /* */ Set<String> fieldList = new HashSet<>(); fieldList.add("content"); Document doc = reader.document(docid, fieldList); MemoryIndex mi = MemoryIndex.fromDocument(doc, new StandardAnalyzer()); IndexReader mr = mi.createSearcher().getIndexReader(); Terms t = mr.leaves().get(0).reader().terms("content"); if ((t != null) && (t.size()>0)) { TermsEnum te = t.iterator(); BytesRef term = null; System.out.println(t.size()); while ((term = te.next()) != null) { System.out.println("BytesRef: " + term.utf8ToString()); System.out.println("docFreq: " + te.docFreq()); System.out.println("totalTermFreq: " + te.totalTermFreq()); } } }
@Override public void verify( NodePropertyAccessor accessor, int[] propKeyIds ) throws IndexEntryConflictException, IOException { for ( String field : allFields() ) { if ( LuceneDocumentStructure.useFieldForUniquenessVerification( field ) ) { TermsEnum terms = LuceneDocumentStructure.originalTerms( termsForField( field ), field ); BytesRef termsRef; while ( (termsRef = terms.next()) != null ) { if ( terms.docFreq() > 1 ) { TermQuery query = new TermQuery( new Term( field, termsRef ) ); searchForDuplicates( query, accessor, propKeyIds, terms.docFreq() ); } } } } }
/** * Remove a stale file (uidIter.term().text()) from the index database and * history cache, and queue the removal of xref. * * @param removeHistory if false, do not remove history cache for this file * @throws java.io.IOException if an error occurs */ private void removeFile(boolean removeHistory) throws IOException { String path = Util.uid2url(uidIter.term().utf8ToString()); for (IndexChangedListener listener : listeners) { listener.fileRemove(path); } writer.deleteDocuments(new Term(QueryBuilder.U, uidIter.term())); removeXrefFile(path); if (removeHistory) { removeHistoryFile(path); } setDirty(); for (IndexChangedListener listener : listeners) { listener.fileRemoved(path); } }
@Override public Explanation explain(LeafReaderContext context, int doc) throws IOException { String desc = "weight(" + getQuery() + " in " + doc + ") [" + function + "]"; Terms terms = context.reader().terms(fieldName); if (terms == null) { return Explanation.noMatch(desc + ". Field " + fieldName + " doesn't exist."); } TermsEnum termsEnum = terms.iterator(); if (termsEnum.seekExact(new BytesRef(featureName)) == false) { return Explanation.noMatch(desc + ". Feature " + featureName + " doesn't exist."); } PostingsEnum postings = termsEnum.postings(null, PostingsEnum.FREQS); if (postings.advance(doc) != doc) { return Explanation.noMatch(desc + ". Feature " + featureName + " isn't set."); } return function.explain(fieldName, featureName, boost, doc, postings.freq()); }
private Query newTermQuery(IndexReader reader, Term term) throws IOException { // we build an artificial TermContext that will give an overall df and ttf // equal to 1 TermContext context = new TermContext(reader.getContext()); for (LeafReaderContext leafContext : reader.leaves()) { Terms terms = leafContext.reader().terms(term.field()); if (terms != null) { TermsEnum termsEnum = terms.iterator(); if (termsEnum.seekExact(term.bytes())) { int freq = 1 - context.docFreq(); // we want the total df and ttf to be 1 context.register(termsEnum.termState(), leafContext.ord, freq, freq); } } } return new TermQuery(term, context); }
@Override public Spans getSpans(final LeafReaderContext context, Postings requiredPostings) throws IOException { assert termContext.wasBuiltFor(ReaderUtil.getTopLevelContext(context)) : "The top-reader used to create Weight is not the same as the current reader's top-reader (" + ReaderUtil.getTopLevelContext(context); final TermState state = termContext.get(context.ord); if (state == null) { // term is not present in that reader assert context.reader().docFreq(term) == 0 : "no termstate found but term exists in reader term=" + term; return null; } final Terms terms = context.reader().terms(term.field()); if (terms == null) return null; if (terms.hasPositions() == false) throw new IllegalStateException("field \"" + term.field() + "\" was indexed without position data; cannot run SpanTermQuery (term=" + term.text() + ")"); final TermsEnum termsEnum = terms.iterator(); termsEnum.seekExact(term.bytes(), state); final PostingsEnum postings = termsEnum.postings(null, requiredPostings.getRequiredPostings()); float positionsCost = termPositionsCost(termsEnum) * PHRASE_TO_SPAN_TERM_POSITIONS_COST; return new TermSpans(getSimScorer(context), postings, term, positionsCost); } }
throw new RuntimeException("vector field=" + field + " does not exist in postings; doc=" + j); TermsEnum postingsTermsEnum = postingsTerms.iterator(); final boolean hasProx = terms.hasOffsets() || terms.hasPositions(); BytesRef term = null; while ((term = termsEnum.next()) != null) { postings = termsEnum.postings(postings, PostingsEnum.ALL); assert postings != null; if (!postingsTermsEnum.seekExact(term)) { throw new RuntimeException("vector term=" + term + " field=" + field + " does not exist in postings; doc=" + j); postingsDocs = postingsTermsEnum.postings(postingsDocs, PostingsEnum.ALL); assert postingsDocs != null; final int advanceDoc = postingsDocs.advance(j); if (advanceDoc != j) { throw new RuntimeException("vector term=" + term + " field=" + field + ": doc=" + j + " was not found in postings (got: " + advanceDoc + ")"); final int doc = postings.nextDoc(); final int tf = postings.freq(); if (postingsHasFreq && postingsDocs.freq() != tf) { throw new RuntimeException("vector term=" + term + " field=" + field + " doc=" + j + ": freq=" + tf + " differs from postings freq=" + postingsDocs.freq());