@Override public Scorer scorer(LeafReaderContext context) throws IOException { Terms terms = context.reader().terms(fieldName); if (terms == null) { return null; } TermsEnum termsEnum = terms.iterator(); if (termsEnum.seekExact(new BytesRef(featureName)) == false) { return null; } SimScorer scorer = function.scorer(fieldName, boost); PostingsEnum postings = termsEnum.postings(null, PostingsEnum.FREQS); return new Scorer(this) { @Override public int docID() { return postings.docID(); } @Override public float score() throws IOException { return scorer.score(postings.docID(), postings.freq()); } @Override public DocIdSetIterator iterator() { return postings; } }; }
private IndexReaderContext build(CompositeReaderContext parent, IndexReader reader, int ord, int docBase) { if (reader instanceof LeafReader) { final LeafReader ar = (LeafReader) reader; final LeafReaderContext atomic = new LeafReaderContext(parent, ar, ord, docBase, leaves.size(), leafDocBase); leaves.add(atomic); leafDocBase += reader.maxDoc(); return atomic; } else { final CompositeReader cr = (CompositeReader) reader; final List<? extends IndexReader> sequentialSubReaders = cr.getSequentialSubReaders(); final List<IndexReaderContext> children = Arrays.asList(new IndexReaderContext[sequentialSubReaders.size()]); final CompositeReaderContext newParent; if (parent == null) { newParent = new CompositeReaderContext(cr, children, leaves); } else { newParent = new CompositeReaderContext(parent, cr, ord, docBase, children); } int newDocBase = 0; for (int i = 0, c = sequentialSubReaders.size(); i < c; i++) { final IndexReader r = sequentialSubReaders.get(i); children.set(i, build(newParent, r, i, newDocBase)); newDocBase += r.maxDoc(); } assert newDocBase == cr.maxDoc(); return newParent; } } }
DirectoryReader r = DirectoryReader.open(taxoDir); try { final int size = r.numDocs(); final OrdinalMap ordinalMap = map; ordinalMap.setSize(size); int base = 0; PostingsEnum docs = null; for (final LeafReaderContext ctx : r.leaves()) { final LeafReader ar = ctx.reader(); final Terms terms = ar.terms(Consts.FULL); TermsEnum te = terms.iterator(); while (te.next() != null) { FacetLabel cp = new FacetLabel(FacetsConfig.stringToPath(te.term().utf8ToString())); final int ordinal = addCategory(cp); docs = te.postings(docs, PostingsEnum.NONE); ordinalMap.addMapping(docs.nextDoc() + base, ordinal); base += ar.maxDoc(); // no deletions, so we're ok
private Set<String> getTerms(IndexReader ir) { Set<String> t = new HashSet<>(); for (int i = 0; i < ir.leaves().size(); i++) { Terms termsList; try { // Get all the terms at this level of the tree. termsList = ir.leaves().get(i).reader().terms(Lucene4IRConstants.FIELD_ALL); if (termsList != null && termsList.size() > 0) { TermsEnum te = termsList.iterator(); BytesRef termBytes; while ((termBytes = te.next()) != null) { t.add(termBytes.utf8ToString()); } } // Get all the terms at the next level of the tree. if (ir.leaves().get(i).children() != null && ir.leaves().get(i).children().size() > 0) { for (IndexReaderContext c : ir.leaves().get(i).children()) { t.addAll(getTerms(c.reader())); } } } catch (IOException e) { e.printStackTrace(); } } return t; }
private LeafReader build(IndexWriter writer) throws IOException { for (InputDocument doc : documents) { writer.addDocument(doc.getDocument()); } writer.commit(); writer.forceMerge(1); LeafReader reader = DirectoryReader.open(directory).leaves().get(0).reader(); assert reader != null; docIds = new String[reader.maxDoc()]; for (int i = 0; i < docIds.length; i++) { docIds[i] = reader.document(i).get(InputDocument.ID_FIELD); // TODO can this be more efficient? } return reader; }
boolean buildit = false; LeafReader subReader = context.reader(); List<LeafReaderContext> leaves = subReader.leaves(); if (leaves != null && !leaves.isEmpty()) { if (leaves.size() > 1 || leaves.get(0) != context) { Bits live = subReader.getLiveDocs(); for (int i = 0; i < subReader.maxDoc(); i++) { if (live != null && !live.get(i)) continue; Document doc = subReader.document(i, fieldsToLoad); String[] with = doc.getValues(this._withField); if (with != null) { contexts = new HashSet<>(); for (String w : with) { contexts.add(new BytesRef(w)); String val = doc.get(aweight.getKey()); try {
/** * Read all terms from a field * * @param field the field in the document to load terms from * @param directory Any directory implementation * @return Unique terms represented as UTF-8 * @throws IOException */ public static Set<String> readTerms(String field, Directory directory) throws IOException { try (DirectoryReader reader = DirectoryReader.open(directory)) { Set<String> termStrings = new TreeSet<>(); for (LeafReaderContext atomicReaderContext : reader.leaves()) { LeafReader atomicReader = atomicReaderContext.reader(); TermsEnum iterator = atomicReader.terms(field).iterator(); BytesRef next = iterator.next(); while (next != null) { termStrings.add(iterator.term().utf8ToString()); next = iterator.next(); } } return termStrings; } }
continue; int readerId = ReaderUtil.subIndex(hit.docId(), context.searcher().getIndexReader().leaves()); LeafReaderContext subReaderContext = context.searcher().getIndexReader().leaves().get(readerId); if (lastReaderId != readerId) { docValuesMap.clear(); for (String field : parentFields) { docValuesMap.put(field, subReaderContext.reader().getSortedDocValues(field)); if (values != null && values.advanceExact(docId)) { BytesRef binaryValue = values.binaryValue(); String value = binaryValue.length > 0 ? binaryValue.utf8ToString() : null; if (value == null) {
/** * Merges the given taxonomy and index directories and commits the changes to * the given writers. */ public static void merge(Directory srcIndexDir, Directory srcTaxoDir, OrdinalMap map, IndexWriter destIndexWriter, DirectoryTaxonomyWriter destTaxoWriter, FacetsConfig srcConfig) throws IOException { // merge the taxonomies destTaxoWriter.addTaxonomy(srcTaxoDir, map); int ordinalMap[] = map.getMap(); DirectoryReader reader = DirectoryReader.open(srcIndexDir); try { List<LeafReaderContext> leaves = reader.leaves(); int numReaders = leaves.size(); CodecReader wrappedLeaves[] = new CodecReader[numReaders]; for (int i = 0; i < numReaders; i++) { wrappedLeaves[i] = SlowCodecReaderWrapper.wrap(new OrdinalMappingLeafReader(leaves.get(i).reader(), ordinalMap, srcConfig)); } destIndexWriter.addIndexes(wrappedLeaves); // commit changes to taxonomy and index respectively. destTaxoWriter.commit(); destIndexWriter.commit(); } finally { reader.close(); } }
List<LeafReaderContext> leaves = reader.leaves(); LeafReader atomicReader = ctx.reader(); FieldInfo finfo = atomicReader.getFieldInfos().fieldInfo(field); if (finfo == null) continue; if (docID >= atomicReader.maxDoc()) { continue; Document storedData = atomicReader.document(docID, new HashSet<String>(Arrays.asList(field))); String strData = storedData.get(field); BytesRef bytesRef = storedData.getBinaryValue(field); if (bytesRef != null) { out.println(bytesRef); BytesRef[] dataArray = storedData.getBinaryValues(field);
/** * Project a field as a String from a Lucene Document matching the provided term. * The method asserts that one match is found, and no more. */ private String projectSingleField(IndexReader reader, String fieldName, Term term) throws IOException { String projection = null; for ( LeafReaderContext leaf : reader.leaves() ) { final LeafReader atomicReader = leaf.reader(); final DocsEnum termDocsEnum = atomicReader.termDocsEnum( term ); while ( termDocsEnum.nextDoc() != DocsEnum.NO_MORE_DOCS ) { final int docID = termDocsEnum.docID(); org.apache.lucene.document.Document document = reader.document( docID ); String value = document.get( fieldName ); Assert.assertNull( "duplicate matches found! This method assumes a single document will match the Term.", projection ); projection = value; } } Assert.assertNotNull( projection ); return projection; }
for ( LeafReaderContext readerContext : indexReader.leaves() ) for ( String fieldName : fieldNames ) Terms terms = readerContext.reader().terms( fieldName ); if ( terms != null ) while ( (termsRef = termsEnum.next()) != null ) sampler.include( termsRef.utf8ToString(), termsEnum.docFreq() ); checkCancellation(); return sampler.result( indexReader.numDocs() );
Tuple<List<BytesRef>, Map<String, List<byte[]>>> extractTermsAndRanges(IndexReader indexReader) throws IOException { List<BytesRef> extractedTerms = new ArrayList<>(); Map<String, List<byte[]>> encodedPointValuesByField = new HashMap<>(); LeafReader reader = indexReader.leaves().get(0).reader(); for (FieldInfo info : reader.getFieldInfos()) { Terms terms = reader.terms(info.name); if (terms != null) { BytesRef fieldBr = new BytesRef(info.name); TermsEnum tenum = terms.iterator(); for (BytesRef term = tenum.next(); term != null; term = tenum.next()) { BytesRefBuilder builder = new BytesRefBuilder(); builder.append(fieldBr); builder.append(FIELD_VALUE_SEPARATOR); builder.append(term); extractedTerms.add(builder.toBytesRef()); } } if (info.getPointIndexDimensionCount() == 1) { // not != 0 because range fields are not supported PointValues values = reader.getPointValues(info.name); List<byte[]> encodedPointValues = new ArrayList<>(); encodedPointValues.add(values.getMinPackedValue().clone()); encodedPointValues.add(values.getMaxPackedValue().clone()); encodedPointValuesByField.put(info.name, encodedPointValues); } } return new Tuple<>(extractedTerms, encodedPointValuesByField); }
@Override public Scorer scorer(LeafReaderContext context) throws IOException { Similarity.SimScorer simScorer = similarity.simScorer(simWeight, context); // we use termscorers + disjunction as an impl detail List<Scorer> subScorers = new ArrayList<>(); for (int i = 0; i < terms.length; i++) { TermState state = termContexts[i].get(context.ord); if (state != null) { TermsEnum termsEnum = context.reader().terms(terms[i].field()).iterator(); termsEnum.seekExact(terms[i].bytes(), state); PostingsEnum postings = termsEnum.postings(null, PostingsEnum.FREQS); subScorers.add(new TermScorer(this, postings, simScorer)); } } if (subScorers.isEmpty()) { return null; } else if (subScorers.size() == 1) { // we must optimize this case (term not in segment), disjunctionscorer requires >= 2 subs return subScorers.get(0); } else { return new SynonymScorer(simScorer, this, subScorers); } }
private void getPrefixTerms(ObjectHashSet<Term> terms, final Term prefix, final IndexReader reader) throws IOException { // SlowCompositeReaderWrapper could be used... but this would merge all terms from each segment into one terms // instance, which is very expensive. Therefore I think it is better to iterate over each leaf individually. List<LeafReaderContext> leaves = reader.leaves(); for (LeafReaderContext leaf : leaves) { Terms _terms = leaf.reader().terms(field); if (_terms == null) { continue; } TermsEnum termsEnum = _terms.iterator(); TermsEnum.SeekStatus seekStatus = termsEnum.seekCeil(prefix.bytes()); if (TermsEnum.SeekStatus.END == seekStatus) { continue; } for (BytesRef term = termsEnum.term(); term != null; term = termsEnum.next()) { if (!StringHelper.startsWith(term, prefix.bytes())) { break; } terms.add(new Term(field, BytesRef.deepCopyOf(term))); if (terms.size() >= maxExpansions) { return; } } } }
@Override public Spans getSpans(final LeafReaderContext context, Postings requiredPostings) throws IOException { assert termContext.wasBuiltFor(ReaderUtil.getTopLevelContext(context)) : "The top-reader used to create Weight is not the same as the current reader's top-reader (" + ReaderUtil.getTopLevelContext(context); final TermState state = termContext.get(context.ord); if (state == null) { // term is not present in that reader assert context.reader().docFreq(term) == 0 : "no termstate found but term exists in reader term=" + term; return null; } final Terms terms = context.reader().terms(term.field()); if (terms == null) return null; if (terms.hasPositions() == false) throw new IllegalStateException("field \"" + term.field() + "\" was indexed without position data; cannot run SpanTermQuery (term=" + term.text() + ")"); final TermsEnum termsEnum = terms.iterator(); termsEnum.seekExact(term.bytes(), state); final PostingsEnum postings = termsEnum.postings(null, requiredPostings.getRequiredPostings()); float positionsCost = termPositionsCost(termsEnum) * PHRASE_TO_SPAN_TERM_POSITIONS_COST; return new TermSpans(getSimScorer(context), postings, term, positionsCost); } }
public void termsList(String field) throws IOException { // again, we'll just look at the first segment. Terms dictionaries // for different segments may well be different, as they depend on // the individual documents that have been added. LeafReader leafReader = reader.leaves().get(0).reader(); Terms terms = leafReader.terms(field); // The Terms object gives us some stats for this term within the segment System.out.println("Number of docs with this term:" + terms.getDocCount()); TermsEnum te = terms.iterator(); BytesRef term; while ((term = te.next()) != null) { System.out.println(term.utf8ToString() + " DF: " + te.docFreq() + " CF: " + te.totalTermFreq()); } }
if (reader.hasDeletions()) { final List<LeafReaderContext> leaves = reader.leaves(); final int size = leaves.size(); assert size > 0 : "A reader with deletions must have at least one leave"; if (size == 1) { return leaves.get(0).reader().getLiveDocs(); liveDocs[i] = ctx.reader().getLiveDocs(); starts[i] = ctx.docBase; starts[size] = reader.maxDoc(); return new MultiBits(liveDocs, starts, true); } else {
/** This method may return null if the field does not exist or if it has no terms. */ public static Terms getTerms(IndexReader r, String field) throws IOException { final List<LeafReaderContext> leaves = r.leaves(); if (leaves.size() == 1) { return leaves.get(0).reader().terms(field); } final List<Terms> termsPerLeaf = new ArrayList<>(leaves.size()); final List<ReaderSlice> slicePerLeaf = new ArrayList<>(leaves.size()); for (int leafIdx = 0; leafIdx < leaves.size(); leafIdx++) { LeafReaderContext ctx = leaves.get(leafIdx); Terms subTerms = ctx.reader().terms(field); if (subTerms != null) { termsPerLeaf.add(subTerms); slicePerLeaf.add(new ReaderSlice(ctx.docBase, r.maxDoc(), leafIdx - 1)); } } if (termsPerLeaf.size() == 0) { return null; } else { return new MultiTerms(termsPerLeaf.toArray(Terms.EMPTY_ARRAY), slicePerLeaf.toArray(ReaderSlice.EMPTY_ARRAY)); } }
final List<LeafReaderContext> leaves = r.leaves(); final int size = leaves.size(); return null; } else if (size == 1) { return leaves.get(0).reader().getSortedDocValues(field); for (int i = 0; i < size; i++) { LeafReaderContext context = leaves.get(i); SortedDocValues v = context.reader().getSortedDocValues(field); if (v == null) { v = DocValues.emptySorted(); } else { anyReal = true; starts[i] = context.docBase; starts[size] = r.maxDoc(); IndexReader.CacheHelper cacheHelper = r.getReaderCacheHelper(); IndexReader.CacheKey owner = cacheHelper == null ? null : cacheHelper.getKey(); OrdinalMap mapping = OrdinalMap.build(owner, values, PackedInts.DEFAULT);