/** * Removes entry for given key from this index. * * @param key Key. * @throws IgniteCheckedException If failed. */ public void remove(CacheObject key) throws IgniteCheckedException { try { writer.deleteDocuments(new Term(KEY_FIELD_NAME, new BytesRef(key.valueBytes(objectContext())))); } catch (IOException e) { throw new IgniteCheckedException(e); } finally { updateCntr.incrementAndGet(); } }
public void listTokens(int freq) throws IOException { IndexReader ireader = null; TermsEnum iter = null; Terms terms; try { ireader = DirectoryReader.open(indexDirectory); int numDocs = ireader.numDocs(); if (numDocs > 0) { Fields uFields = MultiFields.getFields(ireader);//reader.getTermVectors(0); terms = uFields.terms(QueryBuilder.DEFS); iter = terms.iterator(); // init uid iterator } while (iter != null && iter.term() != null) { //if (iter.term().field().startsWith("f")) { if (iter.docFreq() > 16 && iter.term().utf8ToString().length() > freq) { LOGGER.warning(iter.term().utf8ToString()); } BytesRef next = iter.next(); if (next==null) {iter=null;} } } finally { if (ireader != null) { try { ireader.close(); } catch (IOException e) { LOGGER.log(Level.WARNING, "An error occurred while closing index reader", e); } } } }
/** Try to collect terms from the given terms enum and return true iff all * terms could be collected. If {@code false} is returned, the enum is * left positioned on the next term. */ private boolean collectTerms(LeafReaderContext context, TermsEnum termsEnum, List<TermAndState> terms) throws IOException { final int threshold = Math.min(BOOLEAN_REWRITE_TERM_COUNT_THRESHOLD, BooleanQuery.getMaxClauseCount()); for (int i = 0; i < threshold; ++i) { final BytesRef term = termsEnum.next(); if (term == null) { return true; } TermState state = termsEnum.termState(); terms.add(new TermAndState(BytesRef.deepCopyOf(term), state, termsEnum.docFreq(), termsEnum.totalTermFreq())); } return termsEnum.next() == null; }
@SuppressWarnings("unused") static String brToString(BytesRef b) { try { return b.utf8ToString() + " " + b; } catch (Throwable t) { // If BytesRef isn't actually UTF8, or it's eg a // prefix of UTF8 that ends mid-unicode-char, we // fallback to hex: return b.toString(); } }
if (lastSeek != null && lastSeek.compareTo(term) <= 0) { seekOpt = true; final BytesRef curTerm = currentSubs[i].current; if (curTerm != null) { final int cmp = term.compareTo(curTerm); if (cmp == 0) { status = true; status = false; } else { status = currentSubs[i].terms.seekExact(term); status = currentSubs[i].terms.seekExact(term); current = currentSubs[i].current = currentSubs[i].terms.term(); assert term.equals(currentSubs[i].current);
public static int writeFeaturesToIndex(InputStream in, IndexWriter iw) throws IOException { int count = 0; GenericDoubleLireFeature f = new GenericDoubleLireFeature(); BufferedReader br = new BufferedReader(new InputStreamReader(in)); String line; while ((line = br.readLine()) != null) { Document d = new Document(); if (line.startsWith("#")) continue; String[] split = line.split("\\s"); // split at white space ... String filename = split[0]; double[] data = new double[split.length-1]; for (int i = 1; i < split.length; i++) { data[i-1] = Double.parseDouble(split[i]); } f.setData(data); d.add(new StoredField(f.getFieldName(), new BytesRef(f.getByteArrayRepresentation()))); d.add(new StringField(DocumentBuilder.FIELD_NAME_IDENTIFIER, filename, Field.Store.YES)); iw.addDocument(d); count++; } iw.close(); return count; }
while (line != null) { len += line.length(); Document document = new Document(); document.add(new TextField(FIELD, line, Field.Store.NO)); docs.add(document); if (len > maxLen) { writer.flush(); try (IndexReader reader = DirectoryReader.open(directory)) { LeafReader wrappedReader = SlowCompositeReaderWrapper.wrap(reader); Terms terms = wrappedReader.terms(FIELD); TermsEnum termsEnum = terms.iterator(); BytesRef bytesRef = termsEnum.next(); int docsWThisField = wrappedReader.getDocCount(FIELD); while (bytesRef != null) { int df = termsEnum.docFreq(); long tf = termsEnum.totalTermFreq(); if (MIN_DOC_FREQ > -1 && df < MIN_DOC_FREQ) { bytesRef = termsEnum.next(); String t = bytesRef.utf8ToString(); if (! WHITE_LIST.contains(t) && ! BLACK_LIST.contains(t)) { queue.insertWithOverflow(new TokenDFTF(t, df, tf));
Object val = v.isPlatformType() ? v.value(coctx, false) : v; Document doc = new Document(); doc.add(new TextField(VAL_STR_FIELD_NAME, val.toString(), Field.Store.YES)); doc.add(new TextField(idxdFields[i], fieldVal.toString(), Field.Store.YES)); BytesRef keyByteRef = new BytesRef(k.valueBytes(coctx)); final Term term = new Term(KEY_FIELD_NAME, keyByteRef);
ft.freeze(); Document doc = new Document(); Field field = new Field("body", "", ft); doc.add(field); break; field.setStringValue(surfaceForm.utf8ToString()); writer.addDocument(doc); count++; reader = DirectoryReader.open(writer); TermsEnum termsEnum = terms.iterator(); BytesRef term = termsEnum.next(); if (term == null) { break; totTokens += termsEnum.totalTermFreq(); builder.add(Util.toIntsRef(term, scratchInts), encodeWeight(termsEnum.totalTermFreq()));
DirectoryReader r = DirectoryReader.open(taxoDir); try { final int size = r.numDocs(); final OrdinalMap ordinalMap = map; ordinalMap.setSize(size); int base = 0; PostingsEnum docs = null; for (final LeafReaderContext ctx : r.leaves()) { final LeafReader ar = ctx.reader(); final Terms terms = ar.terms(Consts.FULL); TermsEnum te = terms.iterator(); while (te.next() != null) { FacetLabel cp = new FacetLabel(FacetsConfig.stringToPath(te.term().utf8ToString())); final int ordinal = addCategory(cp); docs = te.postings(docs, PostingsEnum.NONE); ordinalMap.addMapping(docs.nextDoc() + base, ordinal); base += ar.maxDoc(); // no deletions, so we're ok
List<BytesRef> termBytesList = new ArrayList<>(); for (String term : includeTerms) { BytesRef termBytes = new BytesRef(term); termBytesList.add(termBytes); for (LeafReaderContext subReaderContext : directoryReader.leaves()) { Terms terms = subReaderContext.reader().terms(fieldName); startTermBytes = new BytesRef(request.getStartTerm()); startTermBytes = new BytesRef(""); endTermBytes = new BytesRef(request.getEndTerm()); for (LeafReaderContext subReaderContext : directoryReader.leaves()) { Terms terms = subReaderContext.reader().terms(fieldName); BytesRef text = termsEnum.term(); if (endTermBytes == null || (text.compareTo(endTermBytes) < 0)) { handleTerm(termsMap, termsEnum, text, termFilter, termMatch); while ((text = termsEnum.next()) != null) { if (endTermBytes == null || (text.compareTo(endTermBytes) < 0)) { handleTerm(termsMap, termsEnum, text, termFilter, termMatch);
@Override public Scorer scorer(LeafReaderContext context) throws IOException { Terms terms = context.reader().terms(fieldName); if (terms == null) { return null; } TermsEnum termsEnum = terms.iterator(); if (termsEnum.seekExact(new BytesRef(featureName)) == false) { return null; } SimScorer scorer = function.scorer(fieldName, boost); PostingsEnum postings = termsEnum.postings(null, PostingsEnum.FREQS); return new Scorer(this) { @Override public int docID() { return postings.docID(); } @Override public float score() throws IOException { return scorer.score(postings.docID(), postings.freq()); } @Override public DocIdSetIterator iterator() { return postings; } }; }
private void getPrefixTerms(ObjectHashSet<Term> terms, final Term prefix, final IndexReader reader) throws IOException { // SlowCompositeReaderWrapper could be used... but this would merge all terms from each segment into one terms // instance, which is very expensive. Therefore I think it is better to iterate over each leaf individually. List<LeafReaderContext> leaves = reader.leaves(); for (LeafReaderContext leaf : leaves) { Terms _terms = leaf.reader().terms(field); if (_terms == null) { continue; } TermsEnum termsEnum = _terms.iterator(); TermsEnum.SeekStatus seekStatus = termsEnum.seekCeil(prefix.bytes()); if (TermsEnum.SeekStatus.END == seekStatus) { continue; } for (BytesRef term = termsEnum.term(); term != null; term = termsEnum.next()) { if (!StringHelper.startsWith(term, prefix.bytes())) { break; } terms.add(new Term(field, BytesRef.deepCopyOf(term))); if (terms.size() >= maxExpansions) { return; } } } }
reader = DirectoryReader.open(indexDirectory); // open existing index settings = readAnalysisSettings(); if (settings == null) { uidIter = terms.iterator(); TermsEnum.SeekStatus stat = uidIter.seekCeil(new BytesRef(startuid)); //init uid if (stat == TermsEnum.SeekStatus.END) { uidIter = null; while (uidIter != null && uidIter.term() != null && uidIter.term().utf8ToString().startsWith(startuid)) {
try { PostingsEnum postingsEnum = null; for (LeafReaderContext ctx : reader.leaves()) { Terms terms = ctx.reader().terms(Consts.FULL); if (terms != null) { // cannot really happen, but be on the safe side TermsEnum termsEnum = terms.iterator(); while (termsEnum.next() != null) { if (!cache.isFull()) { BytesRef t = termsEnum.term(); FacetLabel cp = new FacetLabel(FacetsConfig.stringToPath(t.utf8ToString())); postingsEnum = termsEnum.postings(postingsEnum, PostingsEnum.NONE); boolean res = cache.put(cp, postingsEnum.nextDoc() + ctx.docBase); assert !res : "entries should not have been evicted from the cache";
DateTools.Resolution.MILLISECOND); path = Util.fixPathIfWindows(path); doc.add(new Field(QueryBuilder.U, Util.path2uid(path, date), string_ft_stored_nanalyzed_norms)); doc.add(new Field(QueryBuilder.FULLPATH, file.getAbsolutePath(), string_ft_nstored_nanalyzed_norms)); doc.add(new SortedDocValuesField(QueryBuilder.FULLPATH, new BytesRef(file.getAbsolutePath()))); doc.add(new Field(QueryBuilder.DATE, date, string_ft_stored_nanalyzed_norms)); doc.add(new SortedDocValuesField(QueryBuilder.DATE, new BytesRef(date)));
/** * Read all terms from a field * * @param field the field in the document to load terms from * @param directory Any directory implementation * @return Unique terms represented as UTF-8 * @throws IOException */ public static Set<String> readTerms(String field, Directory directory) throws IOException { try (DirectoryReader reader = DirectoryReader.open(directory)) { Set<String> termStrings = new TreeSet<>(); for (LeafReaderContext atomicReaderContext : reader.leaves()) { LeafReader atomicReader = atomicReaderContext.reader(); TermsEnum iterator = atomicReader.terms(field).iterator(); BytesRef next = iterator.next(); while (next != null) { termStrings.add(iterator.term().utf8ToString()); next = iterator.next(); } } return termStrings; } }
boolean buildit = false; LeafReader subReader = context.reader(); List<LeafReaderContext> leaves = subReader.leaves(); if (leaves != null && !leaves.isEmpty()) { if (leaves.size() > 1 || leaves.get(0) != context) { String[] with = doc.getValues(this._withField); if (with != null) { contexts = new HashSet<>(); for (String w : with) { contexts.add(new BytesRef(w)); String val = doc.get(aweight.getKey()); try { BytesRef payload = serialized == null ? null : new BytesRef(serialized); for (String field : this._searchFields) { String[] texts = doc.getValues(field); if (texts != null) { for (String text : texts) { try { this.suggester.add(new BytesRef(text), contexts, weight, payload); } catch (Exception ex) { LOGGER.error("Failed to add text for field {} to autosuggest {}", field, this._name);
/** * Remove a stale file (uidIter.term().text()) from the index database and * history cache, and queue the removal of xref. * * @param removeHistory if false, do not remove history cache for this file * @throws java.io.IOException if an error occurs */ private void removeFile(boolean removeHistory) throws IOException { String path = Util.uid2url(uidIter.term().utf8ToString()); for (IndexChangedListener listener : listeners) { listener.fileRemove(path); } writer.deleteDocuments(new Term(QueryBuilder.U, uidIter.term())); removeXrefFile(path); if (removeHistory) { removeHistoryFile(path); } setDirty(); for (IndexChangedListener listener : listeners) { listener.fileRemoved(path); } }
@Override public Query rewrite(IndexReader reader) throws IOException { final int threshold = Math.min(BOOLEAN_REWRITE_TERM_COUNT_THRESHOLD, BooleanQuery.getMaxClauseCount()); if (termData.size() <= threshold) { BooleanQuery.Builder bq = new BooleanQuery.Builder(); TermIterator iterator = termData.iterator(); for (BytesRef term = iterator.next(); term != null; term = iterator.next()) { bq.add(new TermQuery(new Term(iterator.field(), BytesRef.deepCopyOf(term))), Occur.SHOULD); } return new ConstantScoreQuery(bq.build()); } return super.rewrite(reader); }