org.apache.lucene.util.BytesRef java code examples

Refine search

/**
 * Removes entry for given key from this index.
 *
 * @param key Key.
 * @throws IgniteCheckedException If failed.
 */
public void remove(CacheObject key) throws IgniteCheckedException {
  try {
    writer.deleteDocuments(new Term(KEY_FIELD_NAME,
      new BytesRef(key.valueBytes(objectContext()))));
  }
  catch (IOException e) {
    throw new IgniteCheckedException(e);
  }
  finally {
    updateCntr.incrementAndGet();
  }
}

public void listTokens(int freq) throws IOException {
  IndexReader ireader = null;
  TermsEnum iter = null;
  Terms terms;
  try {
    ireader = DirectoryReader.open(indexDirectory);
    int numDocs = ireader.numDocs();
    if (numDocs > 0) {
      Fields uFields = MultiFields.getFields(ireader);//reader.getTermVectors(0);
      terms = uFields.terms(QueryBuilder.DEFS);
      iter = terms.iterator(); // init uid iterator
    }
    while (iter != null && iter.term() != null) {
      //if (iter.term().field().startsWith("f")) {
      if (iter.docFreq() > 16 && iter.term().utf8ToString().length() > freq) {
        LOGGER.warning(iter.term().utf8ToString());
      }
      BytesRef next = iter.next();
      if (next==null) {iter=null;}
    }
  } finally {
    if (ireader != null) {
      try {
        ireader.close();
      } catch (IOException e) {
        LOGGER.log(Level.WARNING, "An error occurred while closing index reader", e);
      }
    }
  }
}

/** Try to collect terms from the given terms enum and return true iff all
 *  terms could be collected. If {@code false} is returned, the enum is
 *  left positioned on the next term. */
private boolean collectTerms(LeafReaderContext context, TermsEnum termsEnum, List<TermAndState> terms) throws IOException {
 final int threshold = Math.min(BOOLEAN_REWRITE_TERM_COUNT_THRESHOLD, BooleanQuery.getMaxClauseCount());
 for (int i = 0; i < threshold; ++i) {
  final BytesRef term = termsEnum.next();
  if (term == null) {
   return true;
  }
  TermState state = termsEnum.termState();
  terms.add(new TermAndState(BytesRef.deepCopyOf(term), state, termsEnum.docFreq(), termsEnum.totalTermFreq()));
 }
 return termsEnum.next() == null;
}

@SuppressWarnings("unused")
static String brToString(BytesRef b) {
 try {
  return b.utf8ToString() + " " + b;
 } catch (Throwable t) {
  // If BytesRef isn't actually UTF8, or it's eg a
  // prefix of UTF8 that ends mid-unicode-char, we
  // fallback to hex:
  return b.toString();
 }
}

if (lastSeek != null && lastSeek.compareTo(term) <= 0) {
 seekOpt = true;
  final BytesRef curTerm = currentSubs[i].current;
  if (curTerm != null) {
   final int cmp = term.compareTo(curTerm);
   if (cmp == 0) {
    status = true;
    status = false;
   } else {
    status = currentSubs[i].terms.seekExact(term);
  status = currentSubs[i].terms.seekExact(term);
  current = currentSubs[i].current = currentSubs[i].terms.term();
  assert term.equals(currentSubs[i].current);

public static int writeFeaturesToIndex(InputStream in, IndexWriter iw) throws IOException {
  int count = 0;
  GenericDoubleLireFeature f = new GenericDoubleLireFeature();
  BufferedReader br = new BufferedReader(new InputStreamReader(in));
  String line;
  while ((line = br.readLine()) != null) {
    Document d = new Document();
    if (line.startsWith("#"))
      continue;
    String[] split = line.split("\\s"); // split at white space ...
    String filename = split[0];
    double[] data = new double[split.length-1];
    for (int i = 1; i < split.length; i++) {
      data[i-1] = Double.parseDouble(split[i]);
    }
    f.setData(data);
    d.add(new StoredField(f.getFieldName(), new BytesRef(f.getByteArrayRepresentation())));
    d.add(new StringField(DocumentBuilder.FIELD_NAME_IDENTIFIER, filename, Field.Store.YES));
    iw.addDocument(d);
    count++;
  }
  iw.close();
  return count;
}

    while (line != null) {
      len += line.length();
      Document document = new Document();
      document.add(new TextField(FIELD, line, Field.Store.NO));
      docs.add(document);
      if (len > maxLen) {
  writer.flush();
try (IndexReader reader = DirectoryReader.open(directory)) {
  LeafReader wrappedReader = SlowCompositeReaderWrapper.wrap(reader);
  Terms terms = wrappedReader.terms(FIELD);
  TermsEnum termsEnum = terms.iterator();
  BytesRef bytesRef = termsEnum.next();
  int docsWThisField = wrappedReader.getDocCount(FIELD);
  while (bytesRef != null) {
    int df = termsEnum.docFreq();
    long tf = termsEnum.totalTermFreq();
    if (MIN_DOC_FREQ > -1 && df < MIN_DOC_FREQ) {
      bytesRef = termsEnum.next();
      String t = bytesRef.utf8ToString();
      if (! WHITE_LIST.contains(t) && ! BLACK_LIST.contains(t)) {
        queue.insertWithOverflow(new TokenDFTF(t, df, tf));

Object val = v.isPlatformType() ? v.value(coctx, false) : v;
Document doc = new Document();
  doc.add(new TextField(VAL_STR_FIELD_NAME, val.toString(), Field.Store.YES));
    doc.add(new TextField(idxdFields[i], fieldVal.toString(), Field.Store.YES));
BytesRef keyByteRef = new BytesRef(k.valueBytes(coctx));
  final Term term = new Term(KEY_FIELD_NAME, keyByteRef);

ft.freeze();
Document doc = new Document();
Field field = new Field("body", "", ft);
doc.add(field);
   break;
  field.setStringValue(surfaceForm.utf8ToString());
  writer.addDocument(doc);
  count++;
 reader = DirectoryReader.open(writer);
 TermsEnum termsEnum = terms.iterator();
  BytesRef term = termsEnum.next();
  if (term == null) {
   break;
   totTokens += termsEnum.totalTermFreq();
  builder.add(Util.toIntsRef(term, scratchInts), encodeWeight(termsEnum.totalTermFreq()));

DirectoryReader r = DirectoryReader.open(taxoDir);
try {
 final int size = r.numDocs();
 final OrdinalMap ordinalMap = map;
 ordinalMap.setSize(size);
 int base = 0;
 PostingsEnum docs = null;
 for (final LeafReaderContext ctx : r.leaves()) {
  final LeafReader ar = ctx.reader();
  final Terms terms = ar.terms(Consts.FULL);
  TermsEnum te = terms.iterator();
  while (te.next() != null) {
   FacetLabel cp = new FacetLabel(FacetsConfig.stringToPath(te.term().utf8ToString()));
   final int ordinal = addCategory(cp);
   docs = te.postings(docs, PostingsEnum.NONE);
   ordinalMap.addMapping(docs.nextDoc() + base, ordinal);
  base += ar.maxDoc(); // no deletions, so we're ok

List<BytesRef> termBytesList = new ArrayList<>();
for (String term : includeTerms) {
  BytesRef termBytes = new BytesRef(term);
  termBytesList.add(termBytes);
for (LeafReaderContext subReaderContext : directoryReader.leaves()) {
  Terms terms = subReaderContext.reader().terms(fieldName);
  startTermBytes = new BytesRef(request.getStartTerm());
  startTermBytes = new BytesRef("");
  endTermBytes = new BytesRef(request.getEndTerm());
for (LeafReaderContext subReaderContext : directoryReader.leaves()) {
  Terms terms = subReaderContext.reader().terms(fieldName);
        BytesRef text = termsEnum.term();
        if (endTermBytes == null || (text.compareTo(endTermBytes) < 0)) {
          handleTerm(termsMap, termsEnum, text, termFilter, termMatch);
          while ((text = termsEnum.next()) != null) {
            if (endTermBytes == null || (text.compareTo(endTermBytes) < 0)) {
              handleTerm(termsMap, termsEnum, text, termFilter, termMatch);

@Override
public Scorer scorer(LeafReaderContext context) throws IOException {
 Terms terms = context.reader().terms(fieldName);
 if (terms == null) {
  return null;
 }
 TermsEnum termsEnum = terms.iterator();
 if (termsEnum.seekExact(new BytesRef(featureName)) == false) {
  return null;
 }
 SimScorer scorer = function.scorer(fieldName, boost);
 PostingsEnum postings = termsEnum.postings(null, PostingsEnum.FREQS);
 return new Scorer(this) {
  @Override
  public int docID() {
   return postings.docID();
  }
  @Override
  public float score() throws IOException {
   return scorer.score(postings.docID(), postings.freq());
  }
  @Override
  public DocIdSetIterator iterator() {
   return postings;
  }
 };
}

private void getPrefixTerms(ObjectHashSet<Term> terms, final Term prefix, final IndexReader reader) throws IOException {
  // SlowCompositeReaderWrapper could be used... but this would merge all terms from each segment into one terms
  // instance, which is very expensive. Therefore I think it is better to iterate over each leaf individually.
  List<LeafReaderContext> leaves = reader.leaves();
  for (LeafReaderContext leaf : leaves) {
    Terms _terms = leaf.reader().terms(field);
    if (_terms == null) {
      continue;
    }
    TermsEnum termsEnum = _terms.iterator();
    TermsEnum.SeekStatus seekStatus = termsEnum.seekCeil(prefix.bytes());
    if (TermsEnum.SeekStatus.END == seekStatus) {
      continue;
    }
    for (BytesRef term = termsEnum.term(); term != null; term = termsEnum.next()) {
      if (!StringHelper.startsWith(term, prefix.bytes())) {
        break;
      }
      terms.add(new Term(field, BytesRef.deepCopyOf(term)));
      if (terms.size() >= maxExpansions) {
        return;
      }
    }
  }
}

reader = DirectoryReader.open(indexDirectory); // open existing index
settings = readAnalysisSettings();
if (settings == null) {
    uidIter = terms.iterator();
    TermsEnum.SeekStatus stat = uidIter.seekCeil(new BytesRef(startuid)); //init uid
    if (stat == TermsEnum.SeekStatus.END) {
      uidIter = null;
  while (uidIter != null && uidIter.term() != null
    && uidIter.term().utf8ToString().startsWith(startuid)) {

try {
 PostingsEnum postingsEnum = null;
 for (LeafReaderContext ctx : reader.leaves()) {
  Terms terms = ctx.reader().terms(Consts.FULL);
  if (terms != null) { // cannot really happen, but be on the safe side
   TermsEnum termsEnum = terms.iterator();
   while (termsEnum.next() != null) {
    if (!cache.isFull()) {
     BytesRef t = termsEnum.term();
     FacetLabel cp = new FacetLabel(FacetsConfig.stringToPath(t.utf8ToString()));
     postingsEnum = termsEnum.postings(postingsEnum, PostingsEnum.NONE);
     boolean res = cache.put(cp, postingsEnum.nextDoc() + ctx.docBase);
     assert !res : "entries should not have been evicted from the cache";

    DateTools.Resolution.MILLISECOND);
path = Util.fixPathIfWindows(path);
doc.add(new Field(QueryBuilder.U, Util.path2uid(path, date),
    string_ft_stored_nanalyzed_norms));
doc.add(new Field(QueryBuilder.FULLPATH, file.getAbsolutePath(),
    string_ft_nstored_nanalyzed_norms));
doc.add(new SortedDocValuesField(QueryBuilder.FULLPATH,
    new BytesRef(file.getAbsolutePath())));
doc.add(new Field(QueryBuilder.DATE, date, string_ft_stored_nanalyzed_norms));
doc.add(new SortedDocValuesField(QueryBuilder.DATE, new BytesRef(date)));

/**
* Read all terms from a field
*
* @param field the field in the document to load terms from
* @param directory Any directory implementation
* @return Unique terms represented as UTF-8
* @throws IOException
*/
public static Set<String> readTerms(String field, Directory directory) throws IOException {
 try (DirectoryReader reader = DirectoryReader.open(directory)) {
   Set<String> termStrings = new TreeSet<>();
   for (LeafReaderContext atomicReaderContext : reader.leaves()) {
    LeafReader atomicReader = atomicReaderContext.reader();
    TermsEnum iterator = atomicReader.terms(field).iterator();
    BytesRef next = iterator.next();
    while (next != null) {
      termStrings.add(iterator.term().utf8ToString());
      next = iterator.next();
    }
   }
   return termStrings;
 }
}

boolean buildit = false;
LeafReader subReader = context.reader();
List<LeafReaderContext> leaves = subReader.leaves();
if (leaves != null && !leaves.isEmpty()) {
 if (leaves.size() > 1 || leaves.get(0) != context) {
  String[] with = doc.getValues(this._withField);
  if (with != null) {
   contexts = new HashSet<>();
   for (String w : with) {
    contexts.add(new BytesRef(w));
  String val = doc.get(aweight.getKey());
  try {
 BytesRef payload = serialized == null ? null : new BytesRef(serialized);
 for (String field : this._searchFields) {
  String[] texts = doc.getValues(field);
  if (texts != null) {
   for (String text : texts) {
    try {
     this.suggester.add(new BytesRef(text), contexts, weight, payload);
    } catch (Exception ex) {
     LOGGER.error("Failed to add text for field {} to autosuggest {}", field, this._name);

/**
 * Remove a stale file (uidIter.term().text()) from the index database and
 * history cache, and queue the removal of xref.
 *
 * @param removeHistory if false, do not remove history cache for this file
 * @throws java.io.IOException if an error occurs
 */
private void removeFile(boolean removeHistory) throws IOException {
  String path = Util.uid2url(uidIter.term().utf8ToString());
  for (IndexChangedListener listener : listeners) {
    listener.fileRemove(path);
  }
  writer.deleteDocuments(new Term(QueryBuilder.U, uidIter.term()));
  removeXrefFile(path);
  if (removeHistory) {
    removeHistoryFile(path);
  }
  setDirty();
  for (IndexChangedListener listener : listeners) {
    listener.fileRemoved(path);
  }
}

@Override
public Query rewrite(IndexReader reader) throws IOException {
 final int threshold = Math.min(BOOLEAN_REWRITE_TERM_COUNT_THRESHOLD, BooleanQuery.getMaxClauseCount());
 if (termData.size() <= threshold) {
  BooleanQuery.Builder bq = new BooleanQuery.Builder();
  TermIterator iterator = termData.iterator();
  for (BytesRef term = iterator.next(); term != null; term = iterator.next()) {
   bq.add(new TermQuery(new Term(iterator.field(), BytesRef.deepCopyOf(term))), Occur.SHOULD);
  }
  return new ConstantScoreQuery(bq.build());
 }
 return super.rewrite(reader);
}

Javadoc

Represents byte[], as a slice (offset + length) into an existing byte[]. The #bytes member should never be null; use #EMPTY_BYTES if necessary.

Important note: Unless otherwise noted, Lucene uses this class to represent terms that are encoded as UTF8 bytes in the index. To convert them to a Java String (which is UTF16), use #utf8ToString. Using code like new String(bytes, offset, length) to do this is wrong, as it does not respect the correct character set and may return wrong results (depending on the platform's defaults)!

Most used methods

<init>
This instance will directly reference bytes w/o making a copy. bytes should not be null.
utf8ToString
Interprets stored bytes as UTF8 bytes, returning the resulting string
deepCopyOf
Creates a new BytesRef that points to a copy of the bytes fromother The returned BytesRef will have
compareTo
Unsigned byte order comparison
equals
hashCode
Calculates the hash code as required by TermsHash during indexing. This is currently implemented as
bytesEquals
Expert: compares the bytes against another BytesRef, returning true if the bytes are equal.
toString
Returns hex encoded bytes, eg [0x6c 0x75 0x63 0x65 0x6e 0x65]
clone
Returns a shallow clone of this instance (the underlying bytes arenot copied and will be shared by b
getUTF8SortedAsUnicodeComparator
copyBytes
isValid
Performs internal consistency checks. Always returns true (or throws IllegalStateException)

Popular in Java

Start an intent from android
startActivity (Activity)
notifyDataSetChanged (ArrayAdapter)
compareTo (BigDecimal)
Dictionary (java.util)
Note: Do not use this class since it is obsolete. Please use the Map interface for new implementatio
Locale (java.util)
Locale represents a language/country/variant combination. Locales are used to alter the presentatio
SortedMap (java.util)
A map that has its keys ordered. The sorting is according to either the natural ordering of its keys
Servlet (javax.servlet)
Defines methods that all servlets must implement. A servlet is a small Java program that runs within
Menu (java.awt)
Join (org.hibernate.mapping)
From CI to AI: The AI layer in your organization

How to useBytesRef in org.apache.lucene.util

Best Java code snippets using org.apache.lucene.util.BytesRef (Showing top 20 results out of 1,467)

Refine search

How to use
BytesRef
in
org.apache.lucene.util