public boolean isExact() { // There's an ambiguity when minhash.size() == maxHashes, since this could either // be an exact set with maxHashes elements, or an inexact one. Which is why strict // inequality is used here. return minhash.size() < maxHashes; }
public int estimatedInMemorySize() { return hll.estimatedInMemorySize() + minhash.size() * SIZE_OF_ENTRY + SIZE_OF_SETDIGEST + SIZE_OF_RBTREEMAP; }
public int estimatedSerializedSize() { return SIZE_OF_BYTE + SIZE_OF_INT + hll.estimatedSerializedSize() + 2 * SIZE_OF_INT + minhash.size() * SIZE_OF_ENTRY; }
public long cardinality() { if (isExact()) { return minhash.size(); } return hll.cardinality(); }
public static double jaccardIndex(SetDigest a, SetDigest b) { int sizeOfSmallerSet = Math.min(a.minhash.size(), b.minhash.size()); LongSortedSet minUnion = new LongRBTreeSet(a.minhash.keySet()); minUnion.addAll(b.minhash.keySet()); int intersection = 0; int i = 0; for (long key : minUnion) { if (a.minhash.containsKey(key) && b.minhash.containsKey(key)) { intersection++; } i++; if (i >= sizeOfSmallerSet) { break; } } return intersection / (double) sizeOfSmallerSet; }
private void addHash(long hash) { short value = minhash.get(hash); if (value < Short.MAX_VALUE) { minhash.put(hash, (short) (value + 1)); } while (minhash.size() > maxHashes) { minhash.remove(minhash.lastLongKey()); } }
public void mergeWith(SetDigest other) { hll.mergeWith(other.hll); LongBidirectionalIterator iterator = other.minhash.keySet().iterator(); while (iterator.hasNext()) { long key = iterator.nextLong(); int count = minhash.get(key) + other.minhash.get(key); minhash.put(key, Shorts.saturatedCast(count)); } while (minhash.size() > maxHashes) { minhash.remove(minhash.lastLongKey()); } }
public Slice serialize() { try (SliceOutput output = new DynamicSliceOutput(estimatedSerializedSize())) { output.appendByte(UNCOMPRESSED_FORMAT); Slice serializedHll = hll.serialize(); output.appendInt(serializedHll.length()); output.appendBytes(serializedHll); output.appendInt(maxHashes); output.appendInt(minhash.size()); for (long key : minhash.keySet()) { output.appendLong(key); } for (short value : minhash.values()) { output.appendShort(value); } return output.slice(); } catch (IOException e) { throw new UncheckedIOException(e); } }
public boolean isExact() { // There's an ambiguity when minhash.size() == maxHashes, since this could either // be an exact set with maxHashes elements, or an inexact one. Which is why strict // inequality is used here. return minhash.size() < maxHashes; }
public boolean isExact() { // There's an ambiguity when minhash.size() == maxHashes, since this could either // be an exact set with maxHashes elements, or an inexact one. Which is why strict // inequality is used here. return minhash.size() < maxHashes; }
public int estimatedSerializedSize() { return SIZE_OF_BYTE + SIZE_OF_INT + hll.estimatedSerializedSize() + 2 * SIZE_OF_INT + minhash.size() * SIZE_OF_ENTRY; }
public int estimatedInMemorySize() { return hll.estimatedInMemorySize() + minhash.size() * SIZE_OF_ENTRY + SIZE_OF_SETDIGEST + SIZE_OF_RBTREEMAP; }
public int estimatedInMemorySize() { return hll.estimatedInMemorySize() + minhash.size() * SIZE_OF_ENTRY + SIZE_OF_SETDIGEST + SIZE_OF_RBTREEMAP; }
public int estimatedSerializedSize() { return SIZE_OF_BYTE + SIZE_OF_INT + hll.estimatedSerializedSize() + 2 * SIZE_OF_INT + minhash.size() * SIZE_OF_ENTRY; }
public long cardinality() { if (isExact()) { return minhash.size(); } return hll.cardinality(); }
public long cardinality() { if (isExact()) { return minhash.size(); } return hll.cardinality(); }
private void addHash(long hash) { short value = minhash.get(hash); if (value < Short.MAX_VALUE) { minhash.put(hash, (short) (value + 1)); } while (minhash.size() > maxHashes) { minhash.remove(minhash.lastLongKey()); } }
private void addHash(long hash) { short value = minhash.get(hash); if (value < Short.MAX_VALUE) { minhash.put(hash, (short) (value + 1)); } while (minhash.size() > maxHashes) { minhash.remove(minhash.lastLongKey()); } }
public void mergeWith(SetDigest other) { hll.mergeWith(other.hll); LongBidirectionalIterator iterator = other.minhash.keySet().iterator(); while (iterator.hasNext()) { long key = iterator.nextLong(); int count = minhash.get(key) + other.minhash.get(key); minhash.put(key, Shorts.saturatedCast(count)); } while (minhash.size() > maxHashes) { minhash.remove(minhash.lastLongKey()); } }
public void mergeWith(SetDigest other) { hll.mergeWith(other.hll); LongBidirectionalIterator iterator = other.minhash.keySet().iterator(); while (iterator.hasNext()) { long key = iterator.nextLong(); int count = minhash.get(key) + other.minhash.get(key); minhash.put(key, Shorts.saturatedCast(count)); } while (minhash.size() > maxHashes) { minhash.remove(minhash.lastLongKey()); } }