public void mergeWith(SetDigest other) { hll.mergeWith(other.hll); LongBidirectionalIterator iterator = other.minhash.keySet().iterator(); while (iterator.hasNext()) { long key = iterator.nextLong(); int count = minhash.get(key) + other.minhash.get(key); minhash.put(key, Shorts.saturatedCast(count)); } while (minhash.size() > maxHashes) { minhash.remove(minhash.lastLongKey()); } }
public static double jaccardIndex(SetDigest a, SetDigest b) { int sizeOfSmallerSet = Math.min(a.minhash.size(), b.minhash.size()); LongSortedSet minUnion = new LongRBTreeSet(a.minhash.keySet()); minUnion.addAll(b.minhash.keySet()); int intersection = 0; int i = 0; for (long key : minUnion) { if (a.minhash.containsKey(key) && b.minhash.containsKey(key)) { intersection++; } i++; if (i >= sizeOfSmallerSet) { break; } } return intersection / (double) sizeOfSmallerSet; }
public Slice serialize() { try (SliceOutput output = new DynamicSliceOutput(estimatedSerializedSize())) { output.appendByte(UNCOMPRESSED_FORMAT); Slice serializedHll = hll.serialize(); output.appendInt(serializedHll.length()); output.appendBytes(serializedHll); output.appendInt(maxHashes); output.appendInt(minhash.size()); for (long key : minhash.keySet()) { output.appendLong(key); } for (short value : minhash.values()) { output.appendShort(value); } return output.slice(); } catch (IOException e) { throw new UncheckedIOException(e); } }
private void addHash(long hash) { short value = minhash.get(hash); if (value < Short.MAX_VALUE) { minhash.put(hash, (short) (value + 1)); } while (minhash.size() > maxHashes) { minhash.remove(minhash.lastLongKey()); } }
public static long exactIntersectionCardinality(SetDigest a, SetDigest b) { checkState(a.isExact(), "exact intersection cannot operate on approximate sets"); checkArgument(b.isExact(), "exact intersection cannot operate on approximate sets"); return Sets.intersection(a.minhash.keySet(), b.minhash.keySet()).size(); }
public boolean isExact() { // There's an ambiguity when minhash.size() == maxHashes, since this could either // be an exact set with maxHashes elements, or an inexact one. Which is why strict // inequality is used here. return minhash.size() < maxHashes; }
@Override public long lastLongKey() { return sortedMap.lastLongKey(); } /**
@Override public LongComparator comparator() { return sortedMap.comparator(); } @Override
@Override public LongSortedSet keySet() { if (keys == null) keys = LongSortedSets.unmodifiable(sortedMap.keySet()); return (LongSortedSet) keys; } @Override
public int estimatedInMemorySize() { return hll.estimatedInMemorySize() + minhash.size() * SIZE_OF_ENTRY + SIZE_OF_SETDIGEST + SIZE_OF_RBTREEMAP; }
@Override public long lastLongKey() { synchronized (sync) { return sortedMap.lastLongKey(); } } /**
@Override public LongComparator comparator() { synchronized (sync) { return sortedMap.comparator(); } } @Override
public static double jaccardIndex(SetDigest a, SetDigest b) { int sizeOfSmallerSet = Math.min(a.minhash.size(), b.minhash.size()); LongSortedSet minUnion = new LongRBTreeSet(a.minhash.keySet()); minUnion.addAll(b.minhash.keySet()); int intersection = 0; int i = 0; for (long key : minUnion) { if (a.minhash.containsKey(key) && b.minhash.containsKey(key)) { intersection++; } i++; if (i >= sizeOfSmallerSet) { break; } } return intersection / (double) sizeOfSmallerSet; }
public Slice serialize() { try (SliceOutput output = new DynamicSliceOutput(estimatedSerializedSize())) { output.appendByte(UNCOMPRESSED_FORMAT); Slice serializedHll = hll.serialize(); output.appendInt(serializedHll.length()); output.appendBytes(serializedHll); output.appendInt(maxHashes); output.appendInt(minhash.size()); for (long key : minhash.keySet()) { output.appendLong(key); } for (short value : minhash.values()) { output.appendShort(value); } return output.slice(); } catch (IOException e) { throw new UncheckedIOException(e); } }
@Override public LongSortedSet tailSet(final long from) { return tailMap(from).keySet(); } @Override
public int estimatedSerializedSize() { return SIZE_OF_BYTE + SIZE_OF_INT + hll.estimatedSerializedSize() + 2 * SIZE_OF_INT + minhash.size() * SIZE_OF_ENTRY; }
/** * {@inheritDoc} * * @deprecated Please use the corresponding type-specific method instead. */ @Deprecated @Override default Long lastKey() { return Long.valueOf(lastLongKey()); } /**
/** * Creates a new tree map copying a given sorted map (and its * {@link Comparator}). * * @param m * a type-specific sorted map to be copied into the new tree map. */ public Long2ShortRBTreeMap(final Long2ShortSortedMap m) { this(m.comparator()); putAll(m); } /**
private void addHash(long hash) { short value = minhash.get(hash); if (value < Short.MAX_VALUE) { minhash.put(hash, (short) (value + 1)); } while (minhash.size() > maxHashes) { minhash.remove(minhash.lastLongKey()); } }
public static double jaccardIndex(SetDigest a, SetDigest b) { int sizeOfSmallerSet = Math.min(a.minhash.size(), b.minhash.size()); LongSortedSet minUnion = new LongRBTreeSet(a.minhash.keySet()); minUnion.addAll(b.minhash.keySet()); int intersection = 0; int i = 0; for (long key : minUnion) { if (a.minhash.containsKey(key) && b.minhash.containsKey(key)) { intersection++; } i++; if (i >= sizeOfSmallerSet) { break; } } return intersection / (double) sizeOfSmallerSet; }