private void setHashBuffer(BitArray[] bloomFilters, int[] hashBuffer, SplittableRandom random) { final int hashBufferLength = hashBuffer.length; for (int index = 0; index < hashBufferLength; index++) { final int combinedHash = hashBuffer[index]; final BitArray bloomFilter = bloomFilters[index]; final double resetProbability = ((double) bloomFilter.bitCount()) / ((double) bloomFilter.bitSize()); if (random.nextDouble() < resetProbability) { bloomFilter.clear(random.nextLong(bloomFilter.bitSize())); } bloomFilter.set(combinedHash % bloomFilter.bitSize()); } }
private boolean containsHashBuffer(BitArray[] bloomFilters, int[] hashBuffer) { final int hashBufferLength = hashBuffer.length; for (int index = 0; index < hashBufferLength; index++) { final int combinedHash = hashBuffer[index]; final BitArray bloomFilter = bloomFilters[index]; if (!bloomFilter.get(combinedHash % bloomFilter.bitSize())) { return false; } } return true; }
private double calculateAverageLoad(BitArray[] bloomFilters) { double totalLoad = 0D; final int bloomFiltersLength = bloomFilters.length; for (BitArray currentBloomFilter : bloomFilters) { totalLoad += currentBloomFilter.bitCount() / currentBloomFilter.bitSize(); } return totalLoad / bloomFiltersLength; }
@Test public void testGetSetClear() { final BitArray bitArray = new BitArray(64L); assertFalse(bitArray.get(0L)); assertTrue(bitArray.set(0L)); assertTrue(bitArray.get(0L)); assertFalse(bitArray.set(0L)); assertTrue(bitArray.get(0L)); assertTrue(bitArray.clear(0L)); assertFalse(bitArray.get(0L)); assertFalse(bitArray.clear(0L)); assertFalse(bitArray.get(0L)); }
@Test public void testBitCount() { final BitArray bitArray = new BitArray(64L); assertEquals(0L, bitArray.bitCount()); assertTrue(bitArray.set(0L)); assertEquals(1L, bitArray.bitCount()); assertFalse(bitArray.set(0L)); assertEquals(1L, bitArray.bitCount()); assertTrue(bitArray.clear(0L)); assertEquals(0L, bitArray.bitCount()); assertFalse(bitArray.clear(0L)); assertEquals(0L, bitArray.bitCount()); }
@Test public void testWriteToReadFrom() throws IOException { final BitArray bitArray = new BitArray(64L); assertTrue(bitArray.set(0L)); final ByteArrayOutputStream out = new ByteArrayOutputStream(); final DataOutputStream dos = new DataOutputStream(out); bitArray.writeTo(dos); out.close(); final ByteArrayInputStream in = new ByteArrayInputStream(out.toByteArray()); final DataInputStream dis = new DataInputStream(in); final BitArray serialized = BitArray.readFrom(dis); in.close(); assertEquals(bitArray, serialized); } }
System.out.println(String.format("ProbabilisticDeDuplicator Number of Hash Functions: %d.", NUM_HASH_FUNCTIONS)); final ProbabilisticDeDuplicator deDuplicator = new RLBSBFDeDuplicator(NUM_BITS, NUM_HASH_FUNCTIONS); final BitArray universeBitArray = new BitArray(STREAM_SIZE); final byte[] elementBytes = new byte[LONG_BYTES]; long numFp = 0L; final long startTime = System.currentTimeMillis(); for (long counter = 1; counter <= STREAM_SIZE; counter++) { final long randomNumber = ThreadLocalRandom.current().nextLong(universeBitArray.bitSize()); fillElementBytes(randomNumber, elementBytes); if (deDuplicator.classifyDistinct(elementBytes)) { if (universeBitArray.get(randomNumber)) { numFn++; if (!universeBitArray.get(randomNumber)) { numFp++; universeBitArray.set(randomNumber);
private void setHashBuffer(BitArray[] bloomFilters, int[] hashBuffer, SplittableRandom random) { final int hashBufferLength = hashBuffer.length; for (int index = 0; index < hashBufferLength; index++) { final int combinedHash = hashBuffer[index]; final BitArray bloomFilter = bloomFilters[index]; bloomFilter.clear(random.nextLong(bloomFilter.bitSize())); bloomFilter.set(combinedHash % bloomFilter.bitSize()); } }
public static BitArray readFrom(DataInputStream in) throws IOException { final int numWords = in.readInt(); long[] data = new long[numWords]; for (int i = 0; i < numWords; i++) { data[i] = in.readLong(); } return new BitArray(data); } // @formatter:on
@Override public BSBFDeDuplicator readFrom(InputStream in) throws IOException { final DataInputStream dis = new DataInputStream(in); final int serializedVersion = dis.readInt(); if (serializedVersion != version()) { final String error = String.format( "Unexpected ProbabilisticDeDuplicator version number (%d), expected %d", serializedVersion, version() ); throw new IOException(error); } final long numBits = dis.readLong(); final int numHashFunctions = dis.readInt(); final BitArray[] bloomFilters = new BitArray[numHashFunctions]; for (int index = 0; index < numHashFunctions; index++) { bloomFilters[index] = BitArray.readFrom(dis); } final double reportedDuplicateProbability = dis.readDouble(); return new BSBFDeDuplicator(numBits, numHashFunctions, bloomFilters, reportedDuplicateProbability); } };
@Override public void writeTo(BSBFDeDuplicator probabilisticDeDuplicator, OutputStream out) throws IOException { final DataOutputStream dos = new DataOutputStream(out); dos.writeInt(version()); dos.writeLong(probabilisticDeDuplicator.numBits); dos.writeInt(probabilisticDeDuplicator.numHashFunctions); for (BitArray bloomFilter : probabilisticDeDuplicator.bloomFilters) { bloomFilter.writeTo(dos); } dos.writeDouble(probabilisticDeDuplicator.reportedDuplicateProbability); }
private void updateReportedDuplicateProbability() { /* * X_{m+1} = \left[ \left(X_m\right)^{\frac{1}{k}} \left\{ X_m + \left( 1 - X_m \right) \left( 1 - \frac{1}{ks} * \right) \right\} + \left( 1 - X_m \right) \frac{1}{s} \right]^k */ final double K = bloomFilters.length; final double S = bloomFilters[0].bitSize(); final double X = reportedDuplicateProbability; final double calculation1 = Math.pow(X, 1D / K); final double calculation2 = X + (1D - X) * (1D - (1D / (K * S))); final double calculation3 = (1D - X) * (1D / S); final double calculation4 = calculation1 * calculation2 + calculation3; reportedDuplicateProbability = Math.pow(calculation4, K); }
@Test public void testReset() { final BSBFDeDuplicator deDuplicator = new BSBFDeDuplicator(64L, 2); final Random random = new Random(); final byte[] element = new byte[128]; random.nextBytes(element); assertTrue(deDuplicator.classifyDistinct(element)); deDuplicator.reset(); for (BitArray bloomFilter : deDuplicator.bloomFilters) { assertEquals(0L, bloomFilter.bitCount()); } assertEquals(0D, deDuplicator.reportedDuplicateProbability, 0); }
public BitArray(long numBits) { this(new long[numWords(numBits)]); }
private void setHashBuffer(BitArray[] bloomFilters, int[] hashBuffer, SplittableRandom random) { final int hashBufferLength = hashBuffer.length; final BitArray randomBloomFilter = bloomFilters[random.nextInt(hashBufferLength)]; randomBloomFilter.clear(random.nextLong(randomBloomFilter.bitSize())); for (int index = 0; index < hashBufferLength; index++) { final int combinedHash = hashBuffer[index]; final BitArray bloomFilter = bloomFilters[index]; bloomFilter.set(combinedHash % bloomFilter.bitSize()); } }
private static BitArray[] bloomFilters(long numBits, int numHashFunctions) { if (numBits <= 0L) { final String error = String.format("numBits must be positive, but got %d", numBits); throw new IllegalArgumentException(error); } if (numHashFunctions <= 0) { final String error = String.format("numHashFunctions must be positive, but got %d", numHashFunctions); throw new IllegalArgumentException(error); } final long bloomFilterBits = numBits / numHashFunctions; final BitArray[] bloomFilters = new BitArray[numHashFunctions]; for (int index = 0; index < numHashFunctions; index++) { bloomFilters[index] = new BitArray(bloomFilterBits); } return bloomFilters; }
@Override public BSBFSDDeDuplicator readFrom(InputStream in) throws IOException { final DataInputStream dis = new DataInputStream(in); final int serializedVersion = dis.readInt(); if (serializedVersion != version()) { final String error = String.format( "Unexpected ProbabilisticDeDuplicator version number (%d), expected %d", serializedVersion, version() ); throw new IOException(error); } final long numBits = dis.readLong(); final int numHashFunctions = dis.readInt(); final BitArray[] bloomFilters = new BitArray[numHashFunctions]; for (int index = 0; index < numHashFunctions; index++) { bloomFilters[index] = BitArray.readFrom(dis); } final double reportedDuplicateProbability = dis.readDouble(); return new BSBFSDDeDuplicator(numBits, numHashFunctions, bloomFilters, reportedDuplicateProbability); } };
@Override public void writeTo(RLBSBFDeDuplicator probabilisticDeDuplicator, OutputStream out) throws IOException { final DataOutputStream dos = new DataOutputStream(out); dos.writeInt(version()); dos.writeLong(probabilisticDeDuplicator.numBits); dos.writeInt(probabilisticDeDuplicator.numHashFunctions); for (BitArray bloomFilter : probabilisticDeDuplicator.bloomFilters) { bloomFilter.writeTo(dos); } dos.writeDouble(probabilisticDeDuplicator.reportedDuplicateProbability); }
private void updateReportedDuplicateProbability() { /* * X_{m+1} = \left[ \left(X_m\right)^{\frac{1}{k}} \left\{ X_m + \left( 1 - X_m \right) \left( 1 - \frac{1}{s} * \right) \right\} + \left( 1 - X_m \right) \frac{1}{s} \right]^k */ final double K = bloomFilters.length; final double S = bloomFilters[0].bitSize(); final double X = reportedDuplicateProbability; final double calculation1 = Math.pow(X, 1D / K); final double calculation2 = X + (1D - X) * (1D - (1D / S)); final double calculation3 = (1D - X) * (1D / S); final double calculation4 = calculation1 * calculation2 + calculation3; reportedDuplicateProbability = Math.pow(calculation4, K); }