private void initBloomFilter(int numBuckets) { int avgNumRecordsPerBucket = getEstimatedMaxBucketEntries(this.availableMemory.size(), this.segmentSize, numBuckets, this.avgRecordLen); // Assign all bucket size to bloom filter except bucket header length. int byteSize = HASH_BUCKET_SIZE - BUCKET_HEADER_LENGTH; this.bloomFilter = new BloomFilter(avgNumRecordsPerBucket, byteSize); if (LOG.isDebugEnabled()) { double fpp = BloomFilter.estimateFalsePositiveProbability(avgNumRecordsPerBucket, byteSize << 3); LOG.debug(String.format("Create BloomFilter with average input entries per bucket[%d], bytes size[%d], false positive probability[%f].", avgNumRecordsPerBucket, byteSize, fpp)); } }
/** * @return false if the accuracy of the BloomFilter is not high. */ boolean addHash(int hash) { setLocation(hash); filter.addHash(hash); size++; return size <= maxSize; }
private void setLocation(int hash) { if (numBuffers > 1) { filter.setBitsLocation(buffers[hash & numBuffersMask], 0); } }
/** * Set all the bucket memory except bucket header as the bit set of bloom filter, and use hash code of build records * to build bloom filter. */ final void buildBloomFilterForBucket(int bucketInSegmentPos, MemorySegment bucket, HashPartition<BT, PT> p) { final int count = bucket.getShort(bucketInSegmentPos + HEADER_COUNT_OFFSET); if (count <= 0) { return; } int[] hashCodes = new int[count]; // As the hashcode and bloom filter occupy same bytes, so we read all hashcode out at first and then write back to bloom filter. for (int i = 0; i < count; i++) { hashCodes[i] = bucket.getInt(bucketInSegmentPos + BUCKET_HEADER_LENGTH + i * HASH_CODE_LEN); } this.bloomFilter.setBitsLocation(bucket, bucketInSegmentPos + BUCKET_HEADER_LENGTH); for (int hashCode : hashCodes) { this.bloomFilter.addHash(hashCode); } buildBloomFilterForExtraOverflowSegments(bucketInSegmentPos, bucket, p); }
byte status = bucket.get(bucketInSegmentOffset + HEADER_STATUS_OFFSET); if (status == BUCKET_STATUS_IN_FILTER) { this.bloomFilter.setBitsLocation(bucket, bucketInSegmentOffset + BUCKET_HEADER_LENGTH); if (this.bloomFilter.testHash(hash)) { p.insertIntoProbeBuffer(next);
/** * Compute the false positive probability based on given input entries and bits size. * Note: this is just the math expected value, you should not expect the fpp in real case would under the return value for certain. * * @param inputEntries * @param bitSize * @return */ public static double estimateFalsePositiveProbability(long inputEntries, int bitSize) { int numFunction = optimalNumOfHashFunctions(inputEntries, bitSize); double p = Math.pow(Math.E, -(double) numFunction * inputEntries / bitSize); double estimatedFPP = Math.pow(1 - p, numFunction); return estimatedFPP; }
HashTableBloomFilter(MemorySegment[] buffers, long numRecords) { checkArgument(buffers != null && buffers.length > 0); this.buffers = buffers; this.numBuffers = buffers.length; checkArgument(MathUtils.isPowerOf2(numBuffers)); this.numBuffersMask = numBuffers - 1; int bufferSize = buffers[0].size(); this.filter = new BloomFilter((int) (numRecords / numBuffers), buffers[0].size()); filter.setBitsLocation(buffers[0], 0); // We assume that a BloomFilter can contain up to 2.44 elements per byte. // fpp roughly equal 0.2 this.maxSize = (int) ((numBuffers * bufferSize) * 2.44); }
boolean testHash(int hash) { setLocation(hash); return filter.testHash(hash); }
/** * Set all the bucket memory except bucket header as the bit set of bloom filter, and use hash code of build records * to build bloom filter. */ final void buildBloomFilterForBucket(int bucketInSegmentPos, MemorySegment bucket, HashPartition<BT, PT> p) { final int count = bucket.getShort(bucketInSegmentPos + HEADER_COUNT_OFFSET); if (count <= 0) { return; } int[] hashCodes = new int[count]; // As the hashcode and bloom filter occupy same bytes, so we read all hashcode out at first and then write back to bloom filter. for (int i = 0; i < count; i++) { hashCodes[i] = bucket.getInt(bucketInSegmentPos + BUCKET_HEADER_LENGTH + i * HASH_CODE_LEN); } this.bloomFilter.setBitsLocation(bucket, bucketInSegmentPos + BUCKET_HEADER_LENGTH); for (int hashCode : hashCodes) { this.bloomFilter.addHash(hashCode); } buildBloomFilterForExtraOverflowSegments(bucketInSegmentPos, bucket, p); }
byte status = bucket.get(bucketInSegmentOffset + HEADER_STATUS_OFFSET); if (status == BUCKET_STATUS_IN_FILTER) { this.bloomFilter.setBitsLocation(bucket, bucketInSegmentOffset + BUCKET_HEADER_LENGTH); if (this.bloomFilter.testHash(hash)) { p.insertIntoProbeBuffer(next);
/** * Compute the false positive probability based on given input entries and bits size. * Note: this is just the math expected value, you should not expect the fpp in real case would under the return value for certain. * * @param inputEntries * @param bitSize * @return */ public static double estimateFalsePositiveProbability(long inputEntries, int bitSize) { int numFunction = optimalNumOfHashFunctions(inputEntries, bitSize); double p = Math.pow(Math.E, -(double) numFunction * inputEntries / bitSize); double estimatedFPP = Math.pow(1 - p, numFunction); return estimatedFPP; }
/** * Set all the bucket memory except bucket header as the bit set of bloom filter, and use hash code of build records * to build bloom filter. */ final void buildBloomFilterForBucket(int bucketInSegmentPos, MemorySegment bucket, HashPartition<BT, PT> p) { final int count = bucket.getShort(bucketInSegmentPos + HEADER_COUNT_OFFSET); if (count <= 0) { return; } int[] hashCodes = new int[count]; // As the hashcode and bloom filter occupy same bytes, so we read all hashcode out at first and then write back to bloom filter. for (int i = 0; i < count; i++) { hashCodes[i] = bucket.getInt(bucketInSegmentPos + BUCKET_HEADER_LENGTH + i * HASH_CODE_LEN); } this.bloomFilter.setBitsLocation(bucket, bucketInSegmentPos + BUCKET_HEADER_LENGTH); for (int hashCode : hashCodes) { this.bloomFilter.addHash(hashCode); } buildBloomFilterForExtraOverflowSegments(bucketInSegmentPos, bucket, p); }
byte status = bucket.get(bucketInSegmentOffset + HEADER_STATUS_OFFSET); if (status == BUCKET_STATUS_IN_FILTER) { this.bloomFilter.setBitsLocation(bucket, bucketInSegmentOffset + BUCKET_HEADER_LENGTH); if (this.bloomFilter.testHash(hash)) { p.insertIntoProbeBuffer(next);
private void initBloomFilter(int numBuckets) { int avgNumRecordsPerBucket = getEstimatedMaxBucketEntries(this.availableMemory.size(), this.segmentSize, numBuckets, this.avgRecordLen); // Assign all bucket size to bloom filter except bucket header length. int byteSize = HASH_BUCKET_SIZE - BUCKET_HEADER_LENGTH; this.bloomFilter = new BloomFilter(avgNumRecordsPerBucket, byteSize); if (LOG.isDebugEnabled()) { double fpp = BloomFilter.estimateFalsePositiveProbability(avgNumRecordsPerBucket, byteSize << 3); LOG.debug(String.format("Create BloomFilter with average input entries per bucket[%d], bytes size[%d], false positive probability[%f].", avgNumRecordsPerBucket, byteSize, fpp)); } }
/** * Compute the false positive probability based on given input entries and bits size. * Note: this is just the math expected value, you should not expect the fpp in real case would under the return value for certain. * * @param inputEntries * @param bitSize * @return */ public static double estimateFalsePositiveProbability(long inputEntries, int bitSize) { int numFunction = optimalNumOfHashFunctions(inputEntries, bitSize); double p = Math.pow(Math.E, -(double) numFunction * inputEntries / bitSize); double estimatedFPP = Math.pow(1 - p, numFunction); return estimatedFPP; }
this.bloomFilter.addHash(hashCode);
/** * Set all the bucket memory except bucket header as the bit set of bloom filter, and use hash code of build records * to build bloom filter. */ final void buildBloomFilterForBucket(int bucketInSegmentPos, MemorySegment bucket, HashPartition<BT, PT> p) { final int count = bucket.getShort(bucketInSegmentPos + HEADER_COUNT_OFFSET); if (count <= 0) { return; } int[] hashCodes = new int[count]; // As the hashcode and bloom filter occupy same bytes, so we read all hashcode out at first and then write back to bloom filter. for (int i = 0; i < count; i++) { hashCodes[i] = bucket.getInt(bucketInSegmentPos + BUCKET_HEADER_LENGTH + i * HASH_CODE_LEN); } this.bloomFilter.setBitsLocation(bucket, bucketInSegmentPos + BUCKET_HEADER_LENGTH); for (int hashCode : hashCodes) { this.bloomFilter.addHash(hashCode); } buildBloomFilterForExtraOverflowSegments(bucketInSegmentPos, bucket, p); }
byte status = bucket.get(bucketInSegmentOffset + HEADER_STATUS_OFFSET); if (status == BUCKET_STATUS_IN_FILTER) { this.bloomFilter.setBitsLocation(bucket, bucketInSegmentOffset + BUCKET_HEADER_LENGTH); if (this.bloomFilter.testHash(hash)) { p.insertIntoProbeBuffer(next);
private void initBloomFilter(int numBuckets) { int avgNumRecordsPerBucket = getEstimatedMaxBucketEntries(this.availableMemory.size(), this.segmentSize, numBuckets, this.avgRecordLen); // Assign all bucket size to bloom filter except bucket header length. int byteSize = HASH_BUCKET_SIZE - BUCKET_HEADER_LENGTH; this.bloomFilter = new BloomFilter(avgNumRecordsPerBucket, byteSize); if (LOG.isDebugEnabled()) { double fpp = BloomFilter.estimateFalsePositiveProbability(avgNumRecordsPerBucket, byteSize << 3); LOG.debug(String.format("Create BloomFilter with average input entries per bucket[%d], bytes size[%d], false positive probability[%f].", avgNumRecordsPerBucket, byteSize, fpp)); } }
/** * Compute the false positive probability based on given input entries and bits size. * Note: this is just the math expected value, you should not expect the fpp in real case would under the return value for certain. * * @param inputEntries * @param bitSize * @return */ public static double estimateFalsePositiveProbability(long inputEntries, int bitSize) { int numFunction = optimalNumOfHashFunctions(inputEntries, bitSize); double p = Math.pow(Math.E, -(double) numFunction * inputEntries / bitSize); double estimatedFPP = Math.pow(1 - p, numFunction); return estimatedFPP; }