org.apache.flink.runtime.operators.util.BloomFilter java code examples

private void initBloomFilter(int numBuckets) {
  int avgNumRecordsPerBucket = getEstimatedMaxBucketEntries(this.availableMemory.size(), this.segmentSize,
    numBuckets, this.avgRecordLen);
  // Assign all bucket size to bloom filter except bucket header length.
  int byteSize = HASH_BUCKET_SIZE - BUCKET_HEADER_LENGTH;
  this.bloomFilter = new BloomFilter(avgNumRecordsPerBucket, byteSize);
  if (LOG.isDebugEnabled()) {
    double fpp = BloomFilter.estimateFalsePositiveProbability(avgNumRecordsPerBucket, byteSize << 3);
    LOG.debug(String.format("Create BloomFilter with average input entries per bucket[%d], bytes size[%d], false positive probability[%f].",
      avgNumRecordsPerBucket, byteSize, fpp));
  }
}

/**
 * @return false if the accuracy of the BloomFilter is not high.
 */
boolean addHash(int hash) {
  setLocation(hash);
  filter.addHash(hash);
  size++;
  return size <= maxSize;
}

private void setLocation(int hash) {
  if (numBuffers > 1) {
    filter.setBitsLocation(buffers[hash & numBuffersMask], 0);
  }
}

/**
 * Set all the bucket memory except bucket header as the bit set of bloom filter, and use hash code of build records
 * to build bloom filter.
 */
final void buildBloomFilterForBucket(int bucketInSegmentPos, MemorySegment bucket, HashPartition<BT, PT> p) {
  final int count = bucket.getShort(bucketInSegmentPos + HEADER_COUNT_OFFSET);
  if (count <= 0) {
    return;
  }
  int[] hashCodes = new int[count];
  // As the hashcode and bloom filter occupy same bytes, so we read all hashcode out at first and then write back to bloom filter.
  for (int i = 0; i < count; i++) {
    hashCodes[i] = bucket.getInt(bucketInSegmentPos + BUCKET_HEADER_LENGTH + i * HASH_CODE_LEN);
  }
  this.bloomFilter.setBitsLocation(bucket, bucketInSegmentPos + BUCKET_HEADER_LENGTH);
  for (int hashCode : hashCodes) {
    this.bloomFilter.addHash(hashCode);
  }
  buildBloomFilterForExtraOverflowSegments(bucketInSegmentPos, bucket, p);
}

byte status = bucket.get(bucketInSegmentOffset + HEADER_STATUS_OFFSET);
if (status == BUCKET_STATUS_IN_FILTER) {
  this.bloomFilter.setBitsLocation(bucket, bucketInSegmentOffset + BUCKET_HEADER_LENGTH);
  if (this.bloomFilter.testHash(hash)) {
    p.insertIntoProbeBuffer(next);

/**
 * Compute the false positive probability based on given input entries and bits size.
 * Note: this is just the math expected value, you should not expect the fpp in real case would under the return value for certain.
 *
 * @param inputEntries
 * @param bitSize
 * @return
 */
public static double estimateFalsePositiveProbability(long inputEntries, int bitSize) {
  int numFunction = optimalNumOfHashFunctions(inputEntries, bitSize);
  double p = Math.pow(Math.E, -(double) numFunction * inputEntries / bitSize);
  double estimatedFPP = Math.pow(1 - p, numFunction);
  return estimatedFPP;
}

HashTableBloomFilter(MemorySegment[] buffers, long numRecords) {
  checkArgument(buffers != null && buffers.length > 0);
  this.buffers = buffers;
  this.numBuffers = buffers.length;
  checkArgument(MathUtils.isPowerOf2(numBuffers));
  this.numBuffersMask = numBuffers - 1;
  int bufferSize = buffers[0].size();
  this.filter = new BloomFilter((int) (numRecords / numBuffers), buffers[0].size());
  filter.setBitsLocation(buffers[0], 0);
  // We assume that a BloomFilter can contain up to 2.44 elements per byte.
  // fpp roughly equal 0.2
  this.maxSize = (int) ((numBuffers * bufferSize) * 2.44);
}

boolean testHash(int hash) {
  setLocation(hash);
  return filter.testHash(hash);
}

/**
 * Set all the bucket memory except bucket header as the bit set of bloom filter, and use hash code of build records
 * to build bloom filter.
 */
final void buildBloomFilterForBucket(int bucketInSegmentPos, MemorySegment bucket, HashPartition<BT, PT> p) {
  final int count = bucket.getShort(bucketInSegmentPos + HEADER_COUNT_OFFSET);
  if (count <= 0) {
    return;
  }
  int[] hashCodes = new int[count];
  // As the hashcode and bloom filter occupy same bytes, so we read all hashcode out at first and then write back to bloom filter.
  for (int i = 0; i < count; i++) {
    hashCodes[i] = bucket.getInt(bucketInSegmentPos + BUCKET_HEADER_LENGTH + i * HASH_CODE_LEN);
  }
  this.bloomFilter.setBitsLocation(bucket, bucketInSegmentPos + BUCKET_HEADER_LENGTH);
  for (int hashCode : hashCodes) {
    this.bloomFilter.addHash(hashCode);
  }
  buildBloomFilterForExtraOverflowSegments(bucketInSegmentPos, bucket, p);
}

byte status = bucket.get(bucketInSegmentOffset + HEADER_STATUS_OFFSET);
if (status == BUCKET_STATUS_IN_FILTER) {
  this.bloomFilter.setBitsLocation(bucket, bucketInSegmentOffset + BUCKET_HEADER_LENGTH);
  if (this.bloomFilter.testHash(hash)) {
    p.insertIntoProbeBuffer(next);

/**
 * Compute the false positive probability based on given input entries and bits size.
 * Note: this is just the math expected value, you should not expect the fpp in real case would under the return value for certain.
 *
 * @param inputEntries
 * @param bitSize
 * @return
 */
public static double estimateFalsePositiveProbability(long inputEntries, int bitSize) {
  int numFunction = optimalNumOfHashFunctions(inputEntries, bitSize);
  double p = Math.pow(Math.E, -(double) numFunction * inputEntries / bitSize);
  double estimatedFPP = Math.pow(1 - p, numFunction);
  return estimatedFPP;
}

/**
 * Set all the bucket memory except bucket header as the bit set of bloom filter, and use hash code of build records
 * to build bloom filter.
 */
final void buildBloomFilterForBucket(int bucketInSegmentPos, MemorySegment bucket, HashPartition<BT, PT> p) {
  final int count = bucket.getShort(bucketInSegmentPos + HEADER_COUNT_OFFSET);
  if (count <= 0) {
    return;
  }
  int[] hashCodes = new int[count];
  // As the hashcode and bloom filter occupy same bytes, so we read all hashcode out at first and then write back to bloom filter.
  for (int i = 0; i < count; i++) {
    hashCodes[i] = bucket.getInt(bucketInSegmentPos + BUCKET_HEADER_LENGTH + i * HASH_CODE_LEN);
  }
  this.bloomFilter.setBitsLocation(bucket, bucketInSegmentPos + BUCKET_HEADER_LENGTH);
  for (int hashCode : hashCodes) {
    this.bloomFilter.addHash(hashCode);
  }
  buildBloomFilterForExtraOverflowSegments(bucketInSegmentPos, bucket, p);
}

byte status = bucket.get(bucketInSegmentOffset + HEADER_STATUS_OFFSET);
if (status == BUCKET_STATUS_IN_FILTER) {
  this.bloomFilter.setBitsLocation(bucket, bucketInSegmentOffset + BUCKET_HEADER_LENGTH);
  if (this.bloomFilter.testHash(hash)) {
    p.insertIntoProbeBuffer(next);

private void initBloomFilter(int numBuckets) {
  int avgNumRecordsPerBucket = getEstimatedMaxBucketEntries(this.availableMemory.size(), this.segmentSize,
    numBuckets, this.avgRecordLen);
  // Assign all bucket size to bloom filter except bucket header length.
  int byteSize = HASH_BUCKET_SIZE - BUCKET_HEADER_LENGTH;
  this.bloomFilter = new BloomFilter(avgNumRecordsPerBucket, byteSize);
  if (LOG.isDebugEnabled()) {
    double fpp = BloomFilter.estimateFalsePositiveProbability(avgNumRecordsPerBucket, byteSize << 3);
    LOG.debug(String.format("Create BloomFilter with average input entries per bucket[%d], bytes size[%d], false positive probability[%f].",
      avgNumRecordsPerBucket, byteSize, fpp));
  }
}

/**
 * Compute the false positive probability based on given input entries and bits size.
 * Note: this is just the math expected value, you should not expect the fpp in real case would under the return value for certain.
 *
 * @param inputEntries
 * @param bitSize
 * @return
 */
public static double estimateFalsePositiveProbability(long inputEntries, int bitSize) {
  int numFunction = optimalNumOfHashFunctions(inputEntries, bitSize);
  double p = Math.pow(Math.E, -(double) numFunction * inputEntries / bitSize);
  double estimatedFPP = Math.pow(1 - p, numFunction);
  return estimatedFPP;
}

this.bloomFilter.addHash(hashCode);

/**
 * Set all the bucket memory except bucket header as the bit set of bloom filter, and use hash code of build records
 * to build bloom filter.
 */
final void buildBloomFilterForBucket(int bucketInSegmentPos, MemorySegment bucket, HashPartition<BT, PT> p) {
  final int count = bucket.getShort(bucketInSegmentPos + HEADER_COUNT_OFFSET);
  if (count <= 0) {
    return;
  }
  int[] hashCodes = new int[count];
  // As the hashcode and bloom filter occupy same bytes, so we read all hashcode out at first and then write back to bloom filter.
  for (int i = 0; i < count; i++) {
    hashCodes[i] = bucket.getInt(bucketInSegmentPos + BUCKET_HEADER_LENGTH + i * HASH_CODE_LEN);
  }
  this.bloomFilter.setBitsLocation(bucket, bucketInSegmentPos + BUCKET_HEADER_LENGTH);
  for (int hashCode : hashCodes) {
    this.bloomFilter.addHash(hashCode);
  }
  buildBloomFilterForExtraOverflowSegments(bucketInSegmentPos, bucket, p);
}

byte status = bucket.get(bucketInSegmentOffset + HEADER_STATUS_OFFSET);
if (status == BUCKET_STATUS_IN_FILTER) {
  this.bloomFilter.setBitsLocation(bucket, bucketInSegmentOffset + BUCKET_HEADER_LENGTH);
  if (this.bloomFilter.testHash(hash)) {
    p.insertIntoProbeBuffer(next);

private void initBloomFilter(int numBuckets) {
  int avgNumRecordsPerBucket = getEstimatedMaxBucketEntries(this.availableMemory.size(), this.segmentSize,
    numBuckets, this.avgRecordLen);
  // Assign all bucket size to bloom filter except bucket header length.
  int byteSize = HASH_BUCKET_SIZE - BUCKET_HEADER_LENGTH;
  this.bloomFilter = new BloomFilter(avgNumRecordsPerBucket, byteSize);
  if (LOG.isDebugEnabled()) {
    double fpp = BloomFilter.estimateFalsePositiveProbability(avgNumRecordsPerBucket, byteSize << 3);
    LOG.debug(String.format("Create BloomFilter with average input entries per bucket[%d], bytes size[%d], false positive probability[%f].",
      avgNumRecordsPerBucket, byteSize, fpp));
  }
}

/**
 * Compute the false positive probability based on given input entries and bits size.
 * Note: this is just the math expected value, you should not expect the fpp in real case would under the return value for certain.
 *
 * @param inputEntries
 * @param bitSize
 * @return
 */
public static double estimateFalsePositiveProbability(long inputEntries, int bitSize) {
  int numFunction = optimalNumOfHashFunctions(inputEntries, bitSize);
  double p = Math.pow(Math.E, -(double) numFunction * inputEntries / bitSize);
  double estimatedFPP = Math.pow(1 - p, numFunction);
  return estimatedFPP;
}

Javadoc

BloomFilter is a probabilistic data structure for set membership check. BloomFilters are highly space efficient when compared to using a HashSet. Because of the probabilistic nature of bloom filter false positive (element not present in bloom filter but test() says true) are possible but false negatives are not possible (if element is present then test() will never say false). The false positive probability is configurable depending on which storage requirement may increase or decrease. Lower the false positive probability greater is the space requirement. Bloom filters are sensitive to number of elements that will be inserted in the bloom filter. During the creation of bloom filter expected number of entries must be specified. If the number of insertions exceed the specified initial number of entries then false positive probability will increase accordingly.

Internally, this implementation of bloom filter uses MemorySegment to store BitSet, BloomFilter and BitSet are designed to be able to switch between different MemorySegments, so that Flink can share the same BloomFilter/BitSet object instance for different bloom filters.

Part of this class refers to the implementation from Apache Hive project https://github.com/apache/hive/blob/master/common/src/java/org/apache/hive/common/util/BloomFilter.java

Most used methods

<init>
addHash
setBitsLocation
testHash
estimateFalsePositiveProbability
Compute the false positive probability based on given input entries and bits size. Note: this is jus
optimalNumOfHashFunctions
compute the optimal hash function number with given input entries and bits size, which would make th

Popular in Java

Finding current android device location
getSharedPreferences (Context)
requestLocationUpdates (LocationManager)
notifyDataSetChanged (ArrayAdapter)
MessageDigest (java.security)
Uses a one-way hash function to turn an arbitrary number of bytes into a fixed-length byte sequence.
Locale (java.util)
Locale represents a language/country/variant combination. Locales are used to alter the presentatio
ExecutorService (java.util.concurrent)
An Executor that provides methods to manage termination and methods that can produce a Future for tr
ZipFile (java.util.zip)
This class provides random read access to a zip file. You pay more to read the zip file's central di
Table (com.google.common.collect)
A collection that associates an ordered pair of keys, called a row key and a column key, with a sing
Window (java.awt)
A Window object is a top-level window with no borders and no menubar. The default layout for a windo
Best IntelliJ plugins

How to useBloomFilter in org.apache.flink.runtime.operators.util

Best Java code snippets using org.apache.flink.runtime.operators.util.BloomFilter (Showing top 20 results out of 315)

How to use
BloomFilter
in
org.apache.flink.runtime.operators.util