org.apache.hive.common.util.BloomKFilter java code examples

public BloomFilterBuf(long expectedEntries, long maxEntries) {
 if (expectedEntries > maxEntries) {
  bloomFilter = new BloomKFilter(maxEntries);
 } else {
  bloomFilter = new BloomKFilter(expectedEntries);
 }
}

 @Override
 protected void processValue(Aggregation myagg, ColumnVector columnVector, int i) {
  BytesColumnVector inputColumn = (BytesColumnVector) columnVector;
  myagg.bf.addBytes(inputColumn.vector[i], inputColumn.start[i], inputColumn.length[i]);
 }
}

 @Override
 protected void processValue(Aggregation myagg, ColumnVector columnVector, int i) {
  TimestampColumnVector inputColumn = (TimestampColumnVector) columnVector;
  myagg.bf.addLong(inputColumn.time[i]);
 }
}

public BloomKFilter(long maxNumEntries) {
 checkArgument(maxNumEntries > 0, "expectedEntries should be > 0");
 long numBits = optimalNumOfBits(maxNumEntries, DEFAULT_FPP);
 this.k = optimalNumOfHashFunctions(maxNumEntries, numBits);
 long nLongs = (long) Math.ceil((double) numBits / (double) Long.SIZE);
 // additional bits to pad long array to block size
 long padLongs = DEFAULT_BLOCK_SIZE - nLongs % DEFAULT_BLOCK_SIZE;
 this.m = (nLongs + padLongs) * Long.SIZE;
 this.bitSet = new BitSet(m);
 checkArgument((bitSet.data.length % DEFAULT_BLOCK_SIZE) == 0, "bitSet has to be block aligned");
 this.totalBlockCount = bitSet.data.length / DEFAULT_BLOCK_SIZE;
}

public Aggregation(long expectedEntries) {
 ByteArrayOutputStream bytesOut = null;
 try {
  BloomKFilter bf = new BloomKFilter(expectedEntries);
  bytesOut = new ByteArrayOutputStream();
  BloomKFilter.serialize(bytesOut, bf);
  bfBytes = bytesOut.toByteArray();
 } catch (Exception err) {
  throw new IllegalArgumentException("Error creating aggregation buffer", err);
 } finally {
  IOUtils.closeStream(bytesOut);
 }
}

 System.arraycopy(bw.getBytes(), 0, bytes, 0, bw.getLength());
 in = new NonSyncByteArrayInputStream(bytes);
 bloomFilter = BloomKFilter.deserialize(in);
} catch ( IOException e) {
 throw new HiveException(e);
 boolean vBoolean = ((BooleanObjectInspector)valObjectInspector).
     get(arguments[0].get());
 return bloomFilter.testLong(vBoolean ? 1 : 0);
case BYTE:
 byte vByte = ((ByteObjectInspector) valObjectInspector).
     get(arguments[0].get());
 return bloomFilter.testLong(vByte);
case SHORT:
 short vShort = ((ShortObjectInspector) valObjectInspector).
     get(arguments[0].get());
 return bloomFilter.testLong(vShort);
case INT:
 int vInt = ((IntObjectInspector) valObjectInspector).
     get(arguments[0].get());
 return bloomFilter.testLong(vInt);
case LONG:
 long vLong = ((LongObjectInspector) valObjectInspector).
     get(arguments[0].get());
 return bloomFilter.testLong(vLong);
case FLOAT:
 float vFloat = ((FloatObjectInspector) valObjectInspector).
     get(arguments[0].get());
 return  bloomFilter.testDouble(vFloat);

 @Override
 public boolean checkValue(ColumnVector columnVector, int idx) {
  BytesColumnVector col = (BytesColumnVector) columnVector;
  return bloomFilter.testBytes(col.vector[idx], col.start[idx], col.length[idx]);
 }
}

case BOOLEAN:
 boolean vBoolean = ((BooleanObjectInspector)inputOI).get(parameters[0]);
 bf.addLong(vBoolean ? 1 : 0);
 break;
case BYTE:
 byte vByte = ((ByteObjectInspector)inputOI).get(parameters[0]);
 bf.addLong(vByte);
 break;
case SHORT:
 short vShort = ((ShortObjectInspector)inputOI).get(parameters[0]);
 bf.addLong(vShort);
 break;
case INT:
 int vInt = ((IntObjectInspector)inputOI).get(parameters[0]);
 bf.addLong(vInt);
 break;
case LONG:
 long vLong = ((LongObjectInspector)inputOI).get(parameters[0]);
 bf.addLong(vLong);
 break;
case FLOAT:
 float vFloat = ((FloatObjectInspector)inputOI).get(parameters[0]);
 bf.addDouble(vFloat);
 break;
case DOUBLE:
 double vDouble = ((DoubleObjectInspector)inputOI).get(parameters[0]);
 bf.addDouble(vDouble);
 break;
case DECIMAL:

 @Override
 public boolean checkValue(ColumnVector columnVector, int idx) {
  LongColumnVector col = (LongColumnVector) columnVector;
  return bloomFilter.testLong(col.vector[idx]);
 }
}

/**
 * Serialize a bloom filter:
 * Serialized BloomKFilter format:
 * 1 byte for the number of hash functions.
 * 1 big endian int(That is how OutputStream works) for the number of longs in the bitset
 * big endian longs in the BloomKFilter bitset
 *
 * @param out         output stream to write to
 * @param bloomFilter BloomKFilter that needs to be serialized
 */
public static void serialize(OutputStream out, BloomKFilter bloomFilter) throws IOException {
 DataOutputStream dataOutputStream = new DataOutputStream(out);
 dataOutputStream.writeByte(bloomFilter.k);
 dataOutputStream.writeInt(bloomFilter.getBitSet().length);
 for (long value : bloomFilter.getBitSet()) {
  dataOutputStream.writeLong(value);
 }
}

public void addInt(int val) {
 // puts int in little endian order
 addBytes(intToByteArrayLE(val));
}

public boolean testInt(int val) {
 return testBytes(intToByteArrayLE(val));
}

public void addBytes(byte[] val, int offset, int length) {
 // We use the trick mentioned in "Less Hashing, Same Performance: Building a Better Bloom Filter"
 // by Kirsch et.al. From abstract 'only two hash functions are necessary to effectively
 // implement a Bloom filter without any loss in the asymptotic false positive probability'
 // Lets split up 64-bit hashcode into two 32-bit hash codes and employ the technique mentioned
 // in the above paper
 long hash64 = val == null ? Murmur3.NULL_HASHCODE :
  Murmur3.hash64(val, offset, length);
 addHash(hash64);
}

/**
 * A constructor to support rebuilding the BloomFilter from a serialized representation.
 * @param bits BloomK sketch data in form of array of longs.
 * @param numFuncs  Number of functions called as K.
 */
public BloomKFilter(long[] bits, int numFuncs) {
 super();
 bitSet = new BitSet(bits);
 this.m = bits.length * Long.SIZE;
 this.k = numFuncs;
 checkArgument((bitSet.data.length % DEFAULT_BLOCK_SIZE) == 0, "bitSet has to be block aligned");
 this.totalBlockCount = bitSet.data.length / DEFAULT_BLOCK_SIZE;
}
static int optimalNumOfHashFunctions(long n, long m) {

public void addFloat(float val) {
 addInt(Float.floatToIntBits(val));
}

 @Override
 protected void processValue(Aggregation myagg, ColumnVector columnVector, int i) {
  DoubleColumnVector inputColumn = (DoubleColumnVector) columnVector;
  myagg.bf.addDouble(inputColumn.vector[i]);
 }
}

public boolean testBytes(byte[] val) {
 return testBytes(val, 0, val.length);
}

public BloomKFilter(long maxNumEntries) {
 checkArgument(maxNumEntries > 0, "expectedEntries should be > 0");
 long numBits = optimalNumOfBits(maxNumEntries, DEFAULT_FPP);
 this.k = optimalNumOfHashFunctions(maxNumEntries, numBits);
 int nLongs = (int) Math.ceil((double) numBits / (double) Long.SIZE);
 // additional bits to pad long array to block size
 int padLongs = DEFAULT_BLOCK_SIZE - nLongs % DEFAULT_BLOCK_SIZE;
 this.m = (nLongs + padLongs) * Long.SIZE;
 this.bitSet = new BitSet(m);
 checkArgument((bitSet.data.length % DEFAULT_BLOCK_SIZE) == 0, "bitSet has to be block aligned");
 this.totalBlockCount = bitSet.data.length / DEFAULT_BLOCK_SIZE;
}

 @Override
 public boolean checkValue(ColumnVector columnVector, int idx) {
  TimestampColumnVector col = (TimestampColumnVector) columnVector;
  return bloomFilter.testLong(col.time[idx]);
 }
}

@Override
public long getAggregationBufferFixedSize() {
 if (bitSetSize < 0) {
  // Not pretty, but we need a way to get the size
  try {
   Aggregation agg = (Aggregation) getNewAggregationBuffer();
   bitSetSize = agg.bf.getBitSet().length;
  } catch (Exception e) {
   throw new RuntimeException("Unexpected error while creating AggregationBuffer", e);
  }
 }
 // BloomFilter: object(BitSet: object(data: long[]), numBits: int, numHashFunctions: int)
 JavaDataModel model = JavaDataModel.get();
 long bloomFilterSize = JavaDataModel.alignUp(model.object() + model.lengthForLongArrayOfSize(bitSetSize),
   model.memoryAlign());
 return JavaDataModel.alignUp(
   model.object() + bloomFilterSize + model.primitive1() + model.primitive1(),
   model.memoryAlign());
}

Javadoc

BloomKFilter is variation of BloomFilter. Unlike BloomFilter, BloomKFilter will spread 'k' hash bits within same cache line for better L1 cache performance. The way it works is, First hash code is computed from key which is used to locate the block offset (n-longs in bitset constitute a block) Subsequent 'k' hash codes are used to spread hash bits within the block. By default block size is chosen as 8, which is to match cache line size (8 longs = 64 bytes = cache line size). Refer BloomKFilter#addBytes(byte[]) for more info. This implementation has much lesser L1 data cache misses than BloomFilter.

Most used methods

<init>
A constructor to support rebuilding the BloomFilter from a serialized representation.
addBytes
addLong
getBitSet
testBytes
testLong
addDouble
addHash
addInt
checkArgument
deserialize
Deserialize a bloom filter Read a byte stream, which was written by #serialize(OutputStream,BloomKFi
getBitSize

Popular in Java

Start an intent from android
runOnUiThread (Activity)
getContentResolver (Context)
orElseThrow (Optional)
Return the contained value, if present, otherwise throw an exception to be created by the provided s
IOException (java.io)
Signals a general, I/O-related error. Error details may be specified when calling the constructor, a
HashSet (java.util)
HashSet is an implementation of a Set. All optional operations (adding and removing) are supported.
ConcurrentHashMap (java.util.concurrent)
A plug-in replacement for JDK1.5 java.util.concurrent.ConcurrentHashMap. This version is based on or
LogFactory (org.apache.commons.logging)
Factory for creating Log instances, with discovery and configuration features similar to that employ
JComboBox (javax.swing)
JTextField (javax.swing)
From CI to AI: The AI layer in your organization

How to useBloomKFilter in org.apache.hive.common.util

Best Java code snippets using org.apache.hive.common.util.BloomKFilter (Showing top 20 results out of 315)

How to use
BloomKFilter
in
org.apache.hive.common.util