public BloomFilterBuf(long expectedEntries, long maxEntries) { if (expectedEntries > maxEntries) { bloomFilter = new BloomKFilter(maxEntries); } else { bloomFilter = new BloomKFilter(expectedEntries); } }
@Override protected void processValue(Aggregation myagg, ColumnVector columnVector, int i) { BytesColumnVector inputColumn = (BytesColumnVector) columnVector; myagg.bf.addBytes(inputColumn.vector[i], inputColumn.start[i], inputColumn.length[i]); } }
@Override protected void processValue(Aggregation myagg, ColumnVector columnVector, int i) { TimestampColumnVector inputColumn = (TimestampColumnVector) columnVector; myagg.bf.addLong(inputColumn.time[i]); } }
public BloomKFilter(long maxNumEntries) { checkArgument(maxNumEntries > 0, "expectedEntries should be > 0"); long numBits = optimalNumOfBits(maxNumEntries, DEFAULT_FPP); this.k = optimalNumOfHashFunctions(maxNumEntries, numBits); long nLongs = (long) Math.ceil((double) numBits / (double) Long.SIZE); // additional bits to pad long array to block size long padLongs = DEFAULT_BLOCK_SIZE - nLongs % DEFAULT_BLOCK_SIZE; this.m = (nLongs + padLongs) * Long.SIZE; this.bitSet = new BitSet(m); checkArgument((bitSet.data.length % DEFAULT_BLOCK_SIZE) == 0, "bitSet has to be block aligned"); this.totalBlockCount = bitSet.data.length / DEFAULT_BLOCK_SIZE; }
public Aggregation(long expectedEntries) { ByteArrayOutputStream bytesOut = null; try { BloomKFilter bf = new BloomKFilter(expectedEntries); bytesOut = new ByteArrayOutputStream(); BloomKFilter.serialize(bytesOut, bf); bfBytes = bytesOut.toByteArray(); } catch (Exception err) { throw new IllegalArgumentException("Error creating aggregation buffer", err); } finally { IOUtils.closeStream(bytesOut); } }
System.arraycopy(bw.getBytes(), 0, bytes, 0, bw.getLength()); in = new NonSyncByteArrayInputStream(bytes); bloomFilter = BloomKFilter.deserialize(in); } catch ( IOException e) { throw new HiveException(e); boolean vBoolean = ((BooleanObjectInspector)valObjectInspector). get(arguments[0].get()); return bloomFilter.testLong(vBoolean ? 1 : 0); case BYTE: byte vByte = ((ByteObjectInspector) valObjectInspector). get(arguments[0].get()); return bloomFilter.testLong(vByte); case SHORT: short vShort = ((ShortObjectInspector) valObjectInspector). get(arguments[0].get()); return bloomFilter.testLong(vShort); case INT: int vInt = ((IntObjectInspector) valObjectInspector). get(arguments[0].get()); return bloomFilter.testLong(vInt); case LONG: long vLong = ((LongObjectInspector) valObjectInspector). get(arguments[0].get()); return bloomFilter.testLong(vLong); case FLOAT: float vFloat = ((FloatObjectInspector) valObjectInspector). get(arguments[0].get()); return bloomFilter.testDouble(vFloat);
@Override public boolean checkValue(ColumnVector columnVector, int idx) { BytesColumnVector col = (BytesColumnVector) columnVector; return bloomFilter.testBytes(col.vector[idx], col.start[idx], col.length[idx]); } }
case BOOLEAN: boolean vBoolean = ((BooleanObjectInspector)inputOI).get(parameters[0]); bf.addLong(vBoolean ? 1 : 0); break; case BYTE: byte vByte = ((ByteObjectInspector)inputOI).get(parameters[0]); bf.addLong(vByte); break; case SHORT: short vShort = ((ShortObjectInspector)inputOI).get(parameters[0]); bf.addLong(vShort); break; case INT: int vInt = ((IntObjectInspector)inputOI).get(parameters[0]); bf.addLong(vInt); break; case LONG: long vLong = ((LongObjectInspector)inputOI).get(parameters[0]); bf.addLong(vLong); break; case FLOAT: float vFloat = ((FloatObjectInspector)inputOI).get(parameters[0]); bf.addDouble(vFloat); break; case DOUBLE: double vDouble = ((DoubleObjectInspector)inputOI).get(parameters[0]); bf.addDouble(vDouble); break; case DECIMAL:
@Override public boolean checkValue(ColumnVector columnVector, int idx) { LongColumnVector col = (LongColumnVector) columnVector; return bloomFilter.testLong(col.vector[idx]); } }
/** * Serialize a bloom filter: * Serialized BloomKFilter format: * 1 byte for the number of hash functions. * 1 big endian int(That is how OutputStream works) for the number of longs in the bitset * big endian longs in the BloomKFilter bitset * * @param out output stream to write to * @param bloomFilter BloomKFilter that needs to be serialized */ public static void serialize(OutputStream out, BloomKFilter bloomFilter) throws IOException { DataOutputStream dataOutputStream = new DataOutputStream(out); dataOutputStream.writeByte(bloomFilter.k); dataOutputStream.writeInt(bloomFilter.getBitSet().length); for (long value : bloomFilter.getBitSet()) { dataOutputStream.writeLong(value); } }
public void addInt(int val) { // puts int in little endian order addBytes(intToByteArrayLE(val)); }
public boolean testInt(int val) { return testBytes(intToByteArrayLE(val)); }
public void addBytes(byte[] val, int offset, int length) { // We use the trick mentioned in "Less Hashing, Same Performance: Building a Better Bloom Filter" // by Kirsch et.al. From abstract 'only two hash functions are necessary to effectively // implement a Bloom filter without any loss in the asymptotic false positive probability' // Lets split up 64-bit hashcode into two 32-bit hash codes and employ the technique mentioned // in the above paper long hash64 = val == null ? Murmur3.NULL_HASHCODE : Murmur3.hash64(val, offset, length); addHash(hash64); }
/** * A constructor to support rebuilding the BloomFilter from a serialized representation. * @param bits BloomK sketch data in form of array of longs. * @param numFuncs Number of functions called as K. */ public BloomKFilter(long[] bits, int numFuncs) { super(); bitSet = new BitSet(bits); this.m = bits.length * Long.SIZE; this.k = numFuncs; checkArgument((bitSet.data.length % DEFAULT_BLOCK_SIZE) == 0, "bitSet has to be block aligned"); this.totalBlockCount = bitSet.data.length / DEFAULT_BLOCK_SIZE; } static int optimalNumOfHashFunctions(long n, long m) {
public void addFloat(float val) { addInt(Float.floatToIntBits(val)); }
@Override protected void processValue(Aggregation myagg, ColumnVector columnVector, int i) { DoubleColumnVector inputColumn = (DoubleColumnVector) columnVector; myagg.bf.addDouble(inputColumn.vector[i]); } }
public boolean testBytes(byte[] val) { return testBytes(val, 0, val.length); }
public BloomKFilter(long maxNumEntries) { checkArgument(maxNumEntries > 0, "expectedEntries should be > 0"); long numBits = optimalNumOfBits(maxNumEntries, DEFAULT_FPP); this.k = optimalNumOfHashFunctions(maxNumEntries, numBits); int nLongs = (int) Math.ceil((double) numBits / (double) Long.SIZE); // additional bits to pad long array to block size int padLongs = DEFAULT_BLOCK_SIZE - nLongs % DEFAULT_BLOCK_SIZE; this.m = (nLongs + padLongs) * Long.SIZE; this.bitSet = new BitSet(m); checkArgument((bitSet.data.length % DEFAULT_BLOCK_SIZE) == 0, "bitSet has to be block aligned"); this.totalBlockCount = bitSet.data.length / DEFAULT_BLOCK_SIZE; }
@Override public boolean checkValue(ColumnVector columnVector, int idx) { TimestampColumnVector col = (TimestampColumnVector) columnVector; return bloomFilter.testLong(col.time[idx]); } }
@Override public long getAggregationBufferFixedSize() { if (bitSetSize < 0) { // Not pretty, but we need a way to get the size try { Aggregation agg = (Aggregation) getNewAggregationBuffer(); bitSetSize = agg.bf.getBitSet().length; } catch (Exception e) { throw new RuntimeException("Unexpected error while creating AggregationBuffer", e); } } // BloomFilter: object(BitSet: object(data: long[]), numBits: int, numHashFunctions: int) JavaDataModel model = JavaDataModel.get(); long bloomFilterSize = JavaDataModel.alignUp(model.object() + model.lengthForLongArrayOfSize(bitSetSize), model.memoryAlign()); return JavaDataModel.alignUp( model.object() + bloomFilterSize + model.primitive1() + model.primitive1(), model.memoryAlign()); }