@Test public void testBloomFilterPredicateValuesExisting() BloomFilter bloomFilter = new BloomFilter(TEST_VALUES.size() * 10, 0.01); bloomFilter.addLong((Long) o); bloomFilter.addLong((Integer) o); bloomFilter.addString((String) o); bloomFilter.addString(o.toString()); bloomFilter.addString(((Slice) o).toStringUtf8()); bloomFilter.addLong(((Timestamp) o).getTime()); bloomFilter.addDouble((Double) o);
public HiveBloomFilter(BloomFilter bloomFilter) { this.bitSet = new BitSet(bloomFilter.getBitSet().clone()); this.numBits = bloomFilter.getBitSize(); this.numHashFunctions = bloomFilter.getNumHashFunctions(); }
@VisibleForTesting public static boolean checkInBloomFilter(BloomFilter bloomFilter, Object predicateValue, Type sqlType) { if (sqlType == TINYINT || sqlType == SMALLINT || sqlType == INTEGER || sqlType == BIGINT) { return bloomFilter.testLong(((Number) predicateValue).longValue()); } if (sqlType == DOUBLE) { return bloomFilter.testDouble((Double) predicateValue); } if (sqlType instanceof VarcharType || sqlType instanceof VarbinaryType) { return bloomFilter.test(((Slice) predicateValue).getBytes()); } // todo support DECIMAL, FLOAT, DATE, TIMESTAMP, and CHAR return true; }
private BloomFilter createPartsBloomFilter(int maxPartsPerCacheNode, double fpp, List<String> partNames) { BloomFilter bloomFilter = new BloomFilter(maxPartsPerCacheNode, fpp); for (String partName : partNames) { bloomFilter.add(partName.getBytes()); } return bloomFilter; }
throws Exception BloomFilter bloomFilterWrite = new BloomFilter(1000L, 0.05); bloomFilterWrite.addString(TEST_STRING); assertTrue(bloomFilterWrite.testString(TEST_STRING)); bloomFilterBuilder.addAllBitset(Longs.asList(bloomFilterWrite.getBitSet())); bloomFilterBuilder.setNumHashFunctions(bloomFilterWrite.getNumHashFunctions()); assertFalse(bloomFilters.get(0).testString(TEST_STRING_NOT_WRITTEN)); assertEquals(bloomFilterWrite.getBitSize(), bloomFilters.get(0).getBitSize()); assertEquals(bloomFilterWrite.getNumHashFunctions(), bloomFilters.get(0).getNumHashFunctions()); assertTrue(Arrays.equals(bloomFilters.get(0).getBitSet(), bloomFilterWrite.getBitSet())); assertTrue(Arrays.equals(Longs.toArray(bloomFilterRead.getBitsetList()), bloomFilterWrite.getBitSet())); assertEquals(bloomFilterWrite.getNumHashFunctions(), bloomFilterRead.getNumHashFunctions()); assertEquals(bloomFilterWrite.getBitSet().length, bloomFilterRead.getBitsetCount());
public BloomFilterBuf(long expectedEntries, long maxEntries) { if (expectedEntries > maxEntries) { bloomFilter = new BloomFilter(1); } else { bloomFilter = new BloomFilter(expectedEntries); } }
private static OrcProto.BloomFilter toOrcBloomFilter(BloomFilter bloomFilter) { OrcProto.BloomFilter.Builder builder = OrcProto.BloomFilter.newBuilder(); builder.addAllBitset(Longs.asList(bloomFilter.getBitSet())); builder.setNumHashFunctions(bloomFilter.getNumHashFunctions()); return builder.build(); }
@Override public boolean checkValue(ColumnVector columnVector, int idx) { TimestampColumnVector col = (TimestampColumnVector) columnVector; return bloomFilter.testLong(col.time[idx]); } }
@Override protected void processValue(Aggregation myagg, ColumnVector columnVector, int i) { LongColumnVector inputColumn = (LongColumnVector) columnVector; myagg.bf.addLong(inputColumn.vector[i]); } }
this.bloom1 = new BloomFilter(newKeyCount); } else { this.bloom1 = new BloomFilter(newKeyCount, fpp); newKeyCount, bloom1.sizeInBytes())); memoryUsed = bloom1.sizeInBytes();
byte[] bytes = new byte[bw.getLength()]; System.arraycopy(bw.getBytes(), 0, bytes, 0, bw.getLength()); bloomFilter = BloomFilter.deserialize(new ByteArrayInputStream(bytes)); } catch ( IOException e) { throw new HiveException(e); boolean vBoolean = ((BooleanObjectInspector)valObjectInspector). get(arguments[0].get()); return bloomFilter.testLong(vBoolean ? 1 : 0); case BYTE: byte vByte = ((ByteObjectInspector) valObjectInspector). get(arguments[0].get()); return bloomFilter.testLong(vByte); case SHORT: short vShort = ((ShortObjectInspector) valObjectInspector). get(arguments[0].get()); return bloomFilter.testLong(vShort); case INT: int vInt = ((IntObjectInspector) valObjectInspector). get(arguments[0].get()); return bloomFilter.testLong(vInt); case LONG: long vLong = ((LongObjectInspector) valObjectInspector). get(arguments[0].get()); return bloomFilter.testLong(vLong); case FLOAT: float vFloat = ((FloatObjectInspector) valObjectInspector). get(arguments[0].get()); return bloomFilter.testDouble(vFloat);
public Aggregation(long expectedEntries) { try { BloomFilter bf = new BloomFilter(expectedEntries); ByteArrayOutputStream bytesOut = new ByteArrayOutputStream(); BloomFilter.serialize(bytesOut, bf); bfBytes = bytesOut.toByteArray(); } catch (Exception err) { throw new IllegalArgumentException("Error creating aggregation buffer", err); } }
case BOOLEAN: boolean vBoolean = ((BooleanObjectInspector)inputOI).get(parameters[0]); bf.addLong(vBoolean ? 1 : 0); break; case BYTE: byte vByte = ((ByteObjectInspector)inputOI).get(parameters[0]); bf.addLong(vByte); break; case SHORT: short vShort = ((ShortObjectInspector)inputOI).get(parameters[0]); bf.addLong(vShort); break; case INT: int vInt = ((IntObjectInspector)inputOI).get(parameters[0]); bf.addLong(vInt); break; case LONG: long vLong = ((LongObjectInspector)inputOI).get(parameters[0]); bf.addLong(vLong); break; case FLOAT: float vFloat = ((FloatObjectInspector)inputOI).get(parameters[0]); bf.addDouble(vFloat); break; case DOUBLE: double vDouble = ((DoubleObjectInspector)inputOI).get(parameters[0]); bf.addDouble(vDouble); break; case DECIMAL:
public long sizeInBytes() { return getBitSize() / 8; }
/** * Serialize a bloom filter * @param out output stream to write to * @param bloomFilter BloomFilter that needs to be seralized */ public static void serialize(OutputStream out, BloomFilter bloomFilter) throws IOException { /** * Serialized BloomFilter format: * 1 byte for the number of hash functions. * 1 big endian int(That is how OutputStream works) for the number of longs in the bitset * big endian longs in the BloomFilter bitset */ DataOutputStream dataOutputStream = new DataOutputStream(out); dataOutputStream.writeByte(bloomFilter.numHashFunctions); dataOutputStream.writeInt(bloomFilter.getBitSet().length); for (long value : bloomFilter.getBitSet()) { dataOutputStream.writeLong(value); } }
throws Exception BloomFilter bloomFilterWrite = new BloomFilter(1000L, 0.05); bloomFilterWrite.addString(TEST_STRING); assertTrue(bloomFilterWrite.testString(TEST_STRING)); bloomFilterBuilder.addAllBitset(Longs.asList(bloomFilterWrite.getBitSet())); bloomFilterBuilder.setNumHashFunctions(bloomFilterWrite.getNumHashFunctions()); assertFalse(bloomFilters.get(0).testString(TEST_STRING_NOT_WRITTEN)); assertEquals(bloomFilterWrite.getBitSize(), bloomFilters.get(0).getBitSize()); assertEquals(bloomFilterWrite.getNumHashFunctions(), bloomFilters.get(0).getNumHashFunctions()); assertTrue(Arrays.equals(bloomFilters.get(0).getBitSet(), bloomFilterWrite.getBitSet())); assertTrue(Arrays.equals(Longs.toArray(bloomFilterRead.getBitsetList()), bloomFilterWrite.getBitSet())); assertEquals(bloomFilterWrite.getNumHashFunctions(), bloomFilterRead.getNumHashFunctions()); assertEquals(bloomFilterWrite.getBitSet().length, bloomFilterRead.getBitsetCount());
public Aggregation(long expectedEntries) { bf = new BloomFilter(expectedEntries); }