public static HiveKey copyHiveKey(HiveKey key) { HiveKey copy = new HiveKey(); copy.setDistKeyLength(key.getDistKeyLength()); copy.setHashCode(key.hashCode()); copy.set(key); return copy; }
private void writeHiveKey(Output output, HiveKey hiveKey) { int size = hiveKey.getLength(); output.writeInt(size); output.writeBytes(hiveKey.getBytes(), 0, size); output.writeInt(hiveKey.hashCode()); output.writeInt(hiveKey.getDistKeyLength()); }
protected void initializeEmptyKey(int tag) { // Use the same logic as ReduceSinkOperator.toHiveKey. // if (tag == -1 || reduceSkipTag) { keyWritable.setSize(0); } else { keyWritable.setSize(1); keyWritable.get()[0] = reduceTagByte; } keyWritable.setDistKeyLength(0); keyWritable.setHashCode(0); }
/** * After vectorized batch is processed, can return the key that caused a particular row * to be forwarded. Because the row could only be marked to forward because it has * the same key with some row already in the heap (for GBY), we can use that key from the * heap to emit the forwarded row. * @param batchIndex index of the key in the batch. * @return The key corresponding to the index. */ public HiveKey getVectorizedKeyToForward(int batchIndex) { int index = MAY_FORWARD - batchIndexToResult[batchIndex]; HiveKey hk = new HiveKey(); hk.set(keys[index], 0, keys[index].length); hk.setHashCode(hashes[index]); hk.setDistKeyLength(distKeyLengths[index]); return hk; }
public static int getHiveBucket(List<Entry<ObjectInspector, Object>> columnBindings, int bucketCount) throws HiveException { GenericUDFHash udf = new GenericUDFHash(); ObjectInspector[] objectInspectors = new ObjectInspector[columnBindings.size()]; GenericUDF.DeferredObject[] deferredObjects = new GenericUDF.DeferredObject[columnBindings.size()]; int i = 0; for (Entry<ObjectInspector, Object> entry : columnBindings) { objectInspectors[i] = entry.getKey(); if (entry.getValue() != null && entry.getKey() instanceof JavaHiveVarcharObjectInspector) { JavaHiveVarcharObjectInspector varcharObjectInspector = (JavaHiveVarcharObjectInspector) entry.getKey(); deferredObjects[i] = new GenericUDF.DeferredJavaObject(new HiveVarchar(((String) entry.getValue()), varcharObjectInspector.getMaxLength())); } else { deferredObjects[i] = new GenericUDF.DeferredJavaObject(entry.getValue()); } i++; } ObjectInspector udfInspector = udf.initialize(objectInspectors); IntObjectInspector inspector = (IntObjectInspector) udfInspector; Object result = udf.evaluate(deferredObjects); HiveKey hiveKey = new HiveKey(); hiveKey.setHashCode(inspector.get(result)); return new DefaultHivePartitioner<>().getBucket(hiveKey, null, bucketCount); }
@Override public void collect(byte[] key, byte[] value, int hash) throws IOException { HiveKey keyWritable = new HiveKey(key, hash); BytesWritable valueWritable = new BytesWritable(value); collect(keyWritable, valueWritable); }
private HiveKey readHiveKey(Input input) { HiveKey hiveKey = new HiveKey(input.readBytes(input.readInt()), input.readInt()); hiveKey.setDistKeyLength(input.readInt()); return hiveKey; }
int distKeyLength = firstKey.getDistKeyLength(); if (numDistinctExprs > 0) { populateCachedDistinctKeys(row, 0); hashCode = computeHashCode(row, bucketNumber); firstKey.setHashCode(hashCode); reducerHash.storeValue(firstIndex, firstKey.hashCode(), value, false); populateCachedDistinctKeys(row, i); HiveKey hiveKey = toHiveKey(cachedKeys[i], tag, distKeyLength); hiveKey.setHashCode(hashCode); collect(hiveKey, value);
@Override public void write(Writable r) throws IOException { if (keyIsText) { Text text = (Text) r; keyWritable.set(text.getBytes(), 0, text.getLength()); } else { BytesWritable bw = (BytesWritable) r; // Once we drop support for old Hadoop versions, change these // to getBytes() and getLength() to fix the deprecation warnings. // Not worth a shim. keyWritable.set(bw.get(), 0, bw.getSize()); } keyWritable.setHashCode(r.hashCode()); outStream.append(keyWritable, NULL_WRITABLE); }
private int findWriterOffset(Object row) throws HiveException { if (!multiFileSpray) { return 0; } else { Object[] bucketFieldValues = new Object[partitionEval.length]; for(int i = 0; i < partitionEval.length; i++) { bucketFieldValues[i] = partitionEval[i].evaluate(row); } int keyHashCode = ObjectInspectorUtils.getBucketHashCode(bucketFieldValues, partitionObjectInspectors); key.setHashCode(keyHashCode); int bucketNum = prtner.getBucket(key, null, totalFiles); return bucketMap.get(bucketNum); } }
/** Helper method which inserts numRecords and retrieves them from cache and verifies */ private void testSpillingHelper(HiveKVResultCache cache, int numRecords) { for(int i=0; i<numRecords; i++) { String key = "key_" + i; String value = "value_" + i; cache.add(new HiveKey(key.getBytes(), key.hashCode()), new BytesWritable(value.getBytes())); } int recordsSeen = 0; while(cache.hasNext()) { String key = "key_" + recordsSeen; String value = "value_" + recordsSeen; Tuple2<HiveKey, BytesWritable> row = cache.next(); assertTrue("Unexpected key at position: " + recordsSeen, new String(row._1().getBytes()).equals(key)); assertTrue("Unexpected value at position: " + recordsSeen, new String(row._2().getBytes()).equals(value)); recordsSeen++; } assertTrue("Retrieved record count doesn't match inserted record count", numRecords == recordsSeen); cache.clear(); }
private void writeHiveKey(Output output, HiveKey hiveKey) { int size = hiveKey.getLength(); output.writeInt(size); output.writeBytes(hiveKey.getBytes(), 0, size); output.writeInt(0); // Since hashCode is not used, just put an arbitrary number output.writeInt(hiveKey.getDistKeyLength()); }
@Test public void testSimple() throws Exception { // Create KV result cache object, add one (k,v) pair and retrieve them. HiveKVResultCache cache = new HiveKVResultCache(); HiveKey key = new HiveKey("key".getBytes(), "key".hashCode()); BytesWritable value = new BytesWritable("value".getBytes()); cache.add(key, value); assertTrue("KV result cache should have at least one element", cache.hasNext()); Tuple2<HiveKey, BytesWritable> row = cache.next(); assertTrue("Incorrect key", row._1().equals(key)); assertTrue("Incorrect value", row._2().equals(value)); assertTrue("Cache shouldn't have more records", !cache.hasNext()); }
protected final int computeMurmurHash(HiveKey firstKey) { return Murmur3.hash32(firstKey.getBytes(), firstKey.getDistKeyLength(), 0); }
public void tryStoreVectorizedKey(HiveKey key, boolean partColsIsNull, int batchIndex) throws HiveException, IOException { _tryStoreKey(key, partColsIsNull, batchIndex); if ( partColsIsNull ) { indexesWithNullPartKey.add(batchIndex); } batchIndexToResult[batchIndex] = key.hashCode(); }
/** * Try store the non-vectorized key. * @param key Serialized key. * @return TopNHash.FORWARD if the row should be forwarded; * TopNHash.EXCLUDED if the row should be discarded; * any other number if the row is to be stored; the index should be passed to storeValue. */ public int tryStoreKey(HiveKey key, boolean partColsIsNull) throws HiveException, IOException { if (!isEnabled) { return FORWARD; // short-circuit quickly - forward all rows } if (topN == 0) { return EXCLUDE; // short-circuit quickly - eat all rows } int index = insertKeyIntoHeap(key); if (index >= 0) { usage += key.getLength(); return index; } // IndexStore is trying to tell us something. switch (index) { case FORWARD: return FORWARD; case EXCLUDE: return EXCLUDE; // skip the row. default: { assert false; throw new HiveException("Invalid result trying to store the key: " + index); } } }
public static Optional<HiveBucket> getHiveBucket(List<Entry<ObjectInspector, Object>> columnBindings, int bucketCount) { try { @SuppressWarnings("resource") GenericUDFHash udf = new GenericUDFHash(); ObjectInspector[] objectInspectors = new ObjectInspector[columnBindings.size()]; DeferredObject[] deferredObjects = new DeferredObject[columnBindings.size()]; int i = 0; for (Entry<ObjectInspector, Object> entry : columnBindings) { objectInspectors[i] = getJavaObjectInspector(entry.getKey()); deferredObjects[i] = getJavaDeferredObject(entry.getValue(), entry.getKey()); i++; } ObjectInspector udfInspector = udf.initialize(objectInspectors); IntObjectInspector inspector = checkType(udfInspector, IntObjectInspector.class, "udfInspector"); Object result = udf.evaluate(deferredObjects); HiveKey hiveKey = new HiveKey(); hiveKey.setHashCode(inspector.get(result)); int bucketNumber = new DefaultHivePartitioner<>().getBucket(hiveKey, null, bucketCount); return Optional.of(new HiveBucket(bucketNumber, bucketCount)); } catch (HiveException e) { log.debug(e, "Error evaluating bucket number"); return Optional.empty(); } }
/** * After vectorized batch is processed, can return the key that caused a particular row * to be forwarded. Because the row could only be marked to forward because it has * the same key with some row already in the heap (for GBY), we can use that key from the * heap to emit the forwarded row. * @param batchIndex index of the key in the batch. * @return The key corresponding to the index. */ public HiveKey getVectorizedKeyToForward(int batchIndex) { int index = MAY_FORWARD - batchIndexToResult[batchIndex]; HiveKey hk = new HiveKey(); hk.set(keys[index], 0, keys[index].length); hk.setHashCode(hashes[index]); hk.setDistKeyLength(distKeyLengths[index]); return hk; }