org.apache.hadoop.hive.ql.io.HiveKey java code examples

public static HiveKey copyHiveKey(HiveKey key) {
 HiveKey copy = new HiveKey();
 copy.setDistKeyLength(key.getDistKeyLength());
 copy.setHashCode(key.hashCode());
 copy.set(key);
 return copy;
}

private void writeHiveKey(Output output, HiveKey hiveKey) {
 int size = hiveKey.getLength();
 output.writeInt(size);
 output.writeBytes(hiveKey.getBytes(), 0, size);
 output.writeInt(hiveKey.hashCode());
 output.writeInt(hiveKey.getDistKeyLength());
}

protected void initializeEmptyKey(int tag) {
 // Use the same logic as ReduceSinkOperator.toHiveKey.
 //
 if (tag == -1 || reduceSkipTag) {
  keyWritable.setSize(0);
 } else {
  keyWritable.setSize(1);
  keyWritable.get()[0] = reduceTagByte;
 }
 keyWritable.setDistKeyLength(0);
 keyWritable.setHashCode(0);
}

/**
 * After vectorized batch is processed, can return the key that caused a particular row
 * to be forwarded. Because the row could only be marked to forward because it has
 * the same key with some row already in the heap (for GBY), we can use that key from the
 * heap to emit the forwarded row.
 * @param batchIndex index of the key in the batch.
 * @return The key corresponding to the index.
 */
public HiveKey getVectorizedKeyToForward(int batchIndex) {
 int index = MAY_FORWARD - batchIndexToResult[batchIndex];
 HiveKey hk = new HiveKey();
 hk.set(keys[index], 0, keys[index].length);
 hk.setHashCode(hashes[index]);
 hk.setDistKeyLength(distKeyLengths[index]);
 return hk;
}

public static int getHiveBucket(List<Entry<ObjectInspector, Object>> columnBindings, int bucketCount)
    throws HiveException
{
  GenericUDFHash udf = new GenericUDFHash();
  ObjectInspector[] objectInspectors = new ObjectInspector[columnBindings.size()];
  GenericUDF.DeferredObject[] deferredObjects = new GenericUDF.DeferredObject[columnBindings.size()];
  int i = 0;
  for (Entry<ObjectInspector, Object> entry : columnBindings) {
    objectInspectors[i] = entry.getKey();
    if (entry.getValue() != null && entry.getKey() instanceof JavaHiveVarcharObjectInspector) {
      JavaHiveVarcharObjectInspector varcharObjectInspector = (JavaHiveVarcharObjectInspector) entry.getKey();
      deferredObjects[i] = new GenericUDF.DeferredJavaObject(new HiveVarchar(((String) entry.getValue()), varcharObjectInspector.getMaxLength()));
    }
    else {
      deferredObjects[i] = new GenericUDF.DeferredJavaObject(entry.getValue());
    }
    i++;
  }
  ObjectInspector udfInspector = udf.initialize(objectInspectors);
  IntObjectInspector inspector = (IntObjectInspector) udfInspector;
  Object result = udf.evaluate(deferredObjects);
  HiveKey hiveKey = new HiveKey();
  hiveKey.setHashCode(inspector.get(result));
  return new DefaultHivePartitioner<>().getBucket(hiveKey, null, bucketCount);
}

@Override
public void collect(byte[] key, byte[] value, int hash) throws IOException {
 HiveKey keyWritable = new HiveKey(key, hash);
 BytesWritable valueWritable = new BytesWritable(value);
 collect(keyWritable, valueWritable);
}

private HiveKey readHiveKey(Input input) {
 HiveKey hiveKey = new HiveKey(input.readBytes(input.readInt()), input.readInt());
 hiveKey.setDistKeyLength(input.readInt());
 return hiveKey;
}

int distKeyLength = firstKey.getDistKeyLength();
if (numDistinctExprs > 0) {
 populateCachedDistinctKeys(row, 0);
 hashCode = computeHashCode(row, bucketNumber);
firstKey.setHashCode(hashCode);
 reducerHash.storeValue(firstIndex, firstKey.hashCode(), value, false);
 populateCachedDistinctKeys(row, i);
 HiveKey hiveKey = toHiveKey(cachedKeys[i], tag, distKeyLength);
 hiveKey.setHashCode(hashCode);
 collect(hiveKey, value);

@Override
public void write(Writable r) throws IOException {
 if (keyIsText) {
  Text text = (Text) r;
  keyWritable.set(text.getBytes(), 0, text.getLength());
 } else {
  BytesWritable bw = (BytesWritable) r;
  // Once we drop support for old Hadoop versions, change these
  // to getBytes() and getLength() to fix the deprecation warnings.
  // Not worth a shim.
  keyWritable.set(bw.get(), 0, bw.getSize());
 }
 keyWritable.setHashCode(r.hashCode());
 outStream.append(keyWritable, NULL_WRITABLE);
}

private int findWriterOffset(Object row) throws HiveException {
 if (!multiFileSpray) {
  return 0;
 } else {
  Object[] bucketFieldValues = new Object[partitionEval.length];
  for(int i = 0; i < partitionEval.length; i++) {
   bucketFieldValues[i] = partitionEval[i].evaluate(row);
  }
  int keyHashCode = ObjectInspectorUtils.getBucketHashCode(bucketFieldValues, partitionObjectInspectors);
  key.setHashCode(keyHashCode);
  int bucketNum = prtner.getBucket(key, null, totalFiles);
  return bucketMap.get(bucketNum);
 }
}

/** Helper method which inserts numRecords and retrieves them from cache and verifies */
private void testSpillingHelper(HiveKVResultCache cache, int numRecords) {
 for(int i=0; i<numRecords; i++) {
  String key = "key_" + i;
  String value = "value_" + i;
  cache.add(new HiveKey(key.getBytes(), key.hashCode()), new BytesWritable(value.getBytes()));
 }
 int recordsSeen = 0;
 while(cache.hasNext()) {
  String key = "key_" + recordsSeen;
  String value = "value_" + recordsSeen;
  Tuple2<HiveKey, BytesWritable> row = cache.next();
  assertTrue("Unexpected key at position: " + recordsSeen,
    new String(row._1().getBytes()).equals(key));
  assertTrue("Unexpected value at position: " + recordsSeen,
    new String(row._2().getBytes()).equals(value));
  recordsSeen++;
 }
 assertTrue("Retrieved record count doesn't match inserted record count",
   numRecords == recordsSeen);
 cache.clear();
}

private void writeHiveKey(Output output, HiveKey hiveKey) {
 int size = hiveKey.getLength();
 output.writeInt(size);
 output.writeBytes(hiveKey.getBytes(), 0, size);
 output.writeInt(0); // Since hashCode is not used, just put an arbitrary number
 output.writeInt(hiveKey.getDistKeyLength());
}

public void write(Kryo kryo, Output output, HiveKey object) {
 output.writeVarInt(object.getLength(), true);
 output.write(object.getBytes(), 0, object.getLength());
 output.writeVarInt(object.hashCode(), false);
}

public void collect(HiveKey key, Object value) throws IOException {
 sampled.add(Arrays.copyOfRange(key.getBytes(), 0, key.getLength()));
}

@Test
public void testSimple() throws Exception {
 // Create KV result cache object, add one (k,v) pair and retrieve them.
 HiveKVResultCache cache = new HiveKVResultCache();
 HiveKey key = new HiveKey("key".getBytes(), "key".hashCode());
 BytesWritable value = new BytesWritable("value".getBytes());
 cache.add(key, value);
 assertTrue("KV result cache should have at least one element", cache.hasNext());
 Tuple2<HiveKey, BytesWritable> row = cache.next();
 assertTrue("Incorrect key", row._1().equals(key));
 assertTrue("Incorrect value", row._2().equals(value));
 assertTrue("Cache shouldn't have more records", !cache.hasNext());
}

protected final int computeMurmurHash(HiveKey firstKey) {
 return Murmur3.hash32(firstKey.getBytes(), firstKey.getDistKeyLength(), 0);
}

public void tryStoreVectorizedKey(HiveKey key, boolean partColsIsNull, int batchIndex)
  throws HiveException, IOException {
 _tryStoreKey(key, partColsIsNull, batchIndex);
 if ( partColsIsNull ) {
  indexesWithNullPartKey.add(batchIndex);
 }
 batchIndexToResult[batchIndex] = key.hashCode();
}

/**
 * Try store the non-vectorized key.
 * @param key Serialized key.
 * @return TopNHash.FORWARD if the row should be forwarded;
 *         TopNHash.EXCLUDED if the row should be discarded;
 *         any other number if the row is to be stored; the index should be passed to storeValue.
 */
public int tryStoreKey(HiveKey key, boolean partColsIsNull) throws HiveException, IOException {
 if (!isEnabled) {
  return FORWARD; // short-circuit quickly - forward all rows
 }
 if (topN == 0) {
  return EXCLUDE; // short-circuit quickly - eat all rows
 }
 int index = insertKeyIntoHeap(key);
 if (index >= 0) {
  usage += key.getLength();
  return index;
 }
 // IndexStore is trying to tell us something.
 switch (index) {
  case FORWARD:  return FORWARD;
  case EXCLUDE: return EXCLUDE; // skip the row.
  default: {
   assert false;
   throw new HiveException("Invalid result trying to store the key: " + index);
  }
 }
}

public static Optional<HiveBucket> getHiveBucket(List<Entry<ObjectInspector, Object>> columnBindings, int bucketCount)
{
  try {
    @SuppressWarnings("resource")
    GenericUDFHash udf = new GenericUDFHash();
    ObjectInspector[] objectInspectors = new ObjectInspector[columnBindings.size()];
    DeferredObject[] deferredObjects = new DeferredObject[columnBindings.size()];
    int i = 0;
    for (Entry<ObjectInspector, Object> entry : columnBindings) {
      objectInspectors[i] = getJavaObjectInspector(entry.getKey());
      deferredObjects[i] = getJavaDeferredObject(entry.getValue(), entry.getKey());
      i++;
    }
    ObjectInspector udfInspector = udf.initialize(objectInspectors);
    IntObjectInspector inspector = checkType(udfInspector, IntObjectInspector.class, "udfInspector");
    Object result = udf.evaluate(deferredObjects);
    HiveKey hiveKey = new HiveKey();
    hiveKey.setHashCode(inspector.get(result));
    int bucketNumber = new DefaultHivePartitioner<>().getBucket(hiveKey, null, bucketCount);
    return Optional.of(new HiveBucket(bucketNumber, bucketCount));
  }
  catch (HiveException e) {
    log.debug(e, "Error evaluating bucket number");
    return Optional.empty();
  }
}

/**
 * After vectorized batch is processed, can return the key that caused a particular row
 * to be forwarded. Because the row could only be marked to forward because it has
 * the same key with some row already in the heap (for GBY), we can use that key from the
 * heap to emit the forwarded row.
 * @param batchIndex index of the key in the batch.
 * @return The key corresponding to the index.
 */
public HiveKey getVectorizedKeyToForward(int batchIndex) {
 int index = MAY_FORWARD - batchIndexToResult[batchIndex];
 HiveKey hk = new HiveKey();
 hk.set(keys[index], 0, keys[index].length);
 hk.setHashCode(hashes[index]);
 hk.setDistKeyLength(distKeyLengths[index]);
 return hk;
}

Javadoc

HiveKey is a simple wrapper on Text which allows us to set the hashCode easily. hashCode is used for hadoop partitioner.

Most used methods

equals

Popular in Java

Parsing JSON documents to java classes using gson
addToBackStack (FragmentTransaction)
findViewById (Activity)
getResourceAsStream (ClassLoader)
File (java.io)
An "abstract" representation of a file system entity identified by a pathname. The pathname may be a
FileNotFoundException (java.io)
Thrown when a file specified by a program cannot be found.
InputStreamReader (java.io)
A class for turning a byte stream into a character stream. Data read from the source input stream is
StringTokenizer (java.util)
Breaks a string into tokens; new code should probably use String#split.> // Legacy code: StringTo
Executor (java.util.concurrent)
An object that executes submitted Runnable tasks. This interface provides a way of decoupling task s
Reference (javax.naming)
CodeWhisperer alternatives

How to useHiveKey in org.apache.hadoop.hive.ql.io

Best Java code snippets using org.apache.hadoop.hive.ql.io.HiveKey (Showing top 20 results out of 315)

How to use
HiveKey
in
org.apache.hadoop.hive.ql.io