com.uber.hoodie.common.model.HoodieRecord.getRecordKey java code examples

public static String getRowKeyFromCellRecord(@NonNull final HoodieRecord cellRecord) {
  final String cellRecordKey = cellRecord.getRecordKey();
  return cellRecordKey.substring(0, cellRecordKey.indexOf(StringTypes.HASHTAG));
}

private void writeToBuffer(HoodieRecord<T> record) {
 // update the new location of the record, so we know where to find it next
 record.setNewLocation(new HoodieRecordLocation(commitTime, fileId));
 Optional<IndexedRecord> indexedRecord = getIndexedRecord(record);
 if (indexedRecord.isPresent()) {
  recordList.add(indexedRecord.get());
 } else {
  keysToDelete.add(record.getRecordKey());
 }
 numberOfRecords++;
}

 public HoodieRecordMissingException(HoodieRecord record) {
  super(
    "Record " + record.getRecordKey() + " with partition path " + record.getPartitionPath()
      + " in current location " + record.getCurrentLocation()
      + " is not found in the partition");
 }
}

public static Set<String> getRecordKeys(List<HoodieRecord> hoodieRecords) {
 Set<String> keys = new HashSet<>();
 for (HoodieRecord rec : hoodieRecords) {
  keys.add(rec.getRecordKey());
 }
 return keys;
}

@Override
public void writeAvroWithMetadata(R avroRecord, HoodieRecord record) throws IOException {
 String seqId = HoodieRecord.generateSequenceId(commitTime, TaskContext.getPartitionId(),
   recordIndex.getAndIncrement());
 HoodieAvroUtils.addHoodieKeyToRecord((GenericRecord) avroRecord, record.getRecordKey(),
   record.getPartitionPath(), file.getName());
 HoodieAvroUtils.addCommitMetadataToRecord((GenericRecord) avroRecord, commitTime, seqId);
 super.write(avroRecord);
 writeSupport.add(record.getRecordKey());
}

@Override
public void writeAvroWithMetadata(R avroRecord, HoodieRecord record) throws IOException {
 String seqId = HoodieRecord.generateSequenceId(commitTime, TaskContext.getPartitionId(),
   recordIndex.getAndIncrement());
 HoodieAvroUtils.addHoodieKeyToRecord((GenericRecord) avroRecord, record.getRecordKey(),
   record.getPartitionPath(), file.getName());
 HoodieAvroUtils.addCommitMetadataToRecord((GenericRecord) avroRecord, commitTime, seqId);
 super.write(avroRecord);
 writeSupport.add(record.getRecordKey());
}

@Override
public JavaRDD<HoodieRecord<T>> tagLocation(JavaRDD<HoodieRecord<T>> recordRDD, JavaSparkContext jsc,
  HoodieTable<T> hoodieTable)
  throws HoodieIndexException {
 return recordRDD.map(record -> {
  String bucket = getBucket(record.getRecordKey());
  //HACK(vc) a non-existent commit is provided here.
  record.setCurrentLocation(new HoodieRecordLocation("000", bucket));
  return record;
 });
}

@Override
public JavaRDD<HoodieRecord<T>> tagLocation(JavaRDD<HoodieRecord<T>> recordRDD, JavaSparkContext jsc,
  HoodieTable<T> hoodieTable)
  throws HoodieIndexException {
 return recordRDD.map(record -> {
  String bucket = getBucket(record.getRecordKey());
  //HACK(vc) a non-existent commit is provided here.
  record.setCurrentLocation(new HoodieRecordLocation("000", bucket));
  return record;
 });
}

/**
 * Assert that there is no duplicate key at the partition level
 *
 * @param records List of Hoodie records
 */
void assertNodupesWithinPartition(List<HoodieRecord> records) {
 Map<String, Set<String>> partitionToKeys = new HashMap<>();
 for (HoodieRecord r : records) {
  String key = r.getRecordKey();
  String partitionPath = r.getPartitionPath();
  if (!partitionToKeys.containsKey(partitionPath)) {
   partitionToKeys.put(partitionPath, new HashSet<>());
  }
  assertTrue("key " + key + " is duplicate within partition " + partitionPath,
    !partitionToKeys.get(partitionPath).contains(key));
  partitionToKeys.get(partitionPath).add(key);
 }
}

private JavaRDD<WriteStatus> bulkInsertInternal(JavaRDD<HoodieRecord<T>> dedupedRecords,
  String commitTime, HoodieTable<T> table,
  Option<UserDefinedBulkInsertPartitioner> bulkInsertPartitioner) {
 final JavaRDD<HoodieRecord<T>> repartitionedRecords;
 if (bulkInsertPartitioner.isDefined()) {
  repartitionedRecords = bulkInsertPartitioner.get()
    .repartitionRecords(dedupedRecords, config.getBulkInsertShuffleParallelism());
 } else {
  // Now, sort the records and line them up nicely for loading.
  repartitionedRecords = dedupedRecords.sortBy(record -> {
   // Let's use "partitionPath + key" as the sort key. Spark, will ensure
   // the records split evenly across RDD partitions, such that small partitions fit
   // into 1 RDD partition, while big ones spread evenly across multiple RDD partitions
   return String.format("%s+%s", record.getPartitionPath(), record.getRecordKey());
  }, true, config.getBulkInsertShuffleParallelism());
 }
 JavaRDD<WriteStatus> writeStatusRDD = repartitionedRecords
   .mapPartitionsWithIndex(new BulkInsertMapFunction<T>(commitTime, config, table), true)
   .flatMap(writeStatuses -> writeStatuses.iterator());
 return updateIndexAndCommitIfNeeded(writeStatusRDD, table, commitTime);
}

private JavaRDD<WriteStatus> bulkInsertInternal(JavaRDD<HoodieRecord<T>> dedupedRecords,
  String commitTime, HoodieTable<T> table,
  Option<UserDefinedBulkInsertPartitioner> bulkInsertPartitioner) {
 final JavaRDD<HoodieRecord<T>> repartitionedRecords;
 if (bulkInsertPartitioner.isDefined()) {
  repartitionedRecords = bulkInsertPartitioner.get()
    .repartitionRecords(dedupedRecords, config.getBulkInsertShuffleParallelism());
 } else {
  // Now, sort the records and line them up nicely for loading.
  repartitionedRecords = dedupedRecords.sortBy(record -> {
   // Let's use "partitionPath + key" as the sort key. Spark, will ensure
   // the records split evenly across RDD partitions, such that small partitions fit
   // into 1 RDD partition, while big ones spread evenly across multiple RDD partitions
   return String.format("%s+%s", record.getPartitionPath(), record.getRecordKey());
  }, true, config.getBulkInsertShuffleParallelism());
 }
 JavaRDD<WriteStatus> writeStatusRDD = repartitionedRecords
   .mapPartitionsWithIndex(new BulkInsertMapFunction<T>(commitTime, config, table), true)
   .flatMap(writeStatuses -> writeStatuses.iterator());
 return updateIndexAndCommitIfNeeded(writeStatusRDD, table, commitTime);
}

/**
 * Load the new incoming records in a map and return partitionPath
 */
private String init(String fileId, Iterator<HoodieRecord<T>> newRecordsItr) {
 try {
  // Load the new records in a map
  logger.info("MaxMemoryPerPartitionMerge => " + config.getMaxMemoryPerPartitionMerge());
  this.keyToNewRecords = new ExternalSpillableMap<>(config.getMaxMemoryPerPartitionMerge(),
    config.getSpillableMapBasePath(), new DefaultSizeEstimator(), new HoodieRecordSizeEstimator(schema));
 } catch (IOException io) {
  throw new HoodieIOException("Cannot instantiate an ExternalSpillableMap", io);
 }
 String partitionPath = null;
 while (newRecordsItr.hasNext()) {
  HoodieRecord<T> record = newRecordsItr.next();
  partitionPath = record.getPartitionPath();
  keyToNewRecords.put(record.getRecordKey(), record);
  // update the new location of the record, so we know where to find it next
  record.setNewLocation(new HoodieRecordLocation(commitTime, fileId));
 }
 logger.info("Number of entries in MemoryBasedMap => "
   + ((ExternalSpillableMap) keyToNewRecords).getInMemoryMapNumEntries()
   + "Total size in bytes of MemoryBasedMap => "
   + ((ExternalSpillableMap) keyToNewRecords).getCurrentInMemoryMapSize()
   + "Number of entries in DiskBasedMap => "
   + ((ExternalSpillableMap) keyToNewRecords).getDiskBasedMapNumEntries()
   + "Size of file spilled to disk => "
   + ((ExternalSpillableMap) keyToNewRecords).getSizeOfFileOnDiskInBytes());
 return partitionPath;
}

@Test
public void simpleInsertTest() throws IOException, URISyntaxException {
 Schema schema = HoodieAvroUtils.addMetadataFields(SchemaTestUtil.getSimpleSchema());
 String payloadClazz = HoodieAvroPayload.class.getName();
 ExternalSpillableMap<String, HoodieRecord<? extends HoodieRecordPayload>> records =
   new ExternalSpillableMap<>(16L, BASE_OUTPUT_PATH,
     new DefaultSizeEstimator(), new HoodieRecordSizeEstimator(schema)); //16B
 List<IndexedRecord> iRecords = SchemaTestUtil.generateHoodieTestRecords(0, 100);
 List<String> recordKeys = SpillableMapTestUtils.upsertRecords(iRecords, records);
 assert (recordKeys.size() == 100);
 Iterator<HoodieRecord<? extends HoodieRecordPayload>> itr = records.iterator();
 List<HoodieRecord> oRecords = new ArrayList<>();
 while (itr.hasNext()) {
  HoodieRecord<? extends HoodieRecordPayload> rec = itr.next();
  oRecords.add(rec);
  assert recordKeys.contains(rec.getRecordKey());
 }
}

@Override
public JavaRDD<HoodieRecord<T>> tagLocation(JavaRDD<HoodieRecord<T>> recordRDD, JavaSparkContext jsc,
  HoodieTable<T> hoodieTable) {
 // Step 0: cache the input record RDD
 if (config.getBloomIndexUseCaching()) {
  recordRDD.persist(config.getBloomIndexInputStorageLevel());
 }
 // Step 1: Extract out thinner JavaPairRDD of (partitionPath, recordKey)
 JavaPairRDD<String, String> partitionRecordKeyPairRDD = recordRDD
   .mapToPair(record -> new Tuple2<>(record.getPartitionPath(), record.getRecordKey()));
 // Lookup indexes for all the partition/recordkey pair
 JavaPairRDD<String, String> rowKeyFilenamePairRDD = lookupIndex(partitionRecordKeyPairRDD, jsc, hoodieTable);
 // Cache the result, for subsequent stages.
 if (config.getBloomIndexUseCaching()) {
  rowKeyFilenamePairRDD.persist(StorageLevel.MEMORY_AND_DISK_SER());
 }
 if (logger.isDebugEnabled()) {
  long totalTaggedRecords = rowKeyFilenamePairRDD.count();
  logger.debug("Number of update records (ones tagged with a fileID): " + totalTaggedRecords);
 }
 // Step 4: Tag the incoming records, as inserts or updates, by joining with existing record keys
 // Cost: 4 sec.
 JavaRDD<HoodieRecord<T>> taggedRecordRDD = tagLocationBacktoRecords(rowKeyFilenamePairRDD,
   recordRDD);
 if (config.getBloomIndexUseCaching()) {
  recordRDD.unpersist(); // unpersist the input Record RDD
  rowKeyFilenamePairRDD.unpersist();
 }
 return taggedRecordRDD;
}

public static void writeRecordsToLogFiles(FileSystem fs, String basePath, Schema schema,
  List<HoodieRecord> updatedRecords) {
 Map<HoodieRecordLocation, List<HoodieRecord>> groupedUpdated = updatedRecords.stream().collect(
   Collectors.groupingBy(HoodieRecord::getCurrentLocation));
 groupedUpdated.entrySet().forEach(s -> {
  HoodieRecordLocation location = s.getKey();
  String partitionPath = s.getValue().get(0).getPartitionPath();
  Writer logWriter;
  try {
   logWriter = HoodieLogFormat.newWriterBuilder().onParentPath(new Path(basePath, partitionPath))
     .withFileExtension(HoodieLogFile.DELTA_EXTENSION).withFileId(location.getFileId())
     .overBaseCommit(location.getCommitTime()).withFs(fs).build();
   Map<HoodieLogBlock.HeaderMetadataType, String> header = Maps.newHashMap();
   header.put(HoodieLogBlock.HeaderMetadataType.INSTANT_TIME, location.getCommitTime());
   header.put(HoodieLogBlock.HeaderMetadataType.SCHEMA, schema.toString());
   logWriter.appendBlock(new HoodieAvroDataBlock(s.getValue().stream().map(r -> {
    try {
     GenericRecord val = (GenericRecord) r.getData().getInsertValue(schema).get();
     HoodieAvroUtils.addHoodieKeyToRecord(val, r.getRecordKey(), r.getPartitionPath(), "");
     return (IndexedRecord) val;
    } catch (IOException e) {
     return null;
    }
   }).collect(Collectors.toList()), header));
   logWriter.close();
  } catch (Exception e) {
   fail(e.toString());
  }
 });
}

@Override
public JavaRDD<HoodieRecord<T>> tagLocation(JavaRDD<HoodieRecord<T>> recordRDD, JavaSparkContext jsc,
  HoodieTable<T> hoodieTable) {
 // Step 0: cache the input record RDD
 if (config.getBloomIndexUseCaching()) {
  recordRDD.persist(config.getBloomIndexInputStorageLevel());
 }
 // Step 1: Extract out thinner JavaPairRDD of (partitionPath, recordKey)
 JavaPairRDD<String, String> partitionRecordKeyPairRDD = recordRDD
   .mapToPair(record -> new Tuple2<>(record.getPartitionPath(), record.getRecordKey()));
 // Lookup indexes for all the partition/recordkey pair
 JavaPairRDD<String, String> rowKeyFilenamePairRDD = lookupIndex(partitionRecordKeyPairRDD, jsc, hoodieTable);
 // Cache the result, for subsequent stages.
 if (config.getBloomIndexUseCaching()) {
  rowKeyFilenamePairRDD.persist(StorageLevel.MEMORY_AND_DISK_SER());
 }
 if (logger.isDebugEnabled()) {
  long totalTaggedRecords = rowKeyFilenamePairRDD.count();
  logger.debug("Number of update records (ones tagged with a fileID): " + totalTaggedRecords);
 }
 // Step 4: Tag the incoming records, as inserts or updates, by joining with existing record keys
 // Cost: 4 sec.
 JavaRDD<HoodieRecord<T>> taggedRecordRDD = tagLocationBacktoRecords(rowKeyFilenamePairRDD,
   recordRDD);
 if (config.getBloomIndexUseCaching()) {
  recordRDD.unpersist(); // unpersist the input Record RDD
  rowKeyFilenamePairRDD.unpersist();
 }
 return taggedRecordRDD;
}

@Override
protected void processNextRecord(HoodieRecord<? extends HoodieRecordPayload> hoodieRecord) throws IOException {
 String key = hoodieRecord.getRecordKey();
 if (records.containsKey(key)) {
  // Merge and store the merged record. The HoodieRecordPayload implementation is free to decide what should be
  // done when a delete (empty payload) is encountered before or after an insert/update.
  HoodieRecordPayload combinedValue = records.get(key).getData().preCombine(hoodieRecord.getData());
  records.put(key, new HoodieRecord<>(new HoodieKey(key, hoodieRecord.getPartitionPath()), combinedValue));
 } else {
  // Put the record as is
  records.put(key, hoodieRecord);
 }
}

/**
 * Tag the <rowKey, filename> back to the original HoodieRecord RDD.
 */
private JavaRDD<HoodieRecord<T>> tagLocationBacktoRecords(
  JavaPairRDD<String, String> rowKeyFilenamePairRDD, JavaRDD<HoodieRecord<T>> recordRDD) {
 JavaPairRDD<String, HoodieRecord<T>> rowKeyRecordPairRDD = recordRDD
   .mapToPair(record -> new Tuple2<>(record.getRecordKey(), record));
 // Here as the recordRDD might have more data than rowKeyRDD (some rowKeys' fileId is null),
 // so we do left outer join.
 return rowKeyRecordPairRDD.leftOuterJoin(rowKeyFilenamePairRDD).values().map(v1 -> {
  HoodieRecord<T> record = v1._1();
  if (v1._2().isPresent()) {
   String filename = v1._2().get();
   if (filename != null && !filename.isEmpty()) {
    // When you have a record in multiple files in the same partition, then rowKeyRecordPairRDD will have 2
    // entries with the same exact in memory copy of the HoodieRecord and the 2 separate filenames that the
    // record is found in. This will result in setting currentLocation 2 times and it will fail the second time.
    // This check will create a new in memory copy of the hoodie record.
    if (record.getCurrentLocation() != null) {
     record = new HoodieRecord<T>(record.getKey(), record.getData());
    }
    record.setCurrentLocation(new HoodieRecordLocation(FSUtils.getCommitTime(filename),
      FSUtils.getFileId(filename)));
   }
  }
  return record;
 });
}

/**
 * Tag the <rowKey, filename> back to the original HoodieRecord RDD.
 */
private JavaRDD<HoodieRecord<T>> tagLocationBacktoRecords(
  JavaPairRDD<String, String> rowKeyFilenamePairRDD, JavaRDD<HoodieRecord<T>> recordRDD) {
 JavaPairRDD<String, HoodieRecord<T>> rowKeyRecordPairRDD = recordRDD
   .mapToPair(record -> new Tuple2<>(record.getRecordKey(), record));
 // Here as the recordRDD might have more data than rowKeyRDD (some rowKeys' fileId is null),
 // so we do left outer join.
 return rowKeyRecordPairRDD.leftOuterJoin(rowKeyFilenamePairRDD).values().map(v1 -> {
  HoodieRecord<T> record = v1._1();
  if (v1._2().isPresent()) {
   String filename = v1._2().get();
   if (filename != null && !filename.isEmpty()) {
    // When you have a record in multiple files in the same partition, then rowKeyRecordPairRDD will have 2
    // entries with the same exact in memory copy of the HoodieRecord and the 2 separate filenames that the
    // record is found in. This will result in setting currentLocation 2 times and it will fail the second time.
    // This check will create a new in memory copy of the hoodie record.
    if (record.getCurrentLocation() != null) {
     record = new HoodieRecord<T>(record.getKey(), record.getData());
    }
    record.setCurrentLocation(new HoodieRecordLocation(FSUtils.getCommitTime(filename),
      FSUtils.getFileId(filename)));
   }
  }
  return record;
 });
}

while (itr.hasNext()) {
 HoodieRecord<? extends HoodieRecordPayload> rec = itr.next();
 assert recordKeys.contains(rec.getRecordKey());

Popular methods of HoodieRecord

setNewLocation

Popular in Java

Updating database using SQL prepared statement
getSystemService (Context)
putExtra (Intent)
runOnUiThread (Activity)
OutputStream (java.io)
A writable sink for bytes.Most clients will use output streams that write data to the file system (
DateFormat (java.text)
Formats or parses dates and times.This class provides factories for obtaining instances configured f
CountDownLatch (java.util.concurrent)
A synchronization aid that allows one or more threads to wait until a set of operations being perfor
Executors (java.util.concurrent)
Factory and utility methods for Executor, ExecutorService, ScheduledExecutorService, ThreadFactory,
Rectangle (java.awt)
A Rectangle specifies an area in a coordinate space that is enclosed by the Rectangle object's top-
ImageIO (javax.imageio)
Best IntelliJ plugins

How to use getRecordKeymethodin com.uber.hoodie.common.model.HoodieRecord

Best Java code snippets using com.uber.hoodie.common.model.HoodieRecord.getRecordKey (Showing top 20 results out of 315)

How to use
getRecordKey
method
in
com.uber.hoodie.common.model.HoodieRecord