public static String getRowKeyFromCellRecord(@NonNull final HoodieRecord cellRecord) { final String cellRecordKey = cellRecord.getRecordKey(); return cellRecordKey.substring(0, cellRecordKey.indexOf(StringTypes.HASHTAG)); }
private void writeToBuffer(HoodieRecord<T> record) { // update the new location of the record, so we know where to find it next record.setNewLocation(new HoodieRecordLocation(commitTime, fileId)); Optional<IndexedRecord> indexedRecord = getIndexedRecord(record); if (indexedRecord.isPresent()) { recordList.add(indexedRecord.get()); } else { keysToDelete.add(record.getRecordKey()); } numberOfRecords++; }
public HoodieRecordMissingException(HoodieRecord record) { super( "Record " + record.getRecordKey() + " with partition path " + record.getPartitionPath() + " in current location " + record.getCurrentLocation() + " is not found in the partition"); } }
public static Set<String> getRecordKeys(List<HoodieRecord> hoodieRecords) { Set<String> keys = new HashSet<>(); for (HoodieRecord rec : hoodieRecords) { keys.add(rec.getRecordKey()); } return keys; }
@Override public void writeAvroWithMetadata(R avroRecord, HoodieRecord record) throws IOException { String seqId = HoodieRecord.generateSequenceId(commitTime, TaskContext.getPartitionId(), recordIndex.getAndIncrement()); HoodieAvroUtils.addHoodieKeyToRecord((GenericRecord) avroRecord, record.getRecordKey(), record.getPartitionPath(), file.getName()); HoodieAvroUtils.addCommitMetadataToRecord((GenericRecord) avroRecord, commitTime, seqId); super.write(avroRecord); writeSupport.add(record.getRecordKey()); }
@Override public void writeAvroWithMetadata(R avroRecord, HoodieRecord record) throws IOException { String seqId = HoodieRecord.generateSequenceId(commitTime, TaskContext.getPartitionId(), recordIndex.getAndIncrement()); HoodieAvroUtils.addHoodieKeyToRecord((GenericRecord) avroRecord, record.getRecordKey(), record.getPartitionPath(), file.getName()); HoodieAvroUtils.addCommitMetadataToRecord((GenericRecord) avroRecord, commitTime, seqId); super.write(avroRecord); writeSupport.add(record.getRecordKey()); }
@Override public JavaRDD<HoodieRecord<T>> tagLocation(JavaRDD<HoodieRecord<T>> recordRDD, JavaSparkContext jsc, HoodieTable<T> hoodieTable) throws HoodieIndexException { return recordRDD.map(record -> { String bucket = getBucket(record.getRecordKey()); //HACK(vc) a non-existent commit is provided here. record.setCurrentLocation(new HoodieRecordLocation("000", bucket)); return record; }); }
@Override public JavaRDD<HoodieRecord<T>> tagLocation(JavaRDD<HoodieRecord<T>> recordRDD, JavaSparkContext jsc, HoodieTable<T> hoodieTable) throws HoodieIndexException { return recordRDD.map(record -> { String bucket = getBucket(record.getRecordKey()); //HACK(vc) a non-existent commit is provided here. record.setCurrentLocation(new HoodieRecordLocation("000", bucket)); return record; }); }
/** * Assert that there is no duplicate key at the partition level * * @param records List of Hoodie records */ void assertNodupesWithinPartition(List<HoodieRecord> records) { Map<String, Set<String>> partitionToKeys = new HashMap<>(); for (HoodieRecord r : records) { String key = r.getRecordKey(); String partitionPath = r.getPartitionPath(); if (!partitionToKeys.containsKey(partitionPath)) { partitionToKeys.put(partitionPath, new HashSet<>()); } assertTrue("key " + key + " is duplicate within partition " + partitionPath, !partitionToKeys.get(partitionPath).contains(key)); partitionToKeys.get(partitionPath).add(key); } }
private JavaRDD<WriteStatus> bulkInsertInternal(JavaRDD<HoodieRecord<T>> dedupedRecords, String commitTime, HoodieTable<T> table, Option<UserDefinedBulkInsertPartitioner> bulkInsertPartitioner) { final JavaRDD<HoodieRecord<T>> repartitionedRecords; if (bulkInsertPartitioner.isDefined()) { repartitionedRecords = bulkInsertPartitioner.get() .repartitionRecords(dedupedRecords, config.getBulkInsertShuffleParallelism()); } else { // Now, sort the records and line them up nicely for loading. repartitionedRecords = dedupedRecords.sortBy(record -> { // Let's use "partitionPath + key" as the sort key. Spark, will ensure // the records split evenly across RDD partitions, such that small partitions fit // into 1 RDD partition, while big ones spread evenly across multiple RDD partitions return String.format("%s+%s", record.getPartitionPath(), record.getRecordKey()); }, true, config.getBulkInsertShuffleParallelism()); } JavaRDD<WriteStatus> writeStatusRDD = repartitionedRecords .mapPartitionsWithIndex(new BulkInsertMapFunction<T>(commitTime, config, table), true) .flatMap(writeStatuses -> writeStatuses.iterator()); return updateIndexAndCommitIfNeeded(writeStatusRDD, table, commitTime); }
private JavaRDD<WriteStatus> bulkInsertInternal(JavaRDD<HoodieRecord<T>> dedupedRecords, String commitTime, HoodieTable<T> table, Option<UserDefinedBulkInsertPartitioner> bulkInsertPartitioner) { final JavaRDD<HoodieRecord<T>> repartitionedRecords; if (bulkInsertPartitioner.isDefined()) { repartitionedRecords = bulkInsertPartitioner.get() .repartitionRecords(dedupedRecords, config.getBulkInsertShuffleParallelism()); } else { // Now, sort the records and line them up nicely for loading. repartitionedRecords = dedupedRecords.sortBy(record -> { // Let's use "partitionPath + key" as the sort key. Spark, will ensure // the records split evenly across RDD partitions, such that small partitions fit // into 1 RDD partition, while big ones spread evenly across multiple RDD partitions return String.format("%s+%s", record.getPartitionPath(), record.getRecordKey()); }, true, config.getBulkInsertShuffleParallelism()); } JavaRDD<WriteStatus> writeStatusRDD = repartitionedRecords .mapPartitionsWithIndex(new BulkInsertMapFunction<T>(commitTime, config, table), true) .flatMap(writeStatuses -> writeStatuses.iterator()); return updateIndexAndCommitIfNeeded(writeStatusRDD, table, commitTime); }
/** * Load the new incoming records in a map and return partitionPath */ private String init(String fileId, Iterator<HoodieRecord<T>> newRecordsItr) { try { // Load the new records in a map logger.info("MaxMemoryPerPartitionMerge => " + config.getMaxMemoryPerPartitionMerge()); this.keyToNewRecords = new ExternalSpillableMap<>(config.getMaxMemoryPerPartitionMerge(), config.getSpillableMapBasePath(), new DefaultSizeEstimator(), new HoodieRecordSizeEstimator(schema)); } catch (IOException io) { throw new HoodieIOException("Cannot instantiate an ExternalSpillableMap", io); } String partitionPath = null; while (newRecordsItr.hasNext()) { HoodieRecord<T> record = newRecordsItr.next(); partitionPath = record.getPartitionPath(); keyToNewRecords.put(record.getRecordKey(), record); // update the new location of the record, so we know where to find it next record.setNewLocation(new HoodieRecordLocation(commitTime, fileId)); } logger.info("Number of entries in MemoryBasedMap => " + ((ExternalSpillableMap) keyToNewRecords).getInMemoryMapNumEntries() + "Total size in bytes of MemoryBasedMap => " + ((ExternalSpillableMap) keyToNewRecords).getCurrentInMemoryMapSize() + "Number of entries in DiskBasedMap => " + ((ExternalSpillableMap) keyToNewRecords).getDiskBasedMapNumEntries() + "Size of file spilled to disk => " + ((ExternalSpillableMap) keyToNewRecords).getSizeOfFileOnDiskInBytes()); return partitionPath; }
@Test public void simpleInsertTest() throws IOException, URISyntaxException { Schema schema = HoodieAvroUtils.addMetadataFields(SchemaTestUtil.getSimpleSchema()); String payloadClazz = HoodieAvroPayload.class.getName(); ExternalSpillableMap<String, HoodieRecord<? extends HoodieRecordPayload>> records = new ExternalSpillableMap<>(16L, BASE_OUTPUT_PATH, new DefaultSizeEstimator(), new HoodieRecordSizeEstimator(schema)); //16B List<IndexedRecord> iRecords = SchemaTestUtil.generateHoodieTestRecords(0, 100); List<String> recordKeys = SpillableMapTestUtils.upsertRecords(iRecords, records); assert (recordKeys.size() == 100); Iterator<HoodieRecord<? extends HoodieRecordPayload>> itr = records.iterator(); List<HoodieRecord> oRecords = new ArrayList<>(); while (itr.hasNext()) { HoodieRecord<? extends HoodieRecordPayload> rec = itr.next(); oRecords.add(rec); assert recordKeys.contains(rec.getRecordKey()); } }
@Override public JavaRDD<HoodieRecord<T>> tagLocation(JavaRDD<HoodieRecord<T>> recordRDD, JavaSparkContext jsc, HoodieTable<T> hoodieTable) { // Step 0: cache the input record RDD if (config.getBloomIndexUseCaching()) { recordRDD.persist(config.getBloomIndexInputStorageLevel()); } // Step 1: Extract out thinner JavaPairRDD of (partitionPath, recordKey) JavaPairRDD<String, String> partitionRecordKeyPairRDD = recordRDD .mapToPair(record -> new Tuple2<>(record.getPartitionPath(), record.getRecordKey())); // Lookup indexes for all the partition/recordkey pair JavaPairRDD<String, String> rowKeyFilenamePairRDD = lookupIndex(partitionRecordKeyPairRDD, jsc, hoodieTable); // Cache the result, for subsequent stages. if (config.getBloomIndexUseCaching()) { rowKeyFilenamePairRDD.persist(StorageLevel.MEMORY_AND_DISK_SER()); } if (logger.isDebugEnabled()) { long totalTaggedRecords = rowKeyFilenamePairRDD.count(); logger.debug("Number of update records (ones tagged with a fileID): " + totalTaggedRecords); } // Step 4: Tag the incoming records, as inserts or updates, by joining with existing record keys // Cost: 4 sec. JavaRDD<HoodieRecord<T>> taggedRecordRDD = tagLocationBacktoRecords(rowKeyFilenamePairRDD, recordRDD); if (config.getBloomIndexUseCaching()) { recordRDD.unpersist(); // unpersist the input Record RDD rowKeyFilenamePairRDD.unpersist(); } return taggedRecordRDD; }
public static void writeRecordsToLogFiles(FileSystem fs, String basePath, Schema schema, List<HoodieRecord> updatedRecords) { Map<HoodieRecordLocation, List<HoodieRecord>> groupedUpdated = updatedRecords.stream().collect( Collectors.groupingBy(HoodieRecord::getCurrentLocation)); groupedUpdated.entrySet().forEach(s -> { HoodieRecordLocation location = s.getKey(); String partitionPath = s.getValue().get(0).getPartitionPath(); Writer logWriter; try { logWriter = HoodieLogFormat.newWriterBuilder().onParentPath(new Path(basePath, partitionPath)) .withFileExtension(HoodieLogFile.DELTA_EXTENSION).withFileId(location.getFileId()) .overBaseCommit(location.getCommitTime()).withFs(fs).build(); Map<HoodieLogBlock.HeaderMetadataType, String> header = Maps.newHashMap(); header.put(HoodieLogBlock.HeaderMetadataType.INSTANT_TIME, location.getCommitTime()); header.put(HoodieLogBlock.HeaderMetadataType.SCHEMA, schema.toString()); logWriter.appendBlock(new HoodieAvroDataBlock(s.getValue().stream().map(r -> { try { GenericRecord val = (GenericRecord) r.getData().getInsertValue(schema).get(); HoodieAvroUtils.addHoodieKeyToRecord(val, r.getRecordKey(), r.getPartitionPath(), ""); return (IndexedRecord) val; } catch (IOException e) { return null; } }).collect(Collectors.toList()), header)); logWriter.close(); } catch (Exception e) { fail(e.toString()); } }); }
@Override public JavaRDD<HoodieRecord<T>> tagLocation(JavaRDD<HoodieRecord<T>> recordRDD, JavaSparkContext jsc, HoodieTable<T> hoodieTable) { // Step 0: cache the input record RDD if (config.getBloomIndexUseCaching()) { recordRDD.persist(config.getBloomIndexInputStorageLevel()); } // Step 1: Extract out thinner JavaPairRDD of (partitionPath, recordKey) JavaPairRDD<String, String> partitionRecordKeyPairRDD = recordRDD .mapToPair(record -> new Tuple2<>(record.getPartitionPath(), record.getRecordKey())); // Lookup indexes for all the partition/recordkey pair JavaPairRDD<String, String> rowKeyFilenamePairRDD = lookupIndex(partitionRecordKeyPairRDD, jsc, hoodieTable); // Cache the result, for subsequent stages. if (config.getBloomIndexUseCaching()) { rowKeyFilenamePairRDD.persist(StorageLevel.MEMORY_AND_DISK_SER()); } if (logger.isDebugEnabled()) { long totalTaggedRecords = rowKeyFilenamePairRDD.count(); logger.debug("Number of update records (ones tagged with a fileID): " + totalTaggedRecords); } // Step 4: Tag the incoming records, as inserts or updates, by joining with existing record keys // Cost: 4 sec. JavaRDD<HoodieRecord<T>> taggedRecordRDD = tagLocationBacktoRecords(rowKeyFilenamePairRDD, recordRDD); if (config.getBloomIndexUseCaching()) { recordRDD.unpersist(); // unpersist the input Record RDD rowKeyFilenamePairRDD.unpersist(); } return taggedRecordRDD; }
@Override protected void processNextRecord(HoodieRecord<? extends HoodieRecordPayload> hoodieRecord) throws IOException { String key = hoodieRecord.getRecordKey(); if (records.containsKey(key)) { // Merge and store the merged record. The HoodieRecordPayload implementation is free to decide what should be // done when a delete (empty payload) is encountered before or after an insert/update. HoodieRecordPayload combinedValue = records.get(key).getData().preCombine(hoodieRecord.getData()); records.put(key, new HoodieRecord<>(new HoodieKey(key, hoodieRecord.getPartitionPath()), combinedValue)); } else { // Put the record as is records.put(key, hoodieRecord); } }
/** * Tag the <rowKey, filename> back to the original HoodieRecord RDD. */ private JavaRDD<HoodieRecord<T>> tagLocationBacktoRecords( JavaPairRDD<String, String> rowKeyFilenamePairRDD, JavaRDD<HoodieRecord<T>> recordRDD) { JavaPairRDD<String, HoodieRecord<T>> rowKeyRecordPairRDD = recordRDD .mapToPair(record -> new Tuple2<>(record.getRecordKey(), record)); // Here as the recordRDD might have more data than rowKeyRDD (some rowKeys' fileId is null), // so we do left outer join. return rowKeyRecordPairRDD.leftOuterJoin(rowKeyFilenamePairRDD).values().map(v1 -> { HoodieRecord<T> record = v1._1(); if (v1._2().isPresent()) { String filename = v1._2().get(); if (filename != null && !filename.isEmpty()) { // When you have a record in multiple files in the same partition, then rowKeyRecordPairRDD will have 2 // entries with the same exact in memory copy of the HoodieRecord and the 2 separate filenames that the // record is found in. This will result in setting currentLocation 2 times and it will fail the second time. // This check will create a new in memory copy of the hoodie record. if (record.getCurrentLocation() != null) { record = new HoodieRecord<T>(record.getKey(), record.getData()); } record.setCurrentLocation(new HoodieRecordLocation(FSUtils.getCommitTime(filename), FSUtils.getFileId(filename))); } } return record; }); }
/** * Tag the <rowKey, filename> back to the original HoodieRecord RDD. */ private JavaRDD<HoodieRecord<T>> tagLocationBacktoRecords( JavaPairRDD<String, String> rowKeyFilenamePairRDD, JavaRDD<HoodieRecord<T>> recordRDD) { JavaPairRDD<String, HoodieRecord<T>> rowKeyRecordPairRDD = recordRDD .mapToPair(record -> new Tuple2<>(record.getRecordKey(), record)); // Here as the recordRDD might have more data than rowKeyRDD (some rowKeys' fileId is null), // so we do left outer join. return rowKeyRecordPairRDD.leftOuterJoin(rowKeyFilenamePairRDD).values().map(v1 -> { HoodieRecord<T> record = v1._1(); if (v1._2().isPresent()) { String filename = v1._2().get(); if (filename != null && !filename.isEmpty()) { // When you have a record in multiple files in the same partition, then rowKeyRecordPairRDD will have 2 // entries with the same exact in memory copy of the HoodieRecord and the 2 separate filenames that the // record is found in. This will result in setting currentLocation 2 times and it will fail the second time. // This check will create a new in memory copy of the hoodie record. if (record.getCurrentLocation() != null) { record = new HoodieRecord<T>(record.getKey(), record.getData()); } record.setCurrentLocation(new HoodieRecordLocation(FSUtils.getCommitTime(filename), FSUtils.getFileId(filename))); } } return record; }); }
while (itr.hasNext()) { HoodieRecord<? extends HoodieRecordPayload> rec = itr.next(); assert recordKeys.contains(rec.getRecordKey());