private JavaRDD<WriteStatus> bulkInsertInternal(JavaRDD<HoodieRecord<T>> dedupedRecords, String commitTime, HoodieTable<T> table, Option<UserDefinedBulkInsertPartitioner> bulkInsertPartitioner) { final JavaRDD<HoodieRecord<T>> repartitionedRecords; if (bulkInsertPartitioner.isDefined()) { repartitionedRecords = bulkInsertPartitioner.get() .repartitionRecords(dedupedRecords, config.getBulkInsertShuffleParallelism()); } else { // Now, sort the records and line them up nicely for loading. repartitionedRecords = dedupedRecords.sortBy(record -> { // Let's use "partitionPath + key" as the sort key. Spark, will ensure // the records split evenly across RDD partitions, such that small partitions fit // into 1 RDD partition, while big ones spread evenly across multiple RDD partitions return String.format("%s+%s", record.getPartitionPath(), record.getRecordKey()); }, true, config.getBulkInsertShuffleParallelism()); } JavaRDD<WriteStatus> writeStatusRDD = repartitionedRecords .mapPartitionsWithIndex(new BulkInsertMapFunction<T>(commitTime, config, table), true) .flatMap(writeStatuses -> writeStatuses.iterator()); return updateIndexAndCommitIfNeeded(writeStatusRDD, table, commitTime); }
private JavaRDD<WriteStatus> bulkInsertInternal(JavaRDD<HoodieRecord<T>> dedupedRecords, String commitTime, HoodieTable<T> table, Option<UserDefinedBulkInsertPartitioner> bulkInsertPartitioner) { final JavaRDD<HoodieRecord<T>> repartitionedRecords; if (bulkInsertPartitioner.isDefined()) { repartitionedRecords = bulkInsertPartitioner.get() .repartitionRecords(dedupedRecords, config.getBulkInsertShuffleParallelism()); } else { // Now, sort the records and line them up nicely for loading. repartitionedRecords = dedupedRecords.sortBy(record -> { // Let's use "partitionPath + key" as the sort key. Spark, will ensure // the records split evenly across RDD partitions, such that small partitions fit // into 1 RDD partition, while big ones spread evenly across multiple RDD partitions return String.format("%s+%s", record.getPartitionPath(), record.getRecordKey()); }, true, config.getBulkInsertShuffleParallelism()); } JavaRDD<WriteStatus> writeStatusRDD = repartitionedRecords .mapPartitionsWithIndex(new BulkInsertMapFunction<T>(commitTime, config, table), true) .flatMap(writeStatuses -> writeStatuses.iterator()); return updateIndexAndCommitIfNeeded(writeStatusRDD, table, commitTime); }