@Override public JavaPairRDD<HiveKey, BytesWritable> shuffle( JavaPairRDD<HiveKey, BytesWritable> input, int numPartitions) { if (numPartitions < 0) { numPartitions = 1; } return input.repartitionAndSortWithinPartitions(new HashPartitioner(numPartitions)); }
rdd.repartitionAndSortWithinPartitions(partitioner); assertTrue(repartitioned.partitioner().isPresent()); assertEquals(repartitioned.partitioner().get(), partitioner);
hfilerdd.repartitionAndSortWithinPartitions(new HFilePartitioner(keys), RowKeyWritable.RowKeyComparator.INSTANCE) .mapToPair(new PairFunction<Tuple2<RowKeyWritable, KeyValue>, ImmutableBytesWritable, KeyValue>() {
rdd.repartitionAndSortWithinPartitions(partitioner); assertTrue(repartitioned.partitioner().isPresent()); assertEquals(repartitioned.partitioner().get(), partitioner);
rdd.repartitionAndSortWithinPartitions(partitioner); assertTrue(repartitioned.partitioner().isPresent()); assertEquals(repartitioned.partitioner().get(), partitioner);
@Override public JavaPairRDD<HiveKey, BytesWritable> shuffle( JavaPairRDD<HiveKey, BytesWritable> input, int numPartitions) { JavaPairRDD<HiveKey, BytesWritable> rdd; if (totalOrder) { if (numPartitions > 0) { if (numPartitions > 1 && input.getStorageLevel() == StorageLevel.NONE()) { input.persist(StorageLevel.DISK_ONLY()); sparkPlan.addCachedRDDId(input.id()); } rdd = input.sortByKey(true, numPartitions); } else { rdd = input.sortByKey(true); } } else { Partitioner partitioner = new HashPartitioner(numPartitions); rdd = input.repartitionAndSortWithinPartitions(partitioner); } return rdd; }
@Override public JavaPairRDD<HiveKey, BytesWritable> shuffle( JavaPairRDD<HiveKey, BytesWritable> input, int numPartitions) { JavaPairRDD<HiveKey, BytesWritable> rdd; if (totalOrder) { if (numPartitions > 0) { if (numPartitions > 1 && input.getStorageLevel() == StorageLevel.NONE()) { input.persist(StorageLevel.DISK_ONLY()); sparkPlan.addCachedRDDId(input.id()); } rdd = input.sortByKey(true, numPartitions); } else { rdd = input.sortByKey(true); } } else { Partitioner partitioner = new HashPartitioner(numPartitions); rdd = input.repartitionAndSortWithinPartitions(partitioner); } if (shuffleSerializer != null) { if (rdd.rdd() instanceof ShuffledRDD) { ((ShuffledRDD) rdd.rdd()).setSerializer(shuffleSerializer); } } return rdd; }
JavaPairRDD<Integer, Integer> repartitioned = rdd.repartitionAndSortWithinPartitions(partitioner); List<List<Tuple2<Integer, Integer>>> partitions = repartitioned.glom().collect();
= valueToKey.repartitionAndSortWithinPartitions(new CustomPartitioner(partitions), TupleComparatorDescending.INSTANCE);
@Override protected void prepareKeyValues(final ImportKeyValueJavaPairRDDToAccumulo operation, final AccumuloKeyRangePartitioner partitioner) throws OperationException { final JavaPairRDD<Key, Value> rdd = operation.getInput().repartitionAndSortWithinPartitions(partitioner); rdd.saveAsNewAPIHadoopFile(operation.getOutputPath(), Key.class, Value.class, AccumuloFileOutputFormat.class, getConfiguration(operation)); }
public static RDD<Tuple> handleSecondarySort( RDD<Tuple2<IndexedKey, Tuple>> rdd, POPackage pkgOp) { JavaPairRDD<IndexedKey, Tuple> pairRDD = JavaPairRDD.fromRDD(rdd, SparkUtil.getManifest(IndexedKey.class), SparkUtil.getManifest(Tuple.class)); int partitionNums = pairRDD.partitions().size(); //repartition to group tuples with same indexedkey to same partition JavaPairRDD<IndexedKey, Tuple> sorted = pairRDD.repartitionAndSortWithinPartitions( new IndexedKeyPartitioner(partitionNums)); //Package tuples with same indexedkey as the result: (key,(val1,val2,val3,...)) return sorted.mapPartitions(new AccumulateByKey(pkgOp), true).rdd(); }
@Override public JavaPairRDD<HiveKey, Iterable<BytesWritable>> shuffle( JavaPairRDD<HiveKey, BytesWritable> input, int numPartitions) { JavaPairRDD<HiveKey, BytesWritable> rdd; if (totalOrder) { if (numPartitions > 0) { rdd = input.sortByKey(true, numPartitions); } else { rdd = input.sortByKey(true); } } else { Partitioner partitioner = new HashPartitioner(numPartitions); rdd = input.repartitionAndSortWithinPartitions(partitioner); } return rdd.mapPartitionsToPair(new ShuffleFunction()); }
hfilerdd.repartitionAndSortWithinPartitions(new HFilePartitioner(keys), RowKeyWritable.RowKeyComparator.INSTANCE) .mapToPair(new PairFunction<Tuple2<RowKeyWritable, KeyValue>, ImmutableBytesWritable, KeyValue>() {
private JavaRDD<Tuple2<IndexedKey,Tuple>> handleSecondarySort( RDD<Tuple> rdd, POGlobalRearrangeSpark op, int parallelism) { RDD<Tuple2<Tuple, Object>> rddPair = rdd.map(new ToKeyNullValueFunction(), SparkUtil.<Tuple, Object>getTuple2Manifest()); JavaPairRDD<Tuple, Object> pairRDD = new JavaPairRDD<Tuple, Object>(rddPair, SparkUtil.getManifest(Tuple.class), SparkUtil.getManifest(Object.class)); //first sort the tuple by secondary key if enable useSecondaryKey sort JavaPairRDD<Tuple, Object> sorted = pairRDD.repartitionAndSortWithinPartitions( new HashPartitioner(parallelism), new PigSecondaryKeyComparatorSpark(op.getSecondarySortOrder())); JavaRDD<Tuple> jrdd = sorted.keys(); JavaRDD<Tuple2<IndexedKey,Tuple>> jrddPair = jrdd.map(new ToKeyValueFunction(op)); return jrddPair; }
private JavaRDD<Tuple2<IndexedKey, Tuple>> handleSecondarySort( RDD<Tuple> rdd, POReduceBySpark op, int parallelism) { RDD<Tuple2<Tuple, Object>> rddPair = rdd.map(new ToKeyNullValueFunction(), SparkUtil.<Tuple, Object>getTuple2Manifest()); JavaPairRDD<Tuple, Object> pairRDD = new JavaPairRDD<Tuple, Object>(rddPair, SparkUtil.getManifest(Tuple.class), SparkUtil.getManifest(Object.class)); //first sort the tuple by secondary key if enable useSecondaryKey sort JavaPairRDD<Tuple, Object> sorted = pairRDD.repartitionAndSortWithinPartitions( new HashPartitioner(parallelism), new PigSecondaryKeyComparatorSpark(op.getSecondarySortOrder())); JavaRDD<Tuple> jrdd = sorted.keys(); JavaRDD<Tuple2<IndexedKey, Tuple>> jrddPair = jrdd.map(new ToKeyValueFunction(op)); return jrddPair; }
.union(rightPair) .setName(operator.getName() + "::union-inputs") .repartitionAndSortWithinPartitions(partitioner, comparator) .setName(operator.getName() + "::sort-by-key-and-side") .mapPartitions(
.repartitionAndSortWithinPartitions(groupingPartitioner, comparator) .setName(operator.getName() + "::sort");
.mapToPair(t -> new Tuple2<>(new KeyedWindowValue<>(t._1, t._2), Empty.get())) .setName(operator.getName() + "::create-composite-key") .repartitionAndSortWithinPartitions( partitioner, new SecondarySortComparator<>(operator.getValueComparator())) .setName(operator.getName() + "::secondary-sort") .repartitionAndSortWithinPartitions(partitioner) .setName(operator.getName() + "::sort") .mapPartitionsToPair(ReduceByKeyIterator::new)