@Override public JavaPairRDD<HiveKey, Iterable<BytesWritable>> shuffle( JavaPairRDD<HiveKey, BytesWritable> input, int numPartitions) { if (numPartitions > 0) { return input.groupByKey(numPartitions); } return input.groupByKey(); }
@Override public JavaPairRDD<HiveKey, Iterable<BytesWritable>> shuffle( JavaPairRDD<HiveKey, BytesWritable> input, int numPartitions) { if (numPartitions > 0) { return input.groupByKey(numPartitions); } return input.groupByKey(); }
private static JavaPairRDD<String,Collection<String>> knownsRDD(JavaRDD<String[]> allData, boolean knownItems) { JavaRDD<String[]> sorted = allData.sortBy(datum -> Long.valueOf(datum[3]), true, allData.partitions().size()); JavaPairRDD<String,Tuple2<String,Boolean>> tuples = sorted.mapToPair(datum -> { String user = datum[0]; String item = datum[1]; Boolean delete = datum[2].isEmpty(); return knownItems ? new Tuple2<>(user, new Tuple2<>(item, delete)) : new Tuple2<>(item, new Tuple2<>(user, delete)); }); // TODO likely need to figure out a way to avoid groupByKey but collectByKey // won't work here -- doesn't guarantee enough about ordering return tuples.groupByKey().mapValues(idDeletes -> { Collection<String> ids = new HashSet<>(); for (Tuple2<String,Boolean> idDelete : idDeletes) { if (idDelete._2()) { ids.remove(idDelete._1()); } else { ids.add(idDelete._1()); } } return ids; }); }
private JavaPairRDD<Integer, Iterable<double[]>> fetchClusteredPoints(JavaRDD<? extends Vector> evalData) { return evalData.mapToPair(vector -> { double closestDist = Double.POSITIVE_INFINITY; int minClusterID = Integer.MIN_VALUE; double[] vec = vector.toArray(); DistanceFn<double[]> distanceFn = getDistanceFn(); Map<Integer,ClusterInfo> clusters = getClustersByID(); for (ClusterInfo cluster : clusters.values()) { double distance = distanceFn.applyAsDouble(cluster.getCenter(), vec); if (distance < closestDist) { closestDist = distance; minClusterID = cluster.getID(); } } Preconditions.checkState(!Double.isInfinite(closestDist) && !Double.isNaN(closestDist)); return new Tuple2<>(minClusterID, vec); }).groupByKey(); }
@SuppressWarnings("unchecked") @Test public void lookup() { JavaPairRDD<String, String> categories = sc.parallelizePairs(Arrays.asList( new Tuple2<>("Apples", "Fruit"), new Tuple2<>("Oranges", "Fruit"), new Tuple2<>("Oranges", "Citrus") )); assertEquals(2, categories.lookup("Oranges").size()); assertEquals(2, Iterables.size(categories.groupByKey().lookup("Oranges").get(0))); }
@SuppressWarnings("unchecked") @Test public void lookup() { JavaPairRDD<String, String> categories = sc.parallelizePairs(Arrays.asList( new Tuple2<>("Apples", "Fruit"), new Tuple2<>("Oranges", "Fruit"), new Tuple2<>("Oranges", "Citrus") )); assertEquals(2, categories.lookup("Oranges").size()); assertEquals(2, Iterables.size(categories.groupByKey().lookup("Oranges").get(0))); }
@SuppressWarnings("unchecked") @Test public void lookup() { JavaPairRDD<String, String> categories = sc.parallelizePairs(Arrays.asList( new Tuple2<>("Apples", "Fruit"), new Tuple2<>("Oranges", "Fruit"), new Tuple2<>("Oranges", "Citrus") )); assertEquals(2, categories.lookup("Oranges").size()); assertEquals(2, Iterables.size(categories.groupByKey().lookup("Oranges").get(0))); }
if (model.isImplicit()) { aggregated = tuples.groupByKey().mapValues(MLFunctions.SUM_WITH_NAN); } else {
new FlatOutputFucntion(cubeName, segmentId, metaUrl, sConf, samplingPercent, bytesWritten)); JavaPairRDD<SelfDefineSortableKey, Iterable<Text>> aggredRDD = flatOutputRDD.groupByKey( new FactDistinctPartitioner(cubeName, metaUrl, sConf, reducerMapping.getTotalReducerNum()));
/** * Combines {@link Rating}s with the same user/item into one, with score as the sum of * all of the scores. */ private JavaRDD<Rating> aggregateScores(JavaRDD<? extends Rating> original, double epsilon) { JavaPairRDD<Tuple2<Integer,Integer>,Double> tuples = original.mapToPair(rating -> new Tuple2<>(new Tuple2<>(rating.user(), rating.product()), rating.rating())); JavaPairRDD<Tuple2<Integer,Integer>,Double> aggregated; if (implicit) { // TODO can we avoid groupByKey? reduce, combine, fold don't seem viable since // they don't guarantee the delete elements are properly handled aggregated = tuples.groupByKey().mapValues(MLFunctions.SUM_WITH_NAN); } else { // For non-implicit, last wins. aggregated = tuples.foldByKey(Double.NaN, (current, next) -> next); } JavaPairRDD<Tuple2<Integer,Integer>,Double> noNaN = aggregated.filter(kv -> !Double.isNaN(kv._2())); if (logStrength) { return noNaN.map(userProductScore -> new Rating( userProductScore._1()._1(), userProductScore._1()._2(), Math.log1p(userProductScore._2() / epsilon))); } else { return noNaN.map(userProductScore -> new Rating( userProductScore._1()._1(), userProductScore._1()._2(), userProductScore._2())); } }
positiveUserProducts.groupByKey().flatMapToPair( new PairFlatMapFunction<Tuple2<Integer,Iterable<Integer>>,Integer,Integer>() { private final RandomGenerator random = RandomManager.getRandom();
static JavaPairRDD<String, Tuple2<Double, Integer>> computeRankedProducts( JavaSparkContext context, JavaPairRDD<String, Long>[] ranks) { JavaPairRDD<String, Long> unionRDD = context.union(ranks); // next find unique keys, with their associated copa scores JavaPairRDD<String, Iterable<Long>> groupedByGeneRDD = unionRDD.groupByKey(); // next calculate ranked products and the number of elements JavaPairRDD<String, Tuple2<Double, Integer>> rankedProducts = groupedByGeneRDD.mapValues((Iterable<Long> values) -> { int N = 0; long products = 1; for (Long v : values) { products *= v; N++; } double rankedProduct = Math.pow( (double) products, 1.0/((double) N)); return new Tuple2<Double, Integer>(rankedProduct, N); } // input: copa scores for all studies // output: (rankedProduct, N) ); return rankedProducts; }
static JavaPairRDD<String, Tuple2<Double, Integer>> computeRankedProducts( JavaSparkContext context, JavaPairRDD<String, Long>[] ranks) { JavaPairRDD<String, Long> unionRDD = context.union(ranks); // next find unique keys, with their associated copa scores JavaPairRDD<String, Iterable<Long>> groupedByGeneRDD = unionRDD.groupByKey(); // next calculate ranked products and the number of elements JavaPairRDD<String, Tuple2<Double, Integer>> rankedProducts = groupedByGeneRDD.mapValues( new Function< Iterable<Long>, // input: copa scores for all studies Tuple2<Double, Integer> // output: (rankedProduct, N) >() { @Override public Tuple2<Double, Integer> call(Iterable<Long> values) { int N = 0; long products = 1; for (Long v : values) { products *= v; N++; } double rankedProduct = Math.pow( (double) products, 1.0/((double) N)); return new Tuple2<Double, Integer>(rankedProduct, N); } }); return rankedProducts; }
JavaPairRDD<String, Iterable<Double>> groupedByGene = genes.groupByKey();
JavaPairRDD<String, Iterable<Double>> groupedByGene = genes.groupByKey();
public static <K, V, OK, OV> JavaPairRDD<OK, OV> executeReduce( final JavaPairRDD<K, V> mapOrCombineRDD, final MapReduce<K, V, OK, OV, ?> mapReduce, final Configuration graphComputerConfiguration) { JavaPairRDD<OK, OV> reduceRDD = mapOrCombineRDD.groupByKey().mapPartitionsToPair(partitionIterator -> { KryoShimServiceLoader.applyConfiguration(graphComputerConfiguration); return new ReduceIterator<>(MapReduce.<MapReduce<K, V, OK, OV, ?>>createMapReduce(HadoopGraph.open(graphComputerConfiguration), graphComputerConfiguration), partitionIterator); }); if (mapReduce.getReduceKeySort().isPresent()) reduceRDD = reduceRDD.sortByKey(mapReduce.getReduceKeySort().get(), true, 1); return reduceRDD; } }
JavaPairRDD<String, Iterable<String>> anagrams = rdd.groupByKey();
JavaPairRDD<Integer, Vector> closest = getClosest(data, centroids); JavaPairRDD<Integer, Iterable<Vector>> pointsGroup = closest.groupByKey(); Map<Integer, Vector> newCentroids = getNewCentroids(pointsGroup);
JavaPairRDD<String, Iterable<Tuple2<Integer,String>>> groupedByWord = wordAsKey.groupByKey();