org.apache.spark.api.java.JavaPairRDD.groupByKey java code examples

@Override
public JavaPairRDD<HiveKey, Iterable<BytesWritable>> shuffle(
  JavaPairRDD<HiveKey, BytesWritable> input, int numPartitions) {
 if (numPartitions > 0) {
  return input.groupByKey(numPartitions);
 }
 return input.groupByKey();
}

@Override
public JavaPairRDD<HiveKey, Iterable<BytesWritable>> shuffle(
  JavaPairRDD<HiveKey, BytesWritable> input, int numPartitions) {
 if (numPartitions > 0) {
  return input.groupByKey(numPartitions);
 }
 return input.groupByKey();
}

private static JavaPairRDD<String,Collection<String>> knownsRDD(JavaRDD<String[]> allData,
                                boolean knownItems) {
 JavaRDD<String[]> sorted = allData.sortBy(datum -> Long.valueOf(datum[3]), true, allData.partitions().size());
 JavaPairRDD<String,Tuple2<String,Boolean>> tuples = sorted.mapToPair(datum -> {
   String user = datum[0];
   String item = datum[1];
   Boolean delete = datum[2].isEmpty();
   return knownItems ?
     new Tuple2<>(user, new Tuple2<>(item, delete)) :
     new Tuple2<>(item, new Tuple2<>(user, delete));
  });
 // TODO likely need to figure out a way to avoid groupByKey but collectByKey
 // won't work here -- doesn't guarantee enough about ordering
 return tuples.groupByKey().mapValues(idDeletes -> {
   Collection<String> ids = new HashSet<>();
   for (Tuple2<String,Boolean> idDelete : idDeletes) {
    if (idDelete._2()) {
     ids.remove(idDelete._1());
    } else {
     ids.add(idDelete._1());
    }
   }
   return ids;
  });
}

private JavaPairRDD<Integer, Iterable<double[]>> fetchClusteredPoints(JavaRDD<? extends Vector> evalData) {
 return evalData.mapToPair(vector -> {
  double closestDist = Double.POSITIVE_INFINITY;
  int minClusterID = Integer.MIN_VALUE;
  double[] vec = vector.toArray();
  DistanceFn<double[]> distanceFn = getDistanceFn();
  Map<Integer,ClusterInfo> clusters = getClustersByID();
  for (ClusterInfo cluster : clusters.values()) {
   double distance = distanceFn.applyAsDouble(cluster.getCenter(), vec);
   if (distance < closestDist) {
    closestDist = distance;
    minClusterID = cluster.getID();
   }
  }
  Preconditions.checkState(!Double.isInfinite(closestDist) && !Double.isNaN(closestDist));
  return new Tuple2<>(minClusterID, vec);
 }).groupByKey();
}

@SuppressWarnings("unchecked")
@Test
public void lookup() {
 JavaPairRDD<String, String> categories = sc.parallelizePairs(Arrays.asList(
  new Tuple2<>("Apples", "Fruit"),
  new Tuple2<>("Oranges", "Fruit"),
  new Tuple2<>("Oranges", "Citrus")
 ));
 assertEquals(2, categories.lookup("Oranges").size());
 assertEquals(2, Iterables.size(categories.groupByKey().lookup("Oranges").get(0)));
}

@SuppressWarnings("unchecked")
@Test
public void lookup() {
 JavaPairRDD<String, String> categories = sc.parallelizePairs(Arrays.asList(
  new Tuple2<>("Apples", "Fruit"),
  new Tuple2<>("Oranges", "Fruit"),
  new Tuple2<>("Oranges", "Citrus")
 ));
 assertEquals(2, categories.lookup("Oranges").size());
 assertEquals(2, Iterables.size(categories.groupByKey().lookup("Oranges").get(0)));
}

@SuppressWarnings("unchecked")
@Test
public void lookup() {
 JavaPairRDD<String, String> categories = sc.parallelizePairs(Arrays.asList(
  new Tuple2<>("Apples", "Fruit"),
  new Tuple2<>("Oranges", "Fruit"),
  new Tuple2<>("Oranges", "Citrus")
 ));
 assertEquals(2, categories.lookup("Oranges").size());
 assertEquals(2, Iterables.size(categories.groupByKey().lookup("Oranges").get(0)));
}

}).groupByKey();

if (model.isImplicit()) {
 aggregated = tuples.groupByKey().mapValues(MLFunctions.SUM_WITH_NAN);
} else {

    new FlatOutputFucntion(cubeName, segmentId, metaUrl, sConf, samplingPercent, bytesWritten));
JavaPairRDD<SelfDefineSortableKey, Iterable<Text>> aggredRDD = flatOutputRDD.groupByKey(
    new FactDistinctPartitioner(cubeName, metaUrl, sConf, reducerMapping.getTotalReducerNum()));

/**
 * Combines {@link Rating}s with the same user/item into one, with score as the sum of
 * all of the scores.
 */
private JavaRDD<Rating> aggregateScores(JavaRDD<? extends Rating> original, double epsilon) {
 JavaPairRDD<Tuple2<Integer,Integer>,Double> tuples =
   original.mapToPair(rating -> new Tuple2<>(new Tuple2<>(rating.user(), rating.product()), rating.rating()));
 JavaPairRDD<Tuple2<Integer,Integer>,Double> aggregated;
 if (implicit) {
  // TODO can we avoid groupByKey? reduce, combine, fold don't seem viable since
  // they don't guarantee the delete elements are properly handled
  aggregated = tuples.groupByKey().mapValues(MLFunctions.SUM_WITH_NAN);
 } else {
  // For non-implicit, last wins.
  aggregated = tuples.foldByKey(Double.NaN, (current, next) -> next);
 }
 JavaPairRDD<Tuple2<Integer,Integer>,Double> noNaN =
   aggregated.filter(kv -> !Double.isNaN(kv._2()));
 if (logStrength) {
  return noNaN.map(userProductScore -> new Rating(
    userProductScore._1()._1(),
    userProductScore._1()._2(),
    Math.log1p(userProductScore._2() / epsilon)));
 } else {
  return noNaN.map(userProductScore -> new Rating(
    userProductScore._1()._1(),
    userProductScore._1()._2(),
    userProductScore._2()));
 }
}

positiveUserProducts.groupByKey().flatMapToPair(
 new PairFlatMapFunction<Tuple2<Integer,Iterable<Integer>>,Integer,Integer>() {
  private final RandomGenerator random = RandomManager.getRandom();

static JavaPairRDD<String, Tuple2<Double, Integer>> computeRankedProducts(
    JavaSparkContext context,
    JavaPairRDD<String, Long>[] ranks) {
  JavaPairRDD<String, Long> unionRDD = context.union(ranks);
  
  // next find unique keys, with their associated copa scores
  JavaPairRDD<String, Iterable<Long>> groupedByGeneRDD = unionRDD.groupByKey();
  
  // next calculate ranked products and the number of elements
  JavaPairRDD<String, Tuple2<Double, Integer>> rankedProducts = 
      groupedByGeneRDD.mapValues((Iterable<Long> values) -> {
    int N = 0;
    long products = 1;
    for (Long v : values) {
      products *= v;
      N++;
    }
    
    double rankedProduct = Math.pow( (double) products, 1.0/((double) N));
    return new Tuple2<Double, Integer>(rankedProduct, N);
  } // input: copa scores for all studies
  // output: (rankedProduct, N)
  );                
  return rankedProducts;
}

static JavaPairRDD<String, Tuple2<Double, Integer>> computeRankedProducts(
    JavaSparkContext context,
    JavaPairRDD<String, Long>[] ranks) {
  JavaPairRDD<String, Long> unionRDD = context.union(ranks);
  
  // next find unique keys, with their associated copa scores
  JavaPairRDD<String, Iterable<Long>> groupedByGeneRDD = unionRDD.groupByKey();
  
  // next calculate ranked products and the number of elements
  JavaPairRDD<String, Tuple2<Double, Integer>> rankedProducts = groupedByGeneRDD.mapValues(
      new Function<
             Iterable<Long>,            // input: copa scores for all studies
             Tuple2<Double, Integer>    // output: (rankedProduct, N)
            >() {
      @Override
      public Tuple2<Double, Integer> call(Iterable<Long> values) {
        int N = 0;
        long products = 1;
        for (Long v : values) {
          products *= v;
          N++;
        }
        
        double rankedProduct = Math.pow( (double) products, 1.0/((double) N));
        return new Tuple2<Double, Integer>(rankedProduct, N);
      }
  });                
  return rankedProducts;
}

JavaPairRDD<String, Iterable<Double>> groupedByGene = genes.groupByKey();

JavaPairRDD<String, Iterable<Double>> groupedByGene = genes.groupByKey();

  public static <K, V, OK, OV> JavaPairRDD<OK, OV> executeReduce(
      final JavaPairRDD<K, V> mapOrCombineRDD, final MapReduce<K, V, OK, OV, ?> mapReduce,
      final Configuration graphComputerConfiguration) {
    JavaPairRDD<OK, OV> reduceRDD = mapOrCombineRDD.groupByKey().mapPartitionsToPair(partitionIterator -> {
      KryoShimServiceLoader.applyConfiguration(graphComputerConfiguration);
      return new ReduceIterator<>(MapReduce.<MapReduce<K, V, OK, OV, ?>>createMapReduce(HadoopGraph.open(graphComputerConfiguration), graphComputerConfiguration), partitionIterator);
    });
    if (mapReduce.getReduceKeySort().isPresent())
      reduceRDD = reduceRDD.sortByKey(mapReduce.getReduceKeySort().get(), true, 1);
    return reduceRDD;
  }
}

JavaPairRDD<String, Iterable<String>> anagrams = rdd.groupByKey();

JavaPairRDD<Integer, Vector> closest = getClosest(data, centroids);
JavaPairRDD<Integer, Iterable<Vector>> pointsGroup = closest.groupByKey();
Map<Integer, Vector> newCentroids = getNewCentroids(pointsGroup);

JavaPairRDD<String, Iterable<Tuple2<Integer,String>>> groupedByWord = wordAsKey.groupByKey();

Popular methods of JavaPairRDD

Popular in Java

Making http post requests using okhttp
setScale (BigDecimal)
setContentView (Activity)
startActivity (Activity)
ByteBuffer (java.nio)
A buffer for bytes. A byte buffer can be created in either one of the following ways: * #allocate
NumberFormat (java.text)
The abstract base class for all number formats. This class provides the interface for formatting and
Collectors (java.util.stream)
Graphics2D (java.awt)
This Graphics2D class extends the Graphics class to provide more sophisticated control overgraphics
Annotation (javassist.bytecode.annotation)
The annotation structure.An instance of this class is returned bygetAnnotations() in AnnotationsAttr
Project (org.apache.tools.ant)
Central representation of an Ant project. This class defines an Ant project with all of its targets,
Top plugins for WebStorm

How to use groupByKeymethodin org.apache.spark.api.java.JavaPairRDD

Best Java code snippets using org.apache.spark.api.java.JavaPairRDD.groupByKey (Showing top 20 results out of 315)

How to use
groupByKey
method
in
org.apache.spark.api.java.JavaPairRDD