org.apache.spark.api.java.JavaPairRDD.mapToPair java code examples

 public static void main(String[] args) throws Exception {
    if (args.length != 2) {
   throw new Exception("Usage BasicSaveSequenceFile [sparkMaster] [output]");
    }
  String master = args[0];
  String fileName = args[1];

    JavaSparkContext sc = new JavaSparkContext(
   master, "basicloadsequencefile", System.getenv("SPARK_HOME"), System.getenv("JARS"));
  List<Tuple2<String, Integer>> input = new ArrayList();
  input.add(new Tuple2("coffee", 1));
  input.add(new Tuple2("coffee", 2));
  input.add(new Tuple2("pandas", 3));
  JavaPairRDD<String, Integer> rdd = sc.parallelizePairs(input);
  JavaPairRDD<Text, IntWritable> result = rdd.mapToPair(new ConvertToWritableTypes());
  result.saveAsHadoopFile(fileName, Text.class, IntWritable.class, SequenceFileOutputFormat.class);
  }
}

 public static void main(String[] args) throws Exception {
    if (args.length != 2) {
   throw new Exception("Usage BasicLoadSequenceFile [sparkMaster] [input]");
    }
  String master = args[0];
  String fileName = args[1];

    JavaSparkContext sc = new JavaSparkContext(
   master, "basicloadsequencefile", System.getenv("SPARK_HOME"), System.getenv("JARS"));
  JavaPairRDD<Text, IntWritable> input = sc.sequenceFile(fileName, Text.class, IntWritable.class);
  JavaPairRDD<String, Integer> result = input.mapToPair(new ConvertToNativeTypes());
  List<Tuple2<String, Integer>> resultList = result.collect();
  for (Tuple2<String, Integer> record : resultList) {
   System.out.println(record);
  }
  }
}

@Override
public JavaPairRDD<WritableComparable, Writable> transform(
  JavaPairRDD<WritableComparable, Writable> input) {
 Preconditions.checkArgument(input == null,
   "AssertionError: MapInput doesn't take any input");
 JavaPairRDD<WritableComparable, Writable> result;
 if (toCache) {
  result = hadoopRDD.mapToPair(new CopyFunction());
  sparkPlan.addCachedRDDId(result.id());
  result = result.persist(StorageLevel.MEMORY_AND_DISK());
 } else {
  result = hadoopRDD;
 }
 result.setName(this.name);
 return result;
}

private static RDD<Tuple2<Object,double[]>> readAndConvertFeatureRDD(
  JavaPairRDD<String,float[]> javaRDD,
  Broadcast<? extends Map<String,Integer>> bIdToIndex) {
 RDD<Tuple2<Integer,double[]>> scalaRDD = javaRDD.mapToPair(t ->
   new Tuple2<>(bIdToIndex.value().get(t._1()), t._2())
 ).mapValues(f -> {
   double[] d = new double[f.length];
   for (int i = 0; i < d.length; i++) {
    d[i] = f[i];
   }
   return d;
  }
 ).rdd();
 // This mimics the persistence level establish by ALS training methods
 scalaRDD.persist(StorageLevel.MEMORY_AND_DISK());
 @SuppressWarnings("unchecked")
 RDD<Tuple2<Object,double[]>> objKeyRDD = (RDD<Tuple2<Object,double[]>>) (RDD<?>) scalaRDD;
 return objKeyRDD;
}

@SuppressWarnings("unchecked")
@Test
public void mapOnPairRDD() {
 JavaRDD<Integer> rdd1 = sc.parallelize(Arrays.asList(1,2,3,4));
 JavaPairRDD<Integer, Integer> rdd2 = rdd1.mapToPair(i -> new Tuple2<>(i, i % 2));
 JavaPairRDD<Integer, Integer> rdd3 = rdd2.mapToPair(in -> new Tuple2<>(in._2(), in._1()));
 assertEquals(Arrays.asList(
  new Tuple2<>(1, 1),
  new Tuple2<>(0, 2),
  new Tuple2<>(1, 3),
  new Tuple2<>(0, 4)), rdd3.collect());
}

@SuppressWarnings("unchecked")
@Test
public void mapOnPairRDD() {
 JavaRDD<Integer> rdd1 = sc.parallelize(Arrays.asList(1,2,3,4));
 JavaPairRDD<Integer, Integer> rdd2 = rdd1.mapToPair(i -> new Tuple2<>(i, i % 2));
 JavaPairRDD<Integer, Integer> rdd3 = rdd2.mapToPair(in -> new Tuple2<>(in._2(), in._1()));
 assertEquals(Arrays.asList(
  new Tuple2<>(1, 1),
  new Tuple2<>(0, 2),
  new Tuple2<>(1, 3),
  new Tuple2<>(0, 4)), rdd3.collect());
}

@SuppressWarnings("unchecked")
@Test
public void mapOnPairRDD() {
 JavaRDD<Integer> rdd1 = sc.parallelize(Arrays.asList(1,2,3,4));
 JavaPairRDD<Integer, Integer> rdd2 = rdd1.mapToPair(i -> new Tuple2<>(i, i % 2));
 JavaPairRDD<Integer, Integer> rdd3 = rdd2.mapToPair(in -> new Tuple2<>(in._2(), in._1()));
 assertEquals(Arrays.asList(
  new Tuple2<>(1, 1),
  new Tuple2<>(0, 2),
  new Tuple2<>(1, 3),
  new Tuple2<>(0, 4)), rdd3.collect());
}

@Test
public void mapOnPairRDD() {
 JavaRDD<Integer> rdd1 = sc.parallelize(Arrays.asList(1, 2, 3, 4));
 JavaPairRDD<Integer, Integer> rdd2 =
  rdd1.mapToPair(i -> new Tuple2<>(i, i % 2));
 JavaPairRDD<Integer, Integer> rdd3 =
  rdd2.mapToPair(in -> new Tuple2<>(in._2(), in._1()));
 Assert.assertEquals(Arrays.asList(
  new Tuple2<>(1, 1),
  new Tuple2<>(0, 2),
  new Tuple2<>(1, 3),
  new Tuple2<>(0, 4)), rdd3.collect());
}

@Test
public void mapOnPairRDD() {
 JavaRDD<Integer> rdd1 = sc.parallelize(Arrays.asList(1, 2, 3, 4));
 JavaPairRDD<Integer, Integer> rdd2 =
  rdd1.mapToPair(i -> new Tuple2<>(i, i % 2));
 JavaPairRDD<Integer, Integer> rdd3 =
  rdd2.mapToPair(in -> new Tuple2<>(in._2(), in._1()));
 Assert.assertEquals(Arrays.asList(
  new Tuple2<>(1, 1),
  new Tuple2<>(0, 2),
  new Tuple2<>(1, 3),
  new Tuple2<>(0, 4)), rdd3.collect());
}

@Test
public void mapOnPairRDD() {
 JavaRDD<Integer> rdd1 = sc.parallelize(Arrays.asList(1, 2, 3, 4));
 JavaPairRDD<Integer, Integer> rdd2 =
  rdd1.mapToPair(i -> new Tuple2<>(i, i % 2));
 JavaPairRDD<Integer, Integer> rdd3 =
  rdd2.mapToPair(in -> new Tuple2<>(in._2(), in._1()));
 Assert.assertEquals(Arrays.asList(
  new Tuple2<>(1, 1),
  new Tuple2<>(0, 2),
  new Tuple2<>(1, 3),
  new Tuple2<>(0, 4)), rdd3.collect());
}

@SuppressWarnings("unchecked")
@Test
public void hadoopFile() {
 String outputDir = new File(tempDir, "output").getAbsolutePath();
 List<Tuple2<Integer, String>> pairs = Arrays.asList(
  new Tuple2<>(1, "a"),
  new Tuple2<>(2, "aa"),
  new Tuple2<>(3, "aaa")
 );
 JavaPairRDD<Integer, String> rdd = sc.parallelizePairs(pairs);
 rdd.mapToPair(pair -> new Tuple2<>(new IntWritable(pair._1()), new Text(pair._2())))
  .saveAsHadoopFile(outputDir, IntWritable.class, Text.class, SequenceFileOutputFormat.class);
 JavaPairRDD<IntWritable, Text> output = sc.hadoopFile(outputDir,
  SequenceFileInputFormat.class, IntWritable.class, Text.class);
 assertEquals(pairs.toString(), output.map(Tuple2::toString).collect().toString());
}

@SuppressWarnings("unchecked")
@Test
public void hadoopFileCompressed() {
 String outputDir = new File(tempDir, "output_compressed").getAbsolutePath();
 List<Tuple2<Integer, String>> pairs = Arrays.asList(
  new Tuple2<>(1, "a"),
  new Tuple2<>(2, "aa"),
  new Tuple2<>(3, "aaa")
 );
 JavaPairRDD<Integer, String> rdd = sc.parallelizePairs(pairs);
 rdd.mapToPair(pair -> new Tuple2<>(new IntWritable(pair._1()), new Text(pair._2())))
  .saveAsHadoopFile(outputDir, IntWritable.class, Text.class,
   SequenceFileOutputFormat.class, DefaultCodec.class);
 JavaPairRDD<IntWritable, Text> output = sc.hadoopFile(outputDir,
  SequenceFileInputFormat.class, IntWritable.class, Text.class);
 assertEquals(pairs.toString(), output.map(Tuple2::toString).collect().toString());
}

@SuppressWarnings("unchecked")
@Test
public void hadoopFile() {
 String outputDir = new File(tempDir, "output").getAbsolutePath();
 List<Tuple2<Integer, String>> pairs = Arrays.asList(
  new Tuple2<>(1, "a"),
  new Tuple2<>(2, "aa"),
  new Tuple2<>(3, "aaa")
 );
 JavaPairRDD<Integer, String> rdd = sc.parallelizePairs(pairs);
 rdd.mapToPair(pair -> new Tuple2<>(new IntWritable(pair._1()), new Text(pair._2())))
  .saveAsHadoopFile(outputDir, IntWritable.class, Text.class, SequenceFileOutputFormat.class);
 JavaPairRDD<IntWritable, Text> output = sc.hadoopFile(outputDir,
  SequenceFileInputFormat.class, IntWritable.class, Text.class);
 assertEquals(pairs.toString(), output.map(Tuple2::toString).collect().toString());
}

@SuppressWarnings("unchecked")
@Test
public void writeWithNewAPIHadoopFile() {
 String outputDir = new File(tempDir, "output").getAbsolutePath();
 List<Tuple2<Integer, String>> pairs = Arrays.asList(
  new Tuple2<>(1, "a"),
  new Tuple2<>(2, "aa"),
  new Tuple2<>(3, "aaa")
 );
 JavaPairRDD<Integer, String> rdd = sc.parallelizePairs(pairs);
 rdd.mapToPair(pair -> new Tuple2<>(new IntWritable(pair._1()), new Text(pair._2())))
  .saveAsNewAPIHadoopFile(outputDir, IntWritable.class, Text.class,
   org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat.class);
 JavaPairRDD<IntWritable, Text> output =
  sc.sequenceFile(outputDir, IntWritable.class, Text.class);
 assertEquals(pairs.toString(), output.map(Tuple2::toString).collect().toString());
}

@SuppressWarnings("unchecked")
@Test
public void hadoopFileCompressed() {
 String outputDir = new File(tempDir, "output_compressed").getAbsolutePath();
 List<Tuple2<Integer, String>> pairs = Arrays.asList(
  new Tuple2<>(1, "a"),
  new Tuple2<>(2, "aa"),
  new Tuple2<>(3, "aaa")
 );
 JavaPairRDD<Integer, String> rdd = sc.parallelizePairs(pairs);
 rdd.mapToPair(pair -> new Tuple2<>(new IntWritable(pair._1()), new Text(pair._2())))
  .saveAsHadoopFile(outputDir, IntWritable.class, Text.class,
   SequenceFileOutputFormat.class, DefaultCodec.class);
 JavaPairRDD<IntWritable, Text> output = sc.hadoopFile(outputDir,
  SequenceFileInputFormat.class, IntWritable.class, Text.class);
 assertEquals(pairs.toString(), output.map(Tuple2::toString).collect().toString());
}

@SuppressWarnings("unchecked")
@Test
public void hadoopFile() {
 String outputDir = new File(tempDir, "output").getAbsolutePath();
 List<Tuple2<Integer, String>> pairs = Arrays.asList(
  new Tuple2<>(1, "a"),
  new Tuple2<>(2, "aa"),
  new Tuple2<>(3, "aaa")
 );
 JavaPairRDD<Integer, String> rdd = sc.parallelizePairs(pairs);
 rdd.mapToPair(pair -> new Tuple2<>(new IntWritable(pair._1()), new Text(pair._2())))
  .saveAsHadoopFile(outputDir, IntWritable.class, Text.class, SequenceFileOutputFormat.class);
 JavaPairRDD<IntWritable, Text> output = sc.hadoopFile(outputDir,
  SequenceFileInputFormat.class, IntWritable.class, Text.class);
 assertEquals(pairs.toString(), output.map(Tuple2::toString).collect().toString());
}

@SuppressWarnings("unchecked")
@Test
public void mapsFromPairsToPairs() {
 List<Tuple2<Integer, String>> pairs = Arrays.asList(
  new Tuple2<>(1, "a"),
  new Tuple2<>(2, "aa"),
  new Tuple2<>(3, "aaa")
 );
 JavaPairRDD<Integer, String> pairRDD = sc.parallelizePairs(pairs);
 // Regression test for SPARK-668:
 JavaPairRDD<String, Integer> swapped = pairRDD.flatMapToPair(
  item -> Collections.singletonList(item.swap()).iterator());
 swapped.collect();
 // There was never a bug here, but it's worth testing:
 pairRDD.mapToPair(Tuple2::swap).collect();
}

@SuppressWarnings("unchecked")
@Test
public void mapsFromPairsToPairs() {
 List<Tuple2<Integer, String>> pairs = Arrays.asList(
  new Tuple2<>(1, "a"),
  new Tuple2<>(2, "aa"),
  new Tuple2<>(3, "aaa")
 );
 JavaPairRDD<Integer, String> pairRDD = sc.parallelizePairs(pairs);
 // Regression test for SPARK-668:
 JavaPairRDD<String, Integer> swapped = pairRDD.flatMapToPair(
  item -> Collections.singletonList(item.swap()).iterator());
 swapped.collect();
 // There was never a bug here, but it's worth testing:
 pairRDD.mapToPair(Tuple2::swap).collect();
}

@SuppressWarnings("unchecked")
@Test
public void mapsFromPairsToPairs() {
 List<Tuple2<Integer, String>> pairs = Arrays.asList(
  new Tuple2<>(1, "a"),
  new Tuple2<>(2, "aa"),
  new Tuple2<>(3, "aaa")
 );
 JavaPairRDD<Integer, String> pairRDD = sc.parallelizePairs(pairs);
 // Regression test for SPARK-668:
 JavaPairRDD<String, Integer> swapped = pairRDD.flatMapToPair(
  item -> Collections.singletonList(item.swap()).iterator());
 swapped.collect();
 // There was never a bug here, but it's worth testing:
 pairRDD.mapToPair(Tuple2::swap).collect();
}

@Override
public JavaPairRDD<WritableComparable, Writable> transform(
  JavaPairRDD<WritableComparable, Writable> input) {
 Preconditions.checkArgument(input == null,
   "AssertionError: MapInput doesn't take any input");
 JavaPairRDD<WritableComparable, Writable> result;
 if (toCache) {
  result = hadoopRDD.mapToPair(new CopyFunction());
  sparkPlan.addCachedRDDId(result.id());
  result = result.persist(StorageLevel.MEMORY_AND_DISK());
 } else {
  result = hadoopRDD;
 }
 return result;
}

Popular methods of JavaPairRDD

Popular in Java

Running tasks concurrently on multiple threads
getSupportFragmentManager (FragmentActivity)
notifyDataSetChanged (ArrayAdapter)
compareTo (BigDecimal)
Locale (java.util)
Locale represents a language/country/variant combination. Locales are used to alter the presentatio
BlockingQueue (java.util.concurrent)
A java.util.Queue that additionally supports operations that wait for the queue to become non-empty
ExecutorService (java.util.concurrent)
An Executor that provides methods to manage termination and methods that can produce a Future for tr
Manifest (java.util.jar)
The Manifest class is used to obtain attribute information for a JarFile and its entries.
Options (org.apache.commons.cli)
Main entry-point into the library. Options represents a collection of Option objects, which describ
Modifier (javassist)
The Modifier class provides static methods and constants to decode class and member access modifiers
Top PhpStorm plugins

How to use mapToPairmethodin org.apache.spark.api.java.JavaPairRDD

Best Java code snippets using org.apache.spark.api.java.JavaPairRDD.mapToPair (Showing top 20 results out of 369)

How to use
mapToPair
method
in
org.apache.spark.api.java.JavaPairRDD