org.apache.spark.api.java.JavaSparkContext.newAPIHadoopFile java code examples

@SuppressWarnings("unchecked")
@Test
public void readWithNewAPIHadoopFile() throws IOException {
 String outputDir = new File(tempDir, "output").getAbsolutePath();
 List<Tuple2<Integer, String>> pairs = Arrays.asList(
  new Tuple2<>(1, "a"),
  new Tuple2<>(2, "aa"),
  new Tuple2<>(3, "aaa")
 );
 JavaPairRDD<Integer, String> rdd = sc.parallelizePairs(pairs);
 rdd.mapToPair(pair -> new Tuple2<>(new IntWritable(pair._1()), new Text(pair._2())))
  .saveAsHadoopFile(outputDir, IntWritable.class, Text.class, SequenceFileOutputFormat.class);
 JavaPairRDD<IntWritable, Text> output = sc.newAPIHadoopFile(outputDir,
  org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat.class,
  IntWritable.class, Text.class, Job.getInstance().getConfiguration());
 assertEquals(pairs.toString(), output.map(Tuple2::toString).collect().toString());
}

@SuppressWarnings("unchecked")
@Test
public void readWithNewAPIHadoopFile() throws IOException {
 String outputDir = new File(tempDir, "output").getAbsolutePath();
 List<Tuple2<Integer, String>> pairs = Arrays.asList(
  new Tuple2<>(1, "a"),
  new Tuple2<>(2, "aa"),
  new Tuple2<>(3, "aaa")
 );
 JavaPairRDD<Integer, String> rdd = sc.parallelizePairs(pairs);
 rdd.mapToPair(pair -> new Tuple2<>(new IntWritable(pair._1()), new Text(pair._2())))
  .saveAsHadoopFile(outputDir, IntWritable.class, Text.class, SequenceFileOutputFormat.class);
 JavaPairRDD<IntWritable, Text> output = sc.newAPIHadoopFile(outputDir,
  org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat.class,
  IntWritable.class, Text.class, Job.getInstance().getConfiguration());
 assertEquals(pairs.toString(), output.map(Tuple2::toString).collect().toString());
}

@SuppressWarnings("unchecked")
@Test
public void readWithNewAPIHadoopFile() throws IOException {
 String outputDir = new File(tempDir, "output").getAbsolutePath();
 List<Tuple2<Integer, String>> pairs = Arrays.asList(
  new Tuple2<>(1, "a"),
  new Tuple2<>(2, "aa"),
  new Tuple2<>(3, "aaa")
 );
 JavaPairRDD<Integer, String> rdd = sc.parallelizePairs(pairs);
 rdd.mapToPair(pair -> new Tuple2<>(new IntWritable(pair._1()), new Text(pair._2())))
  .saveAsHadoopFile(outputDir, IntWritable.class, Text.class, SequenceFileOutputFormat.class);
 JavaPairRDD<IntWritable, Text> output = sc.newAPIHadoopFile(outputDir,
  org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat.class,
  IntWritable.class, Text.class, Job.getInstance().getConfiguration());
 assertEquals(pairs.toString(), output.map(Tuple2::toString).collect().toString());
}

JavaPairRDD<LongWritable ,Text> fastqRDD = ctx.newAPIHadoopFile(
       inputPath,
       FastqInputFormat.class,

 SparkConf conf = new SparkConf().setMaster("");
JavaSparkContext jsc = new JavaSparkContext(conf);

// read the content of the file using Hadoop format
JavaPairRDD<LongWritable, Text> data = jsc.newAPIHadoopFile(
    "file_path", // input path
    TextInputFormat.class, // used input format class
    LongWritable.class, // class of the value
    Text.class, // class of the value
    new Configuration());    

JavaRDD<String> mapped = data.map(new Function<Tuple2<LongWritable, Text>, String>() {
  @Override
  public String call(Tuple2<LongWritable, Text> tuple) throws Exception {
    // you will get each line from as a tuple (offset, text)    
    long pos = tuple._1().get(); // extract offset
    String line = tuple._2().toString(); // extract text

    return pos + " " + line;
  }
});

 public class Utils {

 public static <T> JavaPairRDD<String, T> loadAvroFile(JavaSparkContext sc, String avroPath) {
  JavaPairRDD<AvroKey, NullWritable> records = sc.newAPIHadoopFile(avroPath, AvroKeyInputFormat.class, AvroKey.class, NullWritable.class, sc.hadoopConfiguration());
  return records.keys()
    .map(x -> (GenericRecord) x.datum())
    .mapToPair(pair -> new Tuple2<>((String) pair.get("key"), (T)pair.get("value")));
 }
}

 public class Utils {

 public static <T> JavaPairRDD<String, T> loadAvroFile(JavaSparkContext sc, String avroPath) {
  JavaPairRDD<AvroKey, NullWritable> records = sc.newAPIHadoopFile(avroPath, AvroKeyInputFormat.class, AvroKey.class, NullWritable.class, sc.hadoopConfiguration());
  return records.keys()
    .map(x -> (GenericRecord) x.datum())
    .mapToPair(pair -> new Tuple2<>((String) pair.get("key"), (T)pair.get("value")));
 }
}

JavaSparkContext sc = new
JavaSparkContext(conf);         
JavaPairRDD<Text, FileInfoWritable> rdd = sc.newAPIHadoopFile(inputPath, RichFileInputFormat.class,
Text.class,
        FileInfoWritable.class, new Configuration());

 @Override
 public Pair<Optional<JavaRDD<GenericRecord>>, String> fetchNewData(
   Optional<String> lastCheckpointStr, long sourceLimit) {
  try {
   // find the source commit to pull
   Optional<String> commitToPull = findCommitToPull(lastCheckpointStr);

   if (!commitToPull.isPresent()) {
    return new ImmutablePair<>(Optional.empty(),
        lastCheckpointStr.orElse(""));
   }

   // read the files out.
   List<FileStatus> commitDeltaFiles = Arrays.asList(
     fs.listStatus(new Path(incrPullRootPath, commitToPull.get())));
   String pathStr = commitDeltaFiles.stream().map(f -> f.getPath().toString())
     .collect(Collectors.joining(","));
   JavaPairRDD<AvroKey, NullWritable> avroRDD = sparkContext.newAPIHadoopFile(pathStr,
     AvroKeyInputFormat.class, AvroKey.class, NullWritable.class,
     sparkContext.hadoopConfiguration());
   return new ImmutablePair<>(Optional.of(avroRDD.keys().map(r -> ((GenericRecord) r.datum()))),
     String.valueOf(commitToPull.get()));
  } catch (IOException ioe) {
   throw new HoodieIOException(
     "Unable to read from source from checkpoint: " + lastCheckpointStr, ioe);
  }
 }
}

@SuppressWarnings( {"rawtypes", "unchecked"})
private Dataset<Row> readInputFormat(String path) throws Exception {
 LOG.debug("Reading InputFormat[{}]: {}", inputType, path);
 Class<? extends InputFormat> typeClazz = Class.forName(inputType).asSubclass(InputFormat.class);
 Class<?> keyClazz = Class.forName(keyType);
 Class<?> valueClazz = Class.forName(valueType);
 @SuppressWarnings("resource")
 JavaSparkContext context = new JavaSparkContext(Contexts.getSparkSession().sparkContext());
 JavaPairRDD<?, ?> rdd = context.newAPIHadoopFile(path, typeClazz, keyClazz, valueClazz, new Configuration());
 TranslateFunction translateFunction = new TranslateFunction(translatorConfig);
 return Contexts.getSparkSession().createDataFrame(rdd.flatMap(translateFunction), translateFunction.getSchema());
}

 @Override
 public Pair<Optional<JavaRDD<GenericRecord>>, String> fetchNewData(
   Optional<String> lastCheckpointStr, long sourceLimit) {
  try {
   // find the source commit to pull
   Optional<String> commitToPull = findCommitToPull(lastCheckpointStr);

   if (!commitToPull.isPresent()) {
    return new ImmutablePair<>(Optional.empty(),
      lastCheckpointStr.isPresent() ? lastCheckpointStr.get() : "");
   }

   // read the files out.
   List<FileStatus> commitDeltaFiles = Arrays.asList(
     fs.listStatus(new Path(incrPullRootPath, commitToPull.get())));
   String pathStr = commitDeltaFiles.stream().map(f -> f.getPath().toString())
     .collect(Collectors.joining(","));
   JavaPairRDD<AvroKey, NullWritable> avroRDD = sparkContext.newAPIHadoopFile(pathStr,
     AvroKeyInputFormat.class, AvroKey.class, NullWritable.class,
     sparkContext.hadoopConfiguration());
   return new ImmutablePair<>(Optional.of(avroRDD.keys().map(r -> ((GenericRecord) r.datum()))),
     String.valueOf(commitToPull.get()));
  } catch (IOException ioe) {
   throw new HoodieIOException(
     "Unable to read from source from checkpoint: " + lastCheckpointStr, ioe);
  }
 }
}

/**
 * Returns a java rdd filled with records of the specified type (avroRecordClass). The records are read from an avro datastore directory specified by
 * the avroDateStore path 
 */
public <T extends GenericRecord> JavaRDD<T> loadJavaRDD(JavaSparkContext sc, String avroDatastorePath, Class<T> avroRecordClass) {
  Preconditions.checkNotNull(sc);
  Preconditions.checkNotNull(avroDatastorePath);
  Preconditions.checkNotNull(avroRecordClass);
  Schema schema = AvroUtils.toSchema(avroRecordClass.getName());
  Job job = getJob(schema);
  @SuppressWarnings("unchecked")
  JavaPairRDD<AvroKey<T>, NullWritable> inputRecords = (JavaPairRDD<AvroKey<T>, NullWritable>)
      sc.newAPIHadoopFile(avroDatastorePath, AvroKeyInputFormat.class, avroRecordClass, NullWritable.class, job.getConfiguration());
  
  
  // Hadoop's RecordReader reuses the same Writable object for all records
  // which may lead to undesired behavior when caching RDD.
  // Cloning records solves this problem.
  JavaRDD<T> input = inputRecords.map(tuple -> AvroUtils.cloneAvroRecord(tuple._1.datum()));
  return input;
}

/**
 * ShapefileRDD.
 *
 * @param sparkContext the spark context
 * @param filePath the file path
 */
public ShapefileRDD(JavaSparkContext sparkContext, String filePath)
{
  boundBox = new BoundBox();
  JavaPairRDD<ShapeKey, PrimitiveShape> shapePrimitiveRdd = sparkContext.newAPIHadoopFile(
      filePath,
      ShapeInputFormat.class,
      ShapeKey.class,
      PrimitiveShape.class,
      sparkContext.hadoopConfiguration()
  );
  shapeRDD = shapePrimitiveRdd.map(PrimitiveToShape);
}

/**
 * ShapefileRDD.
 *
 * @param sparkContext the spark context
 * @param filePath the file path
 */
public ShapefileRDD(JavaSparkContext sparkContext, String filePath)
{
  boundBox = new BoundBox();
  JavaPairRDD<ShapeKey, PrimitiveShape> shapePrimitiveRdd = sparkContext.newAPIHadoopFile(
      filePath,
      ShapeInputFormat.class,
      ShapeKey.class,
      PrimitiveShape.class,
      sparkContext.hadoopConfiguration()
  );
  shapeRDD = shapePrimitiveRdd.map(PrimitiveToShape);
}

 @Override
 protected JavaRDD<GenericRecord> fromFiles(AvroConvertor convertor, String pathStr) {
  JavaPairRDD<AvroKey, NullWritable> avroRDD = sparkContext.newAPIHadoopFile(pathStr,
    AvroKeyInputFormat.class, AvroKey.class, NullWritable.class,
    sparkContext.hadoopConfiguration());
  return avroRDD.keys().map(r -> ((GenericRecord) r.datum()));
 }
}

 @Override
 protected JavaRDD<GenericRecord> fromFiles(AvroConvertor convertor, String pathStr) {
  JavaPairRDD<AvroKey, NullWritable> avroRDD = sparkContext.newAPIHadoopFile(pathStr,
    AvroKeyInputFormat.class, AvroKey.class, NullWritable.class,
    sparkContext.hadoopConfiguration());
  return avroRDD.keys().map(r -> ((GenericRecord) r.datum()));
 }
}

/**
 * read and merge bound boxes of all shapefiles user input, if there is no, leave BoundBox null;
 */
public static BoundBox readBoundBox(JavaSparkContext sc, String inputPath)
{
  // read bound boxes into memory
  JavaPairRDD<Long, BoundBox> bounds = sc.newAPIHadoopFile(
      inputPath,
      BoundaryInputFormat.class,
      Long.class,
      BoundBox.class,
      sc.hadoopConfiguration()
  );
  // merge all into one
  bounds = bounds.reduceByKey(new Function2<BoundBox, BoundBox, BoundBox>()
  {
    @Override
    public BoundBox call(BoundBox box1, BoundBox box2)
        throws Exception
    {
      return BoundBox.mergeBoundBox(box1, box2);
    }
  });
  // if there is a result assign it to variable : boundBox
  if (bounds.count() > 0) {
    return new BoundBox(bounds.collect().get(0)._2());
  }
  else { return null; }
}

  /**
   * read and merge bound boxes of all shapefiles user input, if there is no, leave BoundBox null;
   */
  public BoundBox getBoundBox(JavaSparkContext sc, String inputPath)
  {
    // read bound boxes into memory
    JavaPairRDD<Long, BoundBox> bounds = sc.newAPIHadoopFile(
        inputPath,
        BoundaryInputFormat.class,
        Long.class,
        BoundBox.class,
        sc.hadoopConfiguration()
    );
    // merge all into one
    bounds = bounds.reduceByKey(new Function2<BoundBox, BoundBox, BoundBox>()
    {
      @Override
      public BoundBox call(BoundBox box1, BoundBox box2)
          throws Exception
      {
        return BoundBox.mergeBoundBox(box1, box2);
      }
    });
    // if there is a result assign it to variable : boundBox
    if (bounds.count() > 0) {
      return new BoundBox(bounds.collect().get(0)._2());
    }
    else { return null; }
  }
}

  /**
   * read and merge bound boxes of all shapefiles user input, if there is no, leave BoundBox null;
   */
  public BoundBox getBoundBox(JavaSparkContext sc, String inputPath)
  {
    // read bound boxes into memory
    JavaPairRDD<Long, BoundBox> bounds = sc.newAPIHadoopFile(
        inputPath,
        BoundaryInputFormat.class,
        Long.class,
        BoundBox.class,
        sc.hadoopConfiguration()
    );
    // merge all into one
    bounds = bounds.reduceByKey(new Function2<BoundBox, BoundBox, BoundBox>()
    {
      @Override
      public BoundBox call(BoundBox box1, BoundBox box2)
          throws Exception
      {
        return BoundBox.mergeBoundBox(box1, box2);
      }
    });
    // if there is a result assign it to variable : boundBox
    if (bounds.count() > 0) {
      return new BoundBox(bounds.collect().get(0)._2());
    }
    else { return null; }
  }
}

/**
 * read and merge bound boxes of all shapefiles user input, if there is no, leave BoundBox null;
 */
public static BoundBox readBoundBox(JavaSparkContext sc, String inputPath)
{
  // read bound boxes into memory
  JavaPairRDD<Long, BoundBox> bounds = sc.newAPIHadoopFile(
      inputPath,
      BoundaryInputFormat.class,
      Long.class,
      BoundBox.class,
      sc.hadoopConfiguration()
  );
  // merge all into one
  bounds = bounds.reduceByKey(new Function2<BoundBox, BoundBox, BoundBox>()
  {
    @Override
    public BoundBox call(BoundBox box1, BoundBox box2)
        throws Exception
    {
      return BoundBox.mergeBoundBox(box1, box2);
    }
  });
  // if there is a result assign it to variable : boundBox
  if (bounds.count() > 0) {
    return new BoundBox(bounds.collect().get(0)._2());
  }
  else { return null; }
}

Popular methods of JavaSparkContext

Popular in Java

Creating JSON documents from java classes using gson
addToBackStack (FragmentTransaction)
findViewById (Activity)
getSharedPreferences (Context)
BigInteger (java.math)
An immutable arbitrary-precision signed integer.FAST CRYPTOGRAPHY This implementation is efficient f
URLEncoder (java.net)
This class is used to encode a string using the format required by application/x-www-form-urlencoded
TreeMap (java.util)
Walk the nodes of the tree left-to-right or right-to-left. Note that in descending iterations, next
TreeSet (java.util)
TreeSet is an implementation of SortedSet. All optional operations (adding and removing) are support
Collectors (java.util.stream)
HttpServlet (javax.servlet.http)
Provides an abstract class to be subclassed to create an HTTP servlet suitable for a Web site. A sub
Top 12 Jupyter Notebook extensions

How to use newAPIHadoopFilemethodin org.apache.spark.api.java.JavaSparkContext

Best Java code snippets using org.apache.spark.api.java.JavaSparkContext.newAPIHadoopFile (Showing top 20 results out of 315)

How to use
newAPIHadoopFile
method
in
org.apache.spark.api.java.JavaSparkContext