org.apache.spark.sql.DataFrameReader.parquet java code examples

@Test
public void testParquetAPI() {
 spark.read().schema(schema).parquet();
 spark.read().schema(schema).parquet(input);
 spark.read().schema(schema).parquet(input, input, input);
 spark.read().schema(schema).parquet(new String[] { input, input })
   .write().parquet(output);
}

@Test
public void testParquetAPI() {
 spark.read().schema(schema).parquet();
 spark.read().schema(schema).parquet(input);
 spark.read().schema(schema).parquet(input, input, input);
 spark.read().schema(schema).parquet(new String[] { input, input })
   .write().parquet(output);
}

@Test
public void testParquetAPI() {
 spark.read().schema(schema).parquet();
 spark.read().schema(schema).parquet(input);
 spark.read().schema(schema).parquet(input, input, input);
 spark.read().schema(schema).parquet(new String[] { input, input })
   .write().parquet(output);
}

Dataset<Row> parquetFile = sqlContext.read().parquet("people.parquet");

Dataset<Row> parquetFile = sqlContext.read().parquet("people.parquet");

private Dataset<Row> readParquet(String path) {
 LOG.debug("Reading Parquet: {}", path);
 return Contexts.getSparkSession().read().parquet(path);
}

@Override
public Dataset<Row> parquet(final String path) {
 final boolean userTriggered = initializeFunction(path);
 final Dataset<Row> result = Dataset.from(super.parquet(path));
 this.setIsUserTriggered(userTriggered);
 return result;
}

@Override
public Dataset<Row> parquet(final String... paths) {
 final boolean userTriggered = initializeFunction(paths);
 final Dataset<Row> result = Dataset.from(super.parquet(paths));
 this.setIsUserTriggered(userTriggered);
 return result;
}

@Override
public Dataset<Row> parquet(final scala.collection.Seq<String> paths) {
 final boolean userTriggered = initializeFunction(paths);
 final Dataset<Row> result = Dataset.from(super.parquet(paths));
 this.setIsUserTriggered(userTriggered);
 return result;
}

public static Dataset<Row> loadFile(String inputFormat, String inputPath, SparkSession spark) {
  if (inputFormat == null || inputFormat.isEmpty() || inputFormat.equalsIgnoreCase("text")) {
    return spark.read().text(inputPath);
  } else if (inputFormat.equalsIgnoreCase("parquet")) {
    return spark.read().parquet(inputPath);
  } else if (inputFormat.equalsIgnoreCase("csv")) {
    return spark.read().option("header", "false").csv(inputPath);
  } else if (inputFormat.equalsIgnoreCase("csv_with_header")) {
    return spark.read().option("header", "true").csv(inputPath);
  } else if (inputFormat.equalsIgnoreCase("json")) {
    return spark.read().json(inputPath);
  } else {
    throw new RuntimeException(String.format("Unsupported inputFormat: %s, %s", inputFormat, inputPath));
  }
}

@Override
public JavaRDD<AvroPayload> getData(@NonNull final ParquetWorkUnitCalculatorResult workUnitCalcResult) {
  Preconditions.checkState(workUnitCalcResult.hasWorkUnits(),
      "No work to process for: " + hiveConf.getDataPath());
  /**
   * Current implementation of HiveSource assumes that only a single work unit exists which
   * corresponds to the single partition that is processed per job.
   */
  final List<String> workUnits = workUnitCalcResult.getWorkUnits();
  final String hdfsPath = new Path(this.hiveConf.getDataPath(), workUnits.get(0)).toString();
  log.info("Reading data from path: {}", hdfsPath);
  final Dataset<Row> data = this.sqlContext.read().parquet(hdfsPath);
  final int numPartitions = calculateHiveNumPartitions(data);
  log.info("Using {} partitions", numPartitions);
  final JavaRDD<AvroPayload> hiveRawData = data
      .coalesce(numPartitions)
      .javaRDD()
      .flatMap(row -> {
          final List<AvroPayload> payloads = new ArrayList<>();
          this.converter.convert(row).forEach(d -> payloads.add(d.getSuccessData().get().getData()));
          return payloads.iterator();
        });
  return hiveRawData;
}

public void run() {
  Dataset<Row> file = sqlContext.read().parquet("hdfs://localhost:20112/khanh/test_parquet/file.parquet");
  Dataset<Row> select = file.select("id", "value_s");

/**
 * Obtain all new data written into the Hoodie dataset since the given timestamp.
 */
public static Dataset<Row> readSince(String basePath, SQLContext sqlContext, HoodieTimeline commitTimeline,
  String lastCommitTime) {
 List<HoodieInstant> commitsToReturn = commitTimeline.findInstantsAfter(lastCommitTime, Integer.MAX_VALUE)
   .getInstants().collect(Collectors.toList());
 try {
  // Go over the commit metadata, and obtain the new files that need to be read.
  HashMap<String, String> fileIdToFullPath = getLatestFileIDsToFullPath(basePath, commitTimeline, commitsToReturn);
  return sqlContext.read().parquet(fileIdToFullPath.values().toArray(new String[fileIdToFullPath.size()]))
    .filter(String.format("%s >'%s'", HoodieRecord.COMMIT_TIME_METADATA_FIELD, lastCommitTime));
 } catch (IOException e) {
  throw new HoodieException("Error pulling data incrementally from commitTimestamp :" + lastCommitTime, e);
 }
}

/**
 * Given a bunch of hoodie keys, fetches all the individual records out as a data frame
 *
 * @return a dataframe
 */
public Dataset<Row> read(JavaRDD<HoodieKey> hoodieKeys, int parallelism) throws Exception {
 assertSqlContext();
 JavaPairRDD<HoodieKey, Optional<String>> keyToFileRDD = index
   .fetchRecordLocation(hoodieKeys, jsc, hoodieTable);
 List<String> paths = keyToFileRDD.filter(keyFileTuple -> keyFileTuple._2().isPresent())
   .map(keyFileTuple -> keyFileTuple._2().get()).collect();
 // record locations might be same for multiple keys, so need a unique list
 Set<String> uniquePaths = new HashSet<>(paths);
 Dataset<Row> originalDF = sqlContextOpt.get().read()
   .parquet(uniquePaths.toArray(new String[uniquePaths.size()]));
 StructType schema = originalDF.schema();
 JavaPairRDD<HoodieKey, Row> keyRowRDD = originalDF.javaRDD().mapToPair(row -> {
  HoodieKey key = new HoodieKey(row.getAs(HoodieRecord.RECORD_KEY_METADATA_FIELD),
    row.getAs(HoodieRecord.PARTITION_PATH_METADATA_FIELD));
  return new Tuple2<>(key, row);
 });
 // Now, we need to further filter out, for only rows that match the supplied hoodie keys
 JavaRDD<Row> rowRDD = keyRowRDD.join(keyToFileRDD, parallelism).map(tuple -> tuple._2()._1());
 return sqlContextOpt.get().createDataFrame(rowRDD, schema);
}

public static Dataset<Row> readCommit(String basePath, SQLContext sqlContext, HoodieTimeline commitTimeline,
  String commitTime) {
 HoodieInstant commitInstant = new HoodieInstant(false, HoodieTimeline.COMMIT_ACTION, commitTime);
 if (!commitTimeline.containsInstant(commitInstant)) {
  new HoodieException("No commit exists at " + commitTime);
 }
 try {
  HashMap<String, String> paths = getLatestFileIDsToFullPath(basePath, commitTimeline,
    Arrays.asList(commitInstant));
  System.out.println("Path :" + paths.values());
  return sqlContext.read().parquet(paths.values().toArray(new String[paths.size()]))
    .filter(String.format("%s ='%s'", HoodieRecord.COMMIT_TIME_METADATA_FIELD, commitTime));
 } catch (Exception e) {
  throw new HoodieException("Error reading commit " + commitTime, e);
 }
}

/**
 * Given a bunch of hoodie keys, fetches all the individual records out as a data frame
 *
 * @return a dataframe
 */
public Dataset<Row> read(JavaRDD<HoodieKey> hoodieKeys, int parallelism) throws Exception {
 assertSqlContext();
 JavaPairRDD<HoodieKey, Optional<String>> keyToFileRDD = index
   .fetchRecordLocation(hoodieKeys, jsc, hoodieTable);
 List<String> paths = keyToFileRDD.filter(keyFileTuple -> keyFileTuple._2().isPresent())
   .map(keyFileTuple -> keyFileTuple._2().get()).collect();
 // record locations might be same for multiple keys, so need a unique list
 Set<String> uniquePaths = new HashSet<>(paths);
 Dataset<Row> originalDF = sqlContextOpt.get().read()
   .parquet(uniquePaths.toArray(new String[uniquePaths.size()]));
 StructType schema = originalDF.schema();
 JavaPairRDD<HoodieKey, Row> keyRowRDD = originalDF.javaRDD().mapToPair(row -> {
  HoodieKey key = new HoodieKey(row.getAs(HoodieRecord.RECORD_KEY_METADATA_FIELD),
    row.getAs(HoodieRecord.PARTITION_PATH_METADATA_FIELD));
  return new Tuple2<>(key, row);
 });
 // Now, we need to further filter out, for only rows that match the supplied hoodie keys
 JavaRDD<Row> rowRDD = keyRowRDD.join(keyToFileRDD, parallelism).map(tuple -> tuple._2()._1());
 return sqlContextOpt.get().createDataFrame(rowRDD, schema);
}

/**
 * Reads the paths under the a hoodie dataset out as a DataFrame
 */
public static Dataset<Row> read(JavaSparkContext jsc, String basePath, SQLContext
  sqlContext,
  FileSystem
    fs, String...
  paths) {
 List<String> filteredPaths = new ArrayList<>();
 try {
  HoodieTableMetaClient metaClient = new HoodieTableMetaClient(fs.getConf(), basePath, true);
  for (String path : paths) {
   TableFileSystemView.ReadOptimizedView fileSystemView = new HoodieTableFileSystemView(
     metaClient, metaClient.getCommitsTimeline().filterCompletedInstants(), fs.globStatus(new Path(path)));
   List<HoodieDataFile> latestFiles = fileSystemView.getLatestDataFiles().collect(Collectors.toList());
   for (HoodieDataFile file : latestFiles) {
    filteredPaths.add(file.getPath());
   }
  }
  return sqlContext.read().parquet(filteredPaths.toArray(new String[filteredPaths.size()]));
 } catch (Exception e) {
  throw new HoodieException("Error reading hoodie dataset as a dataframe", e);
 }
}

public void testWriteUtilTable(@NonNull final Class type,
                @NonNull final JavaRDD utilRecords,
                @NotEmpty final String destFolder,
                final boolean isDatePartitioned) throws IOException {
  final String basePath = FileTestUtil.getTempFolder();
  final Path destPath = new Path(basePath, destFolder);
  final UtilTable utilTable = new UtilTable(type, utilRecords, destPath, isDatePartitioned, spark.get());
  final Path destWritePath = utilTable.getDestWritePath();
  assertEquals(new Long(10), utilTable.size());
  assertFalse(this.fileSystem.get().exists(destPath));
  utilTable.show();
  utilTable.writeParquet();
  final FileStatus[] destPathChildren = this.fileSystem.get().listStatus(destWritePath);
  log.debug("Destination folder content:");
  Stream.of(destPathChildren).forEach(f -> log.debug(f.getPath().toString()));
  assertTrue(this.fileSystem.get().exists(destWritePath));
  assertTrue(destPathChildren.length > 0);
  final Dataset outputDataset = spark.get().read().parquet(destWritePath.toString());
  log.debug("Output dataset content");
  outputDataset.show();
  final List<String> datasetFieldNames = Arrays.asList(outputDataset.schema().fieldNames());
  final List<String> requiredFieldNames = Arrays.asList("application_id", "job_name", "job_start_timestamp", "timestamp");
  assertTrue(datasetFieldNames.containsAll(requiredFieldNames));
}

public void compact(String inputPath, String outputPath) throws IOException {
  this.setCompressionAndSerializationOptions(inputPath, outputPath);
  this.outputCompressionProperties(this.outputCompression);
  
  // Defining Spark Context with a generic Spark Configuration.
  SparkConf sparkConf = new SparkConf().setAppName("Spark Compaction");
  JavaSparkContext sc = new JavaSparkContext(sparkConf);
  
  if (this.outputSerialization.equals(TEXT)) {
    JavaRDD<String> textFile = sc.textFile(this.concatInputPath(inputPath));
    textFile.coalesce(this.splitSize).saveAsTextFile(outputPath);
  } else if (this.outputSerialization.equals(PARQUET)) {
    SQLContext sqlContext = new SQLContext(sc);
    DataFrame parquetFile = sqlContext.read().parquet(this.concatInputPath(inputPath));
    parquetFile.coalesce(this.splitSize).write().parquet(outputPath);
  } else if (this.outputSerialization.equals(AVRO)) {
    // For this to work the files must end in .avro
    // Another issue is that when using compression the compression codec extension is not being added to the file name.
    SQLContext sqlContext = new SQLContext(sc);
    DataFrame avroFile = sqlContext.read().format("com.databricks.spark.avro").load(this.concatInputPath(inputPath));
    avroFile.coalesce(this.splitSize).write().format("com.databricks.spark.avro").save(outputPath);
  } else {
    System.out.println("Did not match any serialization type: text, parquet, or avro.  Recieved: " +
        this.outputSerialization);
  }
}

public void compact(String[] args) throws IOException {
  this.setCompressionAndSerializationOptions(this.parseCli(args));
  this.outputCompressionProperties(this.outputCompression);
  
  // Defining Spark Context with a generic Spark Configuration.
  SparkConf sparkConf = new SparkConf().setAppName("Spark Compaction");
  JavaSparkContext sc = new JavaSparkContext(sparkConf);
      
  if (this.outputSerialization.equals(TEXT)) {
    JavaRDD<String> textFile = sc.textFile(this.concatInputPath(inputPath));
    textFile.coalesce(this.splitSize).saveAsTextFile(outputPath);
  } else if (this.outputSerialization.equals(PARQUET)) {
    SQLContext sqlContext = new SQLContext(sc);
    DataFrame parquetFile = sqlContext.read().parquet(this.concatInputPath(inputPath));
    parquetFile.coalesce(this.splitSize).write().parquet(outputPath);
  } else if (this.outputSerialization.equals(AVRO)) {
    // For this to work the files must end in .avro
    SQLContext sqlContext = new SQLContext(sc);
    DataFrame avroFile = sqlContext.read().format("com.databricks.spark.avro").load(this.concatInputPath(inputPath));
    avroFile.coalesce(this.splitSize).write().format("com.databricks.spark.avro").save(outputPath);
  } else {
    System.out.println("Did not match any serialization type: text, parquet, or avro.  Recieved: " +
        this.outputSerialization);
  }
}

Popular methods of DataFrameReader

orc,
table

Popular in Java

Updating database using SQL prepared statement
scheduleAtFixedRate (Timer)
getSystemService (Context)
findViewById (Activity)
Thread (java.lang)
A thread is a thread of execution in a program. The Java Virtual Machine allows an application to ha
Dictionary (java.util)
Note: Do not use this class since it is obsolete. Please use the Map interface for new implementatio
Iterator (java.util)
An iterator over a sequence of objects, such as a collection.If a collection has been changed since
FileUtils (org.apache.commons.io)
General file manipulation utilities. Facilities are provided in the following areas: * writing to a
JPanel (javax.swing)
Loader (org.hibernate.loader)
Abstract superclass of object loading (and querying) strategies. This class implements useful common
Best IntelliJ plugins

How to use parquetmethodin org.apache.spark.sql.DataFrameReader

Best Java code snippets using org.apache.spark.sql.DataFrameReader.parquet (Showing top 20 results out of 315)

How to use
parquet
method
in
org.apache.spark.sql.DataFrameReader