Dataset<Row> parquetFile = sqlContext.read().parquet("people.parquet");
Dataset<Row> parquetFile = sqlContext.read().parquet("people.parquet");
private Dataset<Row> readParquet(String path) { LOG.debug("Reading Parquet: {}", path); return Contexts.getSparkSession().read().parquet(path); }
@Override public Dataset<Row> parquet(final String path) { final boolean userTriggered = initializeFunction(path); final Dataset<Row> result = Dataset.from(super.parquet(path)); this.setIsUserTriggered(userTriggered); return result; }
@Override public Dataset<Row> parquet(final String... paths) { final boolean userTriggered = initializeFunction(paths); final Dataset<Row> result = Dataset.from(super.parquet(paths)); this.setIsUserTriggered(userTriggered); return result; }
@Override public Dataset<Row> parquet(final scala.collection.Seq<String> paths) { final boolean userTriggered = initializeFunction(paths); final Dataset<Row> result = Dataset.from(super.parquet(paths)); this.setIsUserTriggered(userTriggered); return result; }
public static Dataset<Row> loadFile(String inputFormat, String inputPath, SparkSession spark) { if (inputFormat == null || inputFormat.isEmpty() || inputFormat.equalsIgnoreCase("text")) { return spark.read().text(inputPath); } else if (inputFormat.equalsIgnoreCase("parquet")) { return spark.read().parquet(inputPath); } else if (inputFormat.equalsIgnoreCase("csv")) { return spark.read().option("header", "false").csv(inputPath); } else if (inputFormat.equalsIgnoreCase("csv_with_header")) { return spark.read().option("header", "true").csv(inputPath); } else if (inputFormat.equalsIgnoreCase("json")) { return spark.read().json(inputPath); } else { throw new RuntimeException(String.format("Unsupported inputFormat: %s, %s", inputFormat, inputPath)); } }
@Override public JavaRDD<AvroPayload> getData(@NonNull final ParquetWorkUnitCalculatorResult workUnitCalcResult) { Preconditions.checkState(workUnitCalcResult.hasWorkUnits(), "No work to process for: " + hiveConf.getDataPath()); /** * Current implementation of HiveSource assumes that only a single work unit exists which * corresponds to the single partition that is processed per job. */ final List<String> workUnits = workUnitCalcResult.getWorkUnits(); final String hdfsPath = new Path(this.hiveConf.getDataPath(), workUnits.get(0)).toString(); log.info("Reading data from path: {}", hdfsPath); final Dataset<Row> data = this.sqlContext.read().parquet(hdfsPath); final int numPartitions = calculateHiveNumPartitions(data); log.info("Using {} partitions", numPartitions); final JavaRDD<AvroPayload> hiveRawData = data .coalesce(numPartitions) .javaRDD() .flatMap(row -> { final List<AvroPayload> payloads = new ArrayList<>(); this.converter.convert(row).forEach(d -> payloads.add(d.getSuccessData().get().getData())); return payloads.iterator(); }); return hiveRawData; }
public void run() { Dataset<Row> file = sqlContext.read().parquet("hdfs://localhost:20112/khanh/test_parquet/file.parquet"); Dataset<Row> select = file.select("id", "value_s");
/** * Obtain all new data written into the Hoodie dataset since the given timestamp. */ public static Dataset<Row> readSince(String basePath, SQLContext sqlContext, HoodieTimeline commitTimeline, String lastCommitTime) { List<HoodieInstant> commitsToReturn = commitTimeline.findInstantsAfter(lastCommitTime, Integer.MAX_VALUE) .getInstants().collect(Collectors.toList()); try { // Go over the commit metadata, and obtain the new files that need to be read. HashMap<String, String> fileIdToFullPath = getLatestFileIDsToFullPath(basePath, commitTimeline, commitsToReturn); return sqlContext.read().parquet(fileIdToFullPath.values().toArray(new String[fileIdToFullPath.size()])) .filter(String.format("%s >'%s'", HoodieRecord.COMMIT_TIME_METADATA_FIELD, lastCommitTime)); } catch (IOException e) { throw new HoodieException("Error pulling data incrementally from commitTimestamp :" + lastCommitTime, e); } }
/** * Given a bunch of hoodie keys, fetches all the individual records out as a data frame * * @return a dataframe */ public Dataset<Row> read(JavaRDD<HoodieKey> hoodieKeys, int parallelism) throws Exception { assertSqlContext(); JavaPairRDD<HoodieKey, Optional<String>> keyToFileRDD = index .fetchRecordLocation(hoodieKeys, jsc, hoodieTable); List<String> paths = keyToFileRDD.filter(keyFileTuple -> keyFileTuple._2().isPresent()) .map(keyFileTuple -> keyFileTuple._2().get()).collect(); // record locations might be same for multiple keys, so need a unique list Set<String> uniquePaths = new HashSet<>(paths); Dataset<Row> originalDF = sqlContextOpt.get().read() .parquet(uniquePaths.toArray(new String[uniquePaths.size()])); StructType schema = originalDF.schema(); JavaPairRDD<HoodieKey, Row> keyRowRDD = originalDF.javaRDD().mapToPair(row -> { HoodieKey key = new HoodieKey(row.getAs(HoodieRecord.RECORD_KEY_METADATA_FIELD), row.getAs(HoodieRecord.PARTITION_PATH_METADATA_FIELD)); return new Tuple2<>(key, row); }); // Now, we need to further filter out, for only rows that match the supplied hoodie keys JavaRDD<Row> rowRDD = keyRowRDD.join(keyToFileRDD, parallelism).map(tuple -> tuple._2()._1()); return sqlContextOpt.get().createDataFrame(rowRDD, schema); }
public static Dataset<Row> readCommit(String basePath, SQLContext sqlContext, HoodieTimeline commitTimeline, String commitTime) { HoodieInstant commitInstant = new HoodieInstant(false, HoodieTimeline.COMMIT_ACTION, commitTime); if (!commitTimeline.containsInstant(commitInstant)) { new HoodieException("No commit exists at " + commitTime); } try { HashMap<String, String> paths = getLatestFileIDsToFullPath(basePath, commitTimeline, Arrays.asList(commitInstant)); System.out.println("Path :" + paths.values()); return sqlContext.read().parquet(paths.values().toArray(new String[paths.size()])) .filter(String.format("%s ='%s'", HoodieRecord.COMMIT_TIME_METADATA_FIELD, commitTime)); } catch (Exception e) { throw new HoodieException("Error reading commit " + commitTime, e); } }
/** * Given a bunch of hoodie keys, fetches all the individual records out as a data frame * * @return a dataframe */ public Dataset<Row> read(JavaRDD<HoodieKey> hoodieKeys, int parallelism) throws Exception { assertSqlContext(); JavaPairRDD<HoodieKey, Optional<String>> keyToFileRDD = index .fetchRecordLocation(hoodieKeys, jsc, hoodieTable); List<String> paths = keyToFileRDD.filter(keyFileTuple -> keyFileTuple._2().isPresent()) .map(keyFileTuple -> keyFileTuple._2().get()).collect(); // record locations might be same for multiple keys, so need a unique list Set<String> uniquePaths = new HashSet<>(paths); Dataset<Row> originalDF = sqlContextOpt.get().read() .parquet(uniquePaths.toArray(new String[uniquePaths.size()])); StructType schema = originalDF.schema(); JavaPairRDD<HoodieKey, Row> keyRowRDD = originalDF.javaRDD().mapToPair(row -> { HoodieKey key = new HoodieKey(row.getAs(HoodieRecord.RECORD_KEY_METADATA_FIELD), row.getAs(HoodieRecord.PARTITION_PATH_METADATA_FIELD)); return new Tuple2<>(key, row); }); // Now, we need to further filter out, for only rows that match the supplied hoodie keys JavaRDD<Row> rowRDD = keyRowRDD.join(keyToFileRDD, parallelism).map(tuple -> tuple._2()._1()); return sqlContextOpt.get().createDataFrame(rowRDD, schema); }
/** * Reads the paths under the a hoodie dataset out as a DataFrame */ public static Dataset<Row> read(JavaSparkContext jsc, String basePath, SQLContext sqlContext, FileSystem fs, String... paths) { List<String> filteredPaths = new ArrayList<>(); try { HoodieTableMetaClient metaClient = new HoodieTableMetaClient(fs.getConf(), basePath, true); for (String path : paths) { TableFileSystemView.ReadOptimizedView fileSystemView = new HoodieTableFileSystemView( metaClient, metaClient.getCommitsTimeline().filterCompletedInstants(), fs.globStatus(new Path(path))); List<HoodieDataFile> latestFiles = fileSystemView.getLatestDataFiles().collect(Collectors.toList()); for (HoodieDataFile file : latestFiles) { filteredPaths.add(file.getPath()); } } return sqlContext.read().parquet(filteredPaths.toArray(new String[filteredPaths.size()])); } catch (Exception e) { throw new HoodieException("Error reading hoodie dataset as a dataframe", e); } }
public void testWriteUtilTable(@NonNull final Class type, @NonNull final JavaRDD utilRecords, @NotEmpty final String destFolder, final boolean isDatePartitioned) throws IOException { final String basePath = FileTestUtil.getTempFolder(); final Path destPath = new Path(basePath, destFolder); final UtilTable utilTable = new UtilTable(type, utilRecords, destPath, isDatePartitioned, spark.get()); final Path destWritePath = utilTable.getDestWritePath(); assertEquals(new Long(10), utilTable.size()); assertFalse(this.fileSystem.get().exists(destPath)); utilTable.show(); utilTable.writeParquet(); final FileStatus[] destPathChildren = this.fileSystem.get().listStatus(destWritePath); log.debug("Destination folder content:"); Stream.of(destPathChildren).forEach(f -> log.debug(f.getPath().toString())); assertTrue(this.fileSystem.get().exists(destWritePath)); assertTrue(destPathChildren.length > 0); final Dataset outputDataset = spark.get().read().parquet(destWritePath.toString()); log.debug("Output dataset content"); outputDataset.show(); final List<String> datasetFieldNames = Arrays.asList(outputDataset.schema().fieldNames()); final List<String> requiredFieldNames = Arrays.asList("application_id", "job_name", "job_start_timestamp", "timestamp"); assertTrue(datasetFieldNames.containsAll(requiredFieldNames)); }
public void compact(String inputPath, String outputPath) throws IOException { this.setCompressionAndSerializationOptions(inputPath, outputPath); this.outputCompressionProperties(this.outputCompression); // Defining Spark Context with a generic Spark Configuration. SparkConf sparkConf = new SparkConf().setAppName("Spark Compaction"); JavaSparkContext sc = new JavaSparkContext(sparkConf); if (this.outputSerialization.equals(TEXT)) { JavaRDD<String> textFile = sc.textFile(this.concatInputPath(inputPath)); textFile.coalesce(this.splitSize).saveAsTextFile(outputPath); } else if (this.outputSerialization.equals(PARQUET)) { SQLContext sqlContext = new SQLContext(sc); DataFrame parquetFile = sqlContext.read().parquet(this.concatInputPath(inputPath)); parquetFile.coalesce(this.splitSize).write().parquet(outputPath); } else if (this.outputSerialization.equals(AVRO)) { // For this to work the files must end in .avro // Another issue is that when using compression the compression codec extension is not being added to the file name. SQLContext sqlContext = new SQLContext(sc); DataFrame avroFile = sqlContext.read().format("com.databricks.spark.avro").load(this.concatInputPath(inputPath)); avroFile.coalesce(this.splitSize).write().format("com.databricks.spark.avro").save(outputPath); } else { System.out.println("Did not match any serialization type: text, parquet, or avro. Recieved: " + this.outputSerialization); } }
public void compact(String[] args) throws IOException { this.setCompressionAndSerializationOptions(this.parseCli(args)); this.outputCompressionProperties(this.outputCompression); // Defining Spark Context with a generic Spark Configuration. SparkConf sparkConf = new SparkConf().setAppName("Spark Compaction"); JavaSparkContext sc = new JavaSparkContext(sparkConf); if (this.outputSerialization.equals(TEXT)) { JavaRDD<String> textFile = sc.textFile(this.concatInputPath(inputPath)); textFile.coalesce(this.splitSize).saveAsTextFile(outputPath); } else if (this.outputSerialization.equals(PARQUET)) { SQLContext sqlContext = new SQLContext(sc); DataFrame parquetFile = sqlContext.read().parquet(this.concatInputPath(inputPath)); parquetFile.coalesce(this.splitSize).write().parquet(outputPath); } else if (this.outputSerialization.equals(AVRO)) { // For this to work the files must end in .avro SQLContext sqlContext = new SQLContext(sc); DataFrame avroFile = sqlContext.read().format("com.databricks.spark.avro").load(this.concatInputPath(inputPath)); avroFile.coalesce(this.splitSize).write().format("com.databricks.spark.avro").save(outputPath); } else { System.out.println("Did not match any serialization type: text, parquet, or avro. Recieved: " + this.outputSerialization); } }