import org.apache.spark.sql.SQLContext SQLContext sqlContext = new SQLContext(sc); DataFrame df = sqlContext.read() .format("com.databricks.spark.csv") .option("inferSchema", "true") .option("header", "true") .load("cars.csv"); df.select("year", "model").write() .format("com.databricks.spark.csv") .option("header", "true") .option("codec", "org.apache.hadoop.io.compress.GzipCodec") .save("newcars.csv");
import org.apache.spark.sql.SQLContext SQLContext sqlContext = new SQLContext(sc); DataFrame df = sqlContext.read() .format("com.databricks.spark.csv") .option("inferSchema", "true") .option("header", "true") .option("delimiter","\t") .load("cars.csv"); df.select("year", "model").write() .format("com.databricks.spark.csv") .option("header", "true") .save("newcars.csv");
public class TestClass { JavaSparkContext sc = new JavaSparkContext("local[*]", "test spark-mongodb java"); SQLContext sqlContext = new org.apache.spark.sql.SQLContext(sc); Map options = new HashMap(); options.put("host", "localhost:27017"); options.put("database", "test"); options.put("collection", "mycol"); DataFrame df = sqlContext.read().format("com.stratio.datasource.mongodb").options(options).load(); df.registerTempTable("mycol"); sqlContext.sql("SELECT * FROM mycol"); df.show(); }
import org.apache.spark.api.java.*; import org.apache.spark.SparkConf; import org.apache.spark.sql.SQLContext; import org.apache.spark.sql.hive.HiveContext; import java.util.*; import org.apache.spark.sql.DataFrame; import static org.apache.spark.sql.functions.*; public class App { public static void main(String[] args) { JavaSparkContext sc = new JavaSparkContext(new SparkConf()); SQLContext sqlContext = new HiveContext(sc); List<String> data = Arrays.asList( "{\"id\": 1, \"vs\": [\"a\", \"b\"]}", "{\"id\": 1, \"vs\": [\"c\", \"d\"]}", "{\"id\": 2, \"vs\": [\"e\", \"f\"]}", "{\"id\": 2, \"vs\": [\"g\", \"h\"]}" ); DataFrame df = sqlContext.read().json(sc.parallelize(data)); df.withColumn("vs", explode(col("vs"))) .groupBy(col("id")) .agg(collect_list(col("vs"))) .show(); } }
public static DataSpark open(SQLContext sqlContext, String path, String formatFile) throws Exception { //Load the data and store it into an object of class DataFrame DataFrame df = sqlContext.read().format(formatFile).load(path); //Create an AMIDST object for managing the data return loadSparkDataFrame(df); }
DataFrame df = sqlContext.read().schema(schema).json(sc.parallelize(Arrays.asList(data)));
@Override public JavaRDD<AvroPayload> getData(@NonNull final ParquetWorkUnitCalculatorResult workUnitCalcResult) { Preconditions.checkState(workUnitCalcResult.hasWorkUnits(), "No work to process for: " + hiveConf.getDataPath()); /** * Current implementation of HiveSource assumes that only a single work unit exists which * corresponds to the single partition that is processed per job. */ final List<String> workUnits = workUnitCalcResult.getWorkUnits(); final String hdfsPath = new Path(this.hiveConf.getDataPath(), workUnits.get(0)).toString(); log.info("Reading data from path: {}", hdfsPath); final Dataset<Row> data = this.sqlContext.read().parquet(hdfsPath); final int numPartitions = calculateHiveNumPartitions(data); log.info("Using {} partitions", numPartitions); final JavaRDD<AvroPayload> hiveRawData = data .coalesce(numPartitions) .javaRDD() .flatMap(row -> { final List<AvroPayload> payloads = new ArrayList<>(); this.converter.convert(row).forEach(d -> payloads.add(d.getSuccessData().get().getData())); return payloads.iterator(); }); return hiveRawData; }
/** * Obtain all new data written into the Hoodie dataset since the given timestamp. */ public static Dataset<Row> readSince(String basePath, SQLContext sqlContext, HoodieTimeline commitTimeline, String lastCommitTime) { List<HoodieInstant> commitsToReturn = commitTimeline.findInstantsAfter(lastCommitTime, Integer.MAX_VALUE) .getInstants().collect(Collectors.toList()); try { // Go over the commit metadata, and obtain the new files that need to be read. HashMap<String, String> fileIdToFullPath = getLatestFileIDsToFullPath(basePath, commitTimeline, commitsToReturn); return sqlContext.read().parquet(fileIdToFullPath.values().toArray(new String[fileIdToFullPath.size()])) .filter(String.format("%s >'%s'", HoodieRecord.COMMIT_TIME_METADATA_FIELD, lastCommitTime)); } catch (IOException e) { throw new HoodieException("Error pulling data incrementally from commitTimestamp :" + lastCommitTime, e); } }
public static Dataset<Row> readCommit(String basePath, SQLContext sqlContext, HoodieTimeline commitTimeline, String commitTime) { HoodieInstant commitInstant = new HoodieInstant(false, HoodieTimeline.COMMIT_ACTION, commitTime); if (!commitTimeline.containsInstant(commitInstant)) { new HoodieException("No commit exists at " + commitTime); } try { HashMap<String, String> paths = getLatestFileIDsToFullPath(basePath, commitTimeline, Arrays.asList(commitInstant)); System.out.println("Path :" + paths.values()); return sqlContext.read().parquet(paths.values().toArray(new String[paths.size()])) .filter(String.format("%s ='%s'", HoodieRecord.COMMIT_TIME_METADATA_FIELD, commitTime)); } catch (Exception e) { throw new HoodieException("Error reading commit " + commitTime, e); } }
/** * Given a bunch of hoodie keys, fetches all the individual records out as a data frame * * @return a dataframe */ public Dataset<Row> read(JavaRDD<HoodieKey> hoodieKeys, int parallelism) throws Exception { assertSqlContext(); JavaPairRDD<HoodieKey, Optional<String>> keyToFileRDD = index .fetchRecordLocation(hoodieKeys, jsc, hoodieTable); List<String> paths = keyToFileRDD.filter(keyFileTuple -> keyFileTuple._2().isPresent()) .map(keyFileTuple -> keyFileTuple._2().get()).collect(); // record locations might be same for multiple keys, so need a unique list Set<String> uniquePaths = new HashSet<>(paths); Dataset<Row> originalDF = sqlContextOpt.get().read() .parquet(uniquePaths.toArray(new String[uniquePaths.size()])); StructType schema = originalDF.schema(); JavaPairRDD<HoodieKey, Row> keyRowRDD = originalDF.javaRDD().mapToPair(row -> { HoodieKey key = new HoodieKey(row.getAs(HoodieRecord.RECORD_KEY_METADATA_FIELD), row.getAs(HoodieRecord.PARTITION_PATH_METADATA_FIELD)); return new Tuple2<>(key, row); }); // Now, we need to further filter out, for only rows that match the supplied hoodie keys JavaRDD<Row> rowRDD = keyRowRDD.join(keyToFileRDD, parallelism).map(tuple -> tuple._2()._1()); return sqlContextOpt.get().createDataFrame(rowRDD, schema); }
@Before public void setUp() throws IOException { hc = TestHive$.MODULE$; List<String> jsonObjects = new ArrayList<>(10); for (int i = 0; i < 10; i++) { jsonObjects.add("{\"key\":" + i + ", \"value\":\"str" + i + "\"}"); } df = hc.read().json(hc.createDataset(jsonObjects, Encoders.STRING())); df.createOrReplaceTempView("window_table"); }
/** * Given a bunch of hoodie keys, fetches all the individual records out as a data frame * * @return a dataframe */ public Dataset<Row> read(JavaRDD<HoodieKey> hoodieKeys, int parallelism) throws Exception { assertSqlContext(); JavaPairRDD<HoodieKey, Optional<String>> keyToFileRDD = index .fetchRecordLocation(hoodieKeys, jsc, hoodieTable); List<String> paths = keyToFileRDD.filter(keyFileTuple -> keyFileTuple._2().isPresent()) .map(keyFileTuple -> keyFileTuple._2().get()).collect(); // record locations might be same for multiple keys, so need a unique list Set<String> uniquePaths = new HashSet<>(paths); Dataset<Row> originalDF = sqlContextOpt.get().read() .parquet(uniquePaths.toArray(new String[uniquePaths.size()])); StructType schema = originalDF.schema(); JavaPairRDD<HoodieKey, Row> keyRowRDD = originalDF.javaRDD().mapToPair(row -> { HoodieKey key = new HoodieKey(row.getAs(HoodieRecord.RECORD_KEY_METADATA_FIELD), row.getAs(HoodieRecord.PARTITION_PATH_METADATA_FIELD)); return new Tuple2<>(key, row); }); // Now, we need to further filter out, for only rows that match the supplied hoodie keys JavaRDD<Row> rowRDD = keyRowRDD.join(keyToFileRDD, parallelism).map(tuple -> tuple._2()._1()); return sqlContextOpt.get().createDataFrame(rowRDD, schema); }
@Before public void setUp() throws IOException { hc = TestHive$.MODULE$; List<String> jsonObjects = new ArrayList<>(10); for (int i = 0; i < 10; i++) { jsonObjects.add("{\"key\":" + i + ", \"value\":\"str" + i + "\"}"); } df = hc.read().json(hc.createDataset(jsonObjects, Encoders.STRING())); df.createOrReplaceTempView("window_table"); }
/** * Reads the paths under the a hoodie dataset out as a DataFrame */ public static Dataset<Row> read(JavaSparkContext jsc, String basePath, SQLContext sqlContext, FileSystem fs, String... paths) { List<String> filteredPaths = new ArrayList<>(); try { HoodieTableMetaClient metaClient = new HoodieTableMetaClient(fs.getConf(), basePath, true); for (String path : paths) { TableFileSystemView.ReadOptimizedView fileSystemView = new HoodieTableFileSystemView( metaClient, metaClient.getCommitsTimeline().filterCompletedInstants(), fs.globStatus(new Path(path))); List<HoodieDataFile> latestFiles = fileSystemView.getLatestDataFiles().collect(Collectors.toList()); for (HoodieDataFile file : latestFiles) { filteredPaths.add(file.getPath()); } } return sqlContext.read().parquet(filteredPaths.toArray(new String[filteredPaths.size()])); } catch (Exception e) { throw new HoodieException("Error reading hoodie dataset as a dataframe", e); } }
public void compact(String inputPath, String outputPath) throws IOException { this.setCompressionAndSerializationOptions(inputPath, outputPath); this.outputCompressionProperties(this.outputCompression); // Defining Spark Context with a generic Spark Configuration. SparkConf sparkConf = new SparkConf().setAppName("Spark Compaction"); JavaSparkContext sc = new JavaSparkContext(sparkConf); if (this.outputSerialization.equals(TEXT)) { JavaRDD<String> textFile = sc.textFile(this.concatInputPath(inputPath)); textFile.coalesce(this.splitSize).saveAsTextFile(outputPath); } else if (this.outputSerialization.equals(PARQUET)) { SQLContext sqlContext = new SQLContext(sc); DataFrame parquetFile = sqlContext.read().parquet(this.concatInputPath(inputPath)); parquetFile.coalesce(this.splitSize).write().parquet(outputPath); } else if (this.outputSerialization.equals(AVRO)) { // For this to work the files must end in .avro // Another issue is that when using compression the compression codec extension is not being added to the file name. SQLContext sqlContext = new SQLContext(sc); DataFrame avroFile = sqlContext.read().format("com.databricks.spark.avro").load(this.concatInputPath(inputPath)); avroFile.coalesce(this.splitSize).write().format("com.databricks.spark.avro").save(outputPath); } else { System.out.println("Did not match any serialization type: text, parquet, or avro. Recieved: " + this.outputSerialization); } }
public void compact(String[] args) throws IOException { this.setCompressionAndSerializationOptions(this.parseCli(args)); this.outputCompressionProperties(this.outputCompression); // Defining Spark Context with a generic Spark Configuration. SparkConf sparkConf = new SparkConf().setAppName("Spark Compaction"); JavaSparkContext sc = new JavaSparkContext(sparkConf); if (this.outputSerialization.equals(TEXT)) { JavaRDD<String> textFile = sc.textFile(this.concatInputPath(inputPath)); textFile.coalesce(this.splitSize).saveAsTextFile(outputPath); } else if (this.outputSerialization.equals(PARQUET)) { SQLContext sqlContext = new SQLContext(sc); DataFrame parquetFile = sqlContext.read().parquet(this.concatInputPath(inputPath)); parquetFile.coalesce(this.splitSize).write().parquet(outputPath); } else if (this.outputSerialization.equals(AVRO)) { // For this to work the files must end in .avro SQLContext sqlContext = new SQLContext(sc); DataFrame avroFile = sqlContext.read().format("com.databricks.spark.avro").load(this.concatInputPath(inputPath)); avroFile.coalesce(this.splitSize).write().format("com.databricks.spark.avro").save(outputPath); } else { System.out.println("Did not match any serialization type: text, parquet, or avro. Recieved: " + this.outputSerialization); } }
@Before public void setUp() throws IOException { sqlContext = TestHive$.MODULE$; sc = new JavaSparkContext(sqlContext.sparkContext()); path = Utils.createTempDir(System.getProperty("java.io.tmpdir"), "datasource").getCanonicalFile(); if (path.exists()) { path.delete(); } HiveSessionCatalog catalog = (HiveSessionCatalog) sqlContext.sessionState().catalog(); hiveManagedPath = new Path(catalog.defaultTablePath(new TableIdentifier("javaSavedTable"))); fs = hiveManagedPath.getFileSystem(sc.hadoopConfiguration()); fs.delete(hiveManagedPath, true); List<String> jsonObjects = new ArrayList<>(10); for (int i = 0; i < 10; i++) { jsonObjects.add("{\"a\":" + i + ", \"b\":\"str" + i + "\"}"); } Dataset<String> ds = sqlContext.createDataset(jsonObjects, Encoders.STRING()); df = sqlContext.read().json(ds); df.createOrReplaceTempView("jsonTable"); }
@Before public void setUp() throws IOException { sqlContext = TestHive$.MODULE$; sc = new JavaSparkContext(sqlContext.sparkContext()); path = Utils.createTempDir(System.getProperty("java.io.tmpdir"), "datasource").getCanonicalFile(); if (path.exists()) { path.delete(); } HiveSessionCatalog catalog = (HiveSessionCatalog) sqlContext.sessionState().catalog(); hiveManagedPath = new Path(catalog.defaultTablePath(new TableIdentifier("javaSavedTable"))); fs = hiveManagedPath.getFileSystem(sc.hadoopConfiguration()); fs.delete(hiveManagedPath, true); List<String> jsonObjects = new ArrayList<>(10); for (int i = 0; i < 10; i++) { jsonObjects.add("{\"a\":" + i + ", \"b\":\"str" + i + "\"}"); } Dataset<String> ds = sqlContext.createDataset(jsonObjects, Encoders.STRING()); df = sqlContext.read().json(ds); df.createOrReplaceTempView("jsonTable"); }