Refine search
@Test public void saveAndLoad() { Map<String, String> options = new HashMap<>(); options.put("path", path.toString()); df.write().mode(SaveMode.ErrorIfExists).format("json").options(options).save(); Dataset<Row> loadedDF = spark.read().format("json").options(options).load(); checkAnswer(loadedDF, df.collectAsList()); }
.builder() .appName("SparkSQLRelativeFrequency") .config(sparkConf) .getOrCreate(); JavaSparkContext sc = new JavaSparkContext(spark.sparkContext()); int neighborWindow = Integer.parseInt(args[0]); String input = args[1]; Dataset<Row> rfDataset = spark.createDataFrame(rowRDD, rfSchema); rfDataset.createOrReplaceTempView("rfTable"); Dataset<Row> sqlResult = spark.sql(query); sqlResult.show(); // print first 20 records on the console sqlResult.write().parquet(output + "/parquetFormat"); // saves output in compressed Parquet format, recommended for large projects. sqlResult.rdd().saveAsTextFile(output + "/textFormat"); // to see output via cat command
); Dataset<Row> df = spark.createDataFrame(expected, SimpleRecord.class); df.select("id", "data").write() .format("iceberg") .mode("append") .save(location.toString()); Dataset<Row> result = spark.read() .format("iceberg") .load(location.toString()); List<SimpleRecord> actual = result.orderBy("id").as(Encoders.bean(SimpleRecord.class)).collectAsList();
DataFrameWriter<Row> writer = inputDF1.write().format("com.uber.hoodie") // specify the hoodie source .option("hoodie.insert.shuffle.parallelism", "2") // any hoodie client config can be passed like this .option("hoodie.upsert.shuffle.parallelism", "2") // full list in HoodieWriteConfig & its package .option(DataSourceWriteOptions.STORAGE_TYPE_OPT_KEY(), tableType) // Hoodie Table Type .option(DataSourceWriteOptions.OPERATION_OPT_KEY(), .option(DataSourceWriteOptions.RECORDKEY_FIELD_OPT_KEY(), "_row_key") // This is the record key .option(DataSourceWriteOptions.PARTITIONPATH_FIELD_OPT_KEY(), "partition") // this is the partition to place it into .option(DataSourceWriteOptions.PRECOMBINE_FIELD_OPT_KEY(), "timestamp") // use to combine duplicate records in input/with disk val .option(HoodieWriteConfig.TABLE_NAME, tableName) // Used by hive sync and queries .option(DataSourceWriteOptions.KEYGENERATOR_CLASS_OPT_KEY(), nonPartitionedTable ? NonpartitionedKeyGenerator.class.getCanonicalName() : SimpleKeyGenerator.class.getCanonicalName()) // Add Key Extractor .mode( writer.save(tablePath); // ultimately where the dataset will be placed String commitInstantTime1 = HoodieDataSourceHelpers.latestCommit(fs, tablePath); logger.info("First commit at instant time :" + commitInstantTime1); Dataset<Row> inputDF2 = spark.read().json(jssc.parallelize(records2, 2)); writer = inputDF2.write().format("com.uber.hoodie").option("hoodie.insert.shuffle.parallelism", "2") .option("hoodie.upsert.shuffle.parallelism", "2") .option(DataSourceWriteOptions.STORAGE_TYPE_OPT_KEY(), tableType) // Hoodie Table Type .option(DataSourceWriteOptions.RECORDKEY_FIELD_OPT_KEY(), "_row_key")
this.spark.sql("describe table " + conceptMapTable); this.spark.emptyDataset(conceptMapEncoder) .withColumn("timestamp", lit(null).cast("timestamp")) .write() .format("parquet") .partitionBy("timestamp") .saveAsTable(conceptMapTable); .sql("SELECT url, version FROM " + conceptMapTable) .distinct() .as(URL_AND_VERSION_ENCODER); this.conceptMaps.write() .mode(SaveMode.ErrorIfExists) .insertInto(conceptMapTable);
@Test public void saveTableAndQueryIt() { Map<String, String> options = new HashMap<>(); df.write() .format("org.apache.spark.sql.json") .mode(SaveMode.Append) .options(options) .saveAsTable("javaSavedTable"); checkAnswer( sqlContext.sql("SELECT * FROM javaSavedTable"), df.collectAsList()); } }
private void start() { SparkSession spark = SparkSession.builder().appName("Authors and Books") .master( "local").getOrCreate(); Dataset<Row> authorsDf = spark.read() .format("csv") .option("inferSchema", "true") .option("header", "true") .option("dateFormat", "mm/dd/yy") .load(filename); authorsDf.show(); Dataset<Row> booksDf = spark.read() .format("csv") .option("inferSchema", "true") .load(filename); booksDf.show(); .join(booksDf, authorsDf.col("id").equalTo(booksDf.col("authorId")), "full_outer") .withColumn("bookId", booksDf.col("id")) .drop(booksDf.col("id")); libraryDf.printSchema(); libraryDf.write().json("data/library.json");
SQLContext sqlContext = new SQLContext(ctx); Dataset<Row> schemaPeople = sqlContext.createDataFrame(people, Person.class); schemaPeople.registerTempTable("people"); Dataset<Row> teenagers = sqlContext.sql("SELECT name FROM people WHERE country = 'USA' AND age >= 13 AND age <= 19"); List<String> teenagerNames = teenagers.toJavaRDD().map(new Function<Row, String>() { @Override public String call(Row row) { schemaPeople.write().parquet("people.parquet"); Dataset<Row> parquetFile = sqlContext.read().parquet("people.parquet"); Dataset<Row> peopleFromJsonFile = sqlContext.read().json(path); JavaRDD<String> anotherPeopleRDD = ctx.parallelize(jsonData); Dataset<Row> peopleFromJsonRDD = sqlContext.read().json(anotherPeopleRDD.rdd());
@Test public void testOptionsAPI() { HashMap<String, String> map = new HashMap<String, String>(); map.put("e", "1"); spark .read() .option("a", "1") .option("b", 1) .option("c", 1.0) .option("d", true) .options(map) .text() .write() .option("a", "1") .option("b", 1) .option("c", 1.0) .option("d", true) .options(map) .format("org.apache.spark.sql.test") .save(); }
Dataset<Row> inputDF1, Dataset<Row> inputDF2) throws Exception { inputDF1.write().mode(SaveMode.Append).json(streamingSourcePath); logger.info("First commit at instant time :" + commitInstantTime1); inputDF2.write().mode(SaveMode.Append).json(streamingSourcePath); Dataset<Row> hoodieROViewDF = spark.read().format("com.uber.hoodie") .load(tablePath + "/*/*/*/*"); hoodieROViewDF.registerTempTable("hoodie_ro"); spark.sql("describe hoodie_ro").show(); // all trips whose fare was greater than 2. spark.sql("select fare, begin_lon, begin_lat, timestamp from hoodie_ro where fare > 2.0") .show(); Dataset<Row> hoodieIncViewDF = spark.read().format("com.uber.hoodie") .option(DataSourceReadOptions.VIEW_TYPE_OPT_KEY(), DataSourceReadOptions.VIEW_TYPE_INCREMENTAL_OPT_VAL())
sparkSession.close(); sparkSession.read() .option("delimiter", dfs.getColumnDelimiter()) .option("nullValue", dfs.getNullToken()) .option("mode", "FAILFAST") .schema(st) .format("csv") .load(copyRequest.getSrcDataFile().toURI().toString()); ff.write().mode(SaveMode.Overwrite).format(copyRequest.getFormat().sparkFormat()).save(copyRequest.getDstFile().toURI().toString()); SparkCopyFromHadoopRequest copyRequest = (SparkCopyFromHadoopRequest) request; DataFrameReader dfr = sparkSession.read().format(copyRequest.getFormat().sparkFormat()); df.write().format("csv") .option("delimiter", dfs.getColumnDelimiter()) .option("nullValue", dfs.getNullToken()) .option("dateFormat", "yyyy-MM-dd") .option("timestampFormat", "yyyy-MM-dd HH:mm:ss.SSS") .mode(SaveMode.Overwrite).save(copyRequest.getDstDataDir().toURI().toString()); StructType schema = df.schema();
public void compact(String[] args) throws IOException { this.setCompressionAndSerializationOptions(this.parseCli(args)); this.outputCompressionProperties(this.outputCompression); // Defining Spark Context with a generic Spark Configuration. SparkConf sparkConf = new SparkConf().setAppName("Spark Compaction"); JavaSparkContext sc = new JavaSparkContext(sparkConf); if (this.outputSerialization.equals(TEXT)) { JavaRDD<String> textFile = sc.textFile(this.concatInputPath(inputPath)); textFile.coalesce(this.splitSize).saveAsTextFile(outputPath); } else if (this.outputSerialization.equals(PARQUET)) { SQLContext sqlContext = new SQLContext(sc); DataFrame parquetFile = sqlContext.read().parquet(this.concatInputPath(inputPath)); parquetFile.coalesce(this.splitSize).write().parquet(outputPath); } else if (this.outputSerialization.equals(AVRO)) { // For this to work the files must end in .avro SQLContext sqlContext = new SQLContext(sc); DataFrame avroFile = sqlContext.read().format("com.databricks.spark.avro").load(this.concatInputPath(inputPath)); avroFile.coalesce(this.splitSize).write().format("com.databricks.spark.avro").save(outputPath); } else { System.out.println("Did not match any serialization type: text, parquet, or avro. Recieved: " + this.outputSerialization); } }
private File buildPartitionedTable(String desc, PartitionSpec spec, String udf, String partitionColumn) { File location = new File(parent, desc); Table byId = TABLES.create(SCHEMA, spec, location.toString()); // do not combine splits because the tests expect a split per partition byId.updateProperties().set("read.split.target-size", "1").commit(); // copy the unpartitioned table into the partitioned table to produce the partitioned data Dataset<Row> allRows = spark.read() .format("iceberg") .load(unpartitioned.toString()); allRows .coalesce(1) // ensure only 1 file per partition is written .withColumn("part", callUDF(udf, column(partitionColumn))) .sortWithinPartitions("part") .drop("part") .write() .format("iceberg") .mode("append") .save(byId.location()); return location; }
/** * Writes ancestor records to a table. This class ensures the columns and partitions are mapped * properly, and is a workaround similar to the problem described <a * href="http://stackoverflow.com/questions/35313077/pyspark-order-of-column-on-write-to-mysql-with-jdbc">here</a>. * * @param ancestors a dataset of ancestor records * @param tableName the table to write them to */ private static void writeAncestorsToTable(Dataset<Ancestor> ancestors, String tableName) { Dataset<Row> orderedColumnDataset = ancestors.select("descendantSystem", "descendantValue", "ancestorSystem", "ancestorValue", "uri", "version"); orderedColumnDataset.write() .mode(SaveMode.ErrorIfExists) .insertInto(tableName); }