org.apache.spark.sql.DataFrameWriter java code examples

Refine search

@Test
public void saveAndLoad() {
 Map<String, String> options = new HashMap<>();
 options.put("path", path.toString());
 df.write().mode(SaveMode.ErrorIfExists).format("json").options(options).save();
 Dataset<Row> loadedDF = spark.read().format("json").options(options).load();
 checkAnswer(loadedDF, df.collectAsList());
}

    .builder()
    .appName("SparkSQLRelativeFrequency")
    .config(sparkConf)
    .getOrCreate();
JavaSparkContext sc = new JavaSparkContext(spark.sparkContext());
int neighborWindow = Integer.parseInt(args[0]);
String input = args[1];
Dataset<Row> rfDataset = spark.createDataFrame(rowRDD, rfSchema);
rfDataset.createOrReplaceTempView("rfTable");
Dataset<Row> sqlResult = spark.sql(query);
sqlResult.show(); // print first 20 records on the console
sqlResult.write().parquet(output + "/parquetFormat"); // saves output in compressed Parquet format, recommended for large projects.
sqlResult.rdd().saveAsTextFile(output + "/textFormat"); // to see output via cat command

@Test
public void testJsonAPI() {
 spark.read().schema(schema).json();
 spark.read().schema(schema).json(input);
 spark.read().schema(schema).json(input, input, input);
 spark.read().schema(schema).json(new String[]{input, input})
   .write().json(output);
}

@Test
public void testSaveModeAPI() {
 spark
   .range(10)
   .write()
   .format("org.apache.spark.sql.test")
   .mode(SaveMode.ErrorIfExists)
   .save();
}

@Test
public void testFormatAPI() {
 spark
   .read()
   .format("org.apache.spark.sql.test")
   .load()
   .write()
   .format("org.apache.spark.sql.test")
   .save();
}

);
Dataset<Row> df = spark.createDataFrame(expected, SimpleRecord.class);
df.select("id", "data").write()
  .format("iceberg")
  .mode("append")
  .save(location.toString());
Dataset<Row> result = spark.read()
  .format("iceberg")
  .load(location.toString());
List<SimpleRecord> actual = result.orderBy("id").as(Encoders.bean(SimpleRecord.class)).collectAsList();

DataFrameWriter<Row> writer = inputDF1.write().format("com.uber.hoodie") // specify the hoodie source
  .option("hoodie.insert.shuffle.parallelism",
    "2") // any hoodie client config can be passed like this
  .option("hoodie.upsert.shuffle.parallelism",
    "2") // full list in HoodieWriteConfig & its package
  .option(DataSourceWriteOptions.STORAGE_TYPE_OPT_KEY(), tableType) // Hoodie Table Type
  .option(DataSourceWriteOptions.OPERATION_OPT_KEY(),
  .option(DataSourceWriteOptions.RECORDKEY_FIELD_OPT_KEY(),
    "_row_key") // This is the record key
  .option(DataSourceWriteOptions.PARTITIONPATH_FIELD_OPT_KEY(),
    "partition") // this is the partition to place it into
  .option(DataSourceWriteOptions.PRECOMBINE_FIELD_OPT_KEY(),
    "timestamp") // use to combine duplicate records in input/with disk val
  .option(HoodieWriteConfig.TABLE_NAME, tableName) // Used by hive sync and queries
  .option(DataSourceWriteOptions.KEYGENERATOR_CLASS_OPT_KEY(),
    nonPartitionedTable ? NonpartitionedKeyGenerator.class.getCanonicalName() :
      SimpleKeyGenerator.class.getCanonicalName()) // Add Key Extractor
  .mode(
writer.save(tablePath); // ultimately where the dataset will be placed
String commitInstantTime1 = HoodieDataSourceHelpers.latestCommit(fs, tablePath);
logger.info("First commit at instant time :" + commitInstantTime1);
Dataset<Row> inputDF2 = spark.read().json(jssc.parallelize(records2, 2));
writer = inputDF2.write().format("com.uber.hoodie").option("hoodie.insert.shuffle.parallelism", "2")
  .option("hoodie.upsert.shuffle.parallelism", "2")
  .option(DataSourceWriteOptions.STORAGE_TYPE_OPT_KEY(), tableType) // Hoodie Table Type
  .option(DataSourceWriteOptions.RECORDKEY_FIELD_OPT_KEY(), "_row_key")

 this.spark.sql("describe table " + conceptMapTable);
 this.spark.emptyDataset(conceptMapEncoder)
   .withColumn("timestamp", lit(null).cast("timestamp"))
   .write()
   .format("parquet")
   .partitionBy("timestamp")
   .saveAsTable(conceptMapTable);
  .sql("SELECT url, version FROM " + conceptMapTable)
  .distinct()
  .as(URL_AND_VERSION_ENCODER);
this.conceptMaps.write()
  .mode(SaveMode.ErrorIfExists)
  .insertInto(conceptMapTable);

 @Test
 public void saveTableAndQueryIt() {
  Map<String, String> options = new HashMap<>();
  df.write()
   .format("org.apache.spark.sql.json")
   .mode(SaveMode.Append)
   .options(options)
   .saveAsTable("javaSavedTable");

  checkAnswer(
   sqlContext.sql("SELECT * FROM javaSavedTable"),
   df.collectAsList());
 }
}

private void start() {
 SparkSession spark = SparkSession.builder().appName("Authors and Books")
   .master(
     "local").getOrCreate();
 Dataset<Row> authorsDf = spark.read()
   .format("csv")
   .option("inferSchema", "true")
   .option("header", "true")
   .option("dateFormat", "mm/dd/yy")
   .load(filename);
 authorsDf.show();
 Dataset<Row> booksDf = spark.read()
   .format("csv")
   .option("inferSchema", "true")
   .load(filename);
 booksDf.show();
   .join(booksDf, authorsDf.col("id").equalTo(booksDf.col("authorId")), "full_outer")
   .withColumn("bookId", booksDf.col("id"))
   .drop(booksDf.col("id"));
 libraryDf.printSchema();
 libraryDf.write().json("data/library.json");

SQLContext sqlContext = new SQLContext(ctx);
Dataset<Row> schemaPeople = sqlContext.createDataFrame(people, Person.class);
schemaPeople.registerTempTable("people");
Dataset<Row> teenagers = sqlContext.sql("SELECT name FROM people WHERE country = 'USA' AND age >= 13 AND age <= 19");
List<String> teenagerNames = teenagers.toJavaRDD().map(new Function<Row, String>() {
 @Override
 public String call(Row row) {
schemaPeople.write().parquet("people.parquet");
Dataset<Row> parquetFile = sqlContext.read().parquet("people.parquet");
Dataset<Row> peopleFromJsonFile = sqlContext.read().json(path);
JavaRDD<String> anotherPeopleRDD = ctx.parallelize(jsonData);
Dataset<Row> peopleFromJsonRDD = sqlContext.read().json(anotherPeopleRDD.rdd());

@Test
public void testOptionsAPI() {
 HashMap<String, String> map = new HashMap<String, String>();
 map.put("e", "1");
 spark
   .read()
   .option("a", "1")
   .option("b", 1)
   .option("c", 1.0)
   .option("d", true)
   .options(map)
   .text()
   .write()
   .option("a", "1")
   .option("b", 1)
   .option("c", 1.0)
   .option("d", true)
   .options(map)
   .format("org.apache.spark.sql.test")
   .save();
}

        Dataset<Row> inputDF1,
        Dataset<Row> inputDF2) throws Exception {
inputDF1.write().mode(SaveMode.Append).json(streamingSourcePath);
logger.info("First commit at instant time :" + commitInstantTime1);
inputDF2.write().mode(SaveMode.Append).json(streamingSourcePath);
Dataset<Row> hoodieROViewDF = spark.read().format("com.uber.hoodie")
  .load(tablePath + "/*/*/*/*");
hoodieROViewDF.registerTempTable("hoodie_ro");
spark.sql("describe hoodie_ro").show();
// all trips whose fare was greater than 2.
spark.sql("select fare, begin_lon, begin_lat, timestamp from hoodie_ro where fare > 2.0")
  .show();
 Dataset<Row> hoodieIncViewDF = spark.read().format("com.uber.hoodie")
   .option(DataSourceReadOptions.VIEW_TYPE_OPT_KEY(),
     DataSourceReadOptions.VIEW_TYPE_INCREMENTAL_OPT_VAL())

sparkSession.close();
    sparkSession.read()
        .option("delimiter", dfs.getColumnDelimiter())
        .option("nullValue", dfs.getNullToken())
        .option("mode", "FAILFAST")
        .schema(st)
        .format("csv")
        .load(copyRequest.getSrcDataFile().toURI().toString());
ff.write().mode(SaveMode.Overwrite).format(copyRequest.getFormat().sparkFormat()).save(copyRequest.getDstFile().toURI().toString());
SparkCopyFromHadoopRequest copyRequest = (SparkCopyFromHadoopRequest) request;
DataFrameReader dfr = sparkSession.read().format(copyRequest.getFormat().sparkFormat());
df.write().format("csv")
  .option("delimiter", dfs.getColumnDelimiter())
  .option("nullValue", dfs.getNullToken())
  .option("dateFormat", "yyyy-MM-dd")
  .option("timestampFormat", "yyyy-MM-dd HH:mm:ss.SSS")
  .mode(SaveMode.Overwrite).save(copyRequest.getDstDataDir().toURI().toString());
  StructType schema = df.schema();

public void compact(String[] args) throws IOException {
  this.setCompressionAndSerializationOptions(this.parseCli(args));
  this.outputCompressionProperties(this.outputCompression);
  
  // Defining Spark Context with a generic Spark Configuration.
  SparkConf sparkConf = new SparkConf().setAppName("Spark Compaction");
  JavaSparkContext sc = new JavaSparkContext(sparkConf);
      
  if (this.outputSerialization.equals(TEXT)) {
    JavaRDD<String> textFile = sc.textFile(this.concatInputPath(inputPath));
    textFile.coalesce(this.splitSize).saveAsTextFile(outputPath);
  } else if (this.outputSerialization.equals(PARQUET)) {
    SQLContext sqlContext = new SQLContext(sc);
    DataFrame parquetFile = sqlContext.read().parquet(this.concatInputPath(inputPath));
    parquetFile.coalesce(this.splitSize).write().parquet(outputPath);
  } else if (this.outputSerialization.equals(AVRO)) {
    // For this to work the files must end in .avro
    SQLContext sqlContext = new SQLContext(sc);
    DataFrame avroFile = sqlContext.read().format("com.databricks.spark.avro").load(this.concatInputPath(inputPath));
    avroFile.coalesce(this.splitSize).write().format("com.databricks.spark.avro").save(outputPath);
  } else {
    System.out.println("Did not match any serialization type: text, parquet, or avro.  Recieved: " +
        this.outputSerialization);
  }
}

private File buildPartitionedTable(String desc, PartitionSpec spec, String udf, String partitionColumn) {
 File location = new File(parent, desc);
 Table byId = TABLES.create(SCHEMA, spec, location.toString());
 // do not combine splits because the tests expect a split per partition
 byId.updateProperties().set("read.split.target-size", "1").commit();
 // copy the unpartitioned table into the partitioned table to produce the partitioned data
 Dataset<Row> allRows = spark.read()
   .format("iceberg")
   .load(unpartitioned.toString());
 allRows
   .coalesce(1) // ensure only 1 file per partition is written
   .withColumn("part", callUDF(udf, column(partitionColumn)))
   .sortWithinPartitions("part")
   .drop("part")
   .write()
   .format("iceberg")
   .mode("append")
   .save(byId.location());
 return location;
}

spark.read()
    .option("mergeSchema", true)
    .parquet(inputDir)
    .sort(firstSortColumn, groupBySeq.result())
    .coalesce(1)
    .write()
    .option("compression", "gzip")
    .parquet(outputDir);

public void run() {
  Dataset<Row> file = sqlContext.read().parquet("hdfs://localhost:20112/khanh/test_parquet/file.parquet");
  Dataset<Row> select = file.select("id", "value_s");
  select.write().format("solr").options(writeToSolrOpts).save();

@Test
public void testParquetAPI() {
 spark.read().schema(schema).parquet();
 spark.read().schema(schema).parquet(input);
 spark.read().schema(schema).parquet(input, input, input);
 spark.read().schema(schema).parquet(new String[] { input, input })
   .write().parquet(output);
}

/**
 * Writes ancestor records to a table. This class ensures the columns and partitions are mapped
 * properly, and is a workaround similar to the problem described <a
 * href="http://stackoverflow.com/questions/35313077/pyspark-order-of-column-on-write-to-mysql-with-jdbc">here</a>.
 *
 * @param ancestors a dataset of ancestor records
 * @param tableName the table to write them to
 */
private static void writeAncestorsToTable(Dataset<Ancestor> ancestors, String tableName) {
 Dataset<Row> orderedColumnDataset = ancestors.select("descendantSystem",
   "descendantValue",
   "ancestorSystem",
   "ancestorValue",
   "uri",
   "version");
 orderedColumnDataset.write()
   .mode(SaveMode.ErrorIfExists)
   .insertInto(tableName);
}

Most used methods

Popular in Java

Finding current android device location
getResourceAsStream (ClassLoader)
setRequestProperty (URLConnection)
setContentView (Activity)
URLEncoder (java.net)
This class is used to encode a string using the format required by application/x-www-form-urlencoded
MessageFormat (java.text)
Produces concatenated messages in language-neutral way. New code should probably use java.util.Forma
Locale (java.util)
Locale represents a language/country/variant combination. Locales are used to alter the presentatio
Set (java.util)
A Set is a data structure which does not allow duplicate elements.
StringTokenizer (java.util)
Breaks a string into tokens; new code should probably use String#split.> // Legacy code: StringTo
CountDownLatch (java.util.concurrent)
A synchronization aid that allows one or more threads to wait until a set of operations being perfor
Top Vim plugins

How to useDataFrameWriter in org.apache.spark.sql

Best Java code snippets using org.apache.spark.sql.DataFrameWriter (Showing top 20 results out of 315)

Refine search

How to use
DataFrameWriter
in
org.apache.spark.sql