org.apache.spark.sql.DataFrameWriter.parquet java code examples

schemaPeople.write().parquet("people.parquet");

schemaPeople.write().parquet("people.parquet");

sqlResult.write().parquet(output + "/parquetFormat"); // saves output in compressed Parquet format, recommended for large projects.
sqlResult.rdd().saveAsTextFile(output + "/textFormat"); // to see output via cat command

public static void writeParquet(Dataset<Row> df, String outputPath, SaveMode saveMode, int numPartitions) {
  logger.info(String.format("Saving parquet file %s, saveMode: %s, numPartitions: %s", outputPath, saveMode, numPartitions));
  String hdfsOutputPath = outputPath;
  if (hdfsOutputPath.toLowerCase().startsWith(HDFS_PREFIX_LOWERCASE)) {
    hdfsOutputPath = hdfsOutputPath.substring(HDFS_PREFIX_LOWERCASE.length());
  }
  df.coalesce(numPartitions).write().mode(saveMode).parquet(hdfsOutputPath);
  logger.info(String.format("Saved parquet file %s, saveMode: %s, numPartitions: %s", outputPath, saveMode, numPartitions));
}

@Test
public void testParquetAPI() {
 spark.read().schema(schema).parquet();
 spark.read().schema(schema).parquet(input);
 spark.read().schema(schema).parquet(input, input, input);
 spark.read().schema(schema).parquet(new String[] { input, input })
   .write().parquet(output);
}

@Test
public void testParquetAPI() {
 spark.read().schema(schema).parquet();
 spark.read().schema(schema).parquet(input);
 spark.read().schema(schema).parquet(input, input, input);
 spark.read().schema(schema).parquet(new String[] { input, input })
   .write().parquet(output);
}

@Test
public void testParquetAPI() {
 spark.read().schema(schema).parquet();
 spark.read().schema(schema).parquet(input);
 spark.read().schema(schema).parquet(input, input, input);
 spark.read().schema(schema).parquet(new String[] { input, input })
   .write().parquet(output);
}

public void writeParquet() throws IOException {
  // TODO: Consider having a configuration to limit number records written out
  this.dataset.write().mode(SaveMode.Append).parquet(getDestWritePath().toString());
}

  @Override
  public Object execute(SparkSession sparkSession, ActionStatement actionStatement, CredentialProvider credentialManager) {

    String filePath = actionStatement.getParamValues().get(0).getValue().toString();
    String saveModeStr = actionStatement.getParamValues().get(1).getValue().toString();
    String dfTableName = actionStatement.getParamValues().get(2).getValue().toString();

    SaveMode saveMode = SaveMode.valueOf(saveModeStr);

    String sql = String.format("select * from %s", dfTableName);
    logger.info(String.format("Running sql [%s] to get data and then save it", sql));
    Dataset<Row> df = sparkSession.sql(sql);

    logger.info(String.format("Saving to parquet %s, saveMode: %s", filePath, saveMode));
    df.coalesce(1).write().mode(saveMode).parquet(filePath);
    logger.info(String.format("Saved to parquet %s, saveMode: %s", filePath, saveMode));
    return null;
  }
}

      .parquet(outputDir);
} else {
  LOGGER.debug("Skipping the sorting and aggregation of group: {}, due to no data existing in the temporary files directory: {}", group, tempFileDir);

case PARQUET_FORMAT:
 LOG.debug("Writing Parquet: {}", path);
 writer.parquet(path);
 break;
case CSV_FORMAT:

  break;
case PARQUET:
  output.write().parquet(outputFileName);
  break;
case TEXT:

  @Override
  public void saveImpl(String path) {
    // save metadata and params
    DefaultParamsWriter.saveMetadata(instance, path, sc(), 
        DefaultParamsWriter.saveMetadata$default$4(),
        DefaultParamsWriter.saveMetadata$default$5());
    // save model data: markovOrder, numLabels, weights
    Data data = new Data();
    data.setMarkovOrder(contextExtractor.getMarkovOrder().ordinal()+1);
    data.setWeights(weights);
    data.setTagDictionary(tagDictionary);
    List<Data> list = new LinkedList<Data>();
    list.add(data);
    String dataPath = new Path(path, "data").toString();
    sqlContext().createDataFrame(list, Data.class).write().parquet(dataPath);
    // save pipeline model
    try {
      String pipelinePath = new Path(path, "pipelineModel").toString(); 
      pipelineModel.write().overwrite().save(pipelinePath);
    } catch (IOException e) {
      e.printStackTrace();
    }
  }
}

.write()
.option("compression", "gzip")
.parquet(outputDir);

.write()
.option("compression", "gzip")
.parquet(outputDir);

df.write().partitionBy("ipAddress", "method", "responseCode").mode(SaveMode.Append).parquet(Flags.getInstance().getParquetFile());

public void compact(String inputPath, String outputPath) throws IOException {
  this.setCompressionAndSerializationOptions(inputPath, outputPath);
  this.outputCompressionProperties(this.outputCompression);
  
  // Defining Spark Context with a generic Spark Configuration.
  SparkConf sparkConf = new SparkConf().setAppName("Spark Compaction");
  JavaSparkContext sc = new JavaSparkContext(sparkConf);
  
  if (this.outputSerialization.equals(TEXT)) {
    JavaRDD<String> textFile = sc.textFile(this.concatInputPath(inputPath));
    textFile.coalesce(this.splitSize).saveAsTextFile(outputPath);
  } else if (this.outputSerialization.equals(PARQUET)) {
    SQLContext sqlContext = new SQLContext(sc);
    DataFrame parquetFile = sqlContext.read().parquet(this.concatInputPath(inputPath));
    parquetFile.coalesce(this.splitSize).write().parquet(outputPath);
  } else if (this.outputSerialization.equals(AVRO)) {
    // For this to work the files must end in .avro
    // Another issue is that when using compression the compression codec extension is not being added to the file name.
    SQLContext sqlContext = new SQLContext(sc);
    DataFrame avroFile = sqlContext.read().format("com.databricks.spark.avro").load(this.concatInputPath(inputPath));
    avroFile.coalesce(this.splitSize).write().format("com.databricks.spark.avro").save(outputPath);
  } else {
    System.out.println("Did not match any serialization type: text, parquet, or avro.  Recieved: " +
        this.outputSerialization);
  }
}

public void compact(String[] args) throws IOException {
  this.setCompressionAndSerializationOptions(this.parseCli(args));
  this.outputCompressionProperties(this.outputCompression);
  
  // Defining Spark Context with a generic Spark Configuration.
  SparkConf sparkConf = new SparkConf().setAppName("Spark Compaction");
  JavaSparkContext sc = new JavaSparkContext(sparkConf);
      
  if (this.outputSerialization.equals(TEXT)) {
    JavaRDD<String> textFile = sc.textFile(this.concatInputPath(inputPath));
    textFile.coalesce(this.splitSize).saveAsTextFile(outputPath);
  } else if (this.outputSerialization.equals(PARQUET)) {
    SQLContext sqlContext = new SQLContext(sc);
    DataFrame parquetFile = sqlContext.read().parquet(this.concatInputPath(inputPath));
    parquetFile.coalesce(this.splitSize).write().parquet(outputPath);
  } else if (this.outputSerialization.equals(AVRO)) {
    // For this to work the files must end in .avro
    SQLContext sqlContext = new SQLContext(sc);
    DataFrame avroFile = sqlContext.read().format("com.databricks.spark.avro").load(this.concatInputPath(inputPath));
    avroFile.coalesce(this.splitSize).write().format("com.databricks.spark.avro").save(outputPath);
  } else {
    System.out.println("Did not match any serialization type: text, parquet, or avro.  Recieved: " +
        this.outputSerialization);
  }
}

Popular methods of DataFrameWriter

Popular in Java

Creating JSON documents from java classes using gson
getResourceAsStream (ClassLoader)
orElseThrow (Optional)
Return the contained value, if present, otherwise throw an exception to be created by the provided s
setContentView (Activity)
Path (java.nio.file)
ResultSet (java.sql)
An interface for an object which represents a database table entry, returned as the result of the qu
Collectors (java.util.stream)
FileUtils (org.apache.commons.io)
General file manipulation utilities. Facilities are provided in the following areas: * writing to a
IOUtils (org.apache.commons.io)
General IO stream manipulation utilities. This class provides static utility methods for input/outpu
Menu (java.awt)
Github Copilot alternatives

How to use parquetmethodin org.apache.spark.sql.DataFrameWriter

Best Java code snippets using org.apache.spark.sql.DataFrameWriter.parquet (Showing top 18 results out of 315)

How to use
parquet
method
in
org.apache.spark.sql.DataFrameWriter