schemaPeople.write().parquet("people.parquet");
schemaPeople.write().parquet("people.parquet");
sqlResult.write().parquet(output + "/parquetFormat"); // saves output in compressed Parquet format, recommended for large projects. sqlResult.rdd().saveAsTextFile(output + "/textFormat"); // to see output via cat command
public static void writeParquet(Dataset<Row> df, String outputPath, SaveMode saveMode, int numPartitions) { logger.info(String.format("Saving parquet file %s, saveMode: %s, numPartitions: %s", outputPath, saveMode, numPartitions)); String hdfsOutputPath = outputPath; if (hdfsOutputPath.toLowerCase().startsWith(HDFS_PREFIX_LOWERCASE)) { hdfsOutputPath = hdfsOutputPath.substring(HDFS_PREFIX_LOWERCASE.length()); } df.coalesce(numPartitions).write().mode(saveMode).parquet(hdfsOutputPath); logger.info(String.format("Saved parquet file %s, saveMode: %s, numPartitions: %s", outputPath, saveMode, numPartitions)); }
@Test public void testParquetAPI() { spark.read().schema(schema).parquet(); spark.read().schema(schema).parquet(input); spark.read().schema(schema).parquet(input, input, input); spark.read().schema(schema).parquet(new String[] { input, input }) .write().parquet(output); }
@Test public void testParquetAPI() { spark.read().schema(schema).parquet(); spark.read().schema(schema).parquet(input); spark.read().schema(schema).parquet(input, input, input); spark.read().schema(schema).parquet(new String[] { input, input }) .write().parquet(output); }
@Test public void testParquetAPI() { spark.read().schema(schema).parquet(); spark.read().schema(schema).parquet(input); spark.read().schema(schema).parquet(input, input, input); spark.read().schema(schema).parquet(new String[] { input, input }) .write().parquet(output); }
public void writeParquet() throws IOException { // TODO: Consider having a configuration to limit number records written out this.dataset.write().mode(SaveMode.Append).parquet(getDestWritePath().toString()); }
@Override public Object execute(SparkSession sparkSession, ActionStatement actionStatement, CredentialProvider credentialManager) { String filePath = actionStatement.getParamValues().get(0).getValue().toString(); String saveModeStr = actionStatement.getParamValues().get(1).getValue().toString(); String dfTableName = actionStatement.getParamValues().get(2).getValue().toString(); SaveMode saveMode = SaveMode.valueOf(saveModeStr); String sql = String.format("select * from %s", dfTableName); logger.info(String.format("Running sql [%s] to get data and then save it", sql)); Dataset<Row> df = sparkSession.sql(sql); logger.info(String.format("Saving to parquet %s, saveMode: %s", filePath, saveMode)); df.coalesce(1).write().mode(saveMode).parquet(filePath); logger.info(String.format("Saved to parquet %s, saveMode: %s", filePath, saveMode)); return null; } }
.parquet(outputDir); } else { LOGGER.debug("Skipping the sorting and aggregation of group: {}, due to no data existing in the temporary files directory: {}", group, tempFileDir);
case PARQUET_FORMAT: LOG.debug("Writing Parquet: {}", path); writer.parquet(path); break; case CSV_FORMAT:
break; case PARQUET: output.write().parquet(outputFileName); break; case TEXT:
@Override public void saveImpl(String path) { // save metadata and params DefaultParamsWriter.saveMetadata(instance, path, sc(), DefaultParamsWriter.saveMetadata$default$4(), DefaultParamsWriter.saveMetadata$default$5()); // save model data: markovOrder, numLabels, weights Data data = new Data(); data.setMarkovOrder(contextExtractor.getMarkovOrder().ordinal()+1); data.setWeights(weights); data.setTagDictionary(tagDictionary); List<Data> list = new LinkedList<Data>(); list.add(data); String dataPath = new Path(path, "data").toString(); sqlContext().createDataFrame(list, Data.class).write().parquet(dataPath); // save pipeline model try { String pipelinePath = new Path(path, "pipelineModel").toString(); pipelineModel.write().overwrite().save(pipelinePath); } catch (IOException e) { e.printStackTrace(); } } }
.write() .option("compression", "gzip") .parquet(outputDir);
.write() .option("compression", "gzip") .parquet(outputDir);
df.write().partitionBy("ipAddress", "method", "responseCode").mode(SaveMode.Append).parquet(Flags.getInstance().getParquetFile());
public void compact(String inputPath, String outputPath) throws IOException { this.setCompressionAndSerializationOptions(inputPath, outputPath); this.outputCompressionProperties(this.outputCompression); // Defining Spark Context with a generic Spark Configuration. SparkConf sparkConf = new SparkConf().setAppName("Spark Compaction"); JavaSparkContext sc = new JavaSparkContext(sparkConf); if (this.outputSerialization.equals(TEXT)) { JavaRDD<String> textFile = sc.textFile(this.concatInputPath(inputPath)); textFile.coalesce(this.splitSize).saveAsTextFile(outputPath); } else if (this.outputSerialization.equals(PARQUET)) { SQLContext sqlContext = new SQLContext(sc); DataFrame parquetFile = sqlContext.read().parquet(this.concatInputPath(inputPath)); parquetFile.coalesce(this.splitSize).write().parquet(outputPath); } else if (this.outputSerialization.equals(AVRO)) { // For this to work the files must end in .avro // Another issue is that when using compression the compression codec extension is not being added to the file name. SQLContext sqlContext = new SQLContext(sc); DataFrame avroFile = sqlContext.read().format("com.databricks.spark.avro").load(this.concatInputPath(inputPath)); avroFile.coalesce(this.splitSize).write().format("com.databricks.spark.avro").save(outputPath); } else { System.out.println("Did not match any serialization type: text, parquet, or avro. Recieved: " + this.outputSerialization); } }
public void compact(String[] args) throws IOException { this.setCompressionAndSerializationOptions(this.parseCli(args)); this.outputCompressionProperties(this.outputCompression); // Defining Spark Context with a generic Spark Configuration. SparkConf sparkConf = new SparkConf().setAppName("Spark Compaction"); JavaSparkContext sc = new JavaSparkContext(sparkConf); if (this.outputSerialization.equals(TEXT)) { JavaRDD<String> textFile = sc.textFile(this.concatInputPath(inputPath)); textFile.coalesce(this.splitSize).saveAsTextFile(outputPath); } else if (this.outputSerialization.equals(PARQUET)) { SQLContext sqlContext = new SQLContext(sc); DataFrame parquetFile = sqlContext.read().parquet(this.concatInputPath(inputPath)); parquetFile.coalesce(this.splitSize).write().parquet(outputPath); } else if (this.outputSerialization.equals(AVRO)) { // For this to work the files must end in .avro SQLContext sqlContext = new SQLContext(sc); DataFrame avroFile = sqlContext.read().format("com.databricks.spark.avro").load(this.concatInputPath(inputPath)); avroFile.coalesce(this.splitSize).write().format("com.databricks.spark.avro").save(outputPath); } else { System.out.println("Did not match any serialization type: text, parquet, or avro. Recieved: " + this.outputSerialization); } }