@Test public void testOptionsAPI() { HashMap<String, String> map = new HashMap<String, String>(); map.put("e", "1"); spark .read() .option("a", "1") .option("b", 1) .option("c", 1.0) .option("d", true) .options(map) .text() .write() .option("a", "1") .option("b", 1) .option("c", 1.0) .option("d", true) .options(map) .format("org.apache.spark.sql.test") .save(); }
@Override public Object execute(SparkSession sparkSession, ActionStatement actionStatement, CredentialProvider credentialManager) { String filePath = actionStatement.getParamValues().get(0).getValue().toString(); String saveModeStr = actionStatement.getParamValues().get(1).getValue().toString(); String dfTableName = actionStatement.getParamValues().get(2).getValue().toString(); SaveMode saveMode = SaveMode.valueOf(saveModeStr); String sql = String.format("select * from %s", dfTableName); logger.info(String.format("Running sql [%s] to get data and then save it", sql)); Dataset<Row> df = sparkSession.sql(sql); logger.info(String.format("Saving to csv %s, saveMode: %s", filePath, saveMode)); df.coalesce(1).write().mode(saveMode).option("header", "false").csv(filePath); logger.info(String.format("Saved to csv %s, saveMode: %s", filePath, saveMode)); return null; } }
@Test public void testOptionsAPI() { HashMap<String, String> map = new HashMap<String, String>(); map.put("e", "1"); spark .read() .option("a", "1") .option("b", 1) .option("c", 1.0) .option("d", true) .options(map) .text() .write() .option("a", "1") .option("b", 1) .option("c", 1.0) .option("d", true) .options(map) .format("org.apache.spark.sql.test") .save(); }
/** * Setup configs for syncing to hive * @param writer * @return */ private DataFrameWriter<Row> updateHiveSyncConfig(DataFrameWriter<Row> writer) { if (enableHiveSync) { logger.info("Enabling Hive sync to " + hiveJdbcUrl); writer = writer.option(DataSourceWriteOptions.HIVE_TABLE_OPT_KEY(), hiveTable) .option(DataSourceWriteOptions.HIVE_DATABASE_OPT_KEY(), hiveDB) .option(DataSourceWriteOptions.HIVE_URL_OPT_KEY(), hiveJdbcUrl) .option(DataSourceWriteOptions.HIVE_USER_OPT_KEY(), hiveUser) .option(DataSourceWriteOptions.HIVE_PASS_OPT_KEY(), hivePass) .option(DataSourceWriteOptions.HIVE_SYNC_ENABLED_OPT_KEY(), "true"); if (nonPartitionedTable) { writer = writer.option(DataSourceWriteOptions.HIVE_PARTITION_EXTRACTOR_CLASS_OPT_KEY(), NonPartitionedExtractor.class.getCanonicalName()) .option(DataSourceWriteOptions.PARTITIONPATH_FIELD_OPT_KEY(), ""); } else if (useMultiPartitionKeys) { writer = writer.option(DataSourceWriteOptions.HIVE_PARTITION_FIELDS_OPT_KEY(), "year,month,day") .option(DataSourceWriteOptions.HIVE_PARTITION_EXTRACTOR_CLASS_OPT_KEY(), MultiPartKeysValueExtractor.class.getCanonicalName()); } else { writer = writer.option(DataSourceWriteOptions.HIVE_PARTITION_FIELDS_OPT_KEY(), "dateStr"); } } return writer; } }
@Test public void testOptionsAPI() { HashMap<String, String> map = new HashMap<String, String>(); map.put("e", "1"); spark .read() .option("a", "1") .option("b", 1) .option("c", 1.0) .option("d", true) .options(map) .text() .write() .option("a", "1") .option("b", 1) .option("c", 1.0) .option("d", true) .options(map) .format("org.apache.spark.sql.test") .save(); }
.coalesce(numberOfOutputFiles) .write() .option("compression", "gzip") .parquet(outputDir);
.coalesce(1) .write() .option("compression", "gzip") .parquet(outputDir);
.option("hoodie.insert.shuffle.parallelism", "2") // any hoodie client config can be passed like this .option("hoodie.upsert.shuffle.parallelism", "2") // full list in HoodieWriteConfig & its package .option(DataSourceWriteOptions.STORAGE_TYPE_OPT_KEY(), tableType) // Hoodie Table Type .option(DataSourceWriteOptions.OPERATION_OPT_KEY(), .option(DataSourceWriteOptions.RECORDKEY_FIELD_OPT_KEY(), "_row_key") // This is the record key .option(DataSourceWriteOptions.PARTITIONPATH_FIELD_OPT_KEY(), "partition") // this is the partition to place it into .option(DataSourceWriteOptions.PRECOMBINE_FIELD_OPT_KEY(), "timestamp") // use to combine duplicate records in input/with disk val .option(HoodieWriteConfig.TABLE_NAME, tableName) // Used by hive sync and queries .option(DataSourceWriteOptions.KEYGENERATOR_CLASS_OPT_KEY(), nonPartitionedTable ? NonpartitionedKeyGenerator.class.getCanonicalName() : SimpleKeyGenerator.class.getCanonicalName()) // Add Key Extractor dataGen.generateUpdates("002"/* ignore */, 100)); Dataset<Row> inputDF2 = spark.read().json(jssc.parallelize(records2, 2)); writer = inputDF2.write().format("com.uber.hoodie").option("hoodie.insert.shuffle.parallelism", "2") .option("hoodie.upsert.shuffle.parallelism", "2") .option(DataSourceWriteOptions.STORAGE_TYPE_OPT_KEY(), tableType) // Hoodie Table Type .option(DataSourceWriteOptions.RECORDKEY_FIELD_OPT_KEY(), "_row_key") .option(DataSourceWriteOptions.PARTITIONPATH_FIELD_OPT_KEY(), "partition") .option(DataSourceWriteOptions.PRECOMBINE_FIELD_OPT_KEY(), "timestamp") .option(DataSourceWriteOptions.KEYGENERATOR_CLASS_OPT_KEY(), nonPartitionedTable ? NonpartitionedKeyGenerator.class.getCanonicalName() : SimpleKeyGenerator.class.getCanonicalName()) // Add Key Extractor
results.repartition(1).write().format("com.databricks.spark.csv").option( "header", "true").mode(SaveMode.Overwrite).save(sparkSqlOptions.getCsvOutputFile());