@Override public Dataset<Row> drop(final scala.collection.Seq<String> colNames) { final boolean userTriggered = initializeFunction(colNames); final Dataset<Row> result = from(super.drop(colNames)); this.setIsUserTriggered(userTriggered); return result; }
@Override public Dataset<Row> derive(Map<String, Dataset<Row>> dependencies) throws Exception { dependencyCheck(dependencies); Dataset<Row> sourceStep = dependencies.get(stepName); if (useIncludeFields){ if (!Arrays.asList(sourceStep.columns()).containsAll(includeFields)){ throw new RuntimeException("Columns specified in " + INCLUDE_FIELDS + " are not found in input dependency schema \n" + "Available columns: " + Arrays.toString(sourceStep.columns())); } String firstCol = includeFields.get(0); includeFields.remove(0); return sourceStep.select(firstCol, includeFields.toArray(new String[0])); } else { if (!Arrays.asList(sourceStep.columns()).containsAll(excludeFields)){ throw new RuntimeException("Columns specified in " + EXCLUDE_FIELDS + " are not found in input dependency schema \n" + "Available columns: " + Arrays.toString(sourceStep.columns())); } return sourceStep.drop(JavaConverters.collectionAsScalaIterableConverter(excludeFields).asScala().toSeq()); } }
@Override public Dataset<Row> drop(final Column col) { final boolean userTriggered = initializeFunction(col); final Dataset<Row> result = from(super.drop(col)); this.setIsUserTriggered(userTriggered); return result; }
@Override public Dataset<Row> drop(final String... colNames) { final boolean userTriggered = initializeFunction(colNames); final Dataset<Row> result = from(super.drop(colNames)); this.setIsUserTriggered(userTriggered); return result; }
@Override public Dataset<Row> drop(final String colName) { final boolean userTriggered = initializeFunction(colName); final Dataset<Row> result = from(super.drop(colName)); this.setIsUserTriggered(userTriggered); return result; }
private void start() { SparkSession spark = SparkSession.builder().appName("JSON to Dataset") .master("local").getOrCreate(); String filename = "data/north-carolina-school-performance-data.json"; long start = System.currentTimeMillis(); Dataset<Row> df = spark.read().json(filename); long stop = System.currentTimeMillis(); System.out.println("Processing took " + (stop - start) + " ms"); df.show(); df.printSchema(); // Flatenization df = df.withColumn("district", df.col("fields.district")); df = df.drop(df.col("fields.district")); // this does not work as the column // stays here (Spark 2.0.0) df.show(); df.printSchema(); } }
Set<String> mutationFields = Sets.newHashSet(mutation.schema().fieldNames()); for (String col : Sets.difference(mutationFields, kuduColumns)) { mutation = mutation.drop(col);
private void start() { SparkSession spark = SparkSession.builder().appName("CSV to Dataset") .master("local").getOrCreate(); spark.udf().register("x2Multiplier", new Multiplier2(), DataTypes.IntegerType); String filename = "data/tuple-data-file.csv"; Dataset<Row> df = spark.read().format("csv").option("inferSchema", "true") .option("header", "false").load(filename); df = df.withColumn("label", df.col("_c0")).drop("_c0"); df = df.withColumn("value", df.col("_c1")).drop("_c1"); df = df.withColumn("x2", callUDF("x2Multiplier", df.col("value").cast( DataTypes.IntegerType))); df.show(); } }
private void start() { SparkSession spark = SparkSession.builder().appName("CSV to Dataset") .master("local").getOrCreate(); // registers a new internal UDF spark.udf().register("x2Multiplier", new UDF1<Integer, Integer>() { private static final long serialVersionUID = -5372447039252716846L; @Override public Integer call(Integer x) { return x * 2; } }, DataTypes.IntegerType); String filename = "data/tuple-data-file.csv"; Dataset<Row> df = spark.read().format("csv").option("inferSchema", "true") .option("header", "false").load(filename); df = df.withColumn("label", df.col("_c0")).drop("_c0"); df = df.withColumn("value", df.col("_c1")).drop("_c1"); df = df.withColumn("x2", callUDF("x2Multiplier", df.col("value").cast( DataTypes.IntegerType))); df.show(); } }
private void start() { SparkSession spark = SparkSession.builder() .appName("CSV to Dataset") .master("local") .getOrCreate(); String filename = "data/tuple-data-file.csv"; Dataset<Row> df = spark.read().format("csv") .option("inferSchema", "true") .option("header", "false") .load(filename); df.show(); // To ensure compatibility between Spark 2.0.0 and Spark 1.6.x int count = df.columns().length; for (int i = 0; i < count; i++) { String oldColName = "_c" + i; String newColName = "C" + i; df = df.withColumn(newColName, df.col(oldColName)).drop(oldColName); } df.show(); } }
private File buildPartitionedTable(String desc, PartitionSpec spec, String udf, String partitionColumn) { File location = new File(parent, desc); Table byId = TABLES.create(SCHEMA, spec, location.toString()); // do not combine splits because the tests expect a split per partition byId.updateProperties().set("read.split.target-size", "1").commit(); // copy the unpartitioned table into the partitioned table to produce the partitioned data Dataset<Row> allRows = spark.read() .format("iceberg") .load(unpartitioned.toString()); allRows .coalesce(1) // ensure only 1 file per partition is written .withColumn("part", callUDF(udf, column(partitionColumn))) .sortWithinPartitions("part") .drop("part") .write() .format("iceberg") .mode("append") .save(byId.location()); return location; }
"full_outer") .withColumn("bookId", booksDf.col("id")) .drop(booksDf.col("id"));
.join(booksDf, authorsDf.col("id").equalTo(booksDf.col("authorId")), "full_outer") .withColumn("bookId", booksDf.col("id")) .drop(booksDf.col("id"));
"left") .withColumn("bookId", booksDf.col("id")) .drop(booksDf.col("id")) .groupBy( authorsDf.col("id"),