org.apache.spark.sql.Dataset.drop java code examples

@Override
public Dataset<Row> drop(final scala.collection.Seq<String> colNames) {
 final boolean userTriggered = initializeFunction(colNames);
 final Dataset<Row> result = from(super.drop(colNames));
 this.setIsUserTriggered(userTriggered);
 return result;
}

@Override
public Dataset<Row> derive(Map<String, Dataset<Row>> dependencies) throws Exception {
 dependencyCheck(dependencies);
 Dataset<Row> sourceStep = dependencies.get(stepName);
 if (useIncludeFields){
   if (!Arrays.asList(sourceStep.columns()).containsAll(includeFields)){
     throw new RuntimeException("Columns specified in " + INCLUDE_FIELDS + " are not found in input dependency schema \n" +
     "Available columns: " + Arrays.toString(sourceStep.columns()));
   }
   String firstCol = includeFields.get(0);
   includeFields.remove(0);
   return sourceStep.select(firstCol, includeFields.toArray(new String[0]));
 } else {
   if (!Arrays.asList(sourceStep.columns()).containsAll(excludeFields)){
     throw new RuntimeException("Columns specified in " + EXCLUDE_FIELDS + " are not found in input dependency schema \n" +
     "Available columns: " + Arrays.toString(sourceStep.columns()));
   }
   return sourceStep.drop(JavaConverters.collectionAsScalaIterableConverter(excludeFields).asScala().toSeq());
 }
}

@Override
public Dataset<Row> drop(final Column col) {
 final boolean userTriggered = initializeFunction(col);
 final Dataset<Row> result = from(super.drop(col));
 this.setIsUserTriggered(userTriggered);
 return result;
}

@Override
public Dataset<Row> drop(final String... colNames) {
 final boolean userTriggered = initializeFunction(colNames);
 final Dataset<Row> result = from(super.drop(colNames));
 this.setIsUserTriggered(userTriggered);
 return result;
}

@Override
public Dataset<Row> drop(final String colName) {
 final boolean userTriggered = initializeFunction(colName);
 final Dataset<Row> result = from(super.drop(colName));
 this.setIsUserTriggered(userTriggered);
 return result;
}

 private void start() {
  SparkSession spark = SparkSession.builder().appName("JSON to Dataset")
    .master("local").getOrCreate();

  String filename = "data/north-carolina-school-performance-data.json";
  long start = System.currentTimeMillis();
  Dataset<Row> df = spark.read().json(filename);
  long stop = System.currentTimeMillis();
  System.out.println("Processing took " + (stop - start) + " ms");
  df.show();
  df.printSchema();

  // Flatenization
  df = df.withColumn("district", df.col("fields.district"));
  df = df.drop(df.col("fields.district")); // this does not work as the column
                       // stays here (Spark 2.0.0)
  df.show();
  df.printSchema();
 }
}

Set<String> mutationFields = Sets.newHashSet(mutation.schema().fieldNames());
for (String col : Sets.difference(mutationFields, kuduColumns)) {
 mutation = mutation.drop(col);

 private void start() {
  SparkSession spark = SparkSession.builder().appName("CSV to Dataset")
    .master("local").getOrCreate();

  spark.udf().register("x2Multiplier", new Multiplier2(),
    DataTypes.IntegerType);

  String filename = "data/tuple-data-file.csv";
  Dataset<Row> df = spark.read().format("csv").option("inferSchema", "true")
    .option("header", "false").load(filename);
  df = df.withColumn("label", df.col("_c0")).drop("_c0");
  df = df.withColumn("value", df.col("_c1")).drop("_c1");
  df = df.withColumn("x2", callUDF("x2Multiplier", df.col("value").cast(
    DataTypes.IntegerType)));
  df.show();
 }
}

 private void start() {
  SparkSession spark = SparkSession.builder().appName("CSV to Dataset")
    .master("local").getOrCreate();

  // registers a new internal UDF
  spark.udf().register("x2Multiplier", new UDF1<Integer, Integer>() {
   private static final long serialVersionUID = -5372447039252716846L;

   @Override
   public Integer call(Integer x) {
    return x * 2;
   }
  }, DataTypes.IntegerType);

  String filename = "data/tuple-data-file.csv";
  Dataset<Row> df = spark.read().format("csv").option("inferSchema", "true")
    .option("header", "false").load(filename);
  df = df.withColumn("label", df.col("_c0")).drop("_c0");
  df = df.withColumn("value", df.col("_c1")).drop("_c1");
  df = df.withColumn("x2", callUDF("x2Multiplier", df.col("value").cast(
    DataTypes.IntegerType)));
  df.show();
 }
}

  "false")
  .load(filename);
df = df.withColumn("valuefeatures", df.col("_c0")).drop("_c0");
df = df.withColumn("label", df.col("_c1")).drop("_c1");
df.printSchema();

 private void start() {
  SparkSession spark = SparkSession.builder()
    .appName("CSV to Dataset")
    .master("local")
    .getOrCreate();

  String filename = "data/tuple-data-file.csv";
  Dataset<Row> df = spark.read().format("csv")
    .option("inferSchema", "true")
    .option("header", "false")
    .load(filename);
  df.show();

  // To ensure compatibility between Spark 2.0.0 and Spark 1.6.x
  int count = df.columns().length;
  for (int i = 0; i < count; i++) {
   String oldColName = "_c" + i;
   String newColName = "C" + i;
   df = df.withColumn(newColName, df.col(oldColName)).drop(oldColName);
  }
  df.show();
 }
}

private File buildPartitionedTable(String desc, PartitionSpec spec, String udf, String partitionColumn) {
 File location = new File(parent, desc);
 Table byId = TABLES.create(SCHEMA, spec, location.toString());
 // do not combine splits because the tests expect a split per partition
 byId.updateProperties().set("read.split.target-size", "1").commit();
 // copy the unpartitioned table into the partitioned table to produce the partitioned data
 Dataset<Row> allRows = spark.read()
   .format("iceberg")
   .load(unpartitioned.toString());
 allRows
   .coalesce(1) // ensure only 1 file per partition is written
   .withColumn("part", callUDF(udf, column(partitionColumn)))
   .sortWithinPartitions("part")
   .drop("part")
   .write()
   .format("iceberg")
   .mode("append")
   .save(byId.location());
 return location;
}

  "full_outer")
.withColumn("bookId", booksDf.col("id"))
.drop(booksDf.col("id"));

.join(booksDf, authorsDf.col("id").equalTo(booksDf.col("authorId")), "full_outer")
.withColumn("bookId", booksDf.col("id"))
.drop(booksDf.col("id"));

  "left")
.withColumn("bookId", booksDf.col("id"))
.drop(booksDf.col("id"))
.groupBy(
  authorsDf.col("id"),

Popular methods of Dataset

Popular in Java

Running tasks concurrently on multiple threads
orElseThrow (Optional)
Return the contained value, if present, otherwise throw an exception to be created by the provided s
getApplicationContext (Context)
notifyDataSetChanged (ArrayAdapter)
FileInputStream (java.io)
An input stream that reads bytes from a file. File file = ...finally if (in != null) in.clos
FileReader (java.io)
A specialized Reader that reads from a file in the file system. All read requests made by calling me
LogFactory (org.apache.commons.logging)
Factory for creating Log instances, with discovery and configuration features similar to that employ
Table (com.google.common.collect)
A collection that associates an ordered pair of keys, called a row key and a column key, with a sing
Project (org.apache.tools.ant)
Central representation of an Ant project. This class defines an Ant project with all of its targets,
Option (scala)
Github Copilot alternatives

How to use dropmethodin org.apache.spark.sql.Dataset

Best Java code snippets using org.apache.spark.sql.Dataset.drop (Showing top 15 results out of 315)

How to use
drop
method
in
org.apache.spark.sql.Dataset