Java Examples & Tutorials of Dataset.withColumn (org.apache.spark.sql)

@Override
public List<Tuple2<MutationType, Dataset<Row>>> planMutationsForSet(Dataset<Row> arriving)
{
 if (setsKeyToUUID()) {
  arriving = arriving.withColumn(getKeyFieldNames().get(0), functions.lit(UUID.randomUUID().toString()));
 }
 if (hasLastUpdatedField()) {
  arriving = arriving.withColumn(getLastUpdatedFieldName(), functions.lit(currentTimestampString()));
 }
 List<Tuple2<MutationType, Dataset<Row>>> planned = Lists.newArrayList();
 planned.add(new Tuple2<MutationType, Dataset<Row>>(MutationType.INSERT, arriving));
 return planned;
}

@Override
public List<Tuple2<MutationType, Dataset<Row>>> planMutationsForSet(Dataset<Row> arriving)
{
 if (hasLastUpdatedField()) {
  arriving = arriving.withColumn(getLastUpdatedFieldName(), functions.lit(currentTimestampString()));
 }
 List<Tuple2<MutationType, Dataset<Row>>> planned = Lists.newArrayList();
 planned.add(new Tuple2<MutationType, Dataset<Row>>(MutationType.UPSERT, arriving));
 return planned;
}

@Override
public Dataset<Row> withColumn(final String colName, final Column col) {
 final boolean userTriggered = initializeFunction(colName, col);
 final Dataset<Row> result = from(super.withColumn(colName, col));
 this.setIsUserTriggered(userTriggered);
 return result;
}

protected C withConceptMaps(Dataset<T> newMaps, Dataset<Mapping> newMappings) {
 Dataset<UrlAndVersion> newMembers = getUrlAndVersions(newMaps);
 // Instantiating a new composite ConceptMaps requires a new timestamp
 Timestamp timestamp = new Timestamp(System.currentTimeMillis());
 Dataset<T> newMapsWithTimestamp = newMaps
   .withColumn("timestamp", lit(timestamp.toString()).cast("timestamp"))
   .as(conceptMapEncoder);
 return newInstance(spark,
   this.members.union(newMembers),
   this.conceptMaps.union(newMapsWithTimestamp),
   this.mappings.union(newMappings));
}

private ValueSets withValueSets(Dataset<ValueSet> newValueSets, Dataset<Value> newValues) {
 Dataset<UrlAndVersion> newMembers = getUrlAndVersions(newValueSets);
 // Instantiating a new composite ConceptMaps requires a new timestamp
 Timestamp timestamp = new Timestamp(System.currentTimeMillis());
 Dataset<ValueSet> newValueSetsWithTimestamp = newValueSets
   .withColumn("timestamp", lit(timestamp.toString()).cast("timestamp"))
   .as(VALUE_SET_ENCODER);
 return new ValueSets(spark,
   this.members.union(newMembers),
   this.valueSets.union(newValueSetsWithTimestamp),
   this.values.union(newValues));
}

protected C withConceptMaps(Dataset<T> newMaps, Dataset<Mapping> newMappings) {
 Dataset<UrlAndVersion> newMembers = getUrlAndVersions(newMaps);
 // Instantiating a new composite ConceptMaps requires a new timestamp
 Timestamp timestamp = new Timestamp(System.currentTimeMillis());
 Dataset<T> newMapsWithTimestamp = newMaps
   .withColumn("timestamp", lit(timestamp.toString()).cast("timestamp"))
   .as(conceptMapEncoder);
 return newInstance(spark,
   this.members.union(newMembers),
   this.conceptMaps.union(newMapsWithTimestamp),
   this.mappings.union(newMappings));
}

private ValueSets withValueSets(Dataset<ValueSet> newValueSets, Dataset<Value> newValues) {
 Dataset<UrlAndVersion> newMembers = getUrlAndVersions(newValueSets);
 // Instantiating a new composite ConceptMaps requires a new timestamp
 Timestamp timestamp = new Timestamp(System.currentTimeMillis());
 Dataset<ValueSet> newValueSetsWithTimestamp = newValueSets
   .withColumn("timestamp", lit(timestamp.toString()).cast("timestamp"))
   .as(VALUE_SET_ENCODER);
 return new ValueSets(spark,
   this.members.union(newMembers),
   this.valueSets.union(newValueSetsWithTimestamp),
   this.values.union(newValues));
}

private ValueSets withValueSets(Dataset<ValueSet> newValueSets, Dataset<Value> newValues) {
 Dataset<UrlAndVersion> newMembers = getUrlAndVersions(newValueSets);
 // Instantiating a new composite ConceptMaps requires a new timestamp
 Timestamp timestamp = new Timestamp(System.currentTimeMillis());
 Dataset<ValueSet> newValueSetsWithTimestamp = newValueSets
   .withColumn("timestamp", lit(timestamp.toString()).cast("timestamp"))
   .as(VALUE_SET_ENCODER);
 return new ValueSets(spark,
   this.members.union(newMembers),
   this.valueSets.union(newValueSetsWithTimestamp),
   this.values.union(newValues));
}

/**
 * Normalize by zero mean unit variance
 *
 * @param frame the data to normalize
 * @return a zero mean unit variance centered
 * rdd
 */
public static DataRowsFacade zeromeanUnitVariance(DataRowsFacade frame, List<String> skipColumns) {
  List<String> columnsList = DataFrames.toList(frame.get().columns());
  columnsList.removeAll(skipColumns);
  String[] columnNames = DataFrames.toArray(columnsList);
  //first row is std second row is mean, each column in a row is for a particular column
  List<Row> stdDevMean = stdDevMeanColumns(frame, columnNames);
  for (int i = 0; i < columnNames.length; i++) {
    String columnName = columnNames[i];
    double std = ((Number) stdDevMean.get(0).get(i)).doubleValue();
    double mean = ((Number) stdDevMean.get(1).get(i)).doubleValue();
    if (std == 0.0)
      std = 1; //All same value -> (x-x)/1 = 0
    frame = dataRows(frame.get().withColumn(columnName, frame.get().col(columnName).minus(mean).divide(std)));
  }
  return frame;
}

 private void start() {
  SparkSession spark = SparkSession.builder().appName("JSON to Dataset")
    .master("local").getOrCreate();

  String filename = "data/north-carolina-school-performance-data.json";
  long start = System.currentTimeMillis();
  Dataset<Row> df = spark.read().json(filename);
  long stop = System.currentTimeMillis();
  System.out.println("Processing took " + (stop - start) + " ms");
  df.show();
  df.printSchema();

  // Flatenization
  df = df.withColumn("district", df.col("fields.district"));
  df = df.drop(df.col("fields.district")); // this does not work as the column
                       // stays here (Spark 2.0.0)
  df.show();
  df.printSchema();
 }
}

 private void start() {
  SparkSession spark = SparkSession.builder().appName("CSV to Dataset")
    .master("local").getOrCreate();

  spark.udf().register("x2Multiplier", new Multiplier2(),
    DataTypes.IntegerType);

  String filename = "data/tuple-data-file.csv";
  Dataset<Row> df = spark.read().format("csv").option("inferSchema", "true")
    .option("header", "false").load(filename);
  df = df.withColumn("label", df.col("_c0")).drop("_c0");
  df = df.withColumn("value", df.col("_c1")).drop("_c1");
  df = df.withColumn("x2", callUDF("x2Multiplier", df.col("value").cast(
    DataTypes.IntegerType)));
  df.show();
 }
}

/**
 * Returns an empty ConceptMaps instance.
 *
 * @param spark the spark session
 * @return an empty ConceptMaps instance.
 */
public static ConceptMaps getEmpty(SparkSession spark) {
 Dataset<ConceptMap> emptyConceptMaps = spark.emptyDataset(CONCEPT_MAP_ENCODER)
   .withColumn("timestamp", lit(null).cast("timestamp"))
   .as(CONCEPT_MAP_ENCODER);
 return new ConceptMaps(spark,
   spark.emptyDataset(URL_AND_VERSION_ENCODER),
   emptyConceptMaps,
   spark.emptyDataset(MAPPING_ENCODER));
}

/**
 * Returns an empty ConceptMaps instance.
 *
 * @param spark the spark session
 * @return an empty ConceptMaps instance.
 */
public static ConceptMaps getEmpty(SparkSession spark) {
 Dataset<ConceptMap> emptyConceptMaps = spark.emptyDataset(CONCEPT_MAP_ENCODER)
   .withColumn("timestamp", lit(null).cast("timestamp"))
   .as(CONCEPT_MAP_ENCODER);
 return new ConceptMaps(spark,
   spark.emptyDataset(URL_AND_VERSION_ENCODER),
   emptyConceptMaps,
   spark.emptyDataset(MAPPING_ENCODER));
}

/**
 * Returns an empty ConceptMaps instance.
 *
 * @param spark the spark session
 * @return an empty ConceptMaps instance.
 */
public static ConceptMaps getEmpty(SparkSession spark) {
 Dataset<ConceptMap> emptyConceptMaps = spark.emptyDataset(CONCEPT_MAP_ENCODER)
   .withColumn("timestamp", lit(null).cast("timestamp"))
   .as(CONCEPT_MAP_ENCODER);
 return new ConceptMaps(spark,
   spark.emptyDataset(URL_AND_VERSION_ENCODER),
   emptyConceptMaps,
   spark.emptyDataset(MAPPING_ENCODER));
}

 private void start() {
  SparkSession spark = SparkSession.builder().appName("CSV to Dataset")
    .master("local").getOrCreate();

  // registers a new internal UDF
  spark.udf().register("x2Multiplier", new UDF1<Integer, Integer>() {
   private static final long serialVersionUID = -5372447039252716846L;

   @Override
   public Integer call(Integer x) {
    return x * 2;
   }
  }, DataTypes.IntegerType);

  String filename = "data/tuple-data-file.csv";
  Dataset<Row> df = spark.read().format("csv").option("inferSchema", "true")
    .option("header", "false").load(filename);
  df = df.withColumn("label", df.col("_c0")).drop("_c0");
  df = df.withColumn("value", df.col("_c1")).drop("_c1");
  df = df.withColumn("x2", callUDF("x2Multiplier", df.col("value").cast(
    DataTypes.IntegerType)));
  df.show();
 }
}

/**
 * Returns an empty ValueSets instance.
 *
 * @param spark the spark session
 * @return an empty ValueSets instance.
 */
public static ValueSets getEmpty(SparkSession spark) {
 Dataset<ValueSet> emptyValueSets = spark.emptyDataset(VALUE_SET_ENCODER)
   .withColumn("timestamp", lit(null).cast("timestamp"))
   .as(VALUE_SET_ENCODER);
 return new ValueSets(spark,
   spark.emptyDataset(URL_AND_VERSION_ENCODER),
   emptyValueSets,
   spark.emptyDataset(getValueEncoder()));
}

/**
 * Returns an empty ValueSets instance.
 *
 * @param spark the spark session
 * @return an empty ValueSets instance.
 */
public static ValueSets getEmpty(SparkSession spark) {
 Dataset<ValueSet> emptyValueSets = spark.emptyDataset(VALUE_SET_ENCODER)
   .withColumn("timestamp", lit(null).cast("timestamp"))
   .as(VALUE_SET_ENCODER);
 return new ValueSets(spark,
   spark.emptyDataset(URL_AND_VERSION_ENCODER),
   emptyValueSets,
   spark.emptyDataset(getValueEncoder()));
}

/**
 * Returns an empty ValueSets instance.
 *
 * @param spark the spark session
 * @return an empty ValueSets instance.
 */
public static ValueSets getEmpty(SparkSession spark) {
 Dataset<ValueSet> emptyValueSets = spark.emptyDataset(VALUE_SET_ENCODER)
   .withColumn("timestamp", lit(null).cast("timestamp"))
   .as(VALUE_SET_ENCODER);
 return new ValueSets(spark,
   spark.emptyDataset(URL_AND_VERSION_ENCODER),
   emptyValueSets,
   spark.emptyDataset(getValueEncoder()));
}

@Override
public Dataset<Row> transform(Dataset<?> dataset){
  StructType schema = dataset.schema();
  StructType structSchema = getStructSchema(schema);
  Column structColumn = dataset.apply(DatasetUtil.escapeColumnName(getStructCol()));
  Dataset<Row> result = dataset.toDF();
  StructField[] fields = structSchema.fields();
  for(StructField field : fields){
    String name = field.name();
    Column fieldColumn = structColumn.getField(DatasetUtil.escapeColumnName(name));
    result = result.withColumn(DatasetUtil.escapeColumnName(name), fieldColumn);
  }
  return result;
}

private File buildPartitionedTable(String desc, PartitionSpec spec, String udf, String partitionColumn) {
 File location = new File(parent, desc);
 Table byId = TABLES.create(SCHEMA, spec, location.toString());
 // do not combine splits because the tests expect a split per partition
 byId.updateProperties().set("read.split.target-size", "1").commit();
 // copy the unpartitioned table into the partitioned table to produce the partitioned data
 Dataset<Row> allRows = spark.read()
   .format("iceberg")
   .load(unpartitioned.toString());
 allRows
   .coalesce(1) // ensure only 1 file per partition is written
   .withColumn("part", callUDF(udf, column(partitionColumn)))
   .sortWithinPartitions("part")
   .drop("part")
   .write()
   .format("iceberg")
   .mode("append")
   .save(byId.location());
 return location;
}

Popular methods of Dataset

Popular in Java

Reactive rest calls using spring rest template
scheduleAtFixedRate (ScheduledExecutorService)
setScale (BigDecimal)
scheduleAtFixedRate (Timer)
Socket (java.net)
Provides a client-side TCP socket.
NoSuchElementException (java.util)
Thrown when trying to retrieve an element past the end of an Enumeration or Iterator.
Random (java.util)
This class provides methods that return pseudo-random values.It is dangerous to seed Random with the
SortedMap (java.util)
A map that has its keys ordered. The sorting is according to either the natural ordering of its keys
Semaphore (java.util.concurrent)
A counting semaphore. Conceptually, a semaphore maintains a set of permits. Each #acquire blocks if
SAXParseException (org.xml.sax)
Encapsulate an XML parse error or warning.> This module, both source code and documentation, is in t
Top plugins for Android Studio

How to use withColumnmethodin org.apache.spark.sql.Dataset

Best Java code snippets using org.apache.spark.sql.Dataset.withColumn (Showing top 20 results out of 315)

How to use
withColumn
method
in
org.apache.spark.sql.Dataset