@Override public List<Tuple2<MutationType, Dataset<Row>>> planMutationsForSet(Dataset<Row> arriving) { if (setsKeyToUUID()) { arriving = arriving.withColumn(getKeyFieldNames().get(0), functions.lit(UUID.randomUUID().toString())); } if (hasLastUpdatedField()) { arriving = arriving.withColumn(getLastUpdatedFieldName(), functions.lit(currentTimestampString())); } List<Tuple2<MutationType, Dataset<Row>>> planned = Lists.newArrayList(); planned.add(new Tuple2<MutationType, Dataset<Row>>(MutationType.INSERT, arriving)); return planned; }
@Override public List<Tuple2<MutationType, Dataset<Row>>> planMutationsForSet(Dataset<Row> arriving) { if (hasLastUpdatedField()) { arriving = arriving.withColumn(getLastUpdatedFieldName(), functions.lit(currentTimestampString())); } List<Tuple2<MutationType, Dataset<Row>>> planned = Lists.newArrayList(); planned.add(new Tuple2<MutationType, Dataset<Row>>(MutationType.UPSERT, arriving)); return planned; }
@Override public Dataset<Row> withColumn(final String colName, final Column col) { final boolean userTriggered = initializeFunction(colName, col); final Dataset<Row> result = from(super.withColumn(colName, col)); this.setIsUserTriggered(userTriggered); return result; }
protected C withConceptMaps(Dataset<T> newMaps, Dataset<Mapping> newMappings) { Dataset<UrlAndVersion> newMembers = getUrlAndVersions(newMaps); // Instantiating a new composite ConceptMaps requires a new timestamp Timestamp timestamp = new Timestamp(System.currentTimeMillis()); Dataset<T> newMapsWithTimestamp = newMaps .withColumn("timestamp", lit(timestamp.toString()).cast("timestamp")) .as(conceptMapEncoder); return newInstance(spark, this.members.union(newMembers), this.conceptMaps.union(newMapsWithTimestamp), this.mappings.union(newMappings)); }
private ValueSets withValueSets(Dataset<ValueSet> newValueSets, Dataset<Value> newValues) { Dataset<UrlAndVersion> newMembers = getUrlAndVersions(newValueSets); // Instantiating a new composite ConceptMaps requires a new timestamp Timestamp timestamp = new Timestamp(System.currentTimeMillis()); Dataset<ValueSet> newValueSetsWithTimestamp = newValueSets .withColumn("timestamp", lit(timestamp.toString()).cast("timestamp")) .as(VALUE_SET_ENCODER); return new ValueSets(spark, this.members.union(newMembers), this.valueSets.union(newValueSetsWithTimestamp), this.values.union(newValues)); }
protected C withConceptMaps(Dataset<T> newMaps, Dataset<Mapping> newMappings) { Dataset<UrlAndVersion> newMembers = getUrlAndVersions(newMaps); // Instantiating a new composite ConceptMaps requires a new timestamp Timestamp timestamp = new Timestamp(System.currentTimeMillis()); Dataset<T> newMapsWithTimestamp = newMaps .withColumn("timestamp", lit(timestamp.toString()).cast("timestamp")) .as(conceptMapEncoder); return newInstance(spark, this.members.union(newMembers), this.conceptMaps.union(newMapsWithTimestamp), this.mappings.union(newMappings)); }
private ValueSets withValueSets(Dataset<ValueSet> newValueSets, Dataset<Value> newValues) { Dataset<UrlAndVersion> newMembers = getUrlAndVersions(newValueSets); // Instantiating a new composite ConceptMaps requires a new timestamp Timestamp timestamp = new Timestamp(System.currentTimeMillis()); Dataset<ValueSet> newValueSetsWithTimestamp = newValueSets .withColumn("timestamp", lit(timestamp.toString()).cast("timestamp")) .as(VALUE_SET_ENCODER); return new ValueSets(spark, this.members.union(newMembers), this.valueSets.union(newValueSetsWithTimestamp), this.values.union(newValues)); }
private ValueSets withValueSets(Dataset<ValueSet> newValueSets, Dataset<Value> newValues) { Dataset<UrlAndVersion> newMembers = getUrlAndVersions(newValueSets); // Instantiating a new composite ConceptMaps requires a new timestamp Timestamp timestamp = new Timestamp(System.currentTimeMillis()); Dataset<ValueSet> newValueSetsWithTimestamp = newValueSets .withColumn("timestamp", lit(timestamp.toString()).cast("timestamp")) .as(VALUE_SET_ENCODER); return new ValueSets(spark, this.members.union(newMembers), this.valueSets.union(newValueSetsWithTimestamp), this.values.union(newValues)); }
/** * Normalize by zero mean unit variance * * @param frame the data to normalize * @return a zero mean unit variance centered * rdd */ public static DataRowsFacade zeromeanUnitVariance(DataRowsFacade frame, List<String> skipColumns) { List<String> columnsList = DataFrames.toList(frame.get().columns()); columnsList.removeAll(skipColumns); String[] columnNames = DataFrames.toArray(columnsList); //first row is std second row is mean, each column in a row is for a particular column List<Row> stdDevMean = stdDevMeanColumns(frame, columnNames); for (int i = 0; i < columnNames.length; i++) { String columnName = columnNames[i]; double std = ((Number) stdDevMean.get(0).get(i)).doubleValue(); double mean = ((Number) stdDevMean.get(1).get(i)).doubleValue(); if (std == 0.0) std = 1; //All same value -> (x-x)/1 = 0 frame = dataRows(frame.get().withColumn(columnName, frame.get().col(columnName).minus(mean).divide(std))); } return frame; }
private void start() { SparkSession spark = SparkSession.builder().appName("JSON to Dataset") .master("local").getOrCreate(); String filename = "data/north-carolina-school-performance-data.json"; long start = System.currentTimeMillis(); Dataset<Row> df = spark.read().json(filename); long stop = System.currentTimeMillis(); System.out.println("Processing took " + (stop - start) + " ms"); df.show(); df.printSchema(); // Flatenization df = df.withColumn("district", df.col("fields.district")); df = df.drop(df.col("fields.district")); // this does not work as the column // stays here (Spark 2.0.0) df.show(); df.printSchema(); } }
private void start() { SparkSession spark = SparkSession.builder().appName("CSV to Dataset") .master("local").getOrCreate(); spark.udf().register("x2Multiplier", new Multiplier2(), DataTypes.IntegerType); String filename = "data/tuple-data-file.csv"; Dataset<Row> df = spark.read().format("csv").option("inferSchema", "true") .option("header", "false").load(filename); df = df.withColumn("label", df.col("_c0")).drop("_c0"); df = df.withColumn("value", df.col("_c1")).drop("_c1"); df = df.withColumn("x2", callUDF("x2Multiplier", df.col("value").cast( DataTypes.IntegerType))); df.show(); } }
/** * Returns an empty ConceptMaps instance. * * @param spark the spark session * @return an empty ConceptMaps instance. */ public static ConceptMaps getEmpty(SparkSession spark) { Dataset<ConceptMap> emptyConceptMaps = spark.emptyDataset(CONCEPT_MAP_ENCODER) .withColumn("timestamp", lit(null).cast("timestamp")) .as(CONCEPT_MAP_ENCODER); return new ConceptMaps(spark, spark.emptyDataset(URL_AND_VERSION_ENCODER), emptyConceptMaps, spark.emptyDataset(MAPPING_ENCODER)); }
/** * Returns an empty ConceptMaps instance. * * @param spark the spark session * @return an empty ConceptMaps instance. */ public static ConceptMaps getEmpty(SparkSession spark) { Dataset<ConceptMap> emptyConceptMaps = spark.emptyDataset(CONCEPT_MAP_ENCODER) .withColumn("timestamp", lit(null).cast("timestamp")) .as(CONCEPT_MAP_ENCODER); return new ConceptMaps(spark, spark.emptyDataset(URL_AND_VERSION_ENCODER), emptyConceptMaps, spark.emptyDataset(MAPPING_ENCODER)); }
/** * Returns an empty ConceptMaps instance. * * @param spark the spark session * @return an empty ConceptMaps instance. */ public static ConceptMaps getEmpty(SparkSession spark) { Dataset<ConceptMap> emptyConceptMaps = spark.emptyDataset(CONCEPT_MAP_ENCODER) .withColumn("timestamp", lit(null).cast("timestamp")) .as(CONCEPT_MAP_ENCODER); return new ConceptMaps(spark, spark.emptyDataset(URL_AND_VERSION_ENCODER), emptyConceptMaps, spark.emptyDataset(MAPPING_ENCODER)); }
private void start() { SparkSession spark = SparkSession.builder().appName("CSV to Dataset") .master("local").getOrCreate(); // registers a new internal UDF spark.udf().register("x2Multiplier", new UDF1<Integer, Integer>() { private static final long serialVersionUID = -5372447039252716846L; @Override public Integer call(Integer x) { return x * 2; } }, DataTypes.IntegerType); String filename = "data/tuple-data-file.csv"; Dataset<Row> df = spark.read().format("csv").option("inferSchema", "true") .option("header", "false").load(filename); df = df.withColumn("label", df.col("_c0")).drop("_c0"); df = df.withColumn("value", df.col("_c1")).drop("_c1"); df = df.withColumn("x2", callUDF("x2Multiplier", df.col("value").cast( DataTypes.IntegerType))); df.show(); } }
/** * Returns an empty ValueSets instance. * * @param spark the spark session * @return an empty ValueSets instance. */ public static ValueSets getEmpty(SparkSession spark) { Dataset<ValueSet> emptyValueSets = spark.emptyDataset(VALUE_SET_ENCODER) .withColumn("timestamp", lit(null).cast("timestamp")) .as(VALUE_SET_ENCODER); return new ValueSets(spark, spark.emptyDataset(URL_AND_VERSION_ENCODER), emptyValueSets, spark.emptyDataset(getValueEncoder())); }
/** * Returns an empty ValueSets instance. * * @param spark the spark session * @return an empty ValueSets instance. */ public static ValueSets getEmpty(SparkSession spark) { Dataset<ValueSet> emptyValueSets = spark.emptyDataset(VALUE_SET_ENCODER) .withColumn("timestamp", lit(null).cast("timestamp")) .as(VALUE_SET_ENCODER); return new ValueSets(spark, spark.emptyDataset(URL_AND_VERSION_ENCODER), emptyValueSets, spark.emptyDataset(getValueEncoder())); }
/** * Returns an empty ValueSets instance. * * @param spark the spark session * @return an empty ValueSets instance. */ public static ValueSets getEmpty(SparkSession spark) { Dataset<ValueSet> emptyValueSets = spark.emptyDataset(VALUE_SET_ENCODER) .withColumn("timestamp", lit(null).cast("timestamp")) .as(VALUE_SET_ENCODER); return new ValueSets(spark, spark.emptyDataset(URL_AND_VERSION_ENCODER), emptyValueSets, spark.emptyDataset(getValueEncoder())); }
@Override public Dataset<Row> transform(Dataset<?> dataset){ StructType schema = dataset.schema(); StructType structSchema = getStructSchema(schema); Column structColumn = dataset.apply(DatasetUtil.escapeColumnName(getStructCol())); Dataset<Row> result = dataset.toDF(); StructField[] fields = structSchema.fields(); for(StructField field : fields){ String name = field.name(); Column fieldColumn = structColumn.getField(DatasetUtil.escapeColumnName(name)); result = result.withColumn(DatasetUtil.escapeColumnName(name), fieldColumn); } return result; }
private File buildPartitionedTable(String desc, PartitionSpec spec, String udf, String partitionColumn) { File location = new File(parent, desc); Table byId = TABLES.create(SCHEMA, spec, location.toString()); // do not combine splits because the tests expect a split per partition byId.updateProperties().set("read.split.target-size", "1").commit(); // copy the unpartitioned table into the partitioned table to produce the partitioned data Dataset<Row> allRows = spark.read() .format("iceberg") .load(unpartitioned.toString()); allRows .coalesce(1) // ensure only 1 file per partition is written .withColumn("part", callUDF(udf, column(partitionColumn))) .sortWithinPartitions("part") .drop("part") .write() .format("iceberg") .mode("append") .save(byId.location()); return location; }