org.apache.spark.sql.Dataset.union java code examples

@Test
public void testSetOperation() {
 List<String> data = Arrays.asList("abc", "abc", "xyz");
 Dataset<String> ds = spark.createDataset(data, Encoders.STRING());
 Assert.assertEquals(asSet("abc", "xyz"), toSet(ds.distinct().collectAsList()));
 List<String> data2 = Arrays.asList("xyz", "foo", "foo");
 Dataset<String> ds2 = spark.createDataset(data2, Encoders.STRING());
 Dataset<String> intersected = ds.intersect(ds2);
 Assert.assertEquals(Arrays.asList("xyz"), intersected.collectAsList());
 Dataset<String> unioned = ds.union(ds2).union(ds);
 Assert.assertEquals(
  Arrays.asList("abc", "abc", "xyz", "xyz", "foo", "foo", "abc", "abc", "xyz"),
  unioned.collectAsList());
 Dataset<String> subtracted = ds.except(ds2);
 Assert.assertEquals(Arrays.asList("abc"), subtracted.collectAsList());
}

@Test
public void testSetOperation() {
 List<String> data = Arrays.asList("abc", "abc", "xyz");
 Dataset<String> ds = spark.createDataset(data, Encoders.STRING());
 Assert.assertEquals(asSet("abc", "xyz"), toSet(ds.distinct().collectAsList()));
 List<String> data2 = Arrays.asList("xyz", "foo", "foo");
 Dataset<String> ds2 = spark.createDataset(data2, Encoders.STRING());
 Dataset<String> intersected = ds.intersect(ds2);
 Assert.assertEquals(Arrays.asList("xyz"), intersected.collectAsList());
 Dataset<String> unioned = ds.union(ds2).union(ds);
 Assert.assertEquals(
  Arrays.asList("abc", "abc", "xyz", "xyz", "foo", "foo", "abc", "abc", "xyz"),
  unioned.collectAsList());
 Dataset<String> subtracted = ds.except(ds2);
 Assert.assertEquals(Arrays.asList("abc"), subtracted.collectAsList());
}

@Test
public void testSetOperation() {
 List<String> data = Arrays.asList("abc", "abc", "xyz");
 Dataset<String> ds = spark.createDataset(data, Encoders.STRING());
 Assert.assertEquals(asSet("abc", "xyz"), toSet(ds.distinct().collectAsList()));
 List<String> data2 = Arrays.asList("xyz", "foo", "foo");
 Dataset<String> ds2 = spark.createDataset(data2, Encoders.STRING());
 Dataset<String> intersected = ds.intersect(ds2);
 Assert.assertEquals(Arrays.asList("xyz"), intersected.collectAsList());
 Dataset<String> unioned = ds.union(ds2).union(ds);
 Assert.assertEquals(
  Arrays.asList("abc", "abc", "xyz", "xyz", "foo", "foo", "abc", "abc", "xyz"),
  unioned.collectAsList());
 Dataset<String> subtracted = ds.except(ds2);
 Assert.assertEquals(Arrays.asList("abc"), subtracted.collectAsList());
}

private ValueSets withValueSets(Dataset<ValueSet> newValueSets, Dataset<Value> newValues) {
 Dataset<UrlAndVersion> newMembers = getUrlAndVersions(newValueSets);
 // Instantiating a new composite ConceptMaps requires a new timestamp
 Timestamp timestamp = new Timestamp(System.currentTimeMillis());
 Dataset<ValueSet> newValueSetsWithTimestamp = newValueSets
   .withColumn("timestamp", lit(timestamp.toString()).cast("timestamp"))
   .as(VALUE_SET_ENCODER);
 return new ValueSets(spark,
   this.members.union(newMembers),
   this.valueSets.union(newValueSetsWithTimestamp),
   this.values.union(newValues));
}

protected C withConceptMaps(Dataset<T> newMaps, Dataset<Mapping> newMappings) {
 Dataset<UrlAndVersion> newMembers = getUrlAndVersions(newMaps);
 // Instantiating a new composite ConceptMaps requires a new timestamp
 Timestamp timestamp = new Timestamp(System.currentTimeMillis());
 Dataset<T> newMapsWithTimestamp = newMaps
   .withColumn("timestamp", lit(timestamp.toString()).cast("timestamp"))
   .as(conceptMapEncoder);
 return newInstance(spark,
   this.members.union(newMembers),
   this.conceptMaps.union(newMapsWithTimestamp),
   this.mappings.union(newMappings));
}

private ValueSets withValueSets(Dataset<ValueSet> newValueSets, Dataset<Value> newValues) {
 Dataset<UrlAndVersion> newMembers = getUrlAndVersions(newValueSets);
 // Instantiating a new composite ConceptMaps requires a new timestamp
 Timestamp timestamp = new Timestamp(System.currentTimeMillis());
 Dataset<ValueSet> newValueSetsWithTimestamp = newValueSets
   .withColumn("timestamp", lit(timestamp.toString()).cast("timestamp"))
   .as(VALUE_SET_ENCODER);
 return new ValueSets(spark,
   this.members.union(newMembers),
   this.valueSets.union(newValueSetsWithTimestamp),
   this.values.union(newValues));
}

protected C withConceptMaps(Dataset<T> newMaps, Dataset<Mapping> newMappings) {
 Dataset<UrlAndVersion> newMembers = getUrlAndVersions(newMaps);
 // Instantiating a new composite ConceptMaps requires a new timestamp
 Timestamp timestamp = new Timestamp(System.currentTimeMillis());
 Dataset<T> newMapsWithTimestamp = newMaps
   .withColumn("timestamp", lit(timestamp.toString()).cast("timestamp"))
   .as(conceptMapEncoder);
 return newInstance(spark,
   this.members.union(newMembers),
   this.conceptMaps.union(newMapsWithTimestamp),
   this.mappings.union(newMappings));
}

private ValueSets withValueSets(Dataset<ValueSet> newValueSets, Dataset<Value> newValues) {
 Dataset<UrlAndVersion> newMembers = getUrlAndVersions(newValueSets);
 // Instantiating a new composite ConceptMaps requires a new timestamp
 Timestamp timestamp = new Timestamp(System.currentTimeMillis());
 Dataset<ValueSet> newValueSetsWithTimestamp = newValueSets
   .withColumn("timestamp", lit(timestamp.toString()).cast("timestamp"))
   .as(VALUE_SET_ENCODER);
 return new ValueSets(spark,
   this.members.union(newMembers),
   this.valueSets.union(newValueSetsWithTimestamp),
   this.values.union(newValues));
}

@Override
public Dataset<Row> derive(Map<String, Dataset<Row>> dependencies) throws Exception {
 if (dependencies.isEmpty()) {
  throw new RuntimeException("Passthrough deriver requires at least one dependency");
 }
 Iterator<Dataset<Row>> dependencyIterator = dependencies.values().iterator();
 Dataset<Row> unioned = dependencyIterator.next();
 while (dependencyIterator.hasNext()) {
  Dataset<Row> next = dependencyIterator.next();
  if (!unioned.schema().equals(next.schema())) {
   throw new RuntimeException("All dependencies of the passthrough deriver must have the same schema");
  }
  unioned = unioned.union(next);
 }
 return unioned;
}

/**
 * Returns a new hierarchies instance with the given hierarchies.
 *
 * @param hierarchies the hierarchies to add to this instance
 * @return a new instance of Hierarchies.
 */
public Hierarchies withHierarchies(Hierarchies hierarchies) {
 Dataset<Ancestor> newAncestors = hierarchies.getAncestors();
 Dataset<UrlAndVersion> newMembers = hierarchies.getMembers();
 if (hasDuplicateUriAndVersions(newMembers)) {
  throw new IllegalArgumentException(
    "Cannot add hierarchies having duplicate uri and version");
 }
 return new Hierarchies(this.spark,
   this.members.union(newMembers),
   this.ancestors.union(newAncestors));
}

/**
 * Returns a new hierarchies instance with the given hierarchies.
 *
 * @param hierarchies the hierarchies to add to this instance
 * @return a new instance of Hierarchies.
 */
public Hierarchies withHierarchies(Hierarchies hierarchies) {
 Dataset<Ancestor> newAncestors = hierarchies.getAncestors();
 Dataset<UrlAndVersion> newMembers = hierarchies.getMembers();
 if (hasDuplicateUriAndVersions(newMembers)) {
  throw new IllegalArgumentException(
    "Cannot add hierarchies having duplicate uri and version");
 }
 return new Hierarchies(this.spark,
   this.members.union(newMembers),
   this.ancestors.union(newAncestors));
}

@Override
public Dataset<T> union(final org.apache.spark.sql.Dataset<T> other) {
 final boolean userTriggered = initializeFunction(other);
 final Dataset<T> result = from(super.union(other));
 this.setIsUserTriggered(userTriggered);
 return result;
}

/**
 * Carry out a union of two {@link Dataset}s where the input
 * Datasets may contain a different number of columns.
 * The resulting Dataset will contain entries for all of the columns found in
 * the input Dataset, with null entries used as placeholders.
 *
 * @param ds1 the first Dataset
 * @param ds2 the second Dataset
 * @return the combined Dataset
 */
public static Dataset<Row> union(final Dataset<Row> ds1, final Dataset<Row> ds2) {
  Set<String> ds1Cols = Sets.newHashSet(ds1.columns());
  Set<String> ds2Cols = Sets.newHashSet(ds2.columns());
  final Set<String> total = Sets.newHashSet(ds1Cols);
  total.addAll(ds2Cols);
  return ds1.select(expr(ds1Cols, total)).union(ds2.select(expr(ds2Cols, total)));
}

 theResults = rule.check(theDataset, dependencies);
} else {
 theResults = theResults.union(rule.check(theDataset, dependencies));

/**
 * Returns a new hierarchies instance with the transitive ancestors computed from the given
 * dataset of {@link HierarchicalElement}.
 *
 * @param hierarchyUri the URI of the hierarchical system to add
 * @param hierarchyVersion the version of the hierarchical system to add
 * @param elements the elements from which to calculate the ancestors
 * @return an instance of Hierarchies with the ancestors computed from the given elements
 */
public Hierarchies withHierarchyElements(String hierarchyUri,
  String hierarchyVersion,
  Dataset<HierarchicalElement> elements) {
 Dataset<Ancestor> newAncestors = expandElements(hierarchyUri, hierarchyVersion, elements);
 Dataset<UrlAndVersion> newMembers = newAncestors.select(col("uri").alias("url"), col("version"))
   .distinct()
   .as(URI_AND_VERSION_ENCODER);
 if (hasDuplicateUriAndVersions(newMembers)) {
  throw new IllegalArgumentException(
    "Cannot add elements having duplicate hierarchyUri and hierarchyVersion");
 }
 return new Hierarchies(this.spark,
   this.members.union(newMembers),
   this.ancestors.union(newAncestors));
}

/**
 * Returns a new hierarchies instance with the transitive ancestors computed from the given
 * dataset of {@link HierarchicalElement}.
 *
 * @param hierarchyUri the URI of the hierarchical system to add
 * @param hierarchyVersion the version of the hierarchical system to add
 * @param elements the elements from which to calculate the ancestors
 * @return an instance of Hierarchies with the ancestors computed from the given elements
 */
public Hierarchies withHierarchyElements(String hierarchyUri,
  String hierarchyVersion,
  Dataset<HierarchicalElement> elements) {
 Dataset<Ancestor> newAncestors = expandElements(hierarchyUri, hierarchyVersion, elements);
 Dataset<UrlAndVersion> newMembers = newAncestors.select(col("uri").alias("url"), col("version"))
   .distinct()
   .as(URI_AND_VERSION_ENCODER);
 if (hasDuplicateUriAndVersions(newMembers)) {
  throw new IllegalArgumentException(
    "Cannot add elements having duplicate hierarchyUri and hierarchyVersion");
 }
 return new Hierarchies(this.spark,
   this.members.union(newMembers),
   this.ancestors.union(newAncestors));
}

 @Test
 public void testUDAF() {
  Dataset<Row> df = hc.range(0, 100).union(hc.range(0, 100)).select(col("id").as("value"));
  UserDefinedAggregateFunction udaf = new MyDoubleSum();
  UserDefinedAggregateFunction registeredUDAF = hc.udf().register("mydoublesum", udaf);
  // Create Columns for the UDAF. For now, callUDF does not take an argument to specific if
  // we want to use distinct aggregation.
  Dataset<Row> aggregatedDF =
   df.groupBy()
    .agg(
     udaf.distinct(col("value")),
     udaf.apply(col("value")),
     registeredUDAF.apply(col("value")),
     callUDF("mydoublesum", col("value")));

  List<Row> expectedResult = new ArrayList<>();
  expectedResult.add(RowFactory.create(4950.0, 9900.0, 9900.0, 9900.0));
  checkAnswer(
   aggregatedDF,
   expectedResult);
 }
}

 @Test
 public void testUDAF() {
  Dataset<Row> df = hc.range(0, 100).union(hc.range(0, 100)).select(col("id").as("value"));
  UserDefinedAggregateFunction udaf = new MyDoubleSum();
  UserDefinedAggregateFunction registeredUDAF = hc.udf().register("mydoublesum", udaf);
  // Create Columns for the UDAF. For now, callUDF does not take an argument to specific if
  // we want to use distinct aggregation.
  Dataset<Row> aggregatedDF =
   df.groupBy()
    .agg(
     udaf.distinct(col("value")),
     udaf.apply(col("value")),
     registeredUDAF.apply(col("value")),
     callUDF("mydoublesum", col("value")));

  List<Row> expectedResult = new ArrayList<>();
  expectedResult.add(RowFactory.create(4950.0, 9900.0, 9900.0, 9900.0));
  checkAnswer(
   aggregatedDF,
   expectedResult);
 }
}

final Dataset<Row> destinations = sparkSession.sql("select " + SchemaToStructTypeConverter.DST_COL_NAME + " as " + SchemaToStructTypeConverter.VERTEX_COL_NAME + " from elements where " + SchemaToStructTypeConverter.GROUP + " in " + edgeGroups);
final Dataset<Row> vertices = sources.union(destinations).distinct();

Popular methods of Dataset

Popular in Java

Reactive rest calls using spring rest template
scheduleAtFixedRate (ScheduledExecutorService)
setScale (BigDecimal)
scheduleAtFixedRate (Timer)
Socket (java.net)
Provides a client-side TCP socket.
NoSuchElementException (java.util)
Thrown when trying to retrieve an element past the end of an Enumeration or Iterator.
Random (java.util)
This class provides methods that return pseudo-random values.It is dangerous to seed Random with the
SortedMap (java.util)
A map that has its keys ordered. The sorting is according to either the natural ordering of its keys
Semaphore (java.util.concurrent)
A counting semaphore. Conceptually, a semaphore maintains a set of permits. Each #acquire blocks if
SAXParseException (org.xml.sax)
Encapsulate an XML parse error or warning.> This module, both source code and documentation, is in t
CodeWhisperer alternatives

How to use unionmethodin org.apache.spark.sql.Dataset

Best Java code snippets using org.apache.spark.sql.Dataset.union (Showing top 19 results out of 315)

How to use
union
method
in
org.apache.spark.sql.Dataset