@Test public void testSetOperation() { List<String> data = Arrays.asList("abc", "abc", "xyz"); Dataset<String> ds = spark.createDataset(data, Encoders.STRING()); Assert.assertEquals(asSet("abc", "xyz"), toSet(ds.distinct().collectAsList())); List<String> data2 = Arrays.asList("xyz", "foo", "foo"); Dataset<String> ds2 = spark.createDataset(data2, Encoders.STRING()); Dataset<String> intersected = ds.intersect(ds2); Assert.assertEquals(Arrays.asList("xyz"), intersected.collectAsList()); Dataset<String> unioned = ds.union(ds2).union(ds); Assert.assertEquals( Arrays.asList("abc", "abc", "xyz", "xyz", "foo", "foo", "abc", "abc", "xyz"), unioned.collectAsList()); Dataset<String> subtracted = ds.except(ds2); Assert.assertEquals(Arrays.asList("abc"), subtracted.collectAsList()); }
@Test public void testSetOperation() { List<String> data = Arrays.asList("abc", "abc", "xyz"); Dataset<String> ds = spark.createDataset(data, Encoders.STRING()); Assert.assertEquals(asSet("abc", "xyz"), toSet(ds.distinct().collectAsList())); List<String> data2 = Arrays.asList("xyz", "foo", "foo"); Dataset<String> ds2 = spark.createDataset(data2, Encoders.STRING()); Dataset<String> intersected = ds.intersect(ds2); Assert.assertEquals(Arrays.asList("xyz"), intersected.collectAsList()); Dataset<String> unioned = ds.union(ds2).union(ds); Assert.assertEquals( Arrays.asList("abc", "abc", "xyz", "xyz", "foo", "foo", "abc", "abc", "xyz"), unioned.collectAsList()); Dataset<String> subtracted = ds.except(ds2); Assert.assertEquals(Arrays.asList("abc"), subtracted.collectAsList()); }
@Test public void testSetOperation() { List<String> data = Arrays.asList("abc", "abc", "xyz"); Dataset<String> ds = spark.createDataset(data, Encoders.STRING()); Assert.assertEquals(asSet("abc", "xyz"), toSet(ds.distinct().collectAsList())); List<String> data2 = Arrays.asList("xyz", "foo", "foo"); Dataset<String> ds2 = spark.createDataset(data2, Encoders.STRING()); Dataset<String> intersected = ds.intersect(ds2); Assert.assertEquals(Arrays.asList("xyz"), intersected.collectAsList()); Dataset<String> unioned = ds.union(ds2).union(ds); Assert.assertEquals( Arrays.asList("abc", "abc", "xyz", "xyz", "foo", "foo", "abc", "abc", "xyz"), unioned.collectAsList()); Dataset<String> subtracted = ds.except(ds2); Assert.assertEquals(Arrays.asList("abc"), subtracted.collectAsList()); }
private ValueSets withValueSets(Dataset<ValueSet> newValueSets, Dataset<Value> newValues) { Dataset<UrlAndVersion> newMembers = getUrlAndVersions(newValueSets); // Instantiating a new composite ConceptMaps requires a new timestamp Timestamp timestamp = new Timestamp(System.currentTimeMillis()); Dataset<ValueSet> newValueSetsWithTimestamp = newValueSets .withColumn("timestamp", lit(timestamp.toString()).cast("timestamp")) .as(VALUE_SET_ENCODER); return new ValueSets(spark, this.members.union(newMembers), this.valueSets.union(newValueSetsWithTimestamp), this.values.union(newValues)); }
protected C withConceptMaps(Dataset<T> newMaps, Dataset<Mapping> newMappings) { Dataset<UrlAndVersion> newMembers = getUrlAndVersions(newMaps); // Instantiating a new composite ConceptMaps requires a new timestamp Timestamp timestamp = new Timestamp(System.currentTimeMillis()); Dataset<T> newMapsWithTimestamp = newMaps .withColumn("timestamp", lit(timestamp.toString()).cast("timestamp")) .as(conceptMapEncoder); return newInstance(spark, this.members.union(newMembers), this.conceptMaps.union(newMapsWithTimestamp), this.mappings.union(newMappings)); }
private ValueSets withValueSets(Dataset<ValueSet> newValueSets, Dataset<Value> newValues) { Dataset<UrlAndVersion> newMembers = getUrlAndVersions(newValueSets); // Instantiating a new composite ConceptMaps requires a new timestamp Timestamp timestamp = new Timestamp(System.currentTimeMillis()); Dataset<ValueSet> newValueSetsWithTimestamp = newValueSets .withColumn("timestamp", lit(timestamp.toString()).cast("timestamp")) .as(VALUE_SET_ENCODER); return new ValueSets(spark, this.members.union(newMembers), this.valueSets.union(newValueSetsWithTimestamp), this.values.union(newValues)); }
protected C withConceptMaps(Dataset<T> newMaps, Dataset<Mapping> newMappings) { Dataset<UrlAndVersion> newMembers = getUrlAndVersions(newMaps); // Instantiating a new composite ConceptMaps requires a new timestamp Timestamp timestamp = new Timestamp(System.currentTimeMillis()); Dataset<T> newMapsWithTimestamp = newMaps .withColumn("timestamp", lit(timestamp.toString()).cast("timestamp")) .as(conceptMapEncoder); return newInstance(spark, this.members.union(newMembers), this.conceptMaps.union(newMapsWithTimestamp), this.mappings.union(newMappings)); }
private ValueSets withValueSets(Dataset<ValueSet> newValueSets, Dataset<Value> newValues) { Dataset<UrlAndVersion> newMembers = getUrlAndVersions(newValueSets); // Instantiating a new composite ConceptMaps requires a new timestamp Timestamp timestamp = new Timestamp(System.currentTimeMillis()); Dataset<ValueSet> newValueSetsWithTimestamp = newValueSets .withColumn("timestamp", lit(timestamp.toString()).cast("timestamp")) .as(VALUE_SET_ENCODER); return new ValueSets(spark, this.members.union(newMembers), this.valueSets.union(newValueSetsWithTimestamp), this.values.union(newValues)); }
@Override public Dataset<Row> derive(Map<String, Dataset<Row>> dependencies) throws Exception { if (dependencies.isEmpty()) { throw new RuntimeException("Passthrough deriver requires at least one dependency"); } Iterator<Dataset<Row>> dependencyIterator = dependencies.values().iterator(); Dataset<Row> unioned = dependencyIterator.next(); while (dependencyIterator.hasNext()) { Dataset<Row> next = dependencyIterator.next(); if (!unioned.schema().equals(next.schema())) { throw new RuntimeException("All dependencies of the passthrough deriver must have the same schema"); } unioned = unioned.union(next); } return unioned; }
/** * Returns a new hierarchies instance with the given hierarchies. * * @param hierarchies the hierarchies to add to this instance * @return a new instance of Hierarchies. */ public Hierarchies withHierarchies(Hierarchies hierarchies) { Dataset<Ancestor> newAncestors = hierarchies.getAncestors(); Dataset<UrlAndVersion> newMembers = hierarchies.getMembers(); if (hasDuplicateUriAndVersions(newMembers)) { throw new IllegalArgumentException( "Cannot add hierarchies having duplicate uri and version"); } return new Hierarchies(this.spark, this.members.union(newMembers), this.ancestors.union(newAncestors)); }
/** * Returns a new hierarchies instance with the given hierarchies. * * @param hierarchies the hierarchies to add to this instance * @return a new instance of Hierarchies. */ public Hierarchies withHierarchies(Hierarchies hierarchies) { Dataset<Ancestor> newAncestors = hierarchies.getAncestors(); Dataset<UrlAndVersion> newMembers = hierarchies.getMembers(); if (hasDuplicateUriAndVersions(newMembers)) { throw new IllegalArgumentException( "Cannot add hierarchies having duplicate uri and version"); } return new Hierarchies(this.spark, this.members.union(newMembers), this.ancestors.union(newAncestors)); }
@Override public Dataset<T> union(final org.apache.spark.sql.Dataset<T> other) { final boolean userTriggered = initializeFunction(other); final Dataset<T> result = from(super.union(other)); this.setIsUserTriggered(userTriggered); return result; }
/** * Carry out a union of two {@link Dataset}s where the input * Datasets may contain a different number of columns. * The resulting Dataset will contain entries for all of the columns found in * the input Dataset, with null entries used as placeholders. * * @param ds1 the first Dataset * @param ds2 the second Dataset * @return the combined Dataset */ public static Dataset<Row> union(final Dataset<Row> ds1, final Dataset<Row> ds2) { Set<String> ds1Cols = Sets.newHashSet(ds1.columns()); Set<String> ds2Cols = Sets.newHashSet(ds2.columns()); final Set<String> total = Sets.newHashSet(ds1Cols); total.addAll(ds2Cols); return ds1.select(expr(ds1Cols, total)).union(ds2.select(expr(ds2Cols, total))); }
theResults = rule.check(theDataset, dependencies); } else { theResults = theResults.union(rule.check(theDataset, dependencies));
/** * Returns a new hierarchies instance with the transitive ancestors computed from the given * dataset of {@link HierarchicalElement}. * * @param hierarchyUri the URI of the hierarchical system to add * @param hierarchyVersion the version of the hierarchical system to add * @param elements the elements from which to calculate the ancestors * @return an instance of Hierarchies with the ancestors computed from the given elements */ public Hierarchies withHierarchyElements(String hierarchyUri, String hierarchyVersion, Dataset<HierarchicalElement> elements) { Dataset<Ancestor> newAncestors = expandElements(hierarchyUri, hierarchyVersion, elements); Dataset<UrlAndVersion> newMembers = newAncestors.select(col("uri").alias("url"), col("version")) .distinct() .as(URI_AND_VERSION_ENCODER); if (hasDuplicateUriAndVersions(newMembers)) { throw new IllegalArgumentException( "Cannot add elements having duplicate hierarchyUri and hierarchyVersion"); } return new Hierarchies(this.spark, this.members.union(newMembers), this.ancestors.union(newAncestors)); }
/** * Returns a new hierarchies instance with the transitive ancestors computed from the given * dataset of {@link HierarchicalElement}. * * @param hierarchyUri the URI of the hierarchical system to add * @param hierarchyVersion the version of the hierarchical system to add * @param elements the elements from which to calculate the ancestors * @return an instance of Hierarchies with the ancestors computed from the given elements */ public Hierarchies withHierarchyElements(String hierarchyUri, String hierarchyVersion, Dataset<HierarchicalElement> elements) { Dataset<Ancestor> newAncestors = expandElements(hierarchyUri, hierarchyVersion, elements); Dataset<UrlAndVersion> newMembers = newAncestors.select(col("uri").alias("url"), col("version")) .distinct() .as(URI_AND_VERSION_ENCODER); if (hasDuplicateUriAndVersions(newMembers)) { throw new IllegalArgumentException( "Cannot add elements having duplicate hierarchyUri and hierarchyVersion"); } return new Hierarchies(this.spark, this.members.union(newMembers), this.ancestors.union(newAncestors)); }
@Test public void testUDAF() { Dataset<Row> df = hc.range(0, 100).union(hc.range(0, 100)).select(col("id").as("value")); UserDefinedAggregateFunction udaf = new MyDoubleSum(); UserDefinedAggregateFunction registeredUDAF = hc.udf().register("mydoublesum", udaf); // Create Columns for the UDAF. For now, callUDF does not take an argument to specific if // we want to use distinct aggregation. Dataset<Row> aggregatedDF = df.groupBy() .agg( udaf.distinct(col("value")), udaf.apply(col("value")), registeredUDAF.apply(col("value")), callUDF("mydoublesum", col("value"))); List<Row> expectedResult = new ArrayList<>(); expectedResult.add(RowFactory.create(4950.0, 9900.0, 9900.0, 9900.0)); checkAnswer( aggregatedDF, expectedResult); } }
@Test public void testUDAF() { Dataset<Row> df = hc.range(0, 100).union(hc.range(0, 100)).select(col("id").as("value")); UserDefinedAggregateFunction udaf = new MyDoubleSum(); UserDefinedAggregateFunction registeredUDAF = hc.udf().register("mydoublesum", udaf); // Create Columns for the UDAF. For now, callUDF does not take an argument to specific if // we want to use distinct aggregation. Dataset<Row> aggregatedDF = df.groupBy() .agg( udaf.distinct(col("value")), udaf.apply(col("value")), registeredUDAF.apply(col("value")), callUDF("mydoublesum", col("value"))); List<Row> expectedResult = new ArrayList<>(); expectedResult.add(RowFactory.create(4950.0, 9900.0, 9900.0, 9900.0)); checkAnswer( aggregatedDF, expectedResult); } }
final Dataset<Row> destinations = sparkSession.sql("select " + SchemaToStructTypeConverter.DST_COL_NAME + " as " + SchemaToStructTypeConverter.VERTEX_COL_NAME + " from elements where " + SchemaToStructTypeConverter.GROUP + " in " + edgeGroups); final Dataset<Row> vertices = sources.union(destinations).distinct();