@Test public void testSetOperation() { List<String> data = Arrays.asList("abc", "abc", "xyz"); Dataset<String> ds = spark.createDataset(data, Encoders.STRING()); Assert.assertEquals(asSet("abc", "xyz"), toSet(ds.distinct().collectAsList())); List<String> data2 = Arrays.asList("xyz", "foo", "foo"); Dataset<String> ds2 = spark.createDataset(data2, Encoders.STRING()); Dataset<String> intersected = ds.intersect(ds2); Assert.assertEquals(Arrays.asList("xyz"), intersected.collectAsList()); Dataset<String> unioned = ds.union(ds2).union(ds); Assert.assertEquals( Arrays.asList("abc", "abc", "xyz", "xyz", "foo", "foo", "abc", "abc", "xyz"), unioned.collectAsList()); Dataset<String> subtracted = ds.except(ds2); Assert.assertEquals(Arrays.asList("abc"), subtracted.collectAsList()); }
@Test public void testSetOperation() { List<String> data = Arrays.asList("abc", "abc", "xyz"); Dataset<String> ds = spark.createDataset(data, Encoders.STRING()); Assert.assertEquals(asSet("abc", "xyz"), toSet(ds.distinct().collectAsList())); List<String> data2 = Arrays.asList("xyz", "foo", "foo"); Dataset<String> ds2 = spark.createDataset(data2, Encoders.STRING()); Dataset<String> intersected = ds.intersect(ds2); Assert.assertEquals(Arrays.asList("xyz"), intersected.collectAsList()); Dataset<String> unioned = ds.union(ds2).union(ds); Assert.assertEquals( Arrays.asList("abc", "abc", "xyz", "xyz", "foo", "foo", "abc", "abc", "xyz"), unioned.collectAsList()); Dataset<String> subtracted = ds.except(ds2); Assert.assertEquals(Arrays.asList("abc"), subtracted.collectAsList()); }
@Test public void testSetOperation() { List<String> data = Arrays.asList("abc", "abc", "xyz"); Dataset<String> ds = spark.createDataset(data, Encoders.STRING()); Assert.assertEquals(asSet("abc", "xyz"), toSet(ds.distinct().collectAsList())); List<String> data2 = Arrays.asList("xyz", "foo", "foo"); Dataset<String> ds2 = spark.createDataset(data2, Encoders.STRING()); Dataset<String> intersected = ds.intersect(ds2); Assert.assertEquals(Arrays.asList("xyz"), intersected.collectAsList()); Dataset<String> unioned = ds.union(ds2).union(ds); Assert.assertEquals( Arrays.asList("abc", "abc", "xyz", "xyz", "foo", "foo", "abc", "abc", "xyz"), unioned.collectAsList()); Dataset<String> subtracted = ds.except(ds2); Assert.assertEquals(Arrays.asList("abc"), subtracted.collectAsList()); }
@Override public Dataset<Row> derive(Map<String, Dataset<Row>> dependencies) throws Exception { validate(dependencies); return dependencies.get(stepName).distinct(); }
Object[] getInList(Map<String, Dataset<Row>> dependencies) { if (inListType == InListType.REFERENCE) { List<Row> t = dependencies.get(refStepName).select(getRefFieldName(dependencies)).distinct().collectAsList(); inList = new ArrayList<Object>(t.size()); for (Row r : t) { inList.add(r.get(0)); } } return inList.toArray(new Object[0]); }
/** * Returns a dataset of distinct URL and version tuples. */ protected Dataset<UrlAndVersion> getUrlAndVersions(Dataset<T> valueSets) { return valueSets.select("url", "version") .distinct() .as(URL_AND_VERSION_ENCODER); }
/** * Returns a dataset of distinct URL and version tuples. */ protected Dataset<UrlAndVersion> getUrlAndVersions(Dataset<T> valueSets) { return valueSets.select("url", "version") .distinct() .as(URL_AND_VERSION_ENCODER); }
@Override public Dataset<T> distinct() { final boolean userTriggered = initializeFunction(); final Dataset<T> result = from(super.distinct()); this.setIsUserTriggered(userTriggered); return result; }
/** * Returns a simple dataset of URL and versions of concept maps. */ protected Dataset<UrlAndVersion> getUrlAndVersions(Dataset<T> conceptMaps) { return conceptMaps.select(functions.col("url"), functions.col("version")) .distinct() .as(URL_AND_VERSION_ENCODER); }
/** * Returns a simple dataset of URL and versions of concept maps. */ protected Dataset<UrlAndVersion> getUrlAndVersions(Dataset<T> conceptMaps) { return conceptMaps.select(functions.col("url"), functions.col("version")) .distinct() .as(URL_AND_VERSION_ENCODER); }
/** * Returns the collection of ancestors from the table in the given database. * * @param spark the spark session * @param database name of the database containing the ancestors table * @return a Hierarchies instance. */ public static Hierarchies getFromDatabase(SparkSession spark, String database) { Dataset<Ancestor> ancestors = spark.sql("SELECT * FROM " + database + "." + ANCESTORS_TABLE) .as(ANCESTOR_ENCODER); Dataset<UrlAndVersion> members = ancestors.filter((FilterFunction<Ancestor>) ancestor -> ancestor.getUri().startsWith(HIERARCHY_URI_PREFIX)) .select(col("uri").alias("url"), col("version")) .distinct() .as(URI_AND_VERSION_ENCODER); return new Hierarchies(spark, members, ancestors); }
/** * Returns the collection of ancestors from the table in the given database. * * @param spark the spark session * @param database name of the database containing the ancestors table * @return a Hierarchies instance. */ public static Hierarchies getFromDatabase(SparkSession spark, String database) { Dataset<Ancestor> ancestors = spark.sql("SELECT * FROM " + database + "." + ANCESTORS_TABLE) .as(ANCESTOR_ENCODER); Dataset<UrlAndVersion> members = ancestors.filter((FilterFunction<Ancestor>) ancestor -> ancestor.getUri().startsWith(HIERARCHY_URI_PREFIX)) .select(col("uri").alias("url"), col("version")) .distinct() .as(URI_AND_VERSION_ENCODER); return new Hierarchies(spark, members, ancestors); }
.distinct() .as(URI_AND_VERSION_ENCODER);
.distinct() .as(URI_AND_VERSION_ENCODER);
/** * Returns a new hierarchies instance with the transitive ancestors computed from the given * dataset of {@link HierarchicalElement}. * * @param hierarchyUri the URI of the hierarchical system to add * @param hierarchyVersion the version of the hierarchical system to add * @param elements the elements from which to calculate the ancestors * @return an instance of Hierarchies with the ancestors computed from the given elements */ public Hierarchies withHierarchyElements(String hierarchyUri, String hierarchyVersion, Dataset<HierarchicalElement> elements) { Dataset<Ancestor> newAncestors = expandElements(hierarchyUri, hierarchyVersion, elements); Dataset<UrlAndVersion> newMembers = newAncestors.select(col("uri").alias("url"), col("version")) .distinct() .as(URI_AND_VERSION_ENCODER); if (hasDuplicateUriAndVersions(newMembers)) { throw new IllegalArgumentException( "Cannot add elements having duplicate hierarchyUri and hierarchyVersion"); } return new Hierarchies(this.spark, this.members.union(newMembers), this.ancestors.union(newAncestors)); }
/** * Returns a new hierarchies instance with the transitive ancestors computed from the given * dataset of {@link HierarchicalElement}. * * @param hierarchyUri the URI of the hierarchical system to add * @param hierarchyVersion the version of the hierarchical system to add * @param elements the elements from which to calculate the ancestors * @return an instance of Hierarchies with the ancestors computed from the given elements */ public Hierarchies withHierarchyElements(String hierarchyUri, String hierarchyVersion, Dataset<HierarchicalElement> elements) { Dataset<Ancestor> newAncestors = expandElements(hierarchyUri, hierarchyVersion, elements); Dataset<UrlAndVersion> newMembers = newAncestors.select(col("uri").alias("url"), col("version")) .distinct() .as(URI_AND_VERSION_ENCODER); if (hasDuplicateUriAndVersions(newMembers)) { throw new IllegalArgumentException( "Cannot add elements having duplicate hierarchyUri and hierarchyVersion"); } return new Hierarchies(this.spark, this.members.union(newMembers), this.ancestors.union(newAncestors)); }
.distinct() .as(URL_AND_VERSION_ENCODER);
/** * Returns all value sets that are disjoint with value sets stored in the given database and * adds them to our collection. The directory may be anything readable from a Spark path, * including local filesystems, HDFS, S3, or others. * * @param path a path from which disjoint value sets will be loaded * @param database the database to check value sets against * @return an instance of ValueSets that includes content from that directory that is disjoint * with content already contained in the given database. */ public C withDisjointValueSetsFromDirectory(String path, String database) { Dataset<UrlAndVersion> currentMembers = this.spark.table(database + "." + VALUE_SETS_TABLE) .select("url", "version") .distinct() .as(URL_AND_VERSION_ENCODER) .alias("current"); Dataset<T> valueSets = valueSetDatasetFromDirectory(path) .alias("new") .join(currentMembers, col("new.url").equalTo(col("current.url")) .and(col("new.version").equalTo(col("current.version"))), "leftanti") .as(valueSetEncoder); return withValueSets(valueSets); }
/** * Returns all value sets that are disjoint with value sets stored in the given database and * adds them to our collection. The directory may be anything readable from a Spark path, * including local filesystems, HDFS, S3, or others. * * @param path a path from which disjoint value sets will be loaded * @param database the database to check value sets against * @return an instance of ValueSets that includes content from that directory that is disjoint * with content already contained in the given database. */ public C withDisjointValueSetsFromDirectory(String path, String database) { Dataset<UrlAndVersion> currentMembers = this.spark.table(database + "." + VALUE_SETS_TABLE) .select("url", "version") .distinct() .as(URL_AND_VERSION_ENCODER) .alias("current"); Dataset<T> valueSets = valueSetDatasetFromDirectory(path) .alias("new") .join(currentMembers, col("new.url").equalTo(col("current.url")) .and(col("new.version").equalTo(col("current.version"))), "leftanti") .as(valueSetEncoder); return withValueSets(valueSets); }
final Dataset<Row> destinations = sparkSession.sql("select " + SchemaToStructTypeConverter.DST_COL_NAME + " as " + SchemaToStructTypeConverter.VERTEX_COL_NAME + " from elements where " + SchemaToStructTypeConverter.GROUP + " in " + edgeGroups); final Dataset<Row> vertices = sources.union(destinations).distinct();