org.apache.spark.sql.Dataset.distinct java code examples

@Test
public void testSetOperation() {
 List<String> data = Arrays.asList("abc", "abc", "xyz");
 Dataset<String> ds = spark.createDataset(data, Encoders.STRING());
 Assert.assertEquals(asSet("abc", "xyz"), toSet(ds.distinct().collectAsList()));
 List<String> data2 = Arrays.asList("xyz", "foo", "foo");
 Dataset<String> ds2 = spark.createDataset(data2, Encoders.STRING());
 Dataset<String> intersected = ds.intersect(ds2);
 Assert.assertEquals(Arrays.asList("xyz"), intersected.collectAsList());
 Dataset<String> unioned = ds.union(ds2).union(ds);
 Assert.assertEquals(
  Arrays.asList("abc", "abc", "xyz", "xyz", "foo", "foo", "abc", "abc", "xyz"),
  unioned.collectAsList());
 Dataset<String> subtracted = ds.except(ds2);
 Assert.assertEquals(Arrays.asList("abc"), subtracted.collectAsList());
}

@Test
public void testSetOperation() {
 List<String> data = Arrays.asList("abc", "abc", "xyz");
 Dataset<String> ds = spark.createDataset(data, Encoders.STRING());
 Assert.assertEquals(asSet("abc", "xyz"), toSet(ds.distinct().collectAsList()));
 List<String> data2 = Arrays.asList("xyz", "foo", "foo");
 Dataset<String> ds2 = spark.createDataset(data2, Encoders.STRING());
 Dataset<String> intersected = ds.intersect(ds2);
 Assert.assertEquals(Arrays.asList("xyz"), intersected.collectAsList());
 Dataset<String> unioned = ds.union(ds2).union(ds);
 Assert.assertEquals(
  Arrays.asList("abc", "abc", "xyz", "xyz", "foo", "foo", "abc", "abc", "xyz"),
  unioned.collectAsList());
 Dataset<String> subtracted = ds.except(ds2);
 Assert.assertEquals(Arrays.asList("abc"), subtracted.collectAsList());
}

@Test
public void testSetOperation() {
 List<String> data = Arrays.asList("abc", "abc", "xyz");
 Dataset<String> ds = spark.createDataset(data, Encoders.STRING());
 Assert.assertEquals(asSet("abc", "xyz"), toSet(ds.distinct().collectAsList()));
 List<String> data2 = Arrays.asList("xyz", "foo", "foo");
 Dataset<String> ds2 = spark.createDataset(data2, Encoders.STRING());
 Dataset<String> intersected = ds.intersect(ds2);
 Assert.assertEquals(Arrays.asList("xyz"), intersected.collectAsList());
 Dataset<String> unioned = ds.union(ds2).union(ds);
 Assert.assertEquals(
  Arrays.asList("abc", "abc", "xyz", "xyz", "foo", "foo", "abc", "abc", "xyz"),
  unioned.collectAsList());
 Dataset<String> subtracted = ds.except(ds2);
 Assert.assertEquals(Arrays.asList("abc"), subtracted.collectAsList());
}

@Override
public Dataset<Row> derive(Map<String, Dataset<Row>> dependencies) throws Exception {
 validate(dependencies);
 return dependencies.get(stepName).distinct();
}

Object[] getInList(Map<String, Dataset<Row>> dependencies) {
 if (inListType == InListType.REFERENCE) {
  List<Row> t = dependencies.get(refStepName).select(getRefFieldName(dependencies)).distinct().collectAsList();
  inList = new ArrayList<Object>(t.size());
  for (Row r : t) {
   inList.add(r.get(0));
  }
 }
 return inList.toArray(new Object[0]);
}

/**
 * Returns a dataset of distinct URL and version tuples.
 */
protected Dataset<UrlAndVersion> getUrlAndVersions(Dataset<T> valueSets) {
 return valueSets.select("url", "version")
   .distinct()
   .as(URL_AND_VERSION_ENCODER);
}

/**
 * Returns a dataset of distinct URL and version tuples.
 */
protected Dataset<UrlAndVersion> getUrlAndVersions(Dataset<T> valueSets) {
 return valueSets.select("url", "version")
   .distinct()
   .as(URL_AND_VERSION_ENCODER);
}

@Override
public Dataset<T> distinct() {
 final boolean userTriggered = initializeFunction();
 final Dataset<T> result = from(super.distinct());
 this.setIsUserTriggered(userTriggered);
 return result;
}

/**
 * Returns a simple dataset of URL and versions of concept maps.
 */
protected Dataset<UrlAndVersion> getUrlAndVersions(Dataset<T> conceptMaps) {
 return conceptMaps.select(functions.col("url"), functions.col("version"))
   .distinct()
   .as(URL_AND_VERSION_ENCODER);
}

/**
 * Returns a simple dataset of URL and versions of concept maps.
 */
protected Dataset<UrlAndVersion> getUrlAndVersions(Dataset<T> conceptMaps) {
 return conceptMaps.select(functions.col("url"), functions.col("version"))
   .distinct()
   .as(URL_AND_VERSION_ENCODER);
}

/**
 * Returns the collection of ancestors from the table in the given database.
 *
 * @param spark the spark session
 * @param database name of the database containing the ancestors table
 * @return a Hierarchies instance.
 */
public static Hierarchies getFromDatabase(SparkSession spark, String database) {
 Dataset<Ancestor> ancestors = spark.sql("SELECT * FROM " + database + "." + ANCESTORS_TABLE)
   .as(ANCESTOR_ENCODER);
 Dataset<UrlAndVersion> members = ancestors.filter((FilterFunction<Ancestor>) ancestor ->
     ancestor.getUri().startsWith(HIERARCHY_URI_PREFIX))
   .select(col("uri").alias("url"), col("version"))
   .distinct()
   .as(URI_AND_VERSION_ENCODER);
 return new Hierarchies(spark,
   members,
   ancestors);
}

/**
 * Returns the collection of ancestors from the table in the given database.
 *
 * @param spark the spark session
 * @param database name of the database containing the ancestors table
 * @return a Hierarchies instance.
 */
public static Hierarchies getFromDatabase(SparkSession spark, String database) {
 Dataset<Ancestor> ancestors = spark.sql("SELECT * FROM " + database + "." + ANCESTORS_TABLE)
   .as(ANCESTOR_ENCODER);
 Dataset<UrlAndVersion> members = ancestors.filter((FilterFunction<Ancestor>) ancestor ->
     ancestor.getUri().startsWith(HIERARCHY_URI_PREFIX))
   .select(col("uri").alias("url"), col("version"))
   .distinct()
   .as(URI_AND_VERSION_ENCODER);
 return new Hierarchies(spark,
   members,
   ancestors);
}

.distinct()
.as(URI_AND_VERSION_ENCODER);

.distinct()
.as(URI_AND_VERSION_ENCODER);

/**
 * Returns a new hierarchies instance with the transitive ancestors computed from the given
 * dataset of {@link HierarchicalElement}.
 *
 * @param hierarchyUri the URI of the hierarchical system to add
 * @param hierarchyVersion the version of the hierarchical system to add
 * @param elements the elements from which to calculate the ancestors
 * @return an instance of Hierarchies with the ancestors computed from the given elements
 */
public Hierarchies withHierarchyElements(String hierarchyUri,
  String hierarchyVersion,
  Dataset<HierarchicalElement> elements) {
 Dataset<Ancestor> newAncestors = expandElements(hierarchyUri, hierarchyVersion, elements);
 Dataset<UrlAndVersion> newMembers = newAncestors.select(col("uri").alias("url"), col("version"))
   .distinct()
   .as(URI_AND_VERSION_ENCODER);
 if (hasDuplicateUriAndVersions(newMembers)) {
  throw new IllegalArgumentException(
    "Cannot add elements having duplicate hierarchyUri and hierarchyVersion");
 }
 return new Hierarchies(this.spark,
   this.members.union(newMembers),
   this.ancestors.union(newAncestors));
}

/**
 * Returns a new hierarchies instance with the transitive ancestors computed from the given
 * dataset of {@link HierarchicalElement}.
 *
 * @param hierarchyUri the URI of the hierarchical system to add
 * @param hierarchyVersion the version of the hierarchical system to add
 * @param elements the elements from which to calculate the ancestors
 * @return an instance of Hierarchies with the ancestors computed from the given elements
 */
public Hierarchies withHierarchyElements(String hierarchyUri,
  String hierarchyVersion,
  Dataset<HierarchicalElement> elements) {
 Dataset<Ancestor> newAncestors = expandElements(hierarchyUri, hierarchyVersion, elements);
 Dataset<UrlAndVersion> newMembers = newAncestors.select(col("uri").alias("url"), col("version"))
   .distinct()
   .as(URI_AND_VERSION_ENCODER);
 if (hasDuplicateUriAndVersions(newMembers)) {
  throw new IllegalArgumentException(
    "Cannot add elements having duplicate hierarchyUri and hierarchyVersion");
 }
 return new Hierarchies(this.spark,
   this.members.union(newMembers),
   this.ancestors.union(newAncestors));
}

.distinct()
.as(URL_AND_VERSION_ENCODER);

/**
 * Returns all value sets that are disjoint with value sets stored in the given database and
 * adds them to our collection. The directory may be anything readable from a Spark path,
 * including local filesystems, HDFS, S3, or others.
 *
 * @param path a path from which disjoint value sets will be loaded
 * @param database the database to check value sets against
 * @return an instance of ValueSets that includes content from that directory that is disjoint
 *         with content already contained in the given database.
 */
public C withDisjointValueSetsFromDirectory(String path, String database) {
 Dataset<UrlAndVersion> currentMembers = this.spark.table(database + "." + VALUE_SETS_TABLE)
   .select("url", "version")
   .distinct()
   .as(URL_AND_VERSION_ENCODER)
   .alias("current");
 Dataset<T> valueSets = valueSetDatasetFromDirectory(path)
   .alias("new")
   .join(currentMembers, col("new.url").equalTo(col("current.url"))
       .and(col("new.version").equalTo(col("current.version"))),
     "leftanti")
   .as(valueSetEncoder);
 return withValueSets(valueSets);
}

/**
 * Returns all value sets that are disjoint with value sets stored in the given database and
 * adds them to our collection. The directory may be anything readable from a Spark path,
 * including local filesystems, HDFS, S3, or others.
 *
 * @param path a path from which disjoint value sets will be loaded
 * @param database the database to check value sets against
 * @return an instance of ValueSets that includes content from that directory that is disjoint
 *         with content already contained in the given database.
 */
public C withDisjointValueSetsFromDirectory(String path, String database) {
 Dataset<UrlAndVersion> currentMembers = this.spark.table(database + "." + VALUE_SETS_TABLE)
   .select("url", "version")
   .distinct()
   .as(URL_AND_VERSION_ENCODER)
   .alias("current");
 Dataset<T> valueSets = valueSetDatasetFromDirectory(path)
   .alias("new")
   .join(currentMembers, col("new.url").equalTo(col("current.url"))
       .and(col("new.version").equalTo(col("current.version"))),
     "leftanti")
   .as(valueSetEncoder);
 return withValueSets(valueSets);
}

final Dataset<Row> destinations = sparkSession.sql("select " + SchemaToStructTypeConverter.DST_COL_NAME + " as " + SchemaToStructTypeConverter.VERTEX_COL_NAME + " from elements where " + SchemaToStructTypeConverter.GROUP + " in " + edgeGroups);
final Dataset<Row> vertices = sources.union(destinations).distinct();

Popular methods of Dataset

Popular in Java

Creating JSON documents from java classes using gson
notifyDataSetChanged (ArrayAdapter)
putExtra (Intent)
getSystemService (Context)
String (java.lang)
BigDecimal (java.math)
An immutable arbitrary-precision signed decimal.A value is represented by an arbitrary-precision "un
Timestamp (java.sql)
A Java representation of the SQL TIMESTAMP type. It provides the capability of representing the SQL
TreeSet (java.util)
TreeSet is an implementation of SortedSet. All optional operations (adding and removing) are support
Cipher (javax.crypto)
This class provides access to implementations of cryptographic ciphers for encryption and decryption
Logger (org.apache.log4j)
This is the central class in the log4j package. Most logging operations, except configuration, are d
From CI to AI: The AI layer in your organization

How to use distinctmethodin org.apache.spark.sql.Dataset

Best Java code snippets using org.apache.spark.sql.Dataset.distinct (Showing top 20 results out of 315)

How to use
distinct
method
in
org.apache.spark.sql.Dataset