org.apache.spark.sql.Dataset.join java code examples

@Override
public Dataset<Row> derive(Map<String, Dataset<Row>> dependencies) throws Exception {
 Dataset<Row> compare, with;
 if (!dependencies.containsKey(compareDataset)) {
  throw new RuntimeException("Designated comparison target dataset is not a dependency: " + compareDataset);
 } else {
  compare = dependencies.get(compareDataset);
 }
 if (!dependencies.containsKey(withDataset)) {
  throw new RuntimeException("Designated comparison reference dataset is not a dependency: " + withDataset);
 } else {
  with = dependencies.get(withDataset);
 }
 return compare.join(with, JavaConversions.asScalaBuffer(fields).toList(), "leftanti");
}

@Override
public Dataset<Row> join(final org.apache.spark.sql.Dataset<?> right, final Column joinExprs) {
 final boolean userTriggered = initializeFunction(right, joinExprs);
 final Dataset<Row> result = from(super.join(right, joinExprs));
 this.setIsUserTriggered(userTriggered);
 return result;
}

@Override
public Dataset<Row> join(final org.apache.spark.sql.Dataset<?> right) {
 final boolean userTriggered = initializeFunction(right);
 final Dataset<Row> result = from(super.join(right));
 this.setIsUserTriggered(userTriggered);
 return result;
}

@Override
public Dataset<Row> join(final org.apache.spark.sql.Dataset<?> right, final Column joinExprs, final String joinType) {
 final boolean userTriggered = initializeFunction(right, joinExprs, joinType);
 final Dataset<Row> result = from(super.join(right, joinExprs, joinType));
 this.setIsUserTriggered(userTriggered);
 return result;
}

@Override
public Dataset<Row> join(final org.apache.spark.sql.Dataset<?> right, final scala.collection.Seq<String> usingColumns,
             final String joinType) {
 final boolean userTriggered = initializeFunction(right, usingColumns, joinType);
 final Dataset<Row> result = from(super.join(right, usingColumns, joinType));
 this.setIsUserTriggered(userTriggered);
 return result;
}

@Override
public Dataset<Row> join(final org.apache.spark.sql.Dataset<?> right, final String usingColumn) {
 final boolean userTriggered = initializeFunction(right, usingColumn);
 final Dataset<Row> result = from(super.join(right, usingColumn));
 this.setIsUserTriggered(userTriggered);
 return result;
}

@Override
public Dataset<Row> join(final org.apache.spark.sql.Dataset<?> right,
             final scala.collection.Seq<String> usingColumns) {
 final boolean userTriggered = initializeFunction(right, usingColumns);
 final Dataset<Row> result = from(super.join(right, usingColumns));
 this.setIsUserTriggered(userTriggered);
 return result;
}

/**
 * Returns all concept maps that are disjoint with concept maps stored in the default database and
 * adds them to our collection. The directory may be anything readable from a Spark path,
 * including local filesystems, HDFS, S3, or others.
 *
 * @param path a path from which disjoint concept maps will be loaded
 * @param database the database to check concept maps against
 * @return an instance of ConceptMaps that includes content from that directory that is disjoint
 *         with content already contained in the default database.
 */
public C withDisjointMapsFromDirectory(String path, String database) {
 Dataset<UrlAndVersion> currentMembers = this.spark
   .sql("SELECT url, version FROM " + database + "." + CONCEPT_MAP_TABLE)
   .as(URL_AND_VERSION_ENCODER)
   .alias("current");
 Dataset<T> maps = conceptMapsDatasetFromDirectory(path)
   .alias("new")
   .join(currentMembers, col("new.url").equalTo(col("current.url"))
       .and(col("new.version").equalTo(col("current.version"))),
     "leftanti")
   .as(conceptMapEncoder);
 return withConceptMaps(maps);
}

/**
 * Returns all concept maps that are disjoint with concept maps stored in the default database and
 * adds them to our collection. The directory may be anything readable from a Spark path,
 * including local filesystems, HDFS, S3, or others.
 *
 * @param path a path from which disjoint concept maps will be loaded
 * @param database the database to check concept maps against
 * @return an instance of ConceptMaps that includes content from that directory that is disjoint
 *         with content already contained in the default database.
 */
public C withDisjointMapsFromDirectory(String path, String database) {
 Dataset<UrlAndVersion> currentMembers = this.spark
   .sql("SELECT url, version FROM " + database + "." + CONCEPT_MAP_TABLE)
   .as(URL_AND_VERSION_ENCODER)
   .alias("current");
 Dataset<T> maps = conceptMapsDatasetFromDirectory(path)
   .alias("new")
   .join(currentMembers, col("new.url").equalTo(col("current.url"))
       .and(col("new.version").equalTo(col("current.version"))),
     "leftanti")
   .as(conceptMapEncoder);
 return withConceptMaps(maps);
}

/**
 * Returns all value sets that are disjoint with value sets stored in the given database and
 * adds them to our collection. The directory may be anything readable from a Spark path,
 * including local filesystems, HDFS, S3, or others.
 *
 * @param path a path from which disjoint value sets will be loaded
 * @param database the database to check value sets against
 * @return an instance of ValueSets that includes content from that directory that is disjoint
 *         with content already contained in the given database.
 */
public C withDisjointValueSetsFromDirectory(String path, String database) {
 Dataset<UrlAndVersion> currentMembers = this.spark.table(database + "." + VALUE_SETS_TABLE)
   .select("url", "version")
   .distinct()
   .as(URL_AND_VERSION_ENCODER)
   .alias("current");
 Dataset<T> valueSets = valueSetDatasetFromDirectory(path)
   .alias("new")
   .join(currentMembers, col("new.url").equalTo(col("current.url"))
       .and(col("new.version").equalTo(col("current.version"))),
     "leftanti")
   .as(valueSetEncoder);
 return withValueSets(valueSets);
}

/**
 * Returns all value sets that are disjoint with value sets stored in the given database and
 * adds them to our collection. The directory may be anything readable from a Spark path,
 * including local filesystems, HDFS, S3, or others.
 *
 * @param path a path from which disjoint value sets will be loaded
 * @param database the database to check value sets against
 * @return an instance of ValueSets that includes content from that directory that is disjoint
 *         with content already contained in the given database.
 */
public C withDisjointValueSetsFromDirectory(String path, String database) {
 Dataset<UrlAndVersion> currentMembers = this.spark.table(database + "." + VALUE_SETS_TABLE)
   .select("url", "version")
   .distinct()
   .as(URL_AND_VERSION_ENCODER)
   .alias("current");
 Dataset<T> valueSets = valueSetDatasetFromDirectory(path)
   .alias("new")
   .join(currentMembers, col("new.url").equalTo(col("current.url"))
       .and(col("new.version").equalTo(col("current.version"))),
     "leftanti")
   .as(valueSetEncoder);
 return withValueSets(valueSets);
}

 private void start() {
  SparkSession spark = SparkSession.builder().appName("Authors and Books")
    .master(
      "local").getOrCreate();

  String filename = "data/authors.csv";
  // @formatter:off
  Dataset<Row> authorsDf = spark.read()
    .format("csv")
    .option("inferSchema", "true")
    .option("header", "true")
    .load(filename);
  // @formatter:on

  filename = "data/books.csv";
  // @formatter:off
  Dataset<Row> booksDf = spark.read()
    .format("csv")
    .option("inferSchema", "true")
    .option("header", "true")
    .load(filename);
  // @formatter:on

  Dataset<Row> libraryDf = authorsDf.join(booksDf, authorsDf.col("id")
    .equalTo(booksDf.col("authorId")), "left_anti");
  libraryDf.show();
  libraryDf.printSchema();
 }
}

Dataset<Row> libraryDf = authorsDf.join(booksDf, authorsDf.col("id")
  .equalTo(booksDf.col("authorId")), "full_outer");
libraryDf.show();

.join(
  referencesToLoad,
  col("present.valueSetUri").equalTo(col("toload.valueSetUri"))
.join(
  ancestorsToLoad,
  col("present.uri").equalTo(col("toload.uri"))

.join(
  referencesToLoad,
  col("present.valueSetUri").equalTo(col("toload.valueSetUri"))
.join(
  ancestorsToLoad,
  col("present.uri").equalTo(col("toload.uri"))

.join(booksDf, authorsDf.col("id").equalTo(booksDf.col("authorId")),
  "full_outer")
.withColumn("bookId", booksDf.col("id"))

.join(booksDf, authorsDf.col("id").equalTo(booksDf.col("authorId")), "full_outer")
.withColumn("bookId", booksDf.col("id"))
.drop(booksDf.col("id"));

.join(
  booksDf,
  authorsDf.col("id").equalTo(booksDf.col("authorId")),

Popular methods of Dataset

Popular in Java

Running tasks concurrently on multiple threads
requestLocationUpdates (LocationManager)
setContentView (Activity)
onRequestPermissionsResult (Fragment)
Proxy (java.net)
This class represents proxy server settings. A created instance of Proxy stores a type and an addres
Path (java.nio.file)
Comparator (java.util)
A Comparator is used to compare two objects to determine their ordering with respect to each other.
Cipher (javax.crypto)
This class provides access to implementations of cryptographic ciphers for encryption and decryption
BufferedImage (java.awt.image)
The BufferedImage subclass describes an java.awt.Image with an accessible buffer of image data. All
BoxLayout (javax.swing)
Top Vim plugins

How to use joinmethodin org.apache.spark.sql.Dataset

Best Java code snippets using org.apache.spark.sql.Dataset.join (Showing top 18 results out of 315)

How to use
join
method
in
org.apache.spark.sql.Dataset