@Override public Dataset<Row> derive(Map<String, Dataset<Row>> dependencies) throws Exception { Dataset<Row> compare, with; if (!dependencies.containsKey(compareDataset)) { throw new RuntimeException("Designated comparison target dataset is not a dependency: " + compareDataset); } else { compare = dependencies.get(compareDataset); } if (!dependencies.containsKey(withDataset)) { throw new RuntimeException("Designated comparison reference dataset is not a dependency: " + withDataset); } else { with = dependencies.get(withDataset); } return compare.join(with, JavaConversions.asScalaBuffer(fields).toList(), "leftanti"); }
@Override public Dataset<Row> join(final org.apache.spark.sql.Dataset<?> right, final Column joinExprs) { final boolean userTriggered = initializeFunction(right, joinExprs); final Dataset<Row> result = from(super.join(right, joinExprs)); this.setIsUserTriggered(userTriggered); return result; }
@Override public Dataset<Row> join(final org.apache.spark.sql.Dataset<?> right) { final boolean userTriggered = initializeFunction(right); final Dataset<Row> result = from(super.join(right)); this.setIsUserTriggered(userTriggered); return result; }
@Override public Dataset<Row> join(final org.apache.spark.sql.Dataset<?> right, final Column joinExprs, final String joinType) { final boolean userTriggered = initializeFunction(right, joinExprs, joinType); final Dataset<Row> result = from(super.join(right, joinExprs, joinType)); this.setIsUserTriggered(userTriggered); return result; }
@Override public Dataset<Row> join(final org.apache.spark.sql.Dataset<?> right, final scala.collection.Seq<String> usingColumns, final String joinType) { final boolean userTriggered = initializeFunction(right, usingColumns, joinType); final Dataset<Row> result = from(super.join(right, usingColumns, joinType)); this.setIsUserTriggered(userTriggered); return result; }
@Override public Dataset<Row> join(final org.apache.spark.sql.Dataset<?> right, final String usingColumn) { final boolean userTriggered = initializeFunction(right, usingColumn); final Dataset<Row> result = from(super.join(right, usingColumn)); this.setIsUserTriggered(userTriggered); return result; }
@Override public Dataset<Row> join(final org.apache.spark.sql.Dataset<?> right, final scala.collection.Seq<String> usingColumns) { final boolean userTriggered = initializeFunction(right, usingColumns); final Dataset<Row> result = from(super.join(right, usingColumns)); this.setIsUserTriggered(userTriggered); return result; }
/** * Returns all concept maps that are disjoint with concept maps stored in the default database and * adds them to our collection. The directory may be anything readable from a Spark path, * including local filesystems, HDFS, S3, or others. * * @param path a path from which disjoint concept maps will be loaded * @param database the database to check concept maps against * @return an instance of ConceptMaps that includes content from that directory that is disjoint * with content already contained in the default database. */ public C withDisjointMapsFromDirectory(String path, String database) { Dataset<UrlAndVersion> currentMembers = this.spark .sql("SELECT url, version FROM " + database + "." + CONCEPT_MAP_TABLE) .as(URL_AND_VERSION_ENCODER) .alias("current"); Dataset<T> maps = conceptMapsDatasetFromDirectory(path) .alias("new") .join(currentMembers, col("new.url").equalTo(col("current.url")) .and(col("new.version").equalTo(col("current.version"))), "leftanti") .as(conceptMapEncoder); return withConceptMaps(maps); }
/** * Returns all concept maps that are disjoint with concept maps stored in the default database and * adds them to our collection. The directory may be anything readable from a Spark path, * including local filesystems, HDFS, S3, or others. * * @param path a path from which disjoint concept maps will be loaded * @param database the database to check concept maps against * @return an instance of ConceptMaps that includes content from that directory that is disjoint * with content already contained in the default database. */ public C withDisjointMapsFromDirectory(String path, String database) { Dataset<UrlAndVersion> currentMembers = this.spark .sql("SELECT url, version FROM " + database + "." + CONCEPT_MAP_TABLE) .as(URL_AND_VERSION_ENCODER) .alias("current"); Dataset<T> maps = conceptMapsDatasetFromDirectory(path) .alias("new") .join(currentMembers, col("new.url").equalTo(col("current.url")) .and(col("new.version").equalTo(col("current.version"))), "leftanti") .as(conceptMapEncoder); return withConceptMaps(maps); }
/** * Returns all value sets that are disjoint with value sets stored in the given database and * adds them to our collection. The directory may be anything readable from a Spark path, * including local filesystems, HDFS, S3, or others. * * @param path a path from which disjoint value sets will be loaded * @param database the database to check value sets against * @return an instance of ValueSets that includes content from that directory that is disjoint * with content already contained in the given database. */ public C withDisjointValueSetsFromDirectory(String path, String database) { Dataset<UrlAndVersion> currentMembers = this.spark.table(database + "." + VALUE_SETS_TABLE) .select("url", "version") .distinct() .as(URL_AND_VERSION_ENCODER) .alias("current"); Dataset<T> valueSets = valueSetDatasetFromDirectory(path) .alias("new") .join(currentMembers, col("new.url").equalTo(col("current.url")) .and(col("new.version").equalTo(col("current.version"))), "leftanti") .as(valueSetEncoder); return withValueSets(valueSets); }
/** * Returns all value sets that are disjoint with value sets stored in the given database and * adds them to our collection. The directory may be anything readable from a Spark path, * including local filesystems, HDFS, S3, or others. * * @param path a path from which disjoint value sets will be loaded * @param database the database to check value sets against * @return an instance of ValueSets that includes content from that directory that is disjoint * with content already contained in the given database. */ public C withDisjointValueSetsFromDirectory(String path, String database) { Dataset<UrlAndVersion> currentMembers = this.spark.table(database + "." + VALUE_SETS_TABLE) .select("url", "version") .distinct() .as(URL_AND_VERSION_ENCODER) .alias("current"); Dataset<T> valueSets = valueSetDatasetFromDirectory(path) .alias("new") .join(currentMembers, col("new.url").equalTo(col("current.url")) .and(col("new.version").equalTo(col("current.version"))), "leftanti") .as(valueSetEncoder); return withValueSets(valueSets); }
private void start() { SparkSession spark = SparkSession.builder().appName("Authors and Books") .master( "local").getOrCreate(); String filename = "data/authors.csv"; // @formatter:off Dataset<Row> authorsDf = spark.read() .format("csv") .option("inferSchema", "true") .option("header", "true") .load(filename); // @formatter:on filename = "data/books.csv"; // @formatter:off Dataset<Row> booksDf = spark.read() .format("csv") .option("inferSchema", "true") .option("header", "true") .load(filename); // @formatter:on Dataset<Row> libraryDf = authorsDf.join(booksDf, authorsDf.col("id") .equalTo(booksDf.col("authorId")), "left_anti"); libraryDf.show(); libraryDf.printSchema(); } }
Dataset<Row> libraryDf = authorsDf.join(booksDf, authorsDf.col("id") .equalTo(booksDf.col("authorId")), "full_outer"); libraryDf.show();
.join(booksDf, authorsDf.col("id").equalTo(booksDf.col("authorId")), "full_outer") .withColumn("bookId", booksDf.col("id"))
.join(booksDf, authorsDf.col("id").equalTo(booksDf.col("authorId")), "full_outer") .withColumn("bookId", booksDf.col("id")) .drop(booksDf.col("id"));
.join( booksDf, authorsDf.col("id").equalTo(booksDf.col("authorId")),