private static JavaRDD<String[]> getOtherFormatHiveInput(JavaSparkContext sc, String hiveTable) { SparkSession sparkSession = SparkSession.builder().config(sc.getConf()).enableHiveSupport().getOrCreate(); final Dataset intermediateTable = sparkSession.table(hiveTable); return intermediateTable.javaRDD().map(new Function<Row, String[]>() { @Override public String[] call(Row row) throws Exception { String[] result = new String[row.size()]; for (int i = 0; i < row.size(); i++) { final Object o = row.get(i); if (o != null) { result[i] = o.toString(); } else { result[i] = null; } } return result; } }); }
@Override public Dataset<Row> table(final String tableName) { final boolean userTriggered = initializeFunction(tableName); final Dataset<Row> result = Dataset.from(super.table(tableName)); this.setIsUserTriggered(userTriggered); return result; }
public Dataset<Row> alignColumns(Dataset<Row> input) { Boolean caseSensitive = Contexts.getSparkSession().sparkContext().getConf(). getBoolean(SPARK_SQL_CASE_SENSITIVE_CONFIG, false); Set<String> inputCols = new HashSet<String>(); for (String col : Arrays.asList(input.schema().fieldNames())) { inputCols.add((caseSensitive) ? col : col.toLowerCase()); } List<String> tableCols = new ArrayList<String>(); for (String col : Contexts.getSparkSession().table(tableName).schema().fieldNames()) { tableCols.add((caseSensitive) ? col : col.toLowerCase()); } List<Column> alignedCols = new ArrayList<Column>(); for (String column : tableCols) { alignedCols.add((inputCols.contains(column)) ? functions.col(column) : functions.lit(null).alias(column)); } return input.select(alignedCols.toArray(new Column[alignedCols.size()])); }
/** * Returns the collection of value sets from the tables in the given database. * * @param spark the spark session * @param databaseName name of the database containing the value sets and values tables * @return a ValueSets instance. */ public static ValueSets getFromDatabase(SparkSession spark, String databaseName) { Dataset<Value> values = spark.table(databaseName + "." + VALUES_TABLE).as(getValueEncoder()); Dataset<ValueSet> valueSets = spark.table(databaseName + "." + VALUE_SETS_TABLE) .as(VALUE_SET_ENCODER); Dataset<UrlAndVersion> members = valueSets.select("url", "version").as(URL_AND_VERSION_ENCODER); return new ValueSets(spark, members, valueSets, values); }
/** * Returns the collection of value sets from the tables in the given database. * * @param spark the spark session * @param databaseName name of the database containing the value sets and values tables * @return a ValueSets instance. */ public static ValueSets getFromDatabase(SparkSession spark, String databaseName) { Dataset<Value> values = spark.table(databaseName + "." + VALUES_TABLE).as(getValueEncoder()); Dataset<ValueSet> valueSets = spark.table(databaseName + "." + VALUE_SETS_TABLE) .as(VALUE_SET_ENCODER); Dataset<UrlAndVersion> members = valueSets.select("url", "version").as(URL_AND_VERSION_ENCODER); return new ValueSets(spark, members, valueSets, values); }
/** * Returns the collection of value sets from the tables in the given database. * * @param spark the spark session * @param databaseName name of the database containing the value sets and values tables * @return a ValueSets instance. */ public static ValueSets getFromDatabase(SparkSession spark, String databaseName) { Dataset<Value> values = spark.table(databaseName + "." + VALUES_TABLE).as(getValueEncoder()); Dataset<ValueSet> valueSets = spark.table(databaseName + "." + VALUE_SETS_TABLE) .as(VALUE_SET_ENCODER); Dataset<UrlAndVersion> members = valueSets.select("url", "version").as(URL_AND_VERSION_ENCODER); return new ValueSets(spark, members, valueSets, values); }
/** * Returns a {@link Schema} for the given table with fresh field ids. * <p> * This creates a Schema for an existing table by looking up the table's schema with Spark and * converting that schema. Spark/Hive partition columns are included in the schema. * * @param spark a Spark session * @param name a table name and (optional) database * @return a Schema for the table, if found */ public static Schema schemaForTable(SparkSession spark, String name) { StructType sparkType = spark.table(name).schema(); Type converted = visit(sparkType, new SparkTypeToType(sparkType)); return new Schema(converted.asNestedType().asStructType().fields()); }
private static JavaRDD<String[]> getOtherFormatHiveInput(JavaSparkContext sc, String hiveTable) { SparkSession sparkSession = SparkSession.builder().config(sc.getConf()).enableHiveSupport().getOrCreate(); final Dataset intermediateTable = sparkSession.table(hiveTable); return intermediateTable.javaRDD().map(new Function<Row, String[]>() { @Override public String[] call(Row row) throws Exception { String[] result = new String[row.size()]; for (int i = 0; i < row.size(); i++) { final Object o = row.get(i); if (o != null) { result[i] = o.toString(); } else { result[i] = null; } } return result; } }); }
Dataset<UrlAndVersion> currentMembers = this.spark.table(ancestorsTable) .select(col("uri").alias("url"), col("version")) .distinct()
Dataset<UrlAndVersion> currentMembers = this.spark.table(ancestorsTable) .select(col("uri").alias("url"), col("version")) .distinct()
Dataset<UrlAndVersion> currentMembers = this.spark.table(valueSetTable) .select("url", "version") .distinct()
Dataset<UrlAndVersion> currentMembers = this.spark.table(valueSetTable) .select("url", "version") .distinct()
/** * Returns all value sets that are disjoint with value sets stored in the given database and * adds them to our collection. The directory may be anything readable from a Spark path, * including local filesystems, HDFS, S3, or others. * * @param path a path from which disjoint value sets will be loaded * @param database the database to check value sets against * @return an instance of ValueSets that includes content from that directory that is disjoint * with content already contained in the given database. */ public C withDisjointValueSetsFromDirectory(String path, String database) { Dataset<UrlAndVersion> currentMembers = this.spark.table(database + "." + VALUE_SETS_TABLE) .select("url", "version") .distinct() .as(URL_AND_VERSION_ENCODER) .alias("current"); Dataset<T> valueSets = valueSetDatasetFromDirectory(path) .alias("new") .join(currentMembers, col("new.url").equalTo(col("current.url")) .and(col("new.version").equalTo(col("current.version"))), "leftanti") .as(valueSetEncoder); return withValueSets(valueSets); }
/** * Returns all value sets that are disjoint with value sets stored in the given database and * adds them to our collection. The directory may be anything readable from a Spark path, * including local filesystems, HDFS, S3, or others. * * @param path a path from which disjoint value sets will be loaded * @param database the database to check value sets against * @return an instance of ValueSets that includes content from that directory that is disjoint * with content already contained in the given database. */ public C withDisjointValueSetsFromDirectory(String path, String database) { Dataset<UrlAndVersion> currentMembers = this.spark.table(database + "." + VALUE_SETS_TABLE) .select("url", "version") .distinct() .as(URL_AND_VERSION_ENCODER) .alias("current"); Dataset<T> valueSets = valueSetDatasetFromDirectory(path) .alias("new") .join(currentMembers, col("new.url").equalTo(col("current.url")) .and(col("new.version").equalTo(col("current.version"))), "leftanti") .as(valueSetEncoder); return withValueSets(valueSets); }