public Dataset<Row> alignColumns(Dataset<Row> input) { Boolean caseSensitive = Contexts.getSparkSession().sparkContext().getConf(). getBoolean(SPARK_SQL_CASE_SENSITIVE_CONFIG, false); Set<String> inputCols = new HashSet<String>(); for (String col : Arrays.asList(input.schema().fieldNames())) { inputCols.add((caseSensitive) ? col : col.toLowerCase()); } List<String> tableCols = new ArrayList<String>(); for (String col : Contexts.getSparkSession().table(tableName).schema().fieldNames()) { tableCols.add((caseSensitive) ? col : col.toLowerCase()); } List<Column> alignedCols = new ArrayList<Column>(); for (String column : tableCols) { alignedCols.add((inputCols.contains(column)) ? functions.col(column) : functions.lit(null).alias(column)); } return input.select(alignedCols.toArray(new Column[alignedCols.size()])); }
.select(col("uri").alias("url"), col("version")) .distinct() .as(URI_AND_VERSION_ENCODER);
/** * Returns the collection of ancestors from the table in the given database. * * @param spark the spark session * @param database name of the database containing the ancestors table * @return a Hierarchies instance. */ public static Hierarchies getFromDatabase(SparkSession spark, String database) { Dataset<Ancestor> ancestors = spark.sql("SELECT * FROM " + database + "." + ANCESTORS_TABLE) .as(ANCESTOR_ENCODER); Dataset<UrlAndVersion> members = ancestors.filter((FilterFunction<Ancestor>) ancestor -> ancestor.getUri().startsWith(HIERARCHY_URI_PREFIX)) .select(col("uri").alias("url"), col("version")) .distinct() .as(URI_AND_VERSION_ENCODER); return new Hierarchies(spark, members, ancestors); }
/** * Returns the collection of ancestors from the table in the given database. * * @param spark the spark session * @param database name of the database containing the ancestors table * @return a Hierarchies instance. */ public static Hierarchies getFromDatabase(SparkSession spark, String database) { Dataset<Ancestor> ancestors = spark.sql("SELECT * FROM " + database + "." + ANCESTORS_TABLE) .as(ANCESTOR_ENCODER); Dataset<UrlAndVersion> members = ancestors.filter((FilterFunction<Ancestor>) ancestor -> ancestor.getUri().startsWith(HIERARCHY_URI_PREFIX)) .select(col("uri").alias("url"), col("version")) .distinct() .as(URI_AND_VERSION_ENCODER); return new Hierarchies(spark, members, ancestors); }
.select(col("uri").alias("url"), col("version")) .distinct() .as(URI_AND_VERSION_ENCODER);
/** * Returns a new hierarchies instance with the transitive ancestors computed from the given * dataset of {@link HierarchicalElement}. * * @param hierarchyUri the URI of the hierarchical system to add * @param hierarchyVersion the version of the hierarchical system to add * @param elements the elements from which to calculate the ancestors * @return an instance of Hierarchies with the ancestors computed from the given elements */ public Hierarchies withHierarchyElements(String hierarchyUri, String hierarchyVersion, Dataset<HierarchicalElement> elements) { Dataset<Ancestor> newAncestors = expandElements(hierarchyUri, hierarchyVersion, elements); Dataset<UrlAndVersion> newMembers = newAncestors.select(col("uri").alias("url"), col("version")) .distinct() .as(URI_AND_VERSION_ENCODER); if (hasDuplicateUriAndVersions(newMembers)) { throw new IllegalArgumentException( "Cannot add elements having duplicate hierarchyUri and hierarchyVersion"); } return new Hierarchies(this.spark, this.members.union(newMembers), this.ancestors.union(newAncestors)); }
/** * Returns a new hierarchies instance with the transitive ancestors computed from the given * dataset of {@link HierarchicalElement}. * * @param hierarchyUri the URI of the hierarchical system to add * @param hierarchyVersion the version of the hierarchical system to add * @param elements the elements from which to calculate the ancestors * @return an instance of Hierarchies with the ancestors computed from the given elements */ public Hierarchies withHierarchyElements(String hierarchyUri, String hierarchyVersion, Dataset<HierarchicalElement> elements) { Dataset<Ancestor> newAncestors = expandElements(hierarchyUri, hierarchyVersion, elements); Dataset<UrlAndVersion> newMembers = newAncestors.select(col("uri").alias("url"), col("version")) .distinct() .as(URI_AND_VERSION_ENCODER); if (hasDuplicateUriAndVersions(newMembers)) { throw new IllegalArgumentException( "Cannot add elements having duplicate hierarchyUri and hierarchyVersion"); } return new Hierarchies(this.spark, this.members.union(newMembers), this.ancestors.union(newAncestors)); }