org.apache.spark.sql.SparkSession.table java code examples

private static JavaRDD<String[]> getOtherFormatHiveInput(JavaSparkContext sc, String hiveTable) {
  SparkSession sparkSession = SparkSession.builder().config(sc.getConf()).enableHiveSupport().getOrCreate();
  final Dataset intermediateTable = sparkSession.table(hiveTable);
  return intermediateTable.javaRDD().map(new Function<Row, String[]>() {
    @Override
    public String[] call(Row row) throws Exception {
      String[] result = new String[row.size()];
      for (int i = 0; i < row.size(); i++) {
        final Object o = row.get(i);
        if (o != null) {
          result[i] = o.toString();
        } else {
          result[i] = null;
        }
      }
      return result;
    }
  });
}

@Override
public Dataset<Row> table(final String tableName) {
 final boolean userTriggered = initializeFunction(tableName);
 final Dataset<Row> result = Dataset.from(super.table(tableName));
 this.setIsUserTriggered(userTriggered);
 return result;
}

public Dataset<Row> alignColumns(Dataset<Row> input) {
 Boolean caseSensitive = Contexts.getSparkSession().sparkContext().getConf().
             getBoolean(SPARK_SQL_CASE_SENSITIVE_CONFIG, false);
 Set<String> inputCols = new HashSet<String>();
 for (String col : Arrays.asList(input.schema().fieldNames())) {
  inputCols.add((caseSensitive) ? col : col.toLowerCase());
 }
 List<String> tableCols = new ArrayList<String>();
 for (String col : Contexts.getSparkSession().table(tableName).schema().fieldNames()) {
  tableCols.add((caseSensitive) ? col : col.toLowerCase());
 }
 List<Column> alignedCols = new ArrayList<Column>();
 for (String column : tableCols) {
  alignedCols.add((inputCols.contains(column)) ? functions.col(column) :
                          functions.lit(null).alias(column));
 }
 return input.select(alignedCols.toArray(new Column[alignedCols.size()]));
}

/**
 * Returns the collection of value sets from the tables in the given database.
 *
 * @param spark the spark session
 * @param databaseName name of the database containing the value sets and values tables
 * @return a ValueSets instance.
 */
public static ValueSets getFromDatabase(SparkSession spark, String databaseName) {
 Dataset<Value> values = spark.table(databaseName + "." + VALUES_TABLE).as(getValueEncoder());
 Dataset<ValueSet> valueSets = spark.table(databaseName + "." + VALUE_SETS_TABLE)
   .as(VALUE_SET_ENCODER);
 Dataset<UrlAndVersion> members = valueSets.select("url", "version").as(URL_AND_VERSION_ENCODER);
 return new ValueSets(spark,
   members,
   valueSets,
   values);
}

/**
 * Returns the collection of value sets from the tables in the given database.
 *
 * @param spark the spark session
 * @param databaseName name of the database containing the value sets and values tables
 * @return a ValueSets instance.
 */
public static ValueSets getFromDatabase(SparkSession spark, String databaseName) {
 Dataset<Value> values = spark.table(databaseName + "." + VALUES_TABLE).as(getValueEncoder());
 Dataset<ValueSet> valueSets = spark.table(databaseName + "." + VALUE_SETS_TABLE)
   .as(VALUE_SET_ENCODER);
 Dataset<UrlAndVersion> members = valueSets.select("url", "version").as(URL_AND_VERSION_ENCODER);
 return new ValueSets(spark,
   members,
   valueSets,
   values);
}

/**
 * Returns the collection of value sets from the tables in the given database.
 *
 * @param spark the spark session
 * @param databaseName name of the database containing the value sets and values tables
 * @return a ValueSets instance.
 */
public static ValueSets getFromDatabase(SparkSession spark, String databaseName) {
 Dataset<Value> values = spark.table(databaseName + "." + VALUES_TABLE).as(getValueEncoder());
 Dataset<ValueSet> valueSets = spark.table(databaseName + "." + VALUE_SETS_TABLE)
   .as(VALUE_SET_ENCODER);
 Dataset<UrlAndVersion> members = valueSets.select("url", "version").as(URL_AND_VERSION_ENCODER);
 return new ValueSets(spark,
   members,
   valueSets,
   values);
}

/**
 * Returns a {@link Schema} for the given table with fresh field ids.
 * <p>
 * This creates a Schema for an existing table by looking up the table's schema with Spark and
 * converting that schema. Spark/Hive partition columns are included in the schema.
 *
 * @param spark a Spark session
 * @param name a table name and (optional) database
 * @return a Schema for the table, if found
 */
public static Schema schemaForTable(SparkSession spark, String name) {
 StructType sparkType = spark.table(name).schema();
 Type converted = visit(sparkType,
   new SparkTypeToType(sparkType));
 return new Schema(converted.asNestedType().asStructType().fields());
}

private static JavaRDD<String[]> getOtherFormatHiveInput(JavaSparkContext sc, String hiveTable) {
  SparkSession sparkSession = SparkSession.builder().config(sc.getConf()).enableHiveSupport().getOrCreate();
  final Dataset intermediateTable = sparkSession.table(hiveTable);
  return intermediateTable.javaRDD().map(new Function<Row, String[]>() {
    @Override
    public String[] call(Row row) throws Exception {
      String[] result = new String[row.size()];
      for (int i = 0; i < row.size(); i++) {
        final Object o = row.get(i);
        if (o != null) {
          result[i] = o.toString();
        } else {
          result[i] = null;
        }
      }
      return result;
    }
  });
}

Dataset<UrlAndVersion> currentMembers = this.spark.table(ancestorsTable)
  .select(col("uri").alias("url"), col("version"))
  .distinct()

Dataset<UrlAndVersion> currentMembers = this.spark.table(ancestorsTable)
  .select(col("uri").alias("url"), col("version"))
  .distinct()

Dataset<UrlAndVersion> currentMembers = this.spark.table(valueSetTable)
  .select("url", "version")
  .distinct()

Dataset<UrlAndVersion> currentMembers = this.spark.table(valueSetTable)
  .select("url", "version")
  .distinct()

/**
 * Returns all value sets that are disjoint with value sets stored in the given database and
 * adds them to our collection. The directory may be anything readable from a Spark path,
 * including local filesystems, HDFS, S3, or others.
 *
 * @param path a path from which disjoint value sets will be loaded
 * @param database the database to check value sets against
 * @return an instance of ValueSets that includes content from that directory that is disjoint
 *         with content already contained in the given database.
 */
public C withDisjointValueSetsFromDirectory(String path, String database) {
 Dataset<UrlAndVersion> currentMembers = this.spark.table(database + "." + VALUE_SETS_TABLE)
   .select("url", "version")
   .distinct()
   .as(URL_AND_VERSION_ENCODER)
   .alias("current");
 Dataset<T> valueSets = valueSetDatasetFromDirectory(path)
   .alias("new")
   .join(currentMembers, col("new.url").equalTo(col("current.url"))
       .and(col("new.version").equalTo(col("current.version"))),
     "leftanti")
   .as(valueSetEncoder);
 return withValueSets(valueSets);
}

/**
 * Returns all value sets that are disjoint with value sets stored in the given database and
 * adds them to our collection. The directory may be anything readable from a Spark path,
 * including local filesystems, HDFS, S3, or others.
 *
 * @param path a path from which disjoint value sets will be loaded
 * @param database the database to check value sets against
 * @return an instance of ValueSets that includes content from that directory that is disjoint
 *         with content already contained in the given database.
 */
public C withDisjointValueSetsFromDirectory(String path, String database) {
 Dataset<UrlAndVersion> currentMembers = this.spark.table(database + "." + VALUE_SETS_TABLE)
   .select("url", "version")
   .distinct()
   .as(URL_AND_VERSION_ENCODER)
   .alias("current");
 Dataset<T> valueSets = valueSetDatasetFromDirectory(path)
   .alias("new")
   .join(currentMembers, col("new.url").equalTo(col("current.url"))
       .and(col("new.version").equalTo(col("current.version"))),
     "leftanti")
   .as(valueSetEncoder);
 return withValueSets(valueSets);
}

Popular methods of SparkSession

Popular in Java

Parsing JSON documents to java classes using gson
getSupportFragmentManager (FragmentActivity)
getResourceAsStream (ClassLoader)
compareTo (BigDecimal)
ObjectMapper (com.fasterxml.jackson.databind)
ObjectMapper provides functionality for reading and writing JSON, either to and from basic POJOs (Pl
FileInputStream (java.io)
An input stream that reads bytes from a file. File file = ...finally if (in != null) in.clos
ConnectException (java.net)
A ConnectException is thrown if a connection cannot be established to a remote host on a specific po
InetAddress (java.net)
An Internet Protocol (IP) address. This can be either an IPv4 address or an IPv6 address, and in pra
Permission (java.security)
Legacy security code; do not use.
Servlet (javax.servlet)
Defines methods that all servlets must implement. A servlet is a small Java program that runs within
From CI to AI: The AI layer in your organization

How to use tablemethodin org.apache.spark.sql.SparkSession

Best Java code snippets using org.apache.spark.sql.SparkSession.table (Showing top 14 results out of 315)

How to use
table
method
in
org.apache.spark.sql.SparkSession