/** * Saves an RDD of bundles as a database, where each table * has the resource name. This offers a simple way to load and query * bundles in a system, although users with more sophisticated ETL * operations may want to explicitly write different entities. * * <p> * Note this will access the given RDD of bundles once per resource name, * so consumers with enough memory should consider calling * {@link JavaRDD#cache()} so that RDD is not recomputed for each. * </p> * * @param spark the spark session * @param bundles an RDD of FHIR Bundles * @param database the name of the database to write to * @param resourceNames names of resources to be extracted from the bundle and written */ public void saveAsDatabase(SparkSession spark, JavaRDD<BundleContainer> bundles, String database, String... resourceNames) { spark.sql("create database if not exists " + database); for (String resourceName : resourceNames) { Dataset ds = extractEntry(spark, bundles, resourceName); ds.write().saveAsTable(database + "." + resourceName.toLowerCase()); } }
/** * Saves an RDD of bundles as a database, where each table * has the resource name. This offers a simple way to load and query * bundles in a system, although users with more sophisticated ETL * operations may want to explicitly write different entities. * * <p> * Note this will access the given RDD of bundles once per resource name, * so consumers with enough memory should consider calling * {@link JavaRDD#cache()} so that RDD is not recomputed for each. * </p> * * @param spark the spark session * @param bundles an RDD of FHIR Bundles * @param database the name of the database to write to * @param resourceNames names of resources to be extracted from the bundle and written */ public void saveAsDatabase(SparkSession spark, JavaRDD<BundleContainer> bundles, String database, String... resourceNames) { spark.sql("create database if not exists " + database); for (String resourceName : resourceNames) { Dataset ds = extractEntry(spark, bundles, resourceName); ds.write().saveAsTable(database + "." + resourceName.toLowerCase()); } }
@Test public void saveTableAndQueryIt() { Map<String, String> options = new HashMap<>(); df.write() .format("org.apache.spark.sql.json") .mode(SaveMode.Append) .options(options) .saveAsTable("javaSavedTable"); checkAnswer( sqlContext.sql("SELECT * FROM javaSavedTable"), df.collectAsList()); } }
@Test public void saveTableAndQueryIt() { Map<String, String> options = new HashMap<>(); df.write() .format("org.apache.spark.sql.json") .mode(SaveMode.Append) .options(options) .saveAsTable("javaSavedTable"); checkAnswer( sqlContext.sql("SELECT * FROM javaSavedTable"), df.collectAsList()); } }
.format("parquet") .partitionBy("timestamp") .saveAsTable(conceptMapTable);
@Test public void saveExternalTableAndQueryIt() { Map<String, String> options = new HashMap<>(); options.put("path", path.toString()); df.write() .format("org.apache.spark.sql.json") .mode(SaveMode.Append) .options(options) .saveAsTable("javaSavedTable"); checkAnswer( sqlContext.sql("SELECT * FROM javaSavedTable"), df.collectAsList()); Dataset<Row> loadedDF = sqlContext.createExternalTable("externalTable", "org.apache.spark.sql.json", options); checkAnswer(loadedDF, df.collectAsList()); checkAnswer( sqlContext.sql("SELECT * FROM externalTable"), df.collectAsList()); }
@Test public void saveExternalTableAndQueryIt() { Map<String, String> options = new HashMap<>(); options.put("path", path.toString()); df.write() .format("org.apache.spark.sql.json") .mode(SaveMode.Append) .options(options) .saveAsTable("javaSavedTable"); checkAnswer( sqlContext.sql("SELECT * FROM javaSavedTable"), df.collectAsList()); Dataset<Row> loadedDF = sqlContext.createExternalTable("externalTable", "org.apache.spark.sql.json", options); checkAnswer(loadedDF, df.collectAsList()); checkAnswer( sqlContext.sql("SELECT * FROM externalTable"), df.collectAsList()); }
.format("parquet") .partitionBy("timestamp") .saveAsTable(conceptMapTable);
@Test public void saveExternalTableWithSchemaAndQueryIt() { Map<String, String> options = new HashMap<>(); options.put("path", path.toString()); df.write() .format("org.apache.spark.sql.json") .mode(SaveMode.Append) .options(options) .saveAsTable("javaSavedTable"); checkAnswer( sqlContext.sql("SELECT * FROM javaSavedTable"), df.collectAsList()); List<StructField> fields = new ArrayList<>(); fields.add(DataTypes.createStructField("b", DataTypes.StringType, true)); StructType schema = DataTypes.createStructType(fields); Dataset<Row> loadedDF = sqlContext.createExternalTable("externalTable", "org.apache.spark.sql.json", schema, options); checkAnswer( loadedDF, sqlContext.sql("SELECT b FROM javaSavedTable").collectAsList()); checkAnswer( sqlContext.sql("SELECT * FROM externalTable"), sqlContext.sql("SELECT b FROM javaSavedTable").collectAsList()); }
.format("parquet") .partitionBy("timestamp") .saveAsTable(valueSetTable);
.format("parquet") .partitionBy("timestamp") .saveAsTable(valueSetTable);
@Test public void saveExternalTableWithSchemaAndQueryIt() { Map<String, String> options = new HashMap<>(); options.put("path", path.toString()); df.write() .format("org.apache.spark.sql.json") .mode(SaveMode.Append) .options(options) .saveAsTable("javaSavedTable"); checkAnswer( sqlContext.sql("SELECT * FROM javaSavedTable"), df.collectAsList()); List<StructField> fields = new ArrayList<>(); fields.add(DataTypes.createStructField("b", DataTypes.StringType, true)); StructType schema = DataTypes.createStructType(fields); Dataset<Row> loadedDF = sqlContext.createExternalTable("externalTable", "org.apache.spark.sql.json", schema, options); checkAnswer( loadedDF, sqlContext.sql("SELECT b FROM javaSavedTable").collectAsList()); checkAnswer( sqlContext.sql("SELECT * FROM externalTable"), sqlContext.sql("SELECT b FROM javaSavedTable").collectAsList()); }