private Dataset<Row> readParquet(String path) { LOG.debug("Reading Parquet: {}", path); return Contexts.getSparkSession().read().parquet(path); }
@SuppressWarnings( {"rawtypes", "unchecked"}) private Dataset<Row> readInputFormat(String path) throws Exception { LOG.debug("Reading InputFormat[{}]: {}", inputType, path); Class<? extends InputFormat> typeClazz = Class.forName(inputType).asSubclass(InputFormat.class); Class<?> keyClazz = Class.forName(keyType); Class<?> valueClazz = Class.forName(valueType); @SuppressWarnings("resource") JavaSparkContext context = new JavaSparkContext(Contexts.getSparkSession().sparkContext()); JavaPairRDD<?, ?> rdd = context.newAPIHadoopFile(path, typeClazz, keyClazz, valueClazz, new Configuration()); TranslateFunction translateFunction = new TranslateFunction(translatorConfig); return Contexts.getSparkSession().createDataFrame(rdd.flatMap(translateFunction), translateFunction.getSchema()); }
private Dataset<Row> readJSON(String path) { LOG.debug("Reading JSON: {}", path); if (null != schema) { return Contexts.getSparkSession().read().schema(schema).json(path); } else { return Contexts.getSparkSession().read().json(path); } }
@Override public Dataset<Row> read() throws Exception { String connection = config.getString(CONNECTION_CONFIG_NAME); String tableName = config.getString(TABLE_NAME_CONFIG_NAME); Dataset<Row> tableDF = Contexts.getSparkSession().read() .format("org.apache.kudu.spark.kudu") .option("kudu.master", connection) .option("kudu.table", tableName) .load(); return tableDF; }
@Override public Dataset<Row> derive(Map<String, Dataset<Row>> dependencies) throws Exception { if (!dependencies.containsKey(stepName)) { throw new RuntimeException("Step not found in the dependencies list"); } Dataset<Row> sourceStep = dependencies.get(stepName); // For each partition in the DataFrame / RDD JavaRDD<Row> outputRDD = sourceStep.toJavaRDD().flatMap( MorphlineUtils.morphlineMapper(this.morphlineFile, this.morphlineId, getSchema(), errorOnEmpty)); // Convert all the Rows into a new DataFrame return Contexts.getSparkSession().createDataFrame(outputRDD, getSchema()); }
@Override public Dataset<Row> derive(Map<String, Dataset<Row>> dependencies) throws Exception { String query = config.getString("query.literal"); Dataset<Row> derived = Contexts.getSparkSession().sql(query); return derived; } }
@Override public Dataset<Row> check(Dataset<Row> dataset, Map<String, Dataset<Row>> stepDependencies) { boolean schemasMatch = schemasMatch(requiredSchema, dataset.schema(), exactMatch); List<Row> datasetRows = Lists.newArrayList((Row)new RowWithSchema(SCHEMA, name, schemasMatch)); return Contexts.getSparkSession().createDataFrame(datasetRows, SCHEMA); }
private Dataset<Row> readCSV(String path) { LOG.debug("Reading CSV: {}", path); if (null != schema) { return Contexts.getSparkSession().read().schema(schema).options(options).csv(path); } else { return Contexts.getSparkSession().read().options(options).csv(path); } }
@Override public Dataset<Row> derive(Map<String, Dataset<Row>> dependencies) throws Exception { String query = config.getString("query.literal"); Dataset<Row> derived = Contexts.getSparkSession().sql(query); return derived; } }
private static void initializeStreamingJob() { int batchMilliseconds = INSTANCE.config.getInt(BATCH_MILLISECONDS_PROPERTY); final Duration batchDuration = Durations.milliseconds(batchMilliseconds); JavaStreamingContext jsc = new JavaStreamingContext(new JavaSparkContext(getSparkSession().sparkContext()), batchDuration); INSTANCE.jsc = jsc; }
@Test public void testHiveEnabledByDefault() { Contexts.initialize(ConfigFactory.empty(), Contexts.ExecutionMode.BATCH); Contexts.getSparkSession().sql("CREATE TABLE testHiveEnabled(d int)"); Contexts.getSparkSession().sql("SELECT count(*) from testHiveEnabled"); Contexts.getSparkSession().sql("DROP TABLE testHiveEnabled"); }
@Test public void testDriverMemoryClusterMode() { Properties props = new Properties(); props.setProperty(Contexts.APPLICATION_SECTION_PREFIX + "." + Contexts.SPARK_CONF_PROPERTY_PREFIX + "." + Contexts.SPARK_DEPLOY_MODE_PROPERTY, Contexts.SPARK_DEPLOY_MODE_CLUSTER); props.setProperty(Contexts.APPLICATION_SECTION_PREFIX + "." + Contexts.DRIVER_MEMORY_PROPERTY, "2G"); Config config = ConfigFactory.parseProperties(props); Contexts.initialize(config, Contexts.ExecutionMode.UNIT_TEST); SparkConf sparkConf = Contexts.getSparkSession().sparkContext().getConf(); assertEquals(sparkConf.get(Contexts.SPARK_DRIVER_MEMORY_PROPERTY), "2G"); }
@Test public void testApplicationNameProvided() { Properties props = new Properties(); props.setProperty("application.name", "test"); Config config = ConfigFactory.parseProperties(props); Contexts.initialize(config, Contexts.ExecutionMode.UNIT_TEST); SparkConf sparkConf = Contexts.getSparkSession().sparkContext().getConf(); assertEquals(sparkConf.get("spark.app.name"), "test"); }
private Dataset<Row> readText(String path) throws Exception { Dataset<Row> lines = Contexts.getSparkSession().read().text(path); if (translatorConfig != null) { Dataset<Tuple2<String, String>> keyedLines = lines.map( new PrepareLineForTranslationFunction(), Encoders.tuple(Encoders.STRING(), Encoders.STRING())); TranslateFunction<String, String> translateFunction = getTranslateFunction(translatorConfig); return keyedLines.flatMap(translateFunction, RowEncoder.apply(translateFunction.getSchema())); } else { return lines; } }
@Test public void testSparkPassthroughGood() { Config config = ConfigUtils.configFromPath( this.getClass().getResource(RESOURCES_PATH + "/spark-passthrough-good.conf").getPath()); Contexts.initialize(config, Contexts.ExecutionMode.UNIT_TEST); SparkConf sparkConf = Contexts.getSparkSession().sparkContext().getConf(); assertTrue(sparkConf.contains("spark.driver.allowMultipleContexts")); assertEquals("true", sparkConf.get("spark.driver.allowMultipleContexts")); assertTrue(sparkConf.contains("spark.master")); assertEquals("local[1]", sparkConf.get("spark.master")); }
@Test public void testApplicationNameNotProvided() { Config config = ConfigFactory.empty(); Contexts.initialize(config, Contexts.ExecutionMode.UNIT_TEST); SparkConf sparkConf = Contexts.getSparkSession().sparkContext().getConf(); assertEquals(sparkConf.get("spark.app.name"), ""); }
private JavaPairRDD<Row, Row> getDummyRDD(int numPartitions) { return Contexts.getSparkSession().range(numPartitions).javaRDD() .map(new LongToRowFunction()).keyBy(new ItselfFunction<Row>()).repartition(numPartitions); }
@BeforeClass public static void before() throws IOException { securityConfig = ConfigUtils.getOrElse(config, APPLICATION_SECTION_PREFIX + "." + SECURITY_PREFIX, ConfigFactory.empty()); List<Path> files = SecurityUtils.getExistingTokenStoreFiles(securityConfig, hadoopConf, true); SecurityUtils.deleteTokenStoreFiles(files, 0, hadoopConf); Contexts.closeSparkSession(true); Contexts.initialize(config, Contexts.ExecutionMode.UNIT_TEST); Contexts.getSparkSession(); }
@Test public void testDefaultUnitTestConfiguration() { Config config = ConfigFactory.empty(); Contexts.initialize(config, Contexts.ExecutionMode.UNIT_TEST); SparkConf sparkConf = Contexts.getSparkSession().sparkContext().getConf(); assertEquals(sparkConf.get("spark.sql.catalogImplementation"), "in-memory"); assertEquals(sparkConf.get("spark.sql.shuffle.partitions"), "1"); }
@Test public void testDefaultStreamingConfiguration() { Config config = ConfigFactory.empty(); Contexts.initialize(config, Contexts.ExecutionMode.STREAMING); SparkConf sparkConf = Contexts.getSparkSession().sparkContext().getConf(); assertTrue(sparkConf.contains("spark.dynamicAllocation.enabled")); assertTrue(sparkConf.contains("spark.sql.shuffle.partitions")); assertEquals(sparkConf.get("spark.sql.catalogImplementation"), "hive"); }