com.cloudera.labs.envelope.spark.Contexts.getSparkSession java code examples

private Dataset<Row> readParquet(String path) {
 LOG.debug("Reading Parquet: {}", path);
 return Contexts.getSparkSession().read().parquet(path);
}

@SuppressWarnings( {"rawtypes", "unchecked"})
private Dataset<Row> readInputFormat(String path) throws Exception {
 LOG.debug("Reading InputFormat[{}]: {}", inputType, path);
 Class<? extends InputFormat> typeClazz = Class.forName(inputType).asSubclass(InputFormat.class);
 Class<?> keyClazz = Class.forName(keyType);
 Class<?> valueClazz = Class.forName(valueType);
 @SuppressWarnings("resource")
 JavaSparkContext context = new JavaSparkContext(Contexts.getSparkSession().sparkContext());
 JavaPairRDD<?, ?> rdd = context.newAPIHadoopFile(path, typeClazz, keyClazz, valueClazz, new Configuration());
 TranslateFunction translateFunction = new TranslateFunction(translatorConfig);
 return Contexts.getSparkSession().createDataFrame(rdd.flatMap(translateFunction), translateFunction.getSchema());
}

private Dataset<Row> readJSON(String path) {
 LOG.debug("Reading JSON: {}", path);
 if (null != schema) {
  return Contexts.getSparkSession().read().schema(schema).json(path);
 } else {
  return Contexts.getSparkSession().read().json(path);
 }
}

@Override
public Dataset<Row> read() throws Exception {
 String connection = config.getString(CONNECTION_CONFIG_NAME);
 String tableName = config.getString(TABLE_NAME_CONFIG_NAME);
 Dataset<Row> tableDF = Contexts.getSparkSession().read()
                          .format("org.apache.kudu.spark.kudu")
                          .option("kudu.master", connection)
                          .option("kudu.table", tableName)
                          .load();
 return tableDF;
}

@Override
public Dataset<Row> derive(Map<String, Dataset<Row>> dependencies) throws Exception {
 if (!dependencies.containsKey(stepName)) {
  throw new RuntimeException("Step not found in the dependencies list");
 }
 Dataset<Row> sourceStep = dependencies.get(stepName);
 // For each partition in the DataFrame / RDD
 JavaRDD<Row> outputRDD = sourceStep.toJavaRDD().flatMap(
   MorphlineUtils.morphlineMapper(this.morphlineFile, this.morphlineId, getSchema(), errorOnEmpty));
 // Convert all the Rows into a new DataFrame
 return Contexts.getSparkSession().createDataFrame(outputRDD, getSchema());
}

 @Override
 public Dataset<Row> derive(Map<String, Dataset<Row>> dependencies) throws Exception {
  String query = config.getString("query.literal");
  Dataset<Row> derived = Contexts.getSparkSession().sql(query);
  return derived;
 }
}

@Override
public Dataset<Row> check(Dataset<Row> dataset, Map<String, Dataset<Row>> stepDependencies) {
 boolean schemasMatch = schemasMatch(requiredSchema, dataset.schema(), exactMatch);
 List<Row> datasetRows = Lists.newArrayList((Row)new RowWithSchema(SCHEMA, name, schemasMatch));
 return Contexts.getSparkSession().createDataFrame(datasetRows, SCHEMA);
}

private Dataset<Row> readCSV(String path) {
 LOG.debug("Reading CSV: {}", path);
 if (null != schema) {
  return Contexts.getSparkSession().read().schema(schema).options(options).csv(path);
 } else {
  return Contexts.getSparkSession().read().options(options).csv(path);
 }
}

 @Override
 public Dataset<Row> derive(Map<String, Dataset<Row>> dependencies) throws Exception {
  String query = config.getString("query.literal");
  Dataset<Row> derived = Contexts.getSparkSession().sql(query);
  return derived;
 }
}

private static void initializeStreamingJob() {
 int batchMilliseconds = INSTANCE.config.getInt(BATCH_MILLISECONDS_PROPERTY);
 final Duration batchDuration = Durations.milliseconds(batchMilliseconds);
 JavaStreamingContext jsc = new JavaStreamingContext(new JavaSparkContext(getSparkSession().sparkContext()),
   batchDuration);
 INSTANCE.jsc = jsc;
}

@Test
public void testHiveEnabledByDefault() {
 Contexts.initialize(ConfigFactory.empty(), Contexts.ExecutionMode.BATCH);
 Contexts.getSparkSession().sql("CREATE TABLE testHiveEnabled(d int)");
 Contexts.getSparkSession().sql("SELECT count(*) from testHiveEnabled");
 Contexts.getSparkSession().sql("DROP TABLE testHiveEnabled");
}

@Test
public void testDriverMemoryClusterMode() {
 Properties props = new Properties();
 props.setProperty(Contexts.APPLICATION_SECTION_PREFIX + "." +
     Contexts.SPARK_CONF_PROPERTY_PREFIX + "." + Contexts.SPARK_DEPLOY_MODE_PROPERTY,
   Contexts.SPARK_DEPLOY_MODE_CLUSTER);
 props.setProperty(Contexts.APPLICATION_SECTION_PREFIX + "." + Contexts.DRIVER_MEMORY_PROPERTY, "2G");
 Config config = ConfigFactory.parseProperties(props);
 Contexts.initialize(config, Contexts.ExecutionMode.UNIT_TEST);
 SparkConf sparkConf = Contexts.getSparkSession().sparkContext().getConf();
 assertEquals(sparkConf.get(Contexts.SPARK_DRIVER_MEMORY_PROPERTY), "2G");
}

@Test
public void testApplicationNameProvided() {
 Properties props = new Properties();
 props.setProperty("application.name", "test");
 Config config = ConfigFactory.parseProperties(props);
 Contexts.initialize(config, Contexts.ExecutionMode.UNIT_TEST);
 SparkConf sparkConf = Contexts.getSparkSession().sparkContext().getConf();
 assertEquals(sparkConf.get("spark.app.name"), "test");
}

private Dataset<Row> readText(String path) throws Exception {
 Dataset<Row> lines = Contexts.getSparkSession().read().text(path);
 if (translatorConfig != null) {
  Dataset<Tuple2<String, String>> keyedLines = lines.map(
    new PrepareLineForTranslationFunction(), Encoders.tuple(Encoders.STRING(), Encoders.STRING()));
  TranslateFunction<String, String> translateFunction = getTranslateFunction(translatorConfig);
  return keyedLines.flatMap(translateFunction, RowEncoder.apply(translateFunction.getSchema()));
 } else {
  return lines;
 }
}

@Test
public void testSparkPassthroughGood() {
 Config config = ConfigUtils.configFromPath(
  this.getClass().getResource(RESOURCES_PATH + "/spark-passthrough-good.conf").getPath());
 Contexts.initialize(config, Contexts.ExecutionMode.UNIT_TEST);
 SparkConf sparkConf = Contexts.getSparkSession().sparkContext().getConf();
 assertTrue(sparkConf.contains("spark.driver.allowMultipleContexts"));
 assertEquals("true", sparkConf.get("spark.driver.allowMultipleContexts"));
 assertTrue(sparkConf.contains("spark.master"));
 assertEquals("local[1]", sparkConf.get("spark.master"));
}

@Test
public void testApplicationNameNotProvided() {
 Config config = ConfigFactory.empty();
 Contexts.initialize(config, Contexts.ExecutionMode.UNIT_TEST);
 SparkConf sparkConf = Contexts.getSparkSession().sparkContext().getConf();
 assertEquals(sparkConf.get("spark.app.name"), "");
}

private JavaPairRDD<Row, Row> getDummyRDD(int numPartitions) {
 return Contexts.getSparkSession().range(numPartitions).javaRDD()
   .map(new LongToRowFunction()).keyBy(new ItselfFunction<Row>()).repartition(numPartitions);
}

@BeforeClass
public static void before() throws IOException {
 securityConfig = ConfigUtils.getOrElse(config,
   APPLICATION_SECTION_PREFIX + "." + SECURITY_PREFIX, ConfigFactory.empty());
 List<Path> files = SecurityUtils.getExistingTokenStoreFiles(securityConfig, hadoopConf, true);
 SecurityUtils.deleteTokenStoreFiles(files, 0, hadoopConf);
 Contexts.closeSparkSession(true);
 Contexts.initialize(config, Contexts.ExecutionMode.UNIT_TEST);
 Contexts.getSparkSession();
}

@Test
public void testDefaultUnitTestConfiguration() {
 Config config = ConfigFactory.empty();
 Contexts.initialize(config, Contexts.ExecutionMode.UNIT_TEST);
 SparkConf sparkConf = Contexts.getSparkSession().sparkContext().getConf();
 assertEquals(sparkConf.get("spark.sql.catalogImplementation"), "in-memory");
 assertEquals(sparkConf.get("spark.sql.shuffle.partitions"), "1");
}

@Test
public void testDefaultStreamingConfiguration() {
 Config config = ConfigFactory.empty();
 Contexts.initialize(config, Contexts.ExecutionMode.STREAMING);
 SparkConf sparkConf = Contexts.getSparkSession().sparkContext().getConf();
 assertTrue(sparkConf.contains("spark.dynamicAllocation.enabled"));
 assertTrue(sparkConf.contains("spark.sql.shuffle.partitions"));
 assertEquals(sparkConf.get("spark.sql.catalogImplementation"), "hive");
}

Popular methods of Contexts

Popular in Java

Start an intent from android
getResourceAsStream (ClassLoader)
scheduleAtFixedRate (Timer)
requestLocationUpdates (LocationManager)
Runnable (java.lang)
Represents a command that can be executed. Often used to run code in a different Thread.
URLConnection (java.net)
A connection to a URL for reading or writing. For HTTP connections, see HttpURLConnection for docume
Timestamp (java.sql)
A Java representation of the SQL TIMESTAMP type. It provides the capability of representing the SQL
Queue (java.util)
A collection designed for holding elements prior to processing. Besides basic java.util.Collection o
Manifest (java.util.jar)
The Manifest class is used to obtain attribute information for a JarFile and its entries.
JCheckBox (javax.swing)
Best IntelliJ plugins

How to use getSparkSessionmethodin com.cloudera.labs.envelope.spark.Contexts

Best Java code snippets using com.cloudera.labs.envelope.spark.Contexts.getSparkSession (Showing top 20 results out of 315)

How to use
getSparkSession
method
in
com.cloudera.labs.envelope.spark.Contexts