org.apache.spark.SparkContext java code examples

Refine search

public static void modifySparkHadoopConfiguration(SparkContext sc) throws Exception {
  sc.hadoopConfiguration().set("dfs.replication", "2"); // cuboid intermediate files, replication=2
  sc.hadoopConfiguration().set("mapreduce.output.fileoutputformat.compress", "true");
  sc.hadoopConfiguration().set("mapreduce.output.fileoutputformat.compress.type", "BLOCK");
  sc.hadoopConfiguration().set("mapreduce.output.fileoutputformat.compress.codec", "org.apache.hadoop.io.compress.DefaultCodec"); // or org.apache.hadoop.io.compress.SnappyCodec
}

 @Test
 public void scalaSparkContext() {
  List<String> jars = List$.MODULE$.empty();
  Map<String, String> environment = Map$.MODULE$.empty();

  new SparkContext(new SparkConf().setMaster("local").setAppName("name")).stop();
  new SparkContext("local", "name", new SparkConf()).stop();
  new SparkContext("local", "name").stop();
  new SparkContext("local", "name", "sparkHome").stop();
  new SparkContext("local", "name", "sparkHome", jars).stop();
  new SparkContext("local", "name", "sparkHome", jars, environment).stop();
 }
}

    Class.forName("scala.collection.mutable.WrappedArray$ofRef") };
SparkConf conf = new SparkConf().setAppName("Merge dictionary for cube:" + cubeName + ", segment " + segmentId);
conf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer");
conf.set("spark.kryo.registrator", "org.apache.kylin.engine.spark.KylinKryoRegistrator");
conf.set("spark.kryo.registrationRequired", "true").registerKryoClasses(kryoClassArray);
try (JavaSparkContext sc = new JavaSparkContext(conf)) {
  KylinSparkJobListener jobListener = new KylinSparkJobListener();
  sc.sc().addSparkListener(jobListener);
  HadoopUtil.deletePath(sc.hadoopConfiguration(), new Path(dictOutputPath));
  final SerializableConfiguration sConf = new SerializableConfiguration(sc.hadoopConfiguration());

public static void main(String[] args) throws Exception {
 assertNotEquals(0, args.length);
 assertEquals(args[0], "hello");
 new SparkContext().stop();
 synchronized (LOCK) {
  LOCK.notifyAll();
 }
}

spark.sparkContext().conf().set("spark.files.overwrite", "true");
spark.sparkContext().conf()
    .set("spark.serializer", "org.apache.spark.serializer.KryoSerializer");
spark.sparkContext().conf().registerKryoClasses(SparkUtils.zksparkClasses());
sc = new JavaSparkContext(spark.sparkContext());
final Configuration config = new Configuration(numExecutors,
    numCores,
spark.sparkContext().conf().set("spark.files.overwrite", "true");
spark.sparkContext().conf()
    .set("spark.serializer", "org.apache.spark.serializer.KryoSerializer");
spark.sparkContext().conf().registerKryoClasses(SparkUtils.zksparkClasses());
sc = new JavaSparkContext(spark.sparkContext());
final Configuration config = new Configuration(numExecutors,
    numCores,
spark.sparkContext().conf().set("spark.files.overwrite", "true");
spark.sparkContext().conf()
    .set("spark.serializer", "org.apache.spark.serializer.KryoSerializer");
spark.sparkContext().conf().registerKryoClasses(SparkUtils.zksparkClasses());
spark.sparkContext().conf().set("spark.files.overwrite", "true");
spark.sparkContext().conf()
    .set("spark.serializer", "org.apache.spark.serializer.KryoSerializer");
spark.sparkContext().conf().registerKryoClasses(SparkUtils.zksparkClasses());

 private void start() {
  SparkConf conf = new SparkConf().setAppName("Concurrency Lab 001")
    .setMaster(Config.MASTER);
  JavaSparkContext sc = new JavaSparkContext(conf);
  SparkSession spark = SparkSession.builder().config(conf).getOrCreate();

  conf = spark.sparkContext().conf();
  System.out.println(conf.get("hello"));

  Dataset<Row> df = spark.sql("SELECT * from myView");
  df.show();
 }
}

SparkConf conf = new SparkConf();
String serverAddress = null;
int serverPort = -1;
for (Tuple2<String, String> e : conf.getAll()) {
 mapConf.put(e._1(), e._2());
 LOG.debug("Remote Spark Driver configured with: " + e._1() + "=" + e._2());
 JavaSparkContext sc = new JavaSparkContext(conf);
 sc.sc().addSparkListener(new ClientListener());
 synchronized (jcLock) {
  jc = new JobContextImpl(sc, localTmpDir);

private LocalHiveSparkClient(SparkConf sparkConf, HiveConf hiveConf)
  throws FileNotFoundException, MalformedURLException {
 String regJar = null;
 // the registrator jar should already be in CP when not in test mode
 if (HiveConf.getBoolVar(hiveConf, HiveConf.ConfVars.HIVE_IN_TEST)) {
  String kryoReg = sparkConf.get("spark.kryo.registrator", "");
  if (SparkClientUtilities.HIVE_KRYO_REG_NAME.equals(kryoReg)) {
   regJar = SparkClientUtilities.findKryoRegistratorJar(hiveConf);
   SparkClientUtilities.addJarToContextLoader(new File(regJar));
  }
 }
 sc = new JavaSparkContext(sparkConf);
 if (regJar != null) {
  sc.addJar(regJar);
 }
 jobMetricsListener = new JobMetricsListener();
 sc.sc().addSparkListener(jobMetricsListener);
}

@Test
public void foreachPartition() {
 LongAccumulator accum = sc.sc().longAccumulator();
 JavaRDD<String> rdd = sc.parallelize(Arrays.asList("Hello", "World"));
 rdd.foreachPartition(iter -> {
  while (iter.hasNext()) {
   iter.next();
   accum.add(1);
  }
 });
 assertEquals(2, accum.value().intValue());
}

  private LinkedBlockingQueue<Kryo> initialize(final Configuration configuration) {
    // DCL is safe in this case due to volatility
    if (!INITIALIZED) {
      synchronized (UnshadedKryoShimService.class) {
        if (!INITIALIZED) {
          // so we don't get a WARN that a new configuration is being created within an active context
          final SparkConf sparkConf = null == Spark.getContext() ? new SparkConf() : Spark.getContext().getConf().clone();
          configuration.getKeys().forEachRemaining(key -> sparkConf.set(key, configuration.getProperty(key).toString()));
          final KryoSerializer serializer = new KryoSerializer(sparkConf);
          // Setup a pool backed by our spark.serializer instance
          // Reuse Gryo poolsize for Kryo poolsize (no need to copy this to SparkConf)
          KRYOS.clear();
          final int poolSize = configuration.getInt(GryoPool.CONFIG_IO_GRYO_POOL_SIZE, GryoPool.CONFIG_IO_GRYO_POOL_SIZE_DEFAULT);
          for (int i = 0; i < poolSize; i++) {
            KRYOS.add(serializer.newKryo());
          }
          INITIALIZED = true;
        }
      }
    }

    return KRYOS;
  }
}

/**
 * Creates JavaSparkContext if its hasn't been created yet, or returns the instance. {@link #addSchema(Schema)} and
 * {@link #addSchemas(Collection)} must not be called once the JavaSparkContext has been created
 * @return the JavaSparkContext that will be used to execute the JobDags
 */
public JavaSparkContext getOrCreateSparkContext() {
  if (!this.sparkContext.isPresent()) {
    this.sparkContext = Optional.of(new JavaSparkContext(
        SparkUtil.getSparkConf(
          this.appName, Optional.of(this.schemas), this.serializationClasses, this.conf)));
    this.sparkContext.get().sc().addSparkListener(new SparkEventListener());
    // Adding hadoop configuration to default
    this.sparkContext.get().sc().hadoopConfiguration().addResource(
      new HadoopConfiguration(conf).getHadoopConf());
    this.appId = this.sparkContext.get().sc().applicationId();
  }
  return this.sparkContext.get();
}

@Before
public void runBefore() {
 initialized = (_sc != null);
 if (!initialized) {
  _sc = new SparkContext(conf());
  _jsc = new JavaSparkContext(_sc);
  beforeAllTestCasesHook();
 }
}

@Override
public void initialize(Map<String, Object> puProperties)
{
  reader = new SparkEntityReader(kunderaMetadata);
  setExternalProperties(puProperties);
  initializePropertyReader();
  PersistenceUnitMetadata pum = kunderaMetadata.getApplicationMetadata().getPersistenceUnitMetadata(
      getPersistenceUnit());
  sparkconf = new SparkConf(true);
  configureClientProperties(pum);
  sparkContext = new SparkContext(sparkconf);
  sqlContext = new HiveContext(sparkContext);
}

public static synchronized SnappySharedState create(SparkContext sparkContext)
  throws SparkException {
 // force in-memory catalog to avoid initializing hive for SnappyData
 final String catalogImpl = sparkContext.conf().get(CATALOG_IMPLEMENTATION, null);
 // there is a small thread-safety issue in that if multiple threads
 // are initializing normal concurrently SparkSession vs SnappySession
 // then former can land up with in-memory catalog too
 sparkContext.conf().set(CATALOG_IMPLEMENTATION, "in-memory");
 createListenerAndUI(sparkContext);
 final SnappySharedState sharedState = new SnappySharedState(sparkContext);
 // reset the catalog implementation to original
 if (catalogImpl != null) {
  sparkContext.conf().set(CATALOG_IMPLEMENTATION, catalogImpl);
 } else {
  sparkContext.conf().remove(CATALOG_IMPLEMENTATION);
 }
 return sharedState;
}

@Override
public int getDefaultParallelism() throws Exception {
 return sc.sc().defaultParallelism();
}

@Override
public int getExecutorCount() {
 return sc.sc().getExecutorMemoryStatus().size();
}

@Override
public void cleanup() {
 jobMetricsListener.cleanup(jobId);
 if (cachedRDDIds != null) {
  for (Integer cachedRDDId: cachedRDDIds) {
   sparkContext.sc().unpersistRDD(cachedRDDId, false);
  }
 }
}

@Test
public void testForeach() {
 LongAccumulator accum = jsc.sc().longAccumulator();
 List<String> data = Arrays.asList("a", "b", "c");
 Dataset<String> ds = spark.createDataset(data, Encoders.STRING());
 ds.foreach((ForeachFunction<String>) s -> accum.add(1));
 Assert.assertEquals(3, accum.value().intValue());
}

@Override
public SparkConf getSparkConf() {
 return sc.sc().conf();
}

@Test
public void testDefaultStreamingConfiguration() {
 Config config = ConfigFactory.empty();
 Contexts.initialize(config, Contexts.ExecutionMode.STREAMING);
 SparkConf sparkConf = Contexts.getSparkSession().sparkContext().getConf();
 assertTrue(sparkConf.contains("spark.dynamicAllocation.enabled"));
 assertTrue(sparkConf.contains("spark.sql.shuffle.partitions"));
 assertEquals(sparkConf.get("spark.sql.catalogImplementation"), "hive");
}

How to useSparkContext in org.apache.spark

Best Java code snippets using org.apache.spark.SparkContext (Showing top 20 results out of 702)

Refine search

How to use
SparkContext
in
org.apache.spark