System.out.println("Properties: " + System.getProperties()); SparkConf sparkConf = new SparkConf().setAppName("GroupActionsJob"); sparkConf.setMaster("local"); Tuple2<String, String>[] sparkConfPairs = sparkConf.getAll(); System.out.println("--- sparkConf ---"); for (int i = 0; i < sparkConfPairs.length; i++) { JavaSparkContext jsc = new JavaSparkContext(sparkConf); JavaRDD<String> dataSet = jsc.textFile(JobUtils.getSourceDirFromDate(cmdLineArgs.input_path_pattern, cmdLineArgs.input_date_string)).repartition(4); dataSet = dataSet.filter(clientFilter); JavaPairRDD<String, ActionData> pairs = dataSet.mapToPair(new PairFunction<String, String, ActionData>() { }).persist(StorageLevel.MEMORY_AND_DISK()); List<String> clientList = pairs.keys().distinct().collect(); Queue<ClientDetail> clientDetailQueue = new PriorityQueue<ClientDetail>(30, new Comparator<ClientDetail>() { final String currentClient = client; JavaPairRDD<String, ActionData> filtered_by_client = pairs.filter(new Function<Tuple2<String, ActionData>, Boolean>() {
Class.forName("scala.collection.mutable.WrappedArray$ofRef") }; SparkConf conf = new SparkConf().setAppName("Merge dictionary for cube:" + cubeName + ", segment " + segmentId); conf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer"); conf.set("spark.kryo.registrator", "org.apache.kylin.engine.spark.KylinKryoRegistrator"); conf.set("spark.kryo.registrationRequired", "true").registerKryoClasses(kryoClassArray); try (JavaSparkContext sc = new JavaSparkContext(conf)) { KylinSparkJobListener jobListener = new KylinSparkJobListener(); sc.sc().addSparkListener(jobListener); HadoopUtil.deletePath(sc.hadoopConfiguration(), new Path(dictOutputPath)); final SerializableConfiguration sConf = new SerializableConfiguration(sc.hadoopConfiguration()); JavaRDD<Integer> indexRDD = sc.parallelize(indexs, columnLength + 1); JavaPairRDD<Text, Text> colToDictPathRDD = indexRDD.mapToPair(new MergeDictAndStatsFunction(cubeName, metaUrl, segmentId, StringUtil.splitByComma(segmentIds), statOutputPath, tblColRefs, sConf)); colToDictPathRDD.coalesce(1, false).saveAsNewAPIHadoopFile(dictOutputPath, Text.class, Text.class, SequenceFileOutputFormat.class);
private SparkConf createSparkConf(List<SparkConfiguration.Configuration> configurations, SparkConf old) { SparkConf sparkConf = new SparkConf(); sparkConf.set(SPARK_EXTRA_LISTENERS, old.get(SPARK_EXTRA_LISTENERS)); sparkConf.set(BEAKERX_ID, old.get(BEAKERX_ID)); if (old.contains(SPARK_APP_NAME)) { sparkConf.set(SPARK_APP_NAME, old.get(SPARK_APP_NAME)); } configurations.forEach(x -> { if (x.getName() != null) { sparkConf.set(x.getName(), (x.getValue() != null) ? x.getValue() : ""); } }); return sparkConf; }
public static JavaSparkContext createConf() { SparkConf sparkConf = new SparkConf(); sparkConf.setAppName("animalClass"); JavaSparkContext sc = new JavaSparkContext(sparkConf); return sc; } }
@Before public void setUp() { SparkConf conf = new SparkConf() .setMaster("local[2]") .setAppName("test") .set("spark.streaming.clock", "org.apache.spark.util.ManualClock"); ssc = new JavaStreamingContext(conf, new Duration(1000)); ssc.checkpoint("checkpoint"); }
SparkConf conf = new SparkConf().setMaster(master).setAppName("basicavgwithkyro"); conf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer"); conf.set("spark.kryo.registrator", AvgRegistrator.class.getName()); JavaSparkContext sc = new JavaSparkContext(conf); JavaRDD<Integer> rdd = sc.parallelize(Arrays.asList(1, 2, 3, 4)); Function2<AvgCount, Integer, AvgCount> addAndCount = new Function2<AvgCount, Integer, AvgCount>() { @Override AvgCount result = rdd.aggregate(initial, addAndCount, combine); System.out.println(result.avg());
SparkConf conf = new SparkConf().setAppName("Cubing for:" + cubeName + " segment " + segmentId); conf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer"); conf.set("spark.kryo.registrator", "org.apache.kylin.engine.spark.KylinKryoRegistrator"); conf.set("spark.kryo.registrationRequired", "true").registerKryoClasses(kryoClassArray); JavaSparkContext sc = new JavaSparkContext(conf); sc.sc().addSparkListener(jobListener); HadoopUtil.deletePath(sc.hadoopConfiguration(), new Path(outputPath)); SparkUtil.modifySparkHadoopConfiguration(sc.sc()); // set dfs.replication=2 and enable compress final SerializableConfiguration sConf = new SerializableConfiguration(sc.hadoopConfiguration()); .mapToPair(new EncodeBaseCuboid(cubeName, segmentId, metaUrl, sConf)); totalCount = encodedBaseRDD.count(); allRDDs[0] = encodedBaseRDD.reduceByKey(baseCuboidReducerFunction, partition).persist(storageLevel);
SparkConf conf = new SparkConf(); conf.setMaster("local[*]"); conf.setAppName("DataVec Example"); JavaSparkContext sc = new JavaSparkContext(conf); JavaRDD<String> stringData = sc.textFile(filename); JavaRDD<List<Writable>> parsedInputData = stringData.filter((x) -> !x.isEmpty()).map(new StringToWritablesFunction(rr)); List<String> inputDataCollected = stringData.collect(); System.out.println("\n\n---- Original Data ----"); for(String s : inputDataCollected) System.out.println("'" + s + "'");
SparkConf conf = new SparkConf(); conf.setMaster("local[*]"); conf.setAppName("DataVec Join Example"); JavaSparkContext sc = new JavaSparkContext(conf); JavaRDD<List<Writable>> customerInfo = sc.textFile(customerInfoPath).map(new StringToWritablesFunction(rr)); JavaRDD<List<Writable>> purchaseInfo = sc.textFile(purchaseInfoPath).map(new StringToWritablesFunction(rr)); List<List<Writable>> customerInfoList = customerInfo.collect(); List<List<Writable>> purchaseInfoList = purchaseInfo.collect();
Class.forName("org.apache.kylin.engine.mr.steps.SelfDefineSortableKey") }; SparkConf conf = new SparkConf().setAppName("Fact distinct columns for:" + cubeName + " segment " + segmentId); conf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer"); conf.set("spark.kryo.registrator", "org.apache.kylin.engine.spark.KylinKryoRegistrator"); conf.set("spark.kryo.registrationRequired", "true").registerKryoClasses(kryoClassArray); try (JavaSparkContext sc = new JavaSparkContext(conf)) { sc.sc().addSparkListener(jobListener); HadoopUtil.deletePath(sc.hadoopConfiguration(), new Path(outputPath)); final SerializableConfiguration sConf = new SerializableConfiguration(sc.hadoopConfiguration()); final LongAccumulator bytesWritten = sc.sc().longAccumulator(); JavaPairRDD<SelfDefineSortableKey, Text> flatOutputRDD = recordRDD.mapPartitionsToPair( new FlatOutputFucntion(cubeName, segmentId, metaUrl, sConf, samplingPercent, bytesWritten)); JavaPairRDD<SelfDefineSortableKey, Iterable<Text>> aggredRDD = flatOutputRDD.groupByKey( new FactDistinctPartitioner(cubeName, metaUrl, sConf, reducerMapping.getTotalReducerNum())); .mapPartitionsToPair(new MultiOutputFunction(cubeName, metaUrl, sConf, samplingPercent)); long recordCount = recordRDD.count(); logger.info("Map input records={}", recordCount); logger.info("HDFS Read: {} HDFS Write", bytesWritten.value());
SparkConf conf = new SparkConf().setAppName("Merge segments for cube:" + cubeName + ", segment " + segmentId); conf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer"); conf.set("spark.kryo.registrator", "org.apache.kylin.engine.spark.KylinKryoRegistrator"); conf.set("spark.kryo.registrationRequired", "true").registerKryoClasses(kryoClassArray); try (JavaSparkContext sc = new JavaSparkContext(conf)) { SparkUtil.modifySparkHadoopConfiguration(sc.sc()); // set dfs.replication=2 and enable compress KylinSparkJobListener jobListener = new KylinSparkJobListener(); sc.sc().addSparkListener(jobListener); HadoopUtil.deletePath(sc.hadoopConfiguration(), new Path(outputPath)); CubeSegment sourceSegment = findSourceSegment(path, cubeInstance); JavaPairRDD<Text, Object[]> newEcoddedRdd = segRdd.mapToPair(new ReEncodeCuboidFunction(cubeName, sourceSegment.getUuid(), cubeSegment.getUuid(), metaUrl, sConf)); mergingSegs.add(newEcoddedRdd); sc.union(mergingSegs.toArray(new JavaPairRDD[mergingSegs.size()])) .reduceByKey(reduceFunction, SparkUtil.estimateTotalPartitionNum(cubeStatsReader, envConfig)) .mapToPair(convertTextFunction).saveAsNewAPIHadoopDataset(job.getConfiguration());
@Test public void javaSparkContext() { String[] jars = new String[] {}; java.util.Map<String, String> environment = new java.util.HashMap<>(); new JavaSparkContext(new SparkConf().setMaster("local").setAppName("name")).stop(); new JavaSparkContext("local", "name", new SparkConf()).stop(); new JavaSparkContext("local", "name").stop(); new JavaSparkContext("local", "name", "sparkHome", "jarFile").stop(); new JavaSparkContext("local", "name", "sparkHome", jars).stop(); new JavaSparkContext("local", "name", "sparkHome", jars, environment).stop(); }
public static void main(String[] args) { SparkConf sparkConf = new SparkConf().setAppName("JavaBookExample"); JavaSparkContext sc = new JavaSparkContext(sparkConf); JavaRDD<String> spam = sc.textFile("files/spam.txt"); JavaRDD<String> ham = sc.textFile("files/ham.txt"); JavaRDD<LabeledPoint> positiveExamples = spam.map(new Function<String, LabeledPoint>() { @Override public LabeledPoint call(String email) { return new LabeledPoint(1, tf.transform(Arrays.asList(email.split(" ")))); JavaRDD<LabeledPoint> negativeExamples = ham.map(new Function<String, LabeledPoint>() { @Override public LabeledPoint call(String email) { return new LabeledPoint(0, tf.transform(Arrays.asList(email.split(" ")))); JavaRDD<LabeledPoint> trainingData = positiveExamples.union(negativeExamples); trainingData.cache(); // Cache data since Logistic Regression is an iterative algorithm.
String mongodbUri = MONGODB_HOST + args[3]; SparkConf conf = new SparkConf().setAppName("SparkRecommender"); JavaSparkContext sc = new JavaSparkContext(conf); Logger log = sc.sc().log(); predictionsConfig.set("mongo.output.uri", mongodbUri); JavaPairRDD<Object,BSONObject> bsonRatingsData = sc.newAPIHadoopFile( ratingsUri, BSONFileInputFormat.class, Object.class, BSONObject.class, bsonDataConfig); JavaRDD<Rating> ratingsData = bsonRatingsData.map( new Function<Tuple2<Object,BSONObject>,Rating>() { public Rating call(Tuple2<Object,BSONObject> doc) throws Exception { log.warn("ratings = " + ratingsData.count()); log.warn("users = " + userData.count()); log.warn("movies = " + movieData.count()); JavaRDD<Rating> predictions = model.predict(usersMovies.rdd()).toJavaRDD(); Object.class, Object.class, MongoOutputFormat.class, predictionsConfig); sc.sc().log().info("predictionsOutput.splits() = " + predictionsOutput.splits().size());
KeyValue.class, RowKeyWritable.class }; SparkConf conf = new SparkConf().setAppName("Converting HFile for:" + cubeName + " segment " + segmentId); conf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer"); conf.set("spark.kryo.registrator", "org.apache.kylin.engine.spark.KylinKryoRegistrator"); conf.set("spark.kryo.registrationRequired", "true").registerKryoClasses(kryoClassArray); try (JavaSparkContext sc = new JavaSparkContext(conf)) { sc.sc().addSparkListener(jobListener); final FileSystem fs = partitionFilePath.getFileSystem(sc.hadoopConfiguration()); if (!fs.exists(partitionFilePath)) { throw new IllegalArgumentException("File not exist: " + partitionFilePath.toString()); final JavaPairRDD<RowKeyWritable, KeyValue> hfilerdd; if (quickPath) { hfilerdd = inputRDDs.mapToPair(new PairFunction<Tuple2<Text, Text>, RowKeyWritable, KeyValue>() { @Override public Tuple2<RowKeyWritable, KeyValue> call(Tuple2<Text, Text> textTextTuple2) throws Exception { hfilerdd = inputRDDs.flatMapToPair(new PairFlatMapFunction<Tuple2<Text, Text>, RowKeyWritable, KeyValue>() { @Override public Iterator<Tuple2<RowKeyWritable, KeyValue>> call(Tuple2<Text, Text> textTextTuple2) hfilerdd.repartitionAndSortWithinPartitions(new HFilePartitioner(keys), RowKeyWritable.RowKeyComparator.INSTANCE) .mapToPair(new PairFunction<Tuple2<RowKeyWritable, KeyValue>, ImmutableBytesWritable, KeyValue>() {
public static void main(String[] args) { SparkConf sparkConf = new SparkConf().setAppName("basic log query"); JavaSparkContext sc = new JavaSparkContext(sparkConf); logs = sc.textFile(args[0]); logs = sc.parallelize(EXAMPLE_LOGS); logs.mapToPair((String logRecord) -> { String[] tokens = logRecord.split(","); Tuple3<String, String, String> key = Util.createKey(tokens); extracted.filter((Tuple2<Tuple3<String, String, String>, LogStatistics> s) -> { Tuple3<String, String, String> t3 = s._1; return (t3._1() != null); // exclude Tuple3(null,null,null) filtered.reduceByKey((LogStatistics stats, LogStatistics stats2) -> stats.merge(stats2)); List<Tuple2<Tuple3<String, String, String>, LogStatistics>> output = counts.collect(); for (Tuple2<?,?> t : output) { System.out.println(t._1() + "\t" + t._2());
.build(); SparkConf conf = new SparkConf(); conf.setMaster("local[*]"); conf.setAppName("DataVec Example"); JavaSparkContext sc = new JavaSparkContext(conf); JavaRDD<String> stringData = sc.textFile(directory); JavaRDD<List<Writable>> parsedInputData = stringData.map(new StringToWritablesFunction(rr));
private SparkConf initializeSparkConf(String pluginNames) { return new SparkConf() .setMaster("local") .setAppName("test") .set(EXECUTOR_PLUGIN_CONF_NAME, pluginNames); }