public SparkPipelineRuntime(JavaSparkExecutionContext sec, long batchTime) { super(sec.getNamespace(), sec.getApplicationSpecification().getName(), batchTime, new BasicArguments(sec), sec.getMetrics(), sec.getPluginContext(), sec.getServiceDiscoverer(), sec.getSecureStore()); } }
@Nullable private String getProgramArgs(JavaSparkExecutionContext sec, String stageName) { // get program args from plugin properties PluginProperties pluginProperties = sec.getPluginContext().getPluginProperties(stageName); String programArgs = pluginProperties == null ? null : pluginProperties.getProperties().get(ExternalSparkProgram.PROGRAM_ARGS); // can be overridden by runtime args String programArgsKey = stageName + "." + ExternalSparkProgram.PROGRAM_ARGS; if (sec.getRuntimeArguments().containsKey(programArgsKey)) { programArgs = sec.getRuntimeArguments().get(programArgsKey); } return programArgs; } }
@Override public void execute(TxRunnable runnable) throws TransactionFailureException { sec.execute(runnable); }
public DefaultStreamingContext(StageSpec stageSpec, JavaSparkExecutionContext sec, JavaStreamingContext jsc) { super(new PipelineRuntime(sec.getNamespace(), sec.getApplicationSpecification().getName(), sec.getLogicalStartTime(), new BasicArguments(sec), sec.getMetrics(), sec.getPluginContext(), sec.getServiceDiscoverer(), sec, sec, sec), stageSpec); this.sec = sec; this.jsc = jsc; this.admin = sec.getAdmin(); }
@Override public void run(JavaSparkExecutionContext sec) throws Exception { String stageName = sec.getSpecification().getProperty(ExternalSparkProgram.STAGE_NAME); BatchPhaseSpec batchPhaseSpec = GSON.fromJson(sec.getSpecification().getProperty(Constants.PIPELINEID), BatchPhaseSpec.class); PipelinePluginContext pluginContext = new SparkPipelinePluginContext(sec.getPluginContext(), sec.getMetrics(), batchPhaseSpec.isStageLoggingEnabled(), batchPhaseSpec.isProcessTimingEnabled()); sec.getLogicalStartTime(), sec.getSecureStore(), sec.getNamespace()); JavaSparkMain javaSparkMain = pluginContext.newPluginInstance(stageName, macroEvaluator); javaSparkMain.run(sec); RuntimeArguments.toPosixArray(sec.getRuntimeArguments()) : programArgs.split(" "); final Method mainMethod = mainClass.getMethod("main", String[].class); final Object[] methodArgs = new Object[1];
MacroEvaluator evaluator = new DefaultMacroEvaluator(new BasicArguments(sec), logicalStartTime, sec.getSecureStore(), sec.getNamespace()); final PluginContext pluginContext = new SparkPipelinePluginContext(sec.getPluginContext(), sec.getMetrics(), stageSpec.isStageLoggingEnabled(), stageSpec.isProcessTimingEnabled()); sec.execute(new TxRunnable() { @Override public void run(DatasetContext datasetContext) throws Exception { = new SparkStreamingExecutionContext(sec, JavaSparkContext.fromSparkContext(data.rdd().context()), logicalStartTime, stageSpec); final JavaRDD<T> countedRDD = data.map(new CountingFunction<T>(stageName, sec.getMetrics(), "records.in", null)).cache(); sec.execute(new TxRunnable() { @Override public void run(DatasetContext context) throws Exception { sec.execute(new TxRunnable() { @Override public void run(DatasetContext datasetContext) throws Exception { } finally { if (isPrepared && !isDone) { sec.execute(new TxRunnable() { @Override
@Override public Void call(JavaRDD<Alert> data, Time batchTime) throws Exception { MacroEvaluator evaluator = new DefaultMacroEvaluator(new BasicArguments(sec), batchTime.milliseconds(), sec.getSecureStore(), sec.getNamespace()); PluginContext pluginContext = new SparkPipelinePluginContext(sec.getPluginContext(), sec.getMetrics(), stageSpec.isStageLoggingEnabled(), stageSpec.isProcessTimingEnabled()); String stageName = stageSpec.getName(); AlertPublisher alertPublisher = pluginContext.newPluginInstance(stageName, evaluator); PipelineRuntime pipelineRuntime = new SparkPipelineRuntime(sec, batchTime.milliseconds()); AlertPublisherContext alertPublisherContext = new DefaultAlertPublisherContext(pipelineRuntime, stageSpec, sec.getMessagingContext(), sec.getAdmin()); alertPublisher.initialize(alertPublisherContext); StageMetrics stageMetrics = new DefaultStageMetrics(sec.getMetrics(), stageName); TrackedIterator<Alert> trackedAlerts = new TrackedIterator<>(data.collect().iterator(), stageMetrics, Constants.Metrics.RECORDS_IN); alertPublisher.publish(trackedAlerts); alertPublisher.destroy(); return null; } }
@Override public void run(DatasetContext context) throws Exception { BatchPhaseSpec phaseSpec = GSON.fromJson(sec.getSpecification().getProperty(Constants.PIPELINEID), BatchPhaseSpec.class); Path configFile = sec.getLocalizationContext().getLocalFile("HydratorSpark.config").toPath(); try (BufferedReader reader = Files.newBufferedReader(configFile, StandardCharsets.UTF_8)) { String object = reader.readLine(); PipelinePluginContext pluginContext = new PipelinePluginContext(sec.getPluginContext(), sec.getMetrics(), phaseSpec.isStageLoggingEnabled(), phaseSpec.isProcessTimingEnabled()); new PipelinePluginInstantiator(pluginContext, sec.getMetrics(), phaseSpec, new SingleConnectorFactory()); runPipeline(phaseSpec.getPhase(), BatchSource.PLUGIN_TYPE, sec, stagePartitions, pluginInstantiator, collectors); } finally { updateWorkflowToken(sec.getWorkflowToken(), collectors);
final Metrics metrics = sec.getMetrics(); FileSetArguments.addInputPath(fileSetArgs, sec.getRuntimeArguments().get("input.path")); JavaPairRDD<LongWritable, Text> input = sec.fromDataset( WorkflowAppWithLocalDatasets.CSV_FILESET_DATASET, fileSetArgs); sec.execute(new TxRunnable() { @Override public void run(DatasetContext context) throws Exception {
@Override public void run(JavaSparkExecutionContext sec) throws Exception { JavaSparkContext jsc = new JavaSparkContext(); Map<String, String> runtimeArguments = sec.getRuntimeArguments(); String inputDatasetNS = Strings.isNullOrEmpty(runtimeArguments.get(INPUT_DATASET_NAMESPACE)) ? sec.getNamespace() : runtimeArguments.get(INPUT_DATASET_NAMESPACE); String inputDatasetName = Strings.isNullOrEmpty(runtimeArguments.get(INPUT_DATASET_NAME)) ? "inputDataset" : runtimeArguments.get(INPUT_DATASET_NAME); String outputDatasetNS = Strings.isNullOrEmpty(runtimeArguments.get(OUTPUT_DATASET_NAMESPACE)) ? sec.getNamespace() : runtimeArguments.get(OUTPUT_DATASET_NAMESPACE); String outputDatasetName = Strings.isNullOrEmpty(runtimeArguments.get(OUTPUT_DATASET_NAME)) ? "outputDataset" : runtimeArguments.get(OUTPUT_DATASET_NAME); JavaPairRDD<byte[], byte[]> rdd = sec.fromDataset(inputDatasetNS, inputDatasetName); sec.saveAsDataset(rdd, outputDatasetNS, outputDatasetName); } }
@Override public void run(JavaSparkExecutionContext sec) throws Exception { JavaSparkContext jsc = new JavaSparkContext(); String input = sec.getRuntimeArguments().get("input"); String output = sec.getRuntimeArguments().get("output"); // read the dataset JavaPairRDD<Long, String> inputData = sec.fromDataset(input); JavaPairRDD<String, Integer> stringLengths = transformRDD(inputData); // write the character count to dataset sec.saveAsDataset(stringLengths, output); String inputPartitionTime = sec.getRuntimeArguments().get("inputKey"); String outputPartitionTime = sec.getRuntimeArguments().get("outputKey"); // read and write datasets with dataset arguments if (inputPartitionTime != null && outputPartitionTime != null) { Map<String, String> inputArgs = new HashMap<>(); TimePartitionedFileSetArguments.setInputStartTime(inputArgs, Long.parseLong(inputPartitionTime) - 100); TimePartitionedFileSetArguments.setInputEndTime(inputArgs, Long.parseLong(inputPartitionTime) + 100); // read the dataset with user custom dataset args JavaPairRDD<Long, String> customPartitionData = sec.fromDataset(input, inputArgs); // create a new RDD with the same key but with a new value which is the length of the string JavaPairRDD<String, Integer> customPartitionStringLengths = transformRDD(customPartitionData); // write the character count to dataset with user custom dataset args Map<String, String> outputArgs = new HashMap<>(); TimePartitionedFileSetArguments.setOutputPartitionTime(outputArgs, Long.parseLong(outputPartitionTime)); sec.saveAsDataset(customPartitionStringLengths, output, outputArgs); } }
@Override public void run(JavaSparkExecutionContext sec) throws Exception { JavaSparkContext jsc = new JavaSparkContext(); JavaPairRDD<LongWritable, Text> rdd = sec.fromDataset(SPARK_INPUT); final Object plugin = sec.getPluginContext().newPluginInstance("plugin"); JavaPairRDD<byte[], Put> resultRDD = rdd.values() .map(text -> text + " " + plugin.toString()) .mapToPair(str -> new Tuple2<>(str.getBytes(Charsets.UTF_8), new Put(str, str, str))); sec.saveAsDataset(resultRDD, SPARK_TABLE); } }
JavaSparkContext jsc = new JavaSparkContext(); Map<String, String> runtimeArguments = sec.getRuntimeArguments(); String inputFileSet = runtimeArguments.get("input"); final String outputTable = runtimeArguments.get("output"); JavaPairRDD<LongWritable, Text> input = sec.fromDataset(inputFileSet); sec.execute(new TxRunnable() { @Override public void run(DatasetContext context) throws Exception {
"Due to spark limitations, macro evaluation is not allowed in streaming sources when checkpointing " + "is enabled."); PluginContext pluginContext = new SparkPipelinePluginContext(sec.getPluginContext(), sec.getMetrics(), spec.isStageLoggingEnabled(), spec.isProcessTimingEnabled()); DataTracer dataTracer = sec.getDataTracer(stageSpec.getName()); StreamingContext sourceContext = new DefaultStreamingContext(stageSpec, sec, streamingContext); JavaDStream<Object> javaDStream = source.getStream(sourceContext); .transform(new CountingTransformFunction<>(stageSpec.getName(), sec.getMetrics(), "records.out", dataTracer)) .map(new WrapOutputTransformFunction<>(stageSpec.getName())); return new DStreamCollection<>(sec, outputDStream);
@Override public JavaRDD<U> call(JavaRDD<T> data, Time batchTime) throws Exception { SparkExecutionPluginContext sparkPluginContext = new SparkStreamingExecutionContext(sec, JavaSparkContext.fromSparkContext(data.context()), batchTime.milliseconds(), stageSpec); String stageName = stageSpec.getName(); data = data.map(new CountingFunction<T>(stageName, sec.getMetrics(), "records.in", null)); return compute.transform(sparkPluginContext, data) .map(new CountingFunction<U>(stageName, sec.getMetrics(), "records.out", sec.getDataTracer(stageName))); } }
JavaSparkContext jsc = new JavaSparkContext(); Map<String, String> args = sec.getRuntimeArguments(); Preconditions.checkArgument(args.containsKey(SparkAppUsingLocalFiles.LOCAL_FILE_RUNTIME_ARG), "Runtime argument %s must be set.", SparkAppUsingLocalFiles.LOCAL_FILE_RUNTIME_ARG); final String localFilePath = URI.create(args.get(SparkAppUsingLocalFiles.LOCAL_FILE_RUNTIME_ARG)).getPath(); JavaRDD<String> fileContents = jsc.textFile(localFilePath, 1); final TaskLocalizationContext taskLocalizationContext = sec.getLocalizationContext(); JavaPairRDD<byte[], byte[]> rows = fileContents.mapToPair(new PairFunction<String, byte[], byte[]>() { @Override sec.saveAsDataset(rows, OUTPUT_DATASET_NAME);
@Override public JavaStreamingContext call() throws Exception { JavaStreamingContext jssc = new JavaStreamingContext( new JavaSparkContext(), Durations.milliseconds(pipelineSpec.getBatchIntervalMillis())); SparkStreamingPipelineRunner runner = new SparkStreamingPipelineRunner(sec, jssc, pipelineSpec, pipelineSpec.isCheckpointsDisabled()); PipelinePluginContext pluginContext = new PipelinePluginContext(sec.getPluginContext(), sec.getMetrics(), pipelineSpec.isStageLoggingEnabled(), pipelineSpec.isProcessTimingEnabled()); // TODO: figure out how to get partitions to use for aggregators and joiners. // Seems like they should be set at configure time instead of runtime? but that requires an API change. try { runner.runPipeline(pipelinePhase, StreamingSource.PLUGIN_TYPE, sec, new HashMap<String, Integer>(), pluginContext, new HashMap<String, StageStatisticsCollector>()); } catch (Exception e) { throw new RuntimeException(e); } if (checkpointDir != null) { jssc.checkpoint(checkpointDir); } return jssc; } };
@Override public void run(JavaSparkExecutionContext sec) throws Exception { JavaSparkContext jsc = new JavaSparkContext(); JavaPairRDD<byte[], byte[]> input = sec.fromDataset("lines"); JavaRDD<String> words = input.values().flatMap(new FlatMapFunction<byte[], String>() { public Iterable<String> call(byte[] line) { return Arrays.asList(Bytes.toString(line).split(" ")); } }); JavaPairRDD<String, Integer> pairs = words.mapToPair(new PairFunction<String, String, Integer>() { public Tuple2<String, Integer> call(String s) { return new Tuple2<>(s, 1); } }); JavaPairRDD<String, Integer> counts = pairs.reduceByKey(new Function2<Integer, Integer, Integer>() { public Integer call(Integer a, Integer b) { return a + b; } }); JavaPairRDD<byte[], byte[]> result = counts.mapToPair( new PairFunction<Tuple2<String, Integer>, byte[], byte[]>() { @Override public Tuple2<byte[], byte[]> call(Tuple2<String, Integer> input) throws Exception { return new Tuple2<>(Bytes.toBytes(input._1()), Bytes.toBytes(input._2())); } }); sec.getAdmin().truncateDataset("counts"); sec.saveAsDataset(result, "counts"); } }
@Override public void publishAlerts(StageSpec stageSpec, StageStatisticsCollector collector) throws Exception { PluginFunctionContext pluginFunctionContext = new PluginFunctionContext(stageSpec, sec, collector); AlertPublisher alertPublisher = pluginFunctionContext.createPlugin(); PipelineRuntime pipelineRuntime = new SparkPipelineRuntime(sec); AlertPublisherContext alertPublisherContext = new DefaultAlertPublisherContext(pipelineRuntime, stageSpec, sec.getMessagingContext(), sec.getAdmin()); alertPublisher.initialize(alertPublisherContext); StageMetrics stageMetrics = new DefaultStageMetrics(sec.getMetrics(), stageSpec.getName()); TrackedIterator<Alert> trackedAlerts = new TrackedIterator<>(((JavaRDD<Alert>) rdd).collect().iterator(), stageMetrics, Constants.Metrics.RECORDS_IN); alertPublisher.publish(trackedAlerts); alertPublisher.destroy(); }
@Override public <K, V> void saveAsDataset(JavaPairRDD<K, V> rdd, String datasetName) { sec.saveAsDataset(rdd, datasetName); }