@Override public String toString() { return "DataStreamsPipelineSpec{" + "batchIntervalMillis=" + batchIntervalMillis + ", extraJavaOpts='" + extraJavaOpts + '\'' + ", stopGracefully=" + stopGracefully + ", checkpointsDisabled=" + checkpointsDisabled + ", isUnitTest=" + isUnitTest + ", checkpointDirectory='" + checkpointDirectory + '\'' + "} " + super.toString(); }
public String getStageName() { return stageSpec.getName(); }
@Override public int hashCode() { return Objects.hash(super.hashCode(), batchIntervalMillis, extraJavaOpts, stopGracefully, checkpointsDisabled, isUnitTest, checkpointDirectory); }
public <T> T createPlugin() throws Exception { if (Constants.Connector.PLUGIN_TYPE.equals(stageSpec.getPluginType())) { String connectorType = stageSpec.getPlugin().getProperties().get(Constants.Connector.TYPE); // ok to pass in null to constructors here since we are only going to use the transform method if (connectorType.equals(Constants.Connector.SOURCE_TYPE)) { return (T) new SingleConnectorSource(null, null); } else { return (T) new SingleConnectorSink(null, null); } } MacroEvaluator macroEvaluator = new DefaultMacroEvaluator(arguments, logicalStartTime, secureStore, namespace); return getPluginContext().newPluginInstance(stageSpec.getName(), macroEvaluator); }
@TransactionPolicy(TransactionControl.EXPLICIT) @Override public void initialize(WorkflowContext context) throws Exception { super.initialize(context); postActions = new LinkedHashMap<>(); BatchPipelineSpec batchPipelineSpec = GSON.fromJson(context.getWorkflowSpecification().getProperty("pipeline.spec"), BatchPipelineSpec.class); MacroEvaluator macroEvaluator = new DefaultMacroEvaluator(new BasicArguments(context.getToken(), context.getRuntimeArguments()), context.getLogicalStartTime(), context, context.getNamespace()); postActionSpecs = new HashMap<>(); for (ActionSpec actionSpec : batchPipelineSpec.getEndingActions()) { String name = actionSpec.getName(); postActions.put(name, (PostAction) context.newPluginInstance(name, macroEvaluator)); postActionSpecs.put(name, StageSpec.builder(name, actionSpec.getPluginSpec()) .setProcessTimingEnabled(batchPipelineSpec.isProcessTimingEnabled()) .setStageLoggingEnabled(batchPipelineSpec.isStageLoggingEnabled()) .build()); } }
private PipelinePluginContext getPluginContext() { if (pipelinePluginContext == null) { pipelinePluginContext = new SparkPipelinePluginContext(pluginContext, metrics, stageSpec.isStageLoggingEnabled(), stageSpec.isProcessTimingEnabled()); } return pipelinePluginContext; } }
@Override public boolean equals(Object o) { if (this == o) { return true; } if (o == null || getClass() != o.getClass()) { return false; } if (!super.equals(o)) { return false; } DataStreamsPipelineSpec that = (DataStreamsPipelineSpec) o; return batchIntervalMillis == that.batchIntervalMillis && Objects.equals(extraJavaOpts, that.extraJavaOpts) && stopGracefully == that.stopGracefully && checkpointsDisabled == that.checkpointsDisabled && isUnitTest == that.isUnitTest && Objects.equals(checkpointDirectory, that.checkpointDirectory); }
public StageMetrics createStageMetrics() { return new DefaultStageMetrics(metrics, stageSpec.getName()); }
@Override public void run() { JavaPairRDD<Object, Object> sinkRDD = rdd.flatMapToPair(sinkFunction); sinkFactory.writeFromRDD(sinkRDD, sec, stageSpec.getName(), Object.class, Object.class); } };
private boolean shouldCache(PipelinePhase pipelinePhase, StageSpec stageSpec) { // cache this RDD if it has multiple outputs, // otherwise computation of each output may trigger recomputing this stage Set<String> outputs = pipelinePhase.getStageOutputs(stageSpec.getName()); if (outputs.size() > 1) { return true; } // cache this stage if it is an input to a stage that has multiple inputs. // otherwise the union computation may trigger recomputing this stage for (String outputStageName : outputs) { StageSpec outputStage = pipelinePhase.getStage(outputStageName); //noinspection ConstantConditions if (pipelinePhase.getStageInputs(outputStageName).size() > 1) { return true; } } return false; }
public SparkBatchSinkContext(SparkBatchSinkFactory sinkFactory, JavaSparkExecutionContext sec, DatasetContext datasetContext, PipelineRuntime pipelineRuntime, StageSpec stageSpec) { super(pipelineRuntime, stageSpec, datasetContext, sec.getAdmin()); this.sinkFactory = sinkFactory; this.isPreviewEnabled = sec.getDataTracer(stageSpec.getName()).isEnabled(); }
public SparkBatchSourceContext(SparkBatchSourceFactory sourceFactory, SparkClientContext sparkContext, PipelineRuntime pipelineRuntime, DatasetContext datasetContext, StageSpec stageSpec) { super(pipelineRuntime, stageSpec, datasetContext, sparkContext.getAdmin()); this.sparkContext = sparkContext; this.sourceFactory = sourceFactory; this.isPreviewEnabled = sparkContext.getDataTracer(stageSpec.getName()).isEnabled(); }
public SparkBatchSinkContext(SparkBatchSinkFactory sinkFactory, SparkClientContext sparkContext, PipelineRuntime pipelineRuntime, DatasetContext datasetContext, StageSpec stageSpec) { super(pipelineRuntime, stageSpec, datasetContext, sparkContext.getAdmin()); this.sinkFactory = sinkFactory; this.isPreviewEnabled = sparkContext.getDataTracer(stageSpec.getName()).isEnabled(); }
/** * Instantiates and initializes the plugin for the stage. * * @param stageInfo the stage info * @return the initialized Transformation * @throws InstantiationException if the plugin for the stage could not be instantiated * @throws Exception if there was a problem initializing the plugin */ private <T extends Transformation & StageLifecycle<BatchRuntimeContext>> Transformation getInitializedTransformation(StageSpec stageInfo) throws Exception { BatchRuntimeContext runtimeContext = createRuntimeContext(stageInfo); T plugin = pluginInstantiator.newPluginInstance(stageInfo.getName(), macroEvaluator); plugin.initialize(runtimeContext); return plugin; }
@Override public void run() { String stageName = stageSpec.getName(); PipelineRuntime pipelineRuntime = new SparkPipelineRuntime(sec); SparkExecutionPluginContext sparkPluginContext = new BasicSparkExecutionPluginContext(sec, jsc, datasetContext, pipelineRuntime, stageSpec); JavaRDD<T> countedRDD = rdd.map(new CountingFunction<T>(stageName, sec.getMetrics(), "records.in", null)).cache(); try { sink.run(sparkPluginContext, countedRDD); } catch (Exception e) { Throwables.propagate(e); } } };
@Override protected SparkCollection<RecordInfo<Object>> getSource(StageSpec stageSpec, StageStatisticsCollector collector) { PluginFunctionContext pluginFunctionContext = new PluginFunctionContext(stageSpec, sec, collector); return new RDDCollection<>(sec, jsc, datasetContext, sinkFactory, sourceFactory.createRDD(sec, jsc, stageSpec.getName(), Object.class, Object.class) .flatMap(Compat.convert(new BatchSourceFunction(pluginFunctionContext, numOfRecordsPreview)))); }
public PluginFunctionContext(StageSpec stageSpec, JavaSparkExecutionContext sec, Map<String, String> arguments, long logicalStartTime, StageStatisticsCollector collector) { this.namespace = sec.getNamespace(); this.pipelineName = sec.getApplicationSpecification().getName(); this.stageSpec = stageSpec; this.logicalStartTime = logicalStartTime; this.arguments = new BasicArguments(sec); this.pluginContext = sec.getPluginContext(); this.serviceDiscoverer = sec.getServiceDiscoverer(); this.metrics = sec.getMetrics(); this.secureStore = sec.getSecureStore(); this.dataTracer = sec.getDataTracer(stageSpec.getName()); this.pipelinePluginContext = getPluginContext(); this.collector = collector; }
@Override public SparkCollection<T> window(StageSpec stageSpec, Windower windower) { String stageName = stageSpec.getName(); return wrap(stream.transform(new CountingTransformFunction<T>(stageName, sec.getMetrics(), "records.in", null)) .window(Durations.seconds(windower.getWidth()), Durations.seconds(windower.getSlideInterval())) .transform(new CountingTransformFunction<T>(stageName, sec.getMetrics(), "records.out", sec.getDataTracer(stageName)))); }
@Override public JavaRDD<U> call(JavaRDD<T> data, Time batchTime) throws Exception { SparkExecutionPluginContext sparkPluginContext = new SparkStreamingExecutionContext(sec, JavaSparkContext.fromSparkContext(data.context()), batchTime.milliseconds(), stageSpec); String stageName = stageSpec.getName(); data = data.map(new CountingFunction<T>(stageName, sec.getMetrics(), "records.in", null)); return compute.transform(sparkPluginContext, data) .map(new CountingFunction<U>(stageName, sec.getMetrics(), "records.out", sec.getDataTracer(stageName))); } }
@Override public <U> SparkCollection<U> compute(StageSpec stageSpec, SparkCompute<T, U> compute) throws Exception { String stageName = stageSpec.getName(); PipelineRuntime pipelineRuntime = new SparkPipelineRuntime(sec); SparkExecutionPluginContext sparkPluginContext = new BasicSparkExecutionPluginContext(sec, jsc, datasetContext, pipelineRuntime, stageSpec); compute.initialize(sparkPluginContext); JavaRDD<T> countedInput = rdd.map(new CountingFunction<T>(stageName, sec.getMetrics(), "records.in", null)).cache(); return wrap(compute.transform(sparkPluginContext, countedInput) .map(new CountingFunction<U>(stageName, sec.getMetrics(), "records.out", sec.getDataTracer(stageName)))); }