/** * Instantiates and initializes the plugin for the stage. * * @param stageInfo the stage info * @return the initialized Transformation * @throws InstantiationException if the plugin for the stage could not be instantiated * @throws Exception if there was a problem initializing the plugin */ private <T extends Transformation & StageLifecycle<BatchRuntimeContext>> Transformation getInitializedTransformation(StageSpec stageInfo) throws Exception { BatchRuntimeContext runtimeContext = createRuntimeContext(stageInfo); T plugin = pluginInstantiator.newPluginInstance(stageInfo.getName(), macroEvaluator); plugin.initialize(runtimeContext); return plugin; }
/** * Create a transform executor for the specified pipeline. Will instantiate and initialize all sources, * transforms, and sinks in the pipeline. * * @param pipeline the pipeline to create a transform executor for * @param outputWriter writes output records to the mapreduce context * @return executor for the pipeline * @throws InstantiationException if there was an error instantiating a plugin * @throws Exception if there was an error initializing a plugin */ public <KEY_OUT, VAL_OUT> PipeTransformExecutor<T> create(PipelinePhase pipeline, OutputWriter<KEY_OUT, VAL_OUT> outputWriter) throws Exception { // populate the pipe stages in reverse topological order to ensure that an output is always created before its // input. this will allow us to setup all outputs for a stage when we get to it. List<String> traversalOrder = pipeline.getDag().getTopologicalOrder(); Collections.reverse(traversalOrder); Map<String, PipeStage> pipeStages = new HashMap<>(); for (String stageName : traversalOrder) { pipeStages.put(stageName, getPipeStage(pipeline, stageName, pipeStages, outputWriter)); } // sourceStageName will be null in reducers, so need to handle that case Set<String> startingPoints = (sourceStageName == null) ? pipeline.getSources() : Sets.newHashSet(sourceStageName); return new PipeTransformExecutor<>(pipeStages, startingPoints); }
if (BatchAggregator.PLUGIN_TYPE.equals(pluginType)) { BatchAggregator<?, ?, ?> batchAggregator = pluginInstantiator.newPluginInstance(stageName, macroEvaluator); BatchRuntimeContext runtimeContext = createRuntimeContext(stageSpec); batchAggregator.initialize(runtimeContext); if (isMapPhase) { return getTrackedEmitKeyStep(new MapperAggregatorTransformation(batchAggregator, mapOutputKeyClassName, mapOutputValClassName), stageMetrics, taskContext.getDataTracer(stageName), collector); } else { return getTrackedAggregateStep(new ReducerAggregatorTransformation(batchAggregator, mapOutputKeyClassName, mapOutputValClassName), BatchJoinerRuntimeContext runtimeContext = createRuntimeContext(stageSpec); batchJoiner.initialize(runtimeContext); if (isMapPhase) { return getTrackedEmitKeyStep( new MapperJoinerTransformation(batchJoiner, mapOutputKeyClassName, mapOutputValClassName), stageMetrics, taskContext.getDataTracer(stageName), collector); } else { return getTrackedMergeStep( new ReducerJoinerTransformation(batchJoiner, mapOutputKeyClassName, mapOutputValClassName, runtimeContext.getInputSchemas().size()), stageMetrics, Transformation transformation = getInitializedTransformation(stageSpec); boolean isLimitingSource = taskContext.getDataTracer(stageName).isEnabled() && BatchSource.PLUGIN_TYPE.equals(pluginType) && isMapPhase;
new MapReduceTransformExecutorFactory<>(context, pluginInstantiator, metrics, new BasicArguments(context.getWorkflowToken(), runtimeArgs), sourceStage, phaseSpec.getNumOfRecordsPreview(), phaseSpec.pipelineContainsCondition()); this.transformExecutor = transformExecutorFactory.create(phase, outputWriter);
if (Constants.Connector.PLUGIN_TYPE.equals(pluginType) || BatchJoiner.PLUGIN_TYPE.equals(pluginType)) { Transformation<RecordInfo<Object>, Object> sink = getTransformation(stageSpec); return new DirectOutputPipeStage<>(stageName, sink, new SinkEmitter<>(stageName, outputWriter)); } else { return new UnwrapPipeStage<>(stageName, getTransformation(stageSpec), new SinkEmitter<>(stageName, outputWriter)); return new MultiOutputTransformPipeStage<>(stageName, getMultiOutputTransform(stageSpec), pipeEmitter); } else { return new UnwrapPipeStage<>(stageName, getTransformation(stageSpec), pipeEmitter);
private <IN, ERROR> TrackedMultiOutputTransform<IN, ERROR> getMultiOutputTransform(StageSpec stageSpec) throws Exception { String stageName = stageSpec.getName(); DefaultMacroEvaluator macroEvaluator = new DefaultMacroEvaluator(arguments, taskContext.getLogicalStartTime(), taskContext, taskContext.getNamespace()); SplitterTransform<IN, ERROR> splitterTransform = pluginInstantiator.newPluginInstance(stageName, macroEvaluator); TransformContext transformContext = createRuntimeContext(stageSpec); splitterTransform.initialize(transformContext); StageMetrics stageMetrics = new DefaultStageMetrics(metrics, stageName); TaskAttemptContext taskAttemptContext = (TaskAttemptContext) taskContext.getHadoopContext(); StageStatisticsCollector collector = isPipelineContainsCondition ? new MapReduceStageStatisticsCollector(stageName, taskAttemptContext) : new NoopStageStatisticsCollector(); return new TrackedMultiOutputTransform<>(splitterTransform, stageMetrics, taskContext.getDataTracer(stageName), collector); }