private CoGbkResultSchema createSchema(int size) { List<TupleTag<?>> tags = new ArrayList<>(); for (int i = 0; i < size; i++) { tags.add(new TupleTag<Integer>("tag" + i)); } return new CoGbkResultSchema(TupleTagList.of(tags)); }
@Override public PCollectionTuple expand(PCollection<? extends KeyedWorkItem<K, KV<K, InputT>>> input) { PCollectionTuple outputs = PCollectionTuple.ofPrimitiveOutputsInternal( input.getPipeline(), TupleTagList.of(getMainOutputTag()).and(getAdditionalOutputTags().getAll()), // TODO Collections.emptyMap(), input.getWindowingStrategy(), input.isBounded()); return outputs; } }
public static TupleTagList getAdditionalOutputTags(AppliedPTransform<?, ?, ?> application) throws IOException { PTransform<?, ?> transform = application.getTransform(); if (transform instanceof ParDo.MultiOutput) { return ((ParDo.MultiOutput<?, ?>) transform).getAdditionalOutputTags(); } RunnerApi.PTransform protoTransform = PTransformTranslation.toProto( application, SdkComponents.create(application.getPipeline().getOptions())); ParDoPayload payload = ParDoPayload.parseFrom(protoTransform.getSpec().getPayload()); TupleTag<?> mainOutputTag = getMainOutputTag(payload); Set<String> outputTags = Sets.difference( protoTransform.getOutputsMap().keySet(), Collections.singleton(mainOutputTag.getId())); ArrayList<TupleTag<?>> additionalOutputTags = new ArrayList<>(); for (String outputTag : outputTags) { additionalOutputTags.add(new TupleTag<>(outputTag)); } return TupleTagList.of(additionalOutputTags); }
@Override public PCollectionTuple expand(PCollection<FailsafeElement<T, String>> failsafeElements) { return failsafeElements.apply( "JsonToTableRow", ParDo.of( new DoFn<FailsafeElement<T, String>, TableRow>() { @ProcessElement public void processElement(ProcessContext context) { FailsafeElement<T, String> element = context.element(); String json = element.getPayload(); try { TableRow row = convertJsonToTableRow(json); context.output(row); } catch (Exception e) { context.output( failureTag(), FailsafeElement.of(element) .setErrorMessage(e.getMessage()) .setStacktrace(Throwables.getStackTraceAsString(e))); } } }) .withOutputTags(successTag(), TupleTagList.of(failureTag()))); } }
public static <OutputT> PCollectionTuple createPrimitiveOutputFor( PCollection<?> input, DoFn<?, OutputT> fn, TupleTag<OutputT> mainOutputTag, TupleTagList additionalOutputTags, Map<TupleTag<?>, Coder<?>> outputTagsToCoders, WindowingStrategy<?, ?> windowingStrategy) { DoFnSignature signature = DoFnSignatures.getSignature(fn.getClass()); PCollectionTuple outputs = PCollectionTuple.ofPrimitiveOutputsInternal( input.getPipeline(), TupleTagList.of(mainOutputTag).and(additionalOutputTags.getAll()), outputTagsToCoders, windowingStrategy, input.isBounded().and(signature.isBoundedPerElement())); // Set output type descriptor similarly to how ParDo.MultiOutput does it. outputs.get(mainOutputTag).setTypeDescriptor(fn.getOutputTypeDescriptor()); return outputs; }
private ParDo.MultiOutput<PubsubMessage, Row> createParserParDo() { return ParDo.of( PubsubMessageToRow.builder() .messageSchema(getSchema()) .useDlq(getDeadLetterQueue() != null) .build()) .withOutputTags(MAIN_TAG, useDlq() ? TupleTagList.of(DLQ_TAG) : TupleTagList.empty()); }
private PCollectionTuple buildPCollectionTupleWithTags( TupleTag<Integer> mainOutputTag, TupleTag<Integer> additionalOutputTag) { PCollection<Integer> input = p.apply(Create.of(1, 2, 3)); PCollectionTuple tuple = input.apply( ParDo.of(new IdentityDoFn()) .withOutputTags(mainOutputTag, TupleTagList.of(additionalOutputTag))); return tuple; }
private void testAdditionalOutput(IsBounded bounded) { TupleTag<String> mainOutputTag = new TupleTag<String>("main") {}; TupleTag<String> additionalOutputTag = new TupleTag<String>("additional") {}; PCollectionTuple res = p.apply("input", Create.of(0, 1, 2)) .apply( ParDo.of(sdfWithAdditionalOutput(bounded, additionalOutputTag)) .withOutputTags(mainOutputTag, TupleTagList.of(additionalOutputTag))); PAssert.that(res.get(mainOutputTag)) .containsInAnyOrder(Arrays.asList("main:0", "main:1", "main:2")); PAssert.that(res.get(additionalOutputTag)) .containsInAnyOrder(Arrays.asList("additional:0", "additional:1", "additional:2")); p.run(); }
private static AppliedPTransform<?, ?, ?> multiMultiParDo(Pipeline pipeline) { PCollectionView<String> view = pipeline.apply(Create.of("foo")).apply(View.asSingleton()); PCollection<Long> input = pipeline.apply(GenerateSequence.from(0)); ParDo.MultiOutput<Long, KV<Long, String>> parDo = ParDo.of(new TestDoFn()) .withSideInputs(view) .withOutputTags( new TupleTag<KV<Long, String>>() {}, TupleTagList.of(new TupleTag<KV<String, Long>>() {})); PCollectionTuple output = input.apply(parDo); Map<TupleTag<?>, PValue> inputs = new HashMap<>(); inputs.putAll(parDo.getAdditionalInputs()); inputs.putAll(input.expand()); return AppliedPTransform .<PCollection<Long>, PCollectionTuple, ParDo.MultiOutput<Long, KV<Long, String>>>of( "MultiParDoInAndOut", inputs, output.expand(), parDo, pipeline); } }
@Test @Category(NeedsRunner.class) public void testTaggedOutputUnknownCoder() throws Exception { PCollection<Integer> input = pipeline.apply(Create.of(Arrays.asList(1, 2, 3))); final TupleTag<Integer> mainOutputTag = new TupleTag<>("main"); final TupleTag<TestDummy> additionalOutputTag = new TupleTag<>("unknownSide"); input.apply( ParDo.of(new TaggedOutputDummyFn(mainOutputTag, additionalOutputTag)) .withOutputTags(mainOutputTag, TupleTagList.of(additionalOutputTag))); thrown.expect(IllegalStateException.class); thrown.expectMessage("Unable to return a default Coder"); pipeline.run(); }
/** * @return new PFeatureRows, which has any tagged errors and retries in DoFn added to the errors * and retries gathered so far. */ public PFeatureRows applyDoFn(String name, BaseFeatureDoFn doFn) { MultiOutput<FeatureRowExtended, FeatureRowExtended> transform = ParDo.of(doFn.withTransformName(name)) .withOutputTags(MAIN_TAG, TupleTagList.of(ERRORS_TAG)); PCollectionTuple transformed = Pipeline.applyTransform(name, main, transform); PCollection<FeatureRowExtended> outMain = transformed.get(MAIN_TAG).setCoder(ProtoCoder.of(FeatureRowExtended.class)); PCollection<FeatureRowExtended> outErrors = PCollectionList.of( transformed.get(ERRORS_TAG).setCoder(ProtoCoder.of(FeatureRowExtended.class))) .and(errors) .apply(name + "/Flatten errors", Flatten.pCollections()) .setCoder(ProtoCoder.of(FeatureRowExtended.class)); return new PFeatureRows(outMain, outErrors); }
@Test @Category(NeedsRunner.class) public void testMainOutputUnregisteredExplicitCoder() { PCollection<Integer> input = pipeline.apply(Create.of(Arrays.asList(1, 2, 3))); final TupleTag<TestDummy> mainOutputTag = new TupleTag<>("unregisteredMain"); final TupleTag<Integer> additionalOutputTag = new TupleTag<Integer>("additionalOutput") {}; PCollectionTuple outputTuple = input.apply( ParDo.of(new MainOutputDummyFn(mainOutputTag, additionalOutputTag)) .withOutputTags(mainOutputTag, TupleTagList.of(additionalOutputTag))); outputTuple.get(mainOutputTag).setCoder(new TestDummyCoder()); pipeline.run(); }
@Test public void testMultiOutputOverrideNonCrashing() throws Exception { DataflowPipelineOptions options = buildPipelineOptions(); options.setRunner(DataflowRunner.class); Pipeline pipeline = Pipeline.create(options); TupleTag<Integer> mainOutputTag = new TupleTag<Integer>() {}; TupleTag<Integer> sideOutputTag = new TupleTag<Integer>() {}; DummyStatefulDoFn fn = new DummyStatefulDoFn(); pipeline .apply(Create.of(KV.of(1, 2))) .apply(ParDo.of(fn).withOutputTags(mainOutputTag, TupleTagList.of(sideOutputTag))); DataflowRunner runner = DataflowRunner.fromOptions(options); runner.replaceTransforms(pipeline); assertThat(findBatchStatefulDoFn(pipeline), equalTo((DoFn) fn)); }
@Test @Category(ValidatesRunner.class) public void testParDoWithOnlyTaggedOutput() { List<Integer> inputs = Arrays.asList(3, -42, 666); final TupleTag<Void> mainOutputTag = new TupleTag<Void>("main") {}; final TupleTag<Integer> additionalOutputTag = new TupleTag<Integer>("additional") {}; PCollectionTuple outputs = pipeline .apply(Create.of(inputs)) .apply( ParDo.of( new DoFn<Integer, Void>() { @ProcessElement public void processElement( @Element Integer element, MultiOutputReceiver r) { r.get(additionalOutputTag).output(element); } }) .withOutputTags(mainOutputTag, TupleTagList.of(additionalOutputTag))); PAssert.that(outputs.get(mainOutputTag)).empty(); PAssert.that(outputs.get(additionalOutputTag)).containsInAnyOrder(inputs); pipeline.run(); }
@Override public PCollection<T> expand(PCollection<T> input) { TupleTag<T> mainOutput = new TupleTag<>(); TupleTag<Void> cleanupSignal = new TupleTag<>(); PCollectionTuple outputs = input.apply( ParDo.of(new IdentityFn<T>()) .withOutputTags(mainOutput, TupleTagList.of(cleanupSignal))); PCollectionView<Iterable<Void>> cleanupSignalView = outputs.get(cleanupSignal).setCoder(VoidCoder.of()).apply(View.asIterable()); input .getPipeline() .apply("Create(CleanupOperation)", Create.of(cleanupOperation)) .apply( "Cleanup", ParDo.of( new DoFn<CleanupOperation, Void>() { @ProcessElement public void processElement(ProcessContext c) throws Exception { c.element().cleanup(new ContextContainer(c, jobIdSideInput)); } }) .withSideInputs(jobIdSideInput, cleanupSignalView)); return outputs.get(mainOutput).setCoder(input.getCoder()); }
@Test @Ignore( "TODO: BEAM-2902 Add support for user state in a ParDo.Multi once PTransformMatcher " + "exposes a way to know when the replacement is not required by checking that the " + "preceding ParDos to a GBK are key preserving.") public void testFnApiMultiOutputOverrideNonCrashing() throws Exception { DataflowPipelineOptions options = buildPipelineOptions("--experiments=beam_fn_api"); options.setRunner(DataflowRunner.class); Pipeline pipeline = Pipeline.create(options); TupleTag<Integer> mainOutputTag = new TupleTag<Integer>() {}; TupleTag<Integer> sideOutputTag = new TupleTag<Integer>() {}; DummyStatefulDoFn fn = new DummyStatefulDoFn(); pipeline .apply(Create.of(KV.of(1, 2))) .apply(ParDo.of(fn).withOutputTags(mainOutputTag, TupleTagList.of(sideOutputTag))); DataflowRunner runner = DataflowRunner.fromOptions(options); runner.replaceTransforms(pipeline); assertThat(findBatchStatefulDoFn(pipeline), equalTo((DoFn) fn)); }
@Test @Category(ValidatesRunner.class) public void testComposePCollectionTuple() { pipeline.enableAbandonedNodeEnforcement(true); List<Integer> inputs = Arrays.asList(3, -42, 666); TupleTag<Integer> mainOutputTag = new TupleTag<Integer>("main") {}; TupleTag<Integer> emptyOutputTag = new TupleTag<Integer>("empty") {}; final TupleTag<Integer> additionalOutputTag = new TupleTag<Integer>("extra") {}; PCollection<Integer> mainInput = pipeline.apply(Create.of(inputs)); PCollectionTuple outputs = mainInput.apply( ParDo.of( new DoFn<Integer, Integer>() { @ProcessElement public void processElement(ProcessContext c) { c.output(additionalOutputTag, c.element()); } }) .withOutputTags(emptyOutputTag, TupleTagList.of(additionalOutputTag))); assertNotNull("outputs.getPipeline()", outputs.getPipeline()); outputs = outputs.and(mainOutputTag, mainInput); PAssert.that(outputs.get(mainOutputTag)).containsInAnyOrder(inputs); PAssert.that(outputs.get(additionalOutputTag)).containsInAnyOrder(inputs); PAssert.that(outputs.get(emptyOutputTag)).empty(); pipeline.run(); }
@Test public void testTaggedOutputUnregisteredExplicitCoder() throws Exception { pipeline.enableAbandonedNodeEnforcement(false); PCollection<Integer> input = pipeline.apply(Create.of(Arrays.asList(1, 2, 3))); final TupleTag<Integer> mainOutputTag = new TupleTag<>("main"); final TupleTag<TestDummy> additionalOutputTag = new TupleTag<>("unregisteredSide"); ParDo.MultiOutput<Integer, Integer> pardo = ParDo.of(new TaggedOutputDummyFn(mainOutputTag, additionalOutputTag)) .withOutputTags(mainOutputTag, TupleTagList.of(additionalOutputTag)); PCollectionTuple outputTuple = input.apply(pardo); outputTuple.get(additionalOutputTag).setCoder(new TestDummyCoder()); outputTuple.get(additionalOutputTag).apply(View.asSingleton()); assertEquals(new TestDummyCoder(), outputTuple.get(additionalOutputTag).getCoder()); outputTuple .get(additionalOutputTag) .finishSpecifyingOutput("ParDo", input, pardo); // Check for crashes assertEquals( new TestDummyCoder(), outputTuple.get(additionalOutputTag).getCoder()); // Check for corruption }
@Test @Category(ValidatesRunner.class) public void testParDoWithEmptyTaggedOutput() { TupleTag<String> mainOutputTag = new TupleTag<String>("main") {}; TupleTag<String> additionalOutputTag1 = new TupleTag<String>("additional1") {}; TupleTag<String> additionalOutputTag2 = new TupleTag<String>("additional2") {}; PCollectionTuple outputs = pipeline .apply(Create.empty(VarIntCoder.of())) .apply( ParDo.of(new TestNoOutputDoFn()) .withOutputTags( mainOutputTag, TupleTagList.of(additionalOutputTag1).and(additionalOutputTag2))); PAssert.that(outputs.get(mainOutputTag)).empty(); PAssert.that(outputs.get(additionalOutputTag1)).empty(); PAssert.that(outputs.get(additionalOutputTag2)).empty(); pipeline.run(); }
@Parameters(name = "{index}: {0}") public static Iterable<ParDo.MultiOutput<?, ?>> data() { return ImmutableList.of( ParDo.of(new DropElementsFn()).withOutputTags(new TupleTag<>(), TupleTagList.empty()), ParDo.of(new DropElementsFn()) .withOutputTags(new TupleTag<>(), TupleTagList.empty()) .withSideInputs(singletonSideInput, multimapSideInput), ParDo.of(new DropElementsFn()) .withOutputTags( new TupleTag<>(), TupleTagList.of(new TupleTag<byte[]>() {}).and(new TupleTag<Integer>() {})) .withSideInputs(singletonSideInput, multimapSideInput), ParDo.of(new DropElementsFn()) .withOutputTags( new TupleTag<>(), TupleTagList.of(new TupleTag<byte[]>() {}).and(new TupleTag<Integer>() {})), ParDo.of(new SplittableDropElementsFn()) .withOutputTags(new TupleTag<>(), TupleTagList.empty()), ParDo.of(new StateTimerDropElementsFn()) .withOutputTags(new TupleTag<>(), TupleTagList.empty())); }