/** * Returns a {@link PTransform} like the input {@link PTransform}, but with each output to {@code * originalPCollection} replaced with an output (with the same local name) to {@code * newPCollection}. */ private static PTransform updateOutputs( PTransform transform, Map<String, PCollectionNode> originalToPartial) { PTransform.Builder updatedTransformBuilder = transform.toBuilder(); for (Map.Entry<String, String> output : transform.getOutputsMap().entrySet()) { if (originalToPartial.containsKey(output.getValue())) { updatedTransformBuilder.putOutputs( output.getKey(), originalToPartial.get(output.getValue()).getId()); } } return updatedTransformBuilder.build(); }
private void translateStreamingImpulse( String id, RunnerApi.Pipeline pipeline, StreamingTranslationContext context) { RunnerApi.PTransform pTransform = pipeline.getComponents().getTransformsOrThrow(id); TypeInformation<WindowedValue<byte[]>> typeInfo = new CoderTypeInformation<>( WindowedValue.getFullCoder(ByteArrayCoder.of(), GlobalWindow.Coder.INSTANCE)); ObjectMapper objectMapper = new ObjectMapper(); final int intervalMillis; final int messageCount; try { JsonNode config = objectMapper.readTree(pTransform.getSpec().getPayload().toByteArray()); intervalMillis = config.path("interval_ms").asInt(100); messageCount = config.path("message_count").asInt(0); } catch (IOException e) { throw new RuntimeException("Failed to parse configuration for streaming impulse", e); } SingleOutputStreamOperator<WindowedValue<byte[]>> source = context .getExecutionEnvironment() .addSource( new StreamingImpulseSource(intervalMillis, messageCount), StreamingImpulseSource.class.getSimpleName()) .returns(typeInfo); context.addDataStream(Iterables.getOnlyElement(pTransform.getOutputsMap().values()), source); }
private void translateStreamingImpulse( String id, RunnerApi.Pipeline pipeline, StreamingTranslationContext context) { RunnerApi.PTransform pTransform = pipeline.getComponents().getTransformsOrThrow(id); TypeInformation<WindowedValue<byte[]>> typeInfo = new CoderTypeInformation<>( WindowedValue.getFullCoder(ByteArrayCoder.of(), GlobalWindow.Coder.INSTANCE)); ObjectMapper objectMapper = new ObjectMapper(); final int intervalMillis; final int messageCount; try { JsonNode config = objectMapper.readTree(pTransform.getSpec().getPayload().toByteArray()); intervalMillis = config.path("interval_ms").asInt(100); messageCount = config.path("message_count").asInt(0); } catch (IOException e) { throw new RuntimeException("Failed to parse configuration for streaming impulse", e); } SingleOutputStreamOperator<WindowedValue<byte[]>> source = context .getExecutionEnvironment() .addSource(new StreamingImpulseSource(intervalMillis, messageCount)) .returns(typeInfo); context.addDataStream(Iterables.getOnlyElement(pTransform.getOutputsMap().values()), source); }
for (PCollectionNode parDoOutputNode : parDoOutput) { assertThat(graph.getProducer(parDoOutputNode), equalTo(parDoNode)); assertThat(parDoNode.getTransform().getOutputsMap(), hasValue(parDoOutputNode.getId()));
Iterables.getOnlyElement(transform.getTransform().getInputsMap().values()); String outputCollectionId = Iterables.getOnlyElement(transform.getTransform().getOutputsMap().values()); PCollectionNode collectionNode = PipelineNode.pCollection(
Iterables.getOnlyElement(transform.getTransform().getOutputsMap().values()), result);
private <T> void translateAssignWindows( String id, RunnerApi.Pipeline pipeline, StreamingTranslationContext context) { RunnerApi.Components components = pipeline.getComponents(); RunnerApi.PTransform transform = components.getTransformsOrThrow(id); RunnerApi.WindowIntoPayload payload; try { payload = RunnerApi.WindowIntoPayload.parseFrom(transform.getSpec().getPayload()); } catch (InvalidProtocolBufferException e) { throw new IllegalArgumentException(e); } //TODO: https://issues.apache.org/jira/browse/BEAM-4296 // This only works for well known window fns, we should defer this execution to the SDK // if the WindowFn can't be parsed or just defer it all the time. WindowFn<T, ? extends BoundedWindow> windowFn = (WindowFn<T, ? extends BoundedWindow>) WindowingStrategyTranslation.windowFnFromProto(payload.getWindowFn()); String inputCollectionId = Iterables.getOnlyElement(transform.getInputsMap().values()); String outputCollectionId = Iterables.getOnlyElement(transform.getOutputsMap().values()); Coder<WindowedValue<T>> outputCoder = instantiateCoder(outputCollectionId, components); TypeInformation<WindowedValue<T>> resultTypeInfo = new CoderTypeInformation<>(outputCoder); DataStream<WindowedValue<T>> inputDataStream = context.getDataStreamOrThrow(inputCollectionId); FlinkAssignWindows<T, ? extends BoundedWindow> assignWindowsFunction = new FlinkAssignWindows<>(windowFn); DataStream<WindowedValue<T>> resultDataStream = inputDataStream .flatMap(assignWindowsFunction) .name(transform.getUniqueName()) .returns(resultTypeInfo); context.addDataStream(outputCollectionId, resultDataStream); }
private <T> void translateAssignWindows( String id, RunnerApi.Pipeline pipeline, StreamingTranslationContext context) { RunnerApi.Components components = pipeline.getComponents(); RunnerApi.PTransform transform = components.getTransformsOrThrow(id); RunnerApi.WindowIntoPayload payload; try { payload = RunnerApi.WindowIntoPayload.parseFrom(transform.getSpec().getPayload()); } catch (InvalidProtocolBufferException e) { throw new IllegalArgumentException(e); } //TODO: https://issues.apache.org/jira/browse/BEAM-4296 // This only works for well known window fns, we should defer this execution to the SDK // if the WindowFn can't be parsed or just defer it all the time. WindowFn<T, ? extends BoundedWindow> windowFn = (WindowFn<T, ? extends BoundedWindow>) WindowingStrategyTranslation.windowFnFromProto(payload.getWindowFn()); String inputCollectionId = Iterables.getOnlyElement(transform.getInputsMap().values()); String outputCollectionId = Iterables.getOnlyElement(transform.getOutputsMap().values()); Coder<WindowedValue<T>> outputCoder = instantiateCoder(outputCollectionId, components); TypeInformation<WindowedValue<T>> resultTypeInfo = new CoderTypeInformation<>(outputCoder); DataStream<WindowedValue<T>> inputDataStream = context.getDataStreamOrThrow(inputCollectionId); FlinkAssignWindows<T, ? extends BoundedWindow> assignWindowsFunction = new FlinkAssignWindows<>(windowFn); DataStream<WindowedValue<T>> resultDataStream = inputDataStream .flatMap(assignWindowsFunction) .name(transform.getUniqueName()) .returns(resultTypeInfo); context.addDataStream(outputCollectionId, resultDataStream); }
/** * Update all composites present in the {@code originalPipeline} with an URN equal to the provided * {@code urn} using the provided {@link TransformReplacement}. */ public static Pipeline updateTransform( String urn, Pipeline originalPipeline, TransformReplacement compositeBuilder) { Components.Builder resultComponents = originalPipeline.getComponents().toBuilder(); for (Map.Entry<String, PTransform> pt : originalPipeline.getComponents().getTransformsMap().entrySet()) { if (pt.getValue().getSpec() != null && urn.equals(pt.getValue().getSpec().getUrn())) { MessageWithComponents updated = compositeBuilder.getReplacement(pt.getKey(), originalPipeline.getComponents()); checkArgument( updated.getPtransform().getOutputsMap().equals(pt.getValue().getOutputsMap()), "A %s must produce all of the outputs of the original %s", TransformReplacement.class.getSimpleName(), PTransform.class.getSimpleName()); removeSubtransforms(pt.getValue(), resultComponents); resultComponents .mergeFrom(updated.getComponents()) .putTransforms(pt.getKey(), updated.getPtransform()); } } return originalPipeline.toBuilder().setComponents(resultComponents).build(); }
private void assertRootsInTopologicalOrder(RunnerApi.Pipeline fusedProto) { Set<String> consumedPCollections = new HashSet<>(); Set<String> producedPCollections = new HashSet<>(); for (int i = 0; i < fusedProto.getRootTransformIdsCount(); i++) { PTransform rootTransform = fusedProto.getComponents().getTransformsOrThrow(fusedProto.getRootTransformIds(i)); assertThat( String.format( "All %s consumed by %s must be produced before it", PCollection.class.getSimpleName(), fusedProto.getRootTransformIds(i)), producedPCollections, hasItems(rootTransform.getInputsMap().values().toArray(new String[0]))); for (String consumed : consumedPCollections) { assertThat( String.format( "%s %s was consumed before all of its producers produced it", PCollection.class.getSimpleName(), consumed), rootTransform.getOutputsMap().values(), not(hasItem(consumed))); } consumedPCollections.addAll(rootTransform.getInputsMap().values()); producedPCollections.addAll(rootTransform.getOutputsMap().values()); } } }
/** * Tests that {@link QueryablePipeline#getPerElementConsumers(PCollectionNode)} returns a * transform that consumes the node more than once. */ @Test public void perElementConsumersWithConsumingMultipleTimes() { Pipeline p = Pipeline.create(); PCollection<Long> longs = p.apply("BoundedRead", Read.from(CountingSource.upTo(100L))); PCollectionList.of(longs).and(longs).and(longs).apply("flatten", Flatten.pCollections()); Components components = PipelineTranslation.toProto(p).getComponents(); // This breaks if the way that IDs are assigned to PTransforms changes in PipelineTranslation String readOutput = getOnlyElement(components.getTransformsOrThrow("BoundedRead").getOutputsMap().values()); QueryablePipeline qp = QueryablePipeline.forPrimitivesIn(components); Set<PTransformNode> consumers = qp.getPerElementConsumers( PipelineNode.pCollection(readOutput, components.getPcollectionsOrThrow(readOutput))); assertThat(consumers.size(), equalTo(1)); assertThat( getOnlyElement(consumers).getTransform().getSpec().getUrn(), equalTo(PTransformTranslation.FLATTEN_TRANSFORM_URN)); }
public static TupleTagList getAdditionalOutputTags(AppliedPTransform<?, ?, ?> application) throws IOException { PTransform<?, ?> transform = application.getTransform(); if (transform instanceof ParDo.MultiOutput) { return ((ParDo.MultiOutput<?, ?>) transform).getAdditionalOutputTags(); } RunnerApi.PTransform protoTransform = PTransformTranslation.toProto( application, SdkComponents.create(application.getPipeline().getOptions())); ParDoPayload payload = ParDoPayload.parseFrom(protoTransform.getSpec().getPayload()); TupleTag<?> mainOutputTag = getMainOutputTag(payload); Set<String> outputTags = Sets.difference( protoTransform.getOutputsMap().keySet(), Collections.singleton(mainOutputTag.getId())); ArrayList<TupleTag<?>> additionalOutputTags = new ArrayList<>(); for (String outputTag : outputTags) { additionalOutputTags.add(new TupleTag<>(outputTag)); } return TupleTagList.of(additionalOutputTags); }
private static void translateImpulse( PTransformNode transform, RunnerApi.Pipeline pipeline, BatchTranslationContext context) { TypeInformation<WindowedValue<byte[]>> typeInformation = new CoderTypeInformation<>( WindowedValue.getFullCoder(ByteArrayCoder.of(), GlobalWindow.Coder.INSTANCE)); DataSource<WindowedValue<byte[]>> dataSource = new DataSource<>( context.getExecutionEnvironment(), new ImpulseInputFormat(), typeInformation, transform.getTransform().getUniqueName()); context.addDataSet( Iterables.getOnlyElement(transform.getTransform().getOutputsMap().values()), dataSource); }
private static void translateImpulse( PTransformNode transform, RunnerApi.Pipeline pipeline, BatchTranslationContext context) { TypeInformation<WindowedValue<byte[]>> typeInformation = new CoderTypeInformation<>( WindowedValue.getFullCoder(ByteArrayCoder.of(), GlobalWindow.Coder.INSTANCE)); DataSource<WindowedValue<byte[]>> dataSource = new DataSource<>( context.getExecutionEnvironment(), new ImpulseInputFormat(), typeInformation, transform.getTransform().getUniqueName()) .name("Impulse"); context.addDataSet( Iterables.getOnlyElement(transform.getTransform().getOutputsMap().values()), dataSource); }
private void translateImpulse( String id, RunnerApi.Pipeline pipeline, StreamingTranslationContext context) { RunnerApi.PTransform pTransform = pipeline.getComponents().getTransformsOrThrow(id); TypeInformation<WindowedValue<byte[]>> typeInfo = new CoderTypeInformation<>( WindowedValue.getFullCoder(ByteArrayCoder.of(), GlobalWindow.Coder.INSTANCE)); boolean keepSourceAlive = !context.getPipelineOptions().isShutdownSourcesOnFinalWatermark(); SingleOutputStreamOperator<WindowedValue<byte[]>> source = context .getExecutionEnvironment() .addSource(new ImpulseSourceFunction(keepSourceAlive), "Impulse") .returns(typeInfo); context.addDataStream(Iterables.getOnlyElement(pTransform.getOutputsMap().values()), source); }
private void translateImpulse( String id, RunnerApi.Pipeline pipeline, StreamingTranslationContext context) { RunnerApi.PTransform pTransform = pipeline.getComponents().getTransformsOrThrow(id); TypeInformation<WindowedValue<byte[]>> typeInfo = new CoderTypeInformation<>( WindowedValue.getFullCoder(ByteArrayCoder.of(), GlobalWindow.Coder.INSTANCE)); boolean keepSourceAlive = !context.getPipelineOptions().isShutdownSourcesOnFinalWatermark(); SingleOutputStreamOperator<WindowedValue<byte[]>> source = context .getExecutionEnvironment() .addSource(new ImpulseSourceFunction(keepSourceAlive)) .returns(typeInfo); context.addDataStream(Iterables.getOnlyElement(pTransform.getOutputsMap().values()), source); }
private static <K, V> void translateReshuffle( PTransformNode transform, RunnerApi.Pipeline pipeline, BatchTranslationContext context) { DataSet<WindowedValue<KV<K, V>>> inputDataSet = context.getDataSetOrThrow( Iterables.getOnlyElement(transform.getTransform().getInputsMap().values())); context.addDataSet( Iterables.getOnlyElement(transform.getTransform().getOutputsMap().values()), inputDataSet.rebalance()); }
private <K, V> void translateReshuffle( String id, RunnerApi.Pipeline pipeline, StreamingTranslationContext context) { RunnerApi.PTransform transform = pipeline.getComponents().getTransformsOrThrow(id); DataStream<WindowedValue<KV<K, V>>> inputDataStream = context.getDataStreamOrThrow(Iterables.getOnlyElement(transform.getInputsMap().values())); context.addDataStream( Iterables.getOnlyElement(transform.getOutputsMap().values()), inputDataStream.rebalance()); }
private static <K, V> void translateReshuffle( PTransformNode transform, RunnerApi.Pipeline pipeline, BatchTranslationContext context) { DataSet<WindowedValue<KV<K, V>>> inputDataSet = context.getDataSetOrThrow( Iterables.getOnlyElement(transform.getTransform().getInputsMap().values())); context.addDataSet( Iterables.getOnlyElement(transform.getTransform().getOutputsMap().values()), inputDataSet.rebalance()); }
private <K, V> void translateReshuffle( String id, RunnerApi.Pipeline pipeline, StreamingTranslationContext context) { RunnerApi.PTransform transform = pipeline.getComponents().getTransformsOrThrow(id); DataStream<WindowedValue<KV<K, V>>> inputDataStream = context.getDataStreamOrThrow(Iterables.getOnlyElement(transform.getInputsMap().values())); context.addDataStream( Iterables.getOnlyElement(transform.getOutputsMap().values()), inputDataStream.rebalance()); }