@Override public List<Readable<Object>> getReadables(final int desiredNumOfSplits) throws Exception { final List<Readable<Object>> readables = new ArrayList<>(); source.split(desiredNumOfSplits, null) .forEach(unboundedSource -> readables.add(new UnboundedSourceReadable<>(unboundedSource))); return readables; }
List<? extends Source<T>> split(final PipelineOptions options) throws Exception { final List<MicrobatchSource<T, CheckpointMarkT>> result = new ArrayList<>(); final List<? extends UnboundedSource<T, CheckpointMarkT>> splits = source.split(numInitialSplits, options); final int numSplits = splits.size(); final long[] numRecords = splitNumRecords(maxNumRecords, numSplits); for (int i = 0; i < numSplits; i++) { // splits must be stable, and cannot change during consecutive executions // for example: Kafka should not add partitions if more then one topic is read. result.add( new MicrobatchSource<>( splits.get(i), maxReadTime, 1, numRecords[i], i, sourceId, readerCacheInterval)); } return result; }
@SuppressWarnings("unchecked") public UnboundedSourceWrapper( String stepName, PipelineOptions pipelineOptions, UnboundedSource<OutputT, CheckpointMarkT> source, int parallelism) throws Exception { this.stepName = stepName; this.serializedOptions = new SerializablePipelineOptions(pipelineOptions); if (source.requiresDeduping()) { LOG.warn("Source {} requires deduping but Flink runner doesn't support this yet.", source); } Coder<CheckpointMarkT> checkpointMarkCoder = source.getCheckpointMarkCoder(); if (checkpointMarkCoder == null) { LOG.info("No CheckpointMarkCoder specified for this source. Won't create snapshots."); checkpointCoder = null; } else { Coder<? extends UnboundedSource<OutputT, CheckpointMarkT>> sourceCoder = (Coder) SerializableCoder.of(new TypeDescriptor<UnboundedSource>() {}); checkpointCoder = KvCoder.of(sourceCoder, checkpointMarkCoder); } // get the splits early. we assume that the generated splits are stable, // this is necessary so that the mapping of state to source is correct // when restoring splitSources = source.split(parallelism, pipelineOptions); }
@SuppressWarnings("unchecked") public UnboundedSourceWrapper( String stepName, PipelineOptions pipelineOptions, UnboundedSource<OutputT, CheckpointMarkT> source, int parallelism) throws Exception { this.stepName = stepName; this.serializedOptions = new SerializablePipelineOptions(pipelineOptions); if (source.requiresDeduping()) { LOG.warn("Source {} requires deduping but Flink runner doesn't support this yet.", source); } Coder<CheckpointMarkT> checkpointMarkCoder = source.getCheckpointMarkCoder(); if (checkpointMarkCoder == null) { LOG.info("No CheckpointMarkCoder specified for this source. Won't create snapshots."); checkpointCoder = null; } else { Coder<? extends UnboundedSource<OutputT, CheckpointMarkT>> sourceCoder = (Coder) SerializableCoder.of(new TypeDescriptor<UnboundedSource>() { }); checkpointCoder = KvCoder.of(sourceCoder, checkpointMarkCoder); } // get the splits early. we assume that the generated splits are stable, // this is necessary so that the mapping of state to source is correct // when restoring splitSources = source.split(parallelism, pipelineOptions); }
@SuppressWarnings("unchecked") public UnboundedSourceWrapper( String stepName, PipelineOptions pipelineOptions, UnboundedSource<OutputT, CheckpointMarkT> source, int parallelism) throws Exception { this.stepName = stepName; this.serializedOptions = new SerializablePipelineOptions(pipelineOptions); if (source.requiresDeduping()) { LOG.warn("Source {} requires deduping but Flink runner doesn't support this yet.", source); } Coder<CheckpointMarkT> checkpointMarkCoder = source.getCheckpointMarkCoder(); if (checkpointMarkCoder == null) { LOG.info("No CheckpointMarkCoder specified for this source. Won't create snapshots."); checkpointCoder = null; } else { Coder<? extends UnboundedSource<OutputT, CheckpointMarkT>> sourceCoder = (Coder) SerializableCoder.of(new TypeDescriptor<UnboundedSource>() {}); checkpointCoder = KvCoder.of(sourceCoder, checkpointMarkCoder); } // get the splits early. we assume that the generated splits are stable, // this is necessary so that the mapping of state to source is correct // when restoring splitSources = source.split(parallelism, pipelineOptions); shutdownOnFinalWatermark = pipelineOptions.as(FlinkPipelineOptions.class).isShutdownSourcesOnFinalWatermark(); }
@ProcessElement public void process( @Element Shard<T> shard, OutputReceiver<Shard<T>> out, PipelineOptions options) throws Exception { int numInitialSplits = numInitialSplits(shard.getMaxNumRecords()); List<? extends UnboundedSource<T, ?>> splits = shard.getSource().split(numInitialSplits, options); int numSplits = splits.size(); long[] numRecords = splitNumRecords(shard.getMaxNumRecords(), numSplits); for (int i = 0; i < numSplits; i++) { out.output( shard .toBuilder() .setSource(splits.get(i)) .setMaxNumRecords(numRecords[i]) .setMaxReadTime(shard.getMaxReadTime()) .build()); } } }
int desiredNumSplits = getDesiredNumUnboundedSourceSplits(options.as(DataflowPipelineOptions.class)); for (UnboundedSource<?, ?> split : unboundedSource.split(desiredNumSplits, options)) { encodedSplits.add(encodeBase64String(serializeToByteArray(split)));
@Override public Collection<CommittedBundle<UnboundedSourceShard<T, ?>>> getInitialInputs( AppliedPTransform<PBegin, PCollection<T>, PTransform<PBegin, PCollection<T>>> transform, int targetParallelism) throws Exception { UnboundedSource<T, ?> source = ReadTranslation.unboundedSourceFromTransform(transform); List<? extends UnboundedSource<T, ?>> splits = source.split(targetParallelism, options); UnboundedReadDeduplicator deduplicator = source.requiresDeduping() ? UnboundedReadDeduplicator.CachedIdDeduplicator.create() : NeverDeduplicator.create(); ImmutableList.Builder<CommittedBundle<UnboundedSourceShard<T, ?>>> initialShards = ImmutableList.builder(); for (UnboundedSource<T, ?> split : splits) { UnboundedSourceShard<T, ?> shard = UnboundedSourceShard.unstarted(split, deduplicator); initialShards.add( evaluationContext .<UnboundedSourceShard<T, ?>>createRootBundle() .add(WindowedValue.valueInGlobalWindow(shard)) .commit(BoundedWindow.TIMESTAMP_MAX_VALUE)); } return initialShards.build(); } }
@Test @Category({ValidatesRunner.class, DataflowPortabilityApiUnsupported.class}) public void testUnboundedSourceSplits() throws Exception { long numElements = 1000; int numSplits = 10; UnboundedSource<Long, ?> initial = CountingSource.unbounded(); List<? extends UnboundedSource<Long, ?>> splits = initial.split(numSplits, p.getOptions()); assertEquals("Expected exact splitting", numSplits, splits.size()); long elementsPerSplit = numElements / numSplits; assertEquals("Expected even splits", numElements, elementsPerSplit * numSplits); PCollectionList<Long> pcollections = PCollectionList.empty(p); for (int i = 0; i < splits.size(); ++i) { pcollections = pcollections.and( p.apply("split" + i, Read.from(splits.get(i)).withMaxNumRecords(elementsPerSplit))); } PCollection<Long> input = pcollections.apply(Flatten.pCollections()); addCountingAsserts(input, numElements); p.run(); }
@Test public void testUnboundedSourceSplits() throws Exception { int numElements = 1000; int numSplits = 10; // Coders must be specified explicitly here due to the way the transform // is used in the test. UnboundedSource<KafkaRecord<Integer, Long>, ?> initial = mkKafkaReadTransform(numElements, null) .withKeyDeserializerAndCoder(IntegerDeserializer.class, BigEndianIntegerCoder.of()) .withValueDeserializerAndCoder(LongDeserializer.class, BigEndianLongCoder.of()) .makeSource(); List<? extends UnboundedSource<KafkaRecord<Integer, Long>, ?>> splits = initial.split(numSplits, p.getOptions()); assertEquals("Expected exact splitting", numSplits, splits.size()); long elementsPerSplit = numElements / numSplits; assertEquals("Expected even splits", numElements, elementsPerSplit * numSplits); PCollectionList<Long> pcollections = PCollectionList.empty(p); for (int i = 0; i < splits.size(); ++i) { pcollections = pcollections.and( p.apply("split" + i, Read.from(splits.get(i)).withMaxNumRecords(elementsPerSplit)) .apply("Remove Metadata " + i, ParDo.of(new RemoveKafkaMetadata<>())) .apply("collection " + i, Values.create())); } PCollection<Long> input = pcollections.apply(Flatten.pCollections()); addCountingAsserts(input, numElements); p.run(); }
mkKafkaReadTransform(numElements, new ValueAsTimestampFn()) .makeSource() .split(1, PipelineOptionsFactory.create()) .get(0);