org.apache.beam.sdk.io.UnboundedSource.split java code examples

@Override
public List<Readable<Object>> getReadables(final int desiredNumOfSplits) throws Exception {
 final List<Readable<Object>> readables = new ArrayList<>();
 source.split(desiredNumOfSplits, null)
  .forEach(unboundedSource -> readables.add(new UnboundedSourceReadable<>(unboundedSource)));
 return readables;
}

List<? extends Source<T>> split(final PipelineOptions options) throws Exception {
 final List<MicrobatchSource<T, CheckpointMarkT>> result = new ArrayList<>();
 final List<? extends UnboundedSource<T, CheckpointMarkT>> splits =
   source.split(numInitialSplits, options);
 final int numSplits = splits.size();
 final long[] numRecords = splitNumRecords(maxNumRecords, numSplits);
 for (int i = 0; i < numSplits; i++) {
  // splits must be stable, and cannot change during consecutive executions
  // for example: Kafka should not add partitions if more then one topic is read.
  result.add(
    new MicrobatchSource<>(
      splits.get(i), maxReadTime, 1, numRecords[i], i, sourceId, readerCacheInterval));
 }
 return result;
}

@SuppressWarnings("unchecked")
public UnboundedSourceWrapper(
  String stepName,
  PipelineOptions pipelineOptions,
  UnboundedSource<OutputT, CheckpointMarkT> source,
  int parallelism)
  throws Exception {
 this.stepName = stepName;
 this.serializedOptions = new SerializablePipelineOptions(pipelineOptions);
 if (source.requiresDeduping()) {
  LOG.warn("Source {} requires deduping but Flink runner doesn't support this yet.", source);
 }
 Coder<CheckpointMarkT> checkpointMarkCoder = source.getCheckpointMarkCoder();
 if (checkpointMarkCoder == null) {
  LOG.info("No CheckpointMarkCoder specified for this source. Won't create snapshots.");
  checkpointCoder = null;
 } else {
  Coder<? extends UnboundedSource<OutputT, CheckpointMarkT>> sourceCoder =
    (Coder) SerializableCoder.of(new TypeDescriptor<UnboundedSource>() {});
  checkpointCoder = KvCoder.of(sourceCoder, checkpointMarkCoder);
 }
 // get the splits early. we assume that the generated splits are stable,
 // this is necessary so that the mapping of state to source is correct
 // when restoring
 splitSources = source.split(parallelism, pipelineOptions);
}

@SuppressWarnings("unchecked")
public UnboundedSourceWrapper(
  String stepName,
  PipelineOptions pipelineOptions,
  UnboundedSource<OutputT, CheckpointMarkT> source,
  int parallelism) throws Exception {
 this.stepName = stepName;
 this.serializedOptions = new SerializablePipelineOptions(pipelineOptions);
 if (source.requiresDeduping()) {
  LOG.warn("Source {} requires deduping but Flink runner doesn't support this yet.", source);
 }
 Coder<CheckpointMarkT> checkpointMarkCoder = source.getCheckpointMarkCoder();
 if (checkpointMarkCoder == null) {
  LOG.info("No CheckpointMarkCoder specified for this source. Won't create snapshots.");
  checkpointCoder = null;
 } else {
  Coder<? extends UnboundedSource<OutputT, CheckpointMarkT>> sourceCoder =
    (Coder) SerializableCoder.of(new TypeDescriptor<UnboundedSource>() {
    });
  checkpointCoder = KvCoder.of(sourceCoder, checkpointMarkCoder);
 }
 // get the splits early. we assume that the generated splits are stable,
 // this is necessary so that the mapping of state to source is correct
 // when restoring
 splitSources = source.split(parallelism, pipelineOptions);
}

@SuppressWarnings("unchecked")
public UnboundedSourceWrapper(
  String stepName,
  PipelineOptions pipelineOptions,
  UnboundedSource<OutputT, CheckpointMarkT> source,
  int parallelism)
  throws Exception {
 this.stepName = stepName;
 this.serializedOptions = new SerializablePipelineOptions(pipelineOptions);
 if (source.requiresDeduping()) {
  LOG.warn("Source {} requires deduping but Flink runner doesn't support this yet.", source);
 }
 Coder<CheckpointMarkT> checkpointMarkCoder = source.getCheckpointMarkCoder();
 if (checkpointMarkCoder == null) {
  LOG.info("No CheckpointMarkCoder specified for this source. Won't create snapshots.");
  checkpointCoder = null;
 } else {
  Coder<? extends UnboundedSource<OutputT, CheckpointMarkT>> sourceCoder =
    (Coder) SerializableCoder.of(new TypeDescriptor<UnboundedSource>() {});
  checkpointCoder = KvCoder.of(sourceCoder, checkpointMarkCoder);
 }
 // get the splits early. we assume that the generated splits are stable,
 // this is necessary so that the mapping of state to source is correct
 // when restoring
 splitSources = source.split(parallelism, pipelineOptions);
 shutdownOnFinalWatermark =
   pipelineOptions.as(FlinkPipelineOptions.class).isShutdownSourcesOnFinalWatermark();
}

 @ProcessElement
 public void process(
   @Element Shard<T> shard, OutputReceiver<Shard<T>> out, PipelineOptions options)
   throws Exception {
  int numInitialSplits = numInitialSplits(shard.getMaxNumRecords());
  List<? extends UnboundedSource<T, ?>> splits =
    shard.getSource().split(numInitialSplits, options);
  int numSplits = splits.size();
  long[] numRecords = splitNumRecords(shard.getMaxNumRecords(), numSplits);
  for (int i = 0; i < numSplits; i++) {
   out.output(
     shard
       .toBuilder()
       .setSource(splits.get(i))
       .setMaxNumRecords(numRecords[i])
       .setMaxReadTime(shard.getMaxReadTime())
       .build());
  }
 }
}

int desiredNumSplits =
  getDesiredNumUnboundedSourceSplits(options.as(DataflowPipelineOptions.class));
for (UnboundedSource<?, ?> split : unboundedSource.split(desiredNumSplits, options)) {
 encodedSplits.add(encodeBase64String(serializeToByteArray(split)));

 @Override
 public Collection<CommittedBundle<UnboundedSourceShard<T, ?>>> getInitialInputs(
   AppliedPTransform<PBegin, PCollection<T>, PTransform<PBegin, PCollection<T>>> transform,
   int targetParallelism)
   throws Exception {
  UnboundedSource<T, ?> source = ReadTranslation.unboundedSourceFromTransform(transform);
  List<? extends UnboundedSource<T, ?>> splits = source.split(targetParallelism, options);
  UnboundedReadDeduplicator deduplicator =
    source.requiresDeduping()
      ? UnboundedReadDeduplicator.CachedIdDeduplicator.create()
      : NeverDeduplicator.create();
  ImmutableList.Builder<CommittedBundle<UnboundedSourceShard<T, ?>>> initialShards =
    ImmutableList.builder();
  for (UnboundedSource<T, ?> split : splits) {
   UnboundedSourceShard<T, ?> shard = UnboundedSourceShard.unstarted(split, deduplicator);
   initialShards.add(
     evaluationContext
       .<UnboundedSourceShard<T, ?>>createRootBundle()
       .add(WindowedValue.valueInGlobalWindow(shard))
       .commit(BoundedWindow.TIMESTAMP_MAX_VALUE));
  }
  return initialShards.build();
 }
}

@Test
@Category({ValidatesRunner.class, DataflowPortabilityApiUnsupported.class})
public void testUnboundedSourceSplits() throws Exception {
 long numElements = 1000;
 int numSplits = 10;
 UnboundedSource<Long, ?> initial = CountingSource.unbounded();
 List<? extends UnboundedSource<Long, ?>> splits = initial.split(numSplits, p.getOptions());
 assertEquals("Expected exact splitting", numSplits, splits.size());
 long elementsPerSplit = numElements / numSplits;
 assertEquals("Expected even splits", numElements, elementsPerSplit * numSplits);
 PCollectionList<Long> pcollections = PCollectionList.empty(p);
 for (int i = 0; i < splits.size(); ++i) {
  pcollections =
    pcollections.and(
      p.apply("split" + i, Read.from(splits.get(i)).withMaxNumRecords(elementsPerSplit)));
 }
 PCollection<Long> input = pcollections.apply(Flatten.pCollections());
 addCountingAsserts(input, numElements);
 p.run();
}

mkKafkaReadTransform(initialNumElements, new ValueAsTimestampFn())
  .makeSource()
  .split(1, PipelineOptionsFactory.create())
  .get(0);
  .withTimestampFn(new ValueAsTimestampFn())
  .makeSource()
  .split(1, PipelineOptionsFactory.create())
  .get(0);

@Test
public void testUnboundedSourceSplits() throws Exception {
 int numElements = 1000;
 int numSplits = 10;
 // Coders must be specified explicitly here due to the way the transform
 // is used in the test.
 UnboundedSource<KafkaRecord<Integer, Long>, ?> initial =
   mkKafkaReadTransform(numElements, null)
     .withKeyDeserializerAndCoder(IntegerDeserializer.class, BigEndianIntegerCoder.of())
     .withValueDeserializerAndCoder(LongDeserializer.class, BigEndianLongCoder.of())
     .makeSource();
 List<? extends UnboundedSource<KafkaRecord<Integer, Long>, ?>> splits =
   initial.split(numSplits, p.getOptions());
 assertEquals("Expected exact splitting", numSplits, splits.size());
 long elementsPerSplit = numElements / numSplits;
 assertEquals("Expected even splits", numElements, elementsPerSplit * numSplits);
 PCollectionList<Long> pcollections = PCollectionList.empty(p);
 for (int i = 0; i < splits.size(); ++i) {
  pcollections =
    pcollections.and(
      p.apply("split" + i, Read.from(splits.get(i)).withMaxNumRecords(elementsPerSplit))
        .apply("Remove Metadata " + i, ParDo.of(new RemoveKafkaMetadata<>()))
        .apply("collection " + i, Values.create()));
 }
 PCollection<Long> input = pcollections.apply(Flatten.pCollections());
 addCountingAsserts(input, numElements);
 p.run();
}

mkKafkaReadTransform(numElements, new ValueAsTimestampFn())
  .makeSource()
  .split(1, PipelineOptionsFactory.create())
  .get(0);

Javadoc

Returns a list of UnboundedSource objects representing the instances of this source that should be used when executing the workflow. Each split should return a separate partition of the input data.

For example, for a source reading from a growing directory of files, each split could correspond to a prefix of file names.

Some sources are not splittable, such as reading from a single TCP stream. In that case, only a single split should be returned.

Some data sources automatically partition their data among readers. For these types of inputs, n identical replicas of the top-level source can be returned.

The size of the returned list should be as close to desiredNumSplits as possible, but does not have to match exactly. A low number of splits will limit the amount of parallelism in the source.

Popular methods of UnboundedSource

createReader
Create a new UnboundedReader to read from this source, resuming from the given checkpoint if present
getCheckpointMarkCoder
Returns a Coder for encoding and decoding the checkpoints for this source.
requiresDeduping
Returns whether this source requires explicit deduping.This is needed if the underlying data source
getOutputCoder
validate

Popular in Java

Parsing JSON documents to java classes using gson
startActivity (Activity)
onRequestPermissionsResult (Fragment)
getContentResolver (Context)
EOFException (java.io)
Thrown when a program encounters the end of a file or stream during an input operation.
HttpURLConnection (java.net)
An URLConnection for HTTP (RFC 2616 [http://tools.ietf.org/html/rfc2616]) used to send and receive d
ByteBuffer (java.nio)
A buffer for bytes. A byte buffer can be created in either one of the following ways: * #allocate
HttpServletRequest (javax.servlet.http)
Extends the javax.servlet.ServletRequest interface to provide request information for HTTP servlets.
Window (java.awt)
A Window object is a top-level window with no borders and no menubar. The default layout for a windo
Annotation (javassist.bytecode.annotation)
The annotation structure.An instance of this class is returned bygetAnnotations() in AnnotationsAttr
From CI to AI: The AI layer in your organization

How to use splitmethodin org.apache.beam.sdk.io.UnboundedSource

Best Java code snippets using org.apache.beam.sdk.io.UnboundedSource.split (Showing top 12 results out of 315)

How to use
split
method
in
org.apache.beam.sdk.io.UnboundedSource