/** {@inheritDoc} */ @Override public long getEstimatedSizeBytes(PipelineOptions options) throws Exception { return delegate.getEstimatedSizeBytes(options); }
@Override public long getEstimatedSizeBytes(PipelineOptions options) throws Exception { return boundedSource.getEstimatedSizeBytes(options); }
@Override public BaseStatistics getStatistics(BaseStatistics baseStatistics) throws IOException { try { final long estimatedSize = initialSource.getEstimatedSizeBytes(options); return new BaseStatistics() { @Override public long getTotalInputSize() { return estimatedSize; } @Override public long getNumberOfRecords() { return BaseStatistics.NUM_RECORDS_UNKNOWN; } @Override public float getAverageRecordWidth() { return BaseStatistics.AVG_RECORD_BYTES_UNKNOWN; } }; } catch (Exception e) { LOG.warn("Could not read Source statistics: {}", e); } return null; }
@Override public BaseStatistics getStatistics(BaseStatistics baseStatistics) throws IOException { try { final long estimatedSize = initialSource.getEstimatedSizeBytes(options); return new BaseStatistics() { @Override public long getTotalInputSize() { return estimatedSize; } @Override public long getNumberOfRecords() { return BaseStatistics.NUM_RECORDS_UNKNOWN; } @Override public float getAverageRecordWidth() { return BaseStatistics.AVG_RECORD_BYTES_UNKNOWN; } }; } catch (Exception e) { LOG.warn("Could not read Source statistics: {}", e); } return null; }
@Override public BaseStatistics getStatistics(BaseStatistics baseStatistics) throws IOException { try { final long estimatedSize = initialSource.getEstimatedSizeBytes(options); return new BaseStatistics() { @Override public long getTotalInputSize() { return estimatedSize; } @Override public long getNumberOfRecords() { return BaseStatistics.NUM_RECORDS_UNKNOWN; } @Override public float getAverageRecordWidth() { return BaseStatistics.AVG_RECORD_BYTES_UNKNOWN; } }; } catch (Exception e) { LOG.warn("Could not read Source statistics: {}", e); } return null; }
@Override public long getEstimatedSizeBytes(PipelineOptions options) throws Exception { return underlying.getEstimatedSizeBytes(options); }
@Override public List<Readable<WindowedValue<O>>> getReadables(final int desiredNumOfSplits) throws Exception { final List<Readable<WindowedValue<O>>> readables = new ArrayList<>(); LOG.info("estimate: {}", source.getEstimatedSizeBytes(null)); LOG.info("desired: {}", desiredNumOfSplits); source.split(source.getEstimatedSizeBytes(null) / desiredNumOfSplits, null) .forEach(boundedSource -> readables.add(new BoundedSourceReadable<>(boundedSource))); return readables; }
@Override @SuppressWarnings("unchecked") public SourceInputSplit<T>[] createInputSplits(int numSplits) throws IOException { try { long desiredSizeBytes = initialSource.getEstimatedSizeBytes(options) / numSplits; List<? extends Source<T>> shards = initialSource.split(desiredSizeBytes, options); int numShards = shards.size(); SourceInputSplit<T>[] sourceInputSplits = new SourceInputSplit[numShards]; for (int i = 0; i < numShards; i++) { sourceInputSplits[i] = new SourceInputSplit<>(shards.get(i), i); } return sourceInputSplits; } catch (Exception e) { throw new IOException("Could not create input splits from Source.", e); } }
@Override @SuppressWarnings("unchecked") public SourceInputSplit<T>[] createInputSplits(int numSplits) throws IOException { try { long desiredSizeBytes = initialSource.getEstimatedSizeBytes(options) / numSplits; List<? extends Source<T>> shards = initialSource.split(desiredSizeBytes, options); int numShards = shards.size(); SourceInputSplit<T>[] sourceInputSplits = new SourceInputSplit[numShards]; for (int i = 0; i < numShards; i++) { sourceInputSplits[i] = new SourceInputSplit<>(shards.get(i), i); } return sourceInputSplits; } catch (Exception e) { throw new IOException("Could not create input splits from Source.", e); } }
@Override @SuppressWarnings("unchecked") public SourceInputSplit<T>[] createInputSplits(int numSplits) throws IOException { try { long desiredSizeBytes = initialSource.getEstimatedSizeBytes(options) / numSplits; List<? extends Source<T>> shards = initialSource.split(desiredSizeBytes, options); int numShards = shards.size(); SourceInputSplit<T>[] sourceInputSplits = new SourceInputSplit[numShards]; for (int i = 0; i < numShards; i++) { sourceInputSplits[i] = new SourceInputSplit<>(shards.get(i), i); } return sourceInputSplits; } catch (Exception e) { throw new IOException("Could not create input splits from Source.", e); } }
@SuppressWarnings("unchecked") public BoundedSourceWrapper( String stepName, PipelineOptions pipelineOptions, BoundedSource<OutputT> source, int parallelism) throws Exception { this.stepName = stepName; this.serializedOptions = new SerializablePipelineOptions(pipelineOptions); long desiredBundleSize = source.getEstimatedSizeBytes(pipelineOptions) / parallelism; // get the splits early. we assume that the generated splits are stable, // this is necessary so that the mapping of state to source is correct // when restoring splitSources = source.split(desiredBundleSize, pipelineOptions); }
private Future<BoundedSource<OutputT>> startDynamicSplitThread( BoundedSource<OutputT> source, BoundedReader<OutputT> reader) throws Exception { if (source.getEstimatedSizeBytes(options) > minimumDynamicSplitSize) { return produceSplitExecutor.submit(new GenerateSplitAtHalfwayPoint<>(reader)); } else { SettableFuture<BoundedSource<OutputT>> emptyFuture = SettableFuture.create(); emptyFuture.set(null); return emptyFuture; } }
@Override public List<BoundedToUnboundedSourceAdapter<T>> split( int desiredNumSplits, PipelineOptions options) throws Exception { try { long desiredBundleSize = boundedSource.getEstimatedSizeBytes(options) / desiredNumSplits; if (desiredBundleSize <= 0) { LOG.warn( "BoundedSource {} cannot estimate its size, skips the initial splits.", boundedSource); return ImmutableList.of(this); } List<? extends BoundedSource<T>> splits = boundedSource.split(desiredBundleSize, options); return splits .stream() .map(input -> new BoundedToUnboundedSourceAdapter<>(input)) .collect(Collectors.toList()); } catch (Exception e) { LOG.warn("Exception while splitting {}, skips the initial splits.", boundedSource, e); return ImmutableList.of(this); } }
@Override public Partition[] getPartitions() { try { long desiredSizeBytes = (bundleSize > 0) ? bundleSize : DEFAULT_BUNDLE_SIZE; if (bundleSize == 0) { try { desiredSizeBytes = source.getEstimatedSizeBytes(options.get()) / numPartitions; } catch (Exception e) { LOG.warn( "Failed to get estimated bundle size for source {}, using default bundle " + "size of {} bytes.", source, DEFAULT_BUNDLE_SIZE); } } List<? extends Source<T>> partitionedSources = source.split(desiredSizeBytes, options.get()); Partition[] partitions = new SourcePartition[partitionedSources.size()]; for (int i = 0; i < partitionedSources.size(); i++) { partitions[i] = new SourcePartition<>(id(), i, partitionedSources.get(i)); } return partitions; } catch (Exception e) { throw new RuntimeException( "Failed to create partitions for source " + source.getClass().getSimpleName(), e); } }
@Override public Collection<CommittedBundle<BoundedSourceShard<T>>> getInitialInputs( AppliedPTransform<PBegin, PCollection<T>, PTransform<PBegin, PCollection<T>>> transform, int targetParallelism) throws Exception { BoundedSource<T> source = ReadTranslation.boundedSourceFromTransform(transform); long estimatedBytes = source.getEstimatedSizeBytes(options); long bytesPerBundle = estimatedBytes / targetParallelism; List<? extends BoundedSource<T>> bundles = source.split(bytesPerBundle, options); ImmutableList.Builder<CommittedBundle<BoundedSourceShard<T>>> shards = ImmutableList.builder(); for (BoundedSource<T> bundle : bundles) { CommittedBundle<BoundedSourceShard<T>> inputShard = evaluationContext .<BoundedSourceShard<T>>createRootBundle() .add(WindowedValue.valueInGlobalWindow(BoundedSourceShard.of(bundle))) .commit(BoundedWindow.TIMESTAMP_MAX_VALUE); shards.add(inputShard); } return shards.build(); } }
@Test public void testSplitEstimatedSize() throws Exception { PipelineOptions options = PipelineOptionsFactory.create(); List<KV<IntWritable, Text>> expectedResults = createRandomRecords(3, 10000, 0); File file = createFileWithData("tmp.avro", expectedResults); HDFSFileSource<KV<IntWritable, Text>, IntWritable, Text> source = HDFSFileSource.from(file.toString(), SequenceFileInputFormat.class, IntWritable.class, Text.class); long originalSize = source.getEstimatedSizeBytes(options); long splitTotalSize = 0; List<? extends BoundedSource<KV<IntWritable, Text>>> splits = source.splitIntoBundles( SequenceFile.SYNC_INTERVAL, options ); for (BoundedSource<KV<IntWritable, Text>> splitSource : splits) { splitTotalSize += splitSource.getEstimatedSizeBytes(options); } // Assert that the estimated size of the whole is the sum of its parts assertEquals(originalSize, splitTotalSize); }
@Test @Category({ValidatesRunner.class, DataflowPortabilityApiUnsupported.class}) public void testBoundedSourceSplits() throws Exception { long numElements = 1000; long numSplits = 10; long splitSizeBytes = numElements * 8 / numSplits; // 8 bytes per long element. BoundedSource<Long> initial = CountingSource.upTo(numElements); List<? extends BoundedSource<Long>> splits = initial.split(splitSizeBytes, p.getOptions()); assertEquals("Expected exact splitting", numSplits, splits.size()); // Assemble all the splits into one flattened PCollection, also verify their sizes. PCollectionList<Long> pcollections = PCollectionList.empty(p); for (int i = 0; i < splits.size(); ++i) { BoundedSource<Long> split = splits.get(i); pcollections = pcollections.and(p.apply("split" + i, Read.from(split))); assertEquals( "Expected even splitting", splitSizeBytes, split.getEstimatedSizeBytes(p.getOptions())); } PCollection<Long> input = pcollections.apply(Flatten.pCollections()); addCountingAsserts(input, numElements); p.run(); }
PipelineOptions options = PipelineOptionsFactory.create(); List<? extends BoundedSource<Long>> splits = source.split(source.getEstimatedSizeBytes(options) / 2, options);
assertEquals(118, bqSource.getEstimatedSizeBytes(options));
assertEquals(108, bqSource.getEstimatedSizeBytes(options));