/** * Shuffles the delegate source's splits. */ @Override public List<? extends BoundedSource<T>> split(long desiredBundleSizeBytes, PipelineOptions options) throws Exception { List<? extends BoundedSource<T>> splits = delegate.split(desiredBundleSizeBytes, options); Collections.shuffle(splits); return splits; }
public static <T> List<T> readFromSplitsOfSource( BoundedSource<T> source, long desiredBundleSizeBytes, PipelineOptions options) throws Exception { List<T> res = Lists.newArrayList(); for (BoundedSource<T> split : source.split(desiredBundleSizeBytes, options)) { res.addAll(readFromSource(split, options)); } return res; }
@Override public List<Readable<WindowedValue<O>>> getReadables(final int desiredNumOfSplits) throws Exception { final List<Readable<WindowedValue<O>>> readables = new ArrayList<>(); LOG.info("estimate: {}", source.getEstimatedSizeBytes(null)); LOG.info("desired: {}", desiredNumOfSplits); source.split(source.getEstimatedSizeBytes(null) / desiredNumOfSplits, null) .forEach(boundedSource -> readables.add(new BoundedSourceReadable<>(boundedSource))); return readables; }
@Override @SuppressWarnings("unchecked") public SourceInputSplit<T>[] createInputSplits(int numSplits) throws IOException { try { long desiredSizeBytes = initialSource.getEstimatedSizeBytes(options) / numSplits; List<? extends Source<T>> shards = initialSource.split(desiredSizeBytes, options); int numShards = shards.size(); SourceInputSplit<T>[] sourceInputSplits = new SourceInputSplit[numShards]; for (int i = 0; i < numShards; i++) { sourceInputSplits[i] = new SourceInputSplit<>(shards.get(i), i); } return sourceInputSplits; } catch (Exception e) { throw new IOException("Could not create input splits from Source.", e); } }
@Override @SuppressWarnings("unchecked") public SourceInputSplit<T>[] createInputSplits(int numSplits) throws IOException { try { long desiredSizeBytes = initialSource.getEstimatedSizeBytes(options) / numSplits; List<? extends Source<T>> shards = initialSource.split(desiredSizeBytes, options); int numShards = shards.size(); SourceInputSplit<T>[] sourceInputSplits = new SourceInputSplit[numShards]; for (int i = 0; i < numShards; i++) { sourceInputSplits[i] = new SourceInputSplit<>(shards.get(i), i); } return sourceInputSplits; } catch (Exception e) { throw new IOException("Could not create input splits from Source.", e); } }
@Override @SuppressWarnings("unchecked") public SourceInputSplit<T>[] createInputSplits(int numSplits) throws IOException { try { long desiredSizeBytes = initialSource.getEstimatedSizeBytes(options) / numSplits; List<? extends Source<T>> shards = initialSource.split(desiredSizeBytes, options); int numShards = shards.size(); SourceInputSplit<T>[] sourceInputSplits = new SourceInputSplit[numShards]; for (int i = 0; i < numShards; i++) { sourceInputSplits[i] = new SourceInputSplit<>(shards.get(i), i); } return sourceInputSplits; } catch (Exception e) { throw new IOException("Could not create input splits from Source.", e); } }
@ProcessElement public void splitSource(ProcessContext ctxt) throws Exception { for (BoundedSource<T> split : source.split(bundleSize, ctxt.getPipelineOptions())) { ctxt.output(split); } } }
@SuppressWarnings("unchecked") public BoundedSourceWrapper( String stepName, PipelineOptions pipelineOptions, BoundedSource<OutputT> source, int parallelism) throws Exception { this.stepName = stepName; this.serializedOptions = new SerializablePipelineOptions(pipelineOptions); long desiredBundleSize = source.getEstimatedSizeBytes(pipelineOptions) / parallelism; // get the splits early. we assume that the generated splits are stable, // this is necessary so that the mapping of state to source is correct // when restoring splitSources = source.split(desiredBundleSize, pipelineOptions); }
@Override public List<BoundedToUnboundedSourceAdapter<T>> split( int desiredNumSplits, PipelineOptions options) throws Exception { try { long desiredBundleSize = boundedSource.getEstimatedSizeBytes(options) / desiredNumSplits; if (desiredBundleSize <= 0) { LOG.warn( "BoundedSource {} cannot estimate its size, skips the initial splits.", boundedSource); return ImmutableList.of(this); } List<? extends BoundedSource<T>> splits = boundedSource.split(desiredBundleSize, options); return splits .stream() .map(input -> new BoundedToUnboundedSourceAdapter<>(input)) .collect(Collectors.toList()); } catch (Exception e) { LOG.warn("Exception while splitting {}, skips the initial splits.", boundedSource, e); return ImmutableList.of(this); } }
@Override public Partition[] getPartitions() { try { long desiredSizeBytes = (bundleSize > 0) ? bundleSize : DEFAULT_BUNDLE_SIZE; if (bundleSize == 0) { try { desiredSizeBytes = source.getEstimatedSizeBytes(options.get()) / numPartitions; } catch (Exception e) { LOG.warn( "Failed to get estimated bundle size for source {}, using default bundle " + "size of {} bytes.", source, DEFAULT_BUNDLE_SIZE); } } List<? extends Source<T>> partitionedSources = source.split(desiredSizeBytes, options.get()); Partition[] partitions = new SourcePartition[partitionedSources.size()]; for (int i = 0; i < partitionedSources.size(); i++) { partitions[i] = new SourcePartition<>(id(), i, partitionedSources.get(i)); } return partitions; } catch (Exception e) { throw new RuntimeException( "Failed to create partitions for source " + source.getClass().getSimpleName(), e); } }
@Override public List<? extends BoundedSource<T>> split( long desiredBundleSizeBytes, PipelineOptions options) throws Exception { // Must have more than checkState( desiredBundleSizeBytes < getEstimatedSizeBytes(options), "Must split into more than one source"); return underlying.split(desiredBundleSizeBytes, options); }
@Test public void testSplitWithEmptyBundles() throws Exception { String fileName = "temp.xml"; List<Train> trains = generateRandomTrainList(10); File file = createRandomTrainXML(fileName, trains); BoundedSource<Train> source = XmlIO.<Train>read() .from(file.toPath().toString()) .withRootElement("trains") .withRecordElement("train") .withRecordClass(Train.class) .withMinBundleSize(10) .createSource(); List<? extends BoundedSource<Train>> splits = source.split(100, null); assertTrue(splits.size() > 2); List<Train> results = new ArrayList<>(); for (BoundedSource<Train> split : splits) { results.addAll(readEverythingFromReader(split.createReader(null))); } assertThat(trainsToStrings(trains), containsInAnyOrder(trainsToStrings(results).toArray())); }
@Test public void testXMLWithSplits() throws Exception { String fileName = "temp.xml"; List<Train> trains = generateRandomTrainList(100); File file = createRandomTrainXML(fileName, trains); BoundedSource<Train> source = XmlIO.<Train>read() .from(file.toPath().toString()) .withRootElement("trains") .withRecordElement("train") .withRecordClass(Train.class) .withMinBundleSize(10) .createSource(); List<? extends BoundedSource<Train>> splits = source.split(256, null); // Not a trivial split assertTrue(splits.size() > 2); List<Train> results = new ArrayList<>(); for (BoundedSource<Train> split : splits) { results.addAll(readEverythingFromReader(split.createReader(null))); } assertThat(trainsToStrings(trains), containsInAnyOrder(trainsToStrings(results).toArray())); }
@Test public void testSplitWithEmptyBundleAtEnd() throws Exception { File file = tempFolder.newFile("trainXMLTiny"); Files.write(file.toPath(), tinyXML.getBytes(StandardCharsets.UTF_8)); BoundedSource<Train> source = XmlIO.<Train>read() .from(file.toPath().toString()) .withRootElement("trains") .withRecordElement("train") .withRecordClass(Train.class) .withMinBundleSize(10) .createSource(); List<? extends BoundedSource<Train>> splits = source.split(50, null); assertTrue(splits.size() > 2); List<Train> results = new ArrayList<>(); for (BoundedSource<Train> split : splits) { results.addAll(readEverythingFromReader(split.createReader(null))); } List<Train> expectedResults = ImmutableList.of( new Train("Thomas", Train.TRAIN_NUMBER_UNDEFINED, null, null), new Train("Henry", Train.TRAIN_NUMBER_UNDEFINED, null, null), new Train("James", Train.TRAIN_NUMBER_UNDEFINED, null, null)); assertThat( trainsToStrings(expectedResults), containsInAnyOrder(trainsToStrings(results).toArray())); }
@Override public Collection<CommittedBundle<BoundedSourceShard<T>>> getInitialInputs( AppliedPTransform<PBegin, PCollection<T>, PTransform<PBegin, PCollection<T>>> transform, int targetParallelism) throws Exception { BoundedSource<T> source = ReadTranslation.boundedSourceFromTransform(transform); long estimatedBytes = source.getEstimatedSizeBytes(options); long bytesPerBundle = estimatedBytes / targetParallelism; List<? extends BoundedSource<T>> bundles = source.split(bytesPerBundle, options); ImmutableList.Builder<CommittedBundle<BoundedSourceShard<T>>> shards = ImmutableList.builder(); for (BoundedSource<T> bundle : bundles) { CommittedBundle<BoundedSourceShard<T>> inputShard = evaluationContext .<BoundedSourceShard<T>>createRootBundle() .add(WindowedValue.valueInGlobalWindow(BoundedSourceShard.of(bundle))) .commit(BoundedWindow.TIMESTAMP_MAX_VALUE); shards.add(inputShard); } return shards.build(); } }
@Test public void testToUnsplittableSource() throws Exception { PipelineOptions options = PipelineOptionsFactory.create(); BoundedSource<Long> baseSource = CountingSource.upTo(100); BoundedSource<Long> unsplittableSource = SourceTestUtils.toUnsplittableSource(baseSource); List<?> splits = unsplittableSource.split(1, options); assertEquals(1, splits.size()); assertEquals(unsplittableSource, splits.get(0)); BoundedReader<Long> unsplittableReader = unsplittableSource.createReader(options); assertEquals(0, unsplittableReader.getFractionConsumed(), 1e-15); Set<Long> expected = Sets.newHashSet(SourceTestUtils.readFromSource(baseSource, options)); Set<Long> actual = Sets.newHashSet(); actual.addAll(SourceTestUtils.readNItemsFromUnstartedReader(unsplittableReader, 40)); assertNull(unsplittableReader.splitAtFraction(0.5)); actual.addAll(SourceTestUtils.readRemainingFromReader(unsplittableReader, true /* started */)); assertEquals(1, unsplittableReader.getFractionConsumed(), 1e-15); assertEquals(100, actual.size()); assertEquals(Sets.newHashSet(expected), Sets.newHashSet(actual)); } }
@Test @Category({ValidatesRunner.class, DataflowPortabilityApiUnsupported.class}) public void testBoundedSourceSplits() throws Exception { long numElements = 1000; long numSplits = 10; long splitSizeBytes = numElements * 8 / numSplits; // 8 bytes per long element. BoundedSource<Long> initial = CountingSource.upTo(numElements); List<? extends BoundedSource<Long>> splits = initial.split(splitSizeBytes, p.getOptions()); assertEquals("Expected exact splitting", numSplits, splits.size()); // Assemble all the splits into one flattened PCollection, also verify their sizes. PCollectionList<Long> pcollections = PCollectionList.empty(p); for (int i = 0; i < splits.size(); ++i) { BoundedSource<Long> split = splits.get(i); pcollections = pcollections.and(p.apply("split" + i, Read.from(split))); assertEquals( "Expected even splitting", splitSizeBytes, split.getEstimatedSizeBytes(p.getOptions())); } PCollection<Long> input = pcollections.apply(Flatten.pCollections()); addCountingAsserts(input, numElements); p.run(); }
PipelineOptions options = PipelineOptionsFactory.create(); List<? extends BoundedSource<Long>> splits = source.split(source.getEstimatedSizeBytes(options) / 2, options);
.createSource(); List<? extends BoundedSource<Train>> splits = fileSource.split(file.length() / 3, null); for (BoundedSource<Train> splitSource : splits) { int numItems = readEverythingFromReader(splitSource.createReader(null)).size();