@Override protected Source.Reader<T> createReader(PipelineOptions options) throws IOException { return source.createReader(options); } }
@SuppressWarnings("unchecked") public BoundedSourceWrapper( String stepName, PipelineOptions pipelineOptions, BoundedSource<OutputT> source, int parallelism) throws Exception { this.stepName = stepName; this.serializedOptions = new SerializablePipelineOptions(pipelineOptions); long desiredBundleSize = source.getEstimatedSizeBytes(pipelineOptions) / parallelism; // get the splits early. we assume that the generated splits are stable, // this is necessary so that the mapping of state to source is correct // when restoring splitSources = source.split(desiredBundleSize, pipelineOptions); }
@Override public void populateDisplayData(DisplayData.Builder builder) { this.boundedSource.populateDisplayData(builder); }
@Override public final PCollection<T> expand(PBegin input) { source.validate(); return PCollection.createPrimitiveOutputInternal( input.getPipeline(), WindowingStrategy.globalDefault(), IsBounded.BOUNDED, source.getOutputCoder()); }
@Test public void testSplitWithEmptyBundles() throws Exception { String fileName = "temp.xml"; List<Train> trains = generateRandomTrainList(10); File file = createRandomTrainXML(fileName, trains); BoundedSource<Train> source = XmlIO.<Train>read() .from(file.toPath().toString()) .withRootElement("trains") .withRecordElement("train") .withRecordClass(Train.class) .withMinBundleSize(10) .createSource(); List<? extends BoundedSource<Train>> splits = source.split(100, null); assertTrue(splits.size() > 2); List<Train> results = new ArrayList<>(); for (BoundedSource<Train> split : splits) { results.addAll(readEverythingFromReader(split.createReader(null))); } assertThat(trainsToStrings(trains), containsInAnyOrder(trainsToStrings(results).toArray())); }
/** {@inheritDoc} */ @Override public long getEstimatedSizeBytes(PipelineOptions options) throws Exception { return delegate.getEstimatedSizeBytes(options); }
/** * Shuffles the delegate source's splits. */ @Override public List<? extends BoundedSource<T>> split(long desiredBundleSizeBytes, PipelineOptions options) throws Exception { List<? extends BoundedSource<T>> splits = delegate.split(desiredBundleSizeBytes, options); Collections.shuffle(splits); return splits; }
/** {@inheritDoc} */ @Override public void validate() { delegate.validate(); }
@Override public Coder<T> getOutputCoder() { return boundedSource.getOutputCoder(); }
@Test public void testXMLWithSplits() throws Exception { String fileName = "temp.xml"; List<Train> trains = generateRandomTrainList(100); File file = createRandomTrainXML(fileName, trains); BoundedSource<Train> source = XmlIO.<Train>read() .from(file.toPath().toString()) .withRootElement("trains") .withRecordElement("train") .withRecordClass(Train.class) .withMinBundleSize(10) .createSource(); List<? extends BoundedSource<Train>> splits = source.split(256, null); // Not a trivial split assertTrue(splits.size() > 2); List<Train> results = new ArrayList<>(); for (BoundedSource<Train> split : splits) { results.addAll(readEverythingFromReader(split.createReader(null))); } assertThat(trainsToStrings(trains), containsInAnyOrder(trainsToStrings(results).toArray())); }
@Override public long getEstimatedSizeBytes(PipelineOptions options) throws Exception { return boundedSource.getEstimatedSizeBytes(options); }
public static <T> List<T> readFromSplitsOfSource( BoundedSource<T> source, long desiredBundleSizeBytes, PipelineOptions options) throws Exception { List<T> res = Lists.newArrayList(); for (BoundedSource<T> split : source.split(desiredBundleSizeBytes, options)) { res.addAll(readFromSource(split, options)); } return res; }
@Override public void validate() { boundedSource.validate(); }
@Override public Coder<T> getOutputCoder() { return underlying.getOutputCoder(); } }
/** {@inheritDoc} */ @Override public BoundedReader<T> createReader(PipelineOptions options) throws IOException { return delegate.createReader(options); }
@Override public List<Readable<WindowedValue<O>>> getReadables(final int desiredNumOfSplits) throws Exception { final List<Readable<WindowedValue<O>>> readables = new ArrayList<>(); LOG.info("estimate: {}", source.getEstimatedSizeBytes(null)); LOG.info("desired: {}", desiredNumOfSplits); source.split(source.getEstimatedSizeBytes(null) / desiredNumOfSplits, null) .forEach(boundedSource -> readables.add(new BoundedSourceReadable<>(boundedSource))); return readables; }
@Test public void testSplitWithEmptyBundleAtEnd() throws Exception { File file = tempFolder.newFile("trainXMLTiny"); Files.write(file.toPath(), tinyXML.getBytes(StandardCharsets.UTF_8)); BoundedSource<Train> source = XmlIO.<Train>read() .from(file.toPath().toString()) .withRootElement("trains") .withRecordElement("train") .withRecordClass(Train.class) .withMinBundleSize(10) .createSource(); List<? extends BoundedSource<Train>> splits = source.split(50, null); assertTrue(splits.size() > 2); List<Train> results = new ArrayList<>(); for (BoundedSource<Train> split : splits) { results.addAll(readEverythingFromReader(split.createReader(null))); } List<Train> expectedResults = ImmutableList.of( new Train("Thomas", Train.TRAIN_NUMBER_UNDEFINED, null, null), new Train("Henry", Train.TRAIN_NUMBER_UNDEFINED, null, null), new Train("James", Train.TRAIN_NUMBER_UNDEFINED, null, null)); assertThat( trainsToStrings(expectedResults), containsInAnyOrder(trainsToStrings(results).toArray())); }
@Override public BaseStatistics getStatistics(BaseStatistics baseStatistics) throws IOException { try { final long estimatedSize = initialSource.getEstimatedSizeBytes(options); return new BaseStatistics() { @Override public long getTotalInputSize() { return estimatedSize; } @Override public long getNumberOfRecords() { return BaseStatistics.NUM_RECORDS_UNKNOWN; } @Override public float getAverageRecordWidth() { return BaseStatistics.AVG_RECORD_BYTES_UNKNOWN; } }; } catch (Exception e) { LOG.warn("Could not read Source statistics: {}", e); } return null; }
@ProcessElement public void splitSource(ProcessContext ctxt) throws Exception { for (BoundedSource<T> split : source.split(bundleSize, ctxt.getPipelineOptions())) { ctxt.output(split); } } }
/** {@inheritDoc} */ @Override public void populateDisplayData(Builder builder) { delegate.populateDisplayData(builder); } }