private void testSplitIntoBundlesP(long splitPointFrequency) throws Exception { PipelineOptions options = PipelineOptionsFactory.create(); testSourceOptions.splitPointFrequencyRecords = splitPointFrequency; testSourceOptions.numRecords = 100; SyntheticBoundedSource source = new SyntheticBoundedSource(testSourceOptions); SourceTestUtils.assertSourcesEqualReferenceSource(source, source.split(10, options), options); SourceTestUtils.assertSourcesEqualReferenceSource(source, source.split(40, options), options); SourceTestUtils.assertSourcesEqualReferenceSource(source, source.split(100, options), options); }
@Test public void testInitialSplitAutoModeTxt() throws Exception { PipelineOptions options = TestPipeline.testingPipelineOptions(); long desiredBundleSize = 1000; File largeTxt = writeToFile(LARGE, tempFolder, "large.txt", UNCOMPRESSED); // Sanity check: file is at least 2 bundles long. assertThat(largeTxt.length(), greaterThan(2 * desiredBundleSize)); FileBasedSource<String> source = TextIO.read().from(largeTxt.getPath()).getSource(); List<? extends FileBasedSource<String>> splits = source.split(desiredBundleSize, options); // At least 2 splits and they are equal to reading the whole file. assertThat(splits, hasSize(greaterThan(1))); SourceTestUtils.assertSourcesEqualReferenceSource(source, splits, options); }
@Test public void testInitialSplitAutoModeGz() throws Exception { PipelineOptions options = TestPipeline.testingPipelineOptions(); long desiredBundleSize = 1000; File largeGz = writeToFile(LARGE, tempFolder, "large.gz", GZIP); // Sanity check: file is at least 2 bundles long. assertThat(largeGz.length(), greaterThan(2 * desiredBundleSize)); FileBasedSource<String> source = TextIO.read().from(largeGz.getPath()).getSource(); List<? extends FileBasedSource<String>> splits = source.split(desiredBundleSize, options); // Exactly 1 split, even in AUTO mode, since it is a gzip file. assertThat(splits, hasSize(equalTo(1))); SourceTestUtils.assertSourcesEqualReferenceSource(source, splits, options); }
@Test public void testInitialSplitGzipModeTxt() throws Exception { PipelineOptions options = TestPipeline.testingPipelineOptions(); long desiredBundleSize = 1000; File largeTxt = writeToFile(LARGE, tempFolder, "large.txt", UNCOMPRESSED); // Sanity check: file is at least 2 bundles long. assertThat(largeTxt.length(), greaterThan(2 * desiredBundleSize)); FileBasedSource<String> source = TextIO.read().from(largeTxt.getPath()).withCompression(GZIP).getSource(); List<? extends FileBasedSource<String>> splits = source.split(desiredBundleSize, options); // Exactly 1 split, even though splittable text file, since using GZIP mode. assertThat(splits, hasSize(equalTo(1))); SourceTestUtils.assertSourcesEqualReferenceSource(source, splits, options); }
@Test public void testSourceSplit() throws Exception { CreateSource<Integer> source = CreateSource.fromIterable( ImmutableList.of(1, 2, 3, 4, 5, 6, 7, 8), BigEndianIntegerCoder.of()); PipelineOptions options = PipelineOptionsFactory.create(); List<? extends BoundedSource<Integer>> splitSources = source.split(12, options); assertThat(splitSources, hasSize(3)); SourceTestUtils.assertSourcesEqualReferenceSource(source, splitSources, options); }
@Test public void getInitialInputsSplitsIntoBundles() throws Exception { when(context.createRootBundle()).thenAnswer(invocation -> bundleFactory.createRootBundle()); Collection<CommittedBundle<?>> initialInputs = new BoundedReadEvaluatorFactory.InputProvider(context, options) .getInitialInputs(longsProducer, 3); assertThat(initialInputs, hasSize(allOf(greaterThanOrEqualTo(3), lessThanOrEqualTo(4)))); Collection<BoundedSource<Long>> sources = new ArrayList<>(); for (CommittedBundle<?> initialInput : initialInputs) { Iterable<WindowedValue<BoundedSourceShard<Long>>> shards = (Iterable) initialInput.getElements(); WindowedValue<BoundedSourceShard<Long>> shard = Iterables.getOnlyElement(shards); assertThat(shard.getWindows(), Matchers.contains(GlobalWindow.INSTANCE)); assertThat(shard.getTimestamp(), equalTo(BoundedWindow.TIMESTAMP_MIN_VALUE)); sources.add(shard.getValue().getSource()); } SourceTestUtils.assertSourcesEqualReferenceSource( source, (List<? extends BoundedSource<Long>>) sources, PipelineOptionsFactory.create()); }
@Test public void testSourceSplitEmpty() throws Exception { CreateSource<Integer> source = CreateSource.fromIterable(ImmutableList.of(), BigEndianIntegerCoder.of()); PipelineOptions options = PipelineOptionsFactory.create(); List<? extends BoundedSource<Integer>> splitSources = source.split(12, options); SourceTestUtils.assertSourcesEqualReferenceSource(source, splitSources, options); }
@Test public void testSourceSplitVoid() throws Exception { CreateSource<Void> source = CreateSource.fromIterable(Lists.newArrayList(null, null, null, null, null), VoidCoder.of()); PipelineOptions options = PipelineOptionsFactory.create(); List<? extends BoundedSource<Void>> splitSources = source.split(3, options); SourceTestUtils.assertSourcesEqualReferenceSource(source, splitSources, options); }
/** Tests reading all rows from a split table. */ @Test public void testReadingWithSplits() throws Exception { final String table = "TEST-MANY-ROWS-SPLITS-TABLE"; final int numRows = 1500; final int numSamples = 10; final long bytesPerRow = 100L; // Set up test table data and sample row keys for size estimation and splitting. makeTableData(table, numRows); service.setupSampleRowKeys(table, numSamples, bytesPerRow); // Generate source and split it. BigtableSource source = new BigtableSource( config.withTableId(ValueProvider.StaticValueProvider.of(table)), null /*filter*/, Arrays.asList(ByteKeyRange.ALL_KEYS), null /*size*/); List<BigtableSource> splits = source.split(numRows * bytesPerRow / numSamples, null /* options */); // Test num splits and split equality. assertThat(splits, hasSize(numSamples)); assertSourcesEqualReferenceSource(source, splits, null /* options */); }
/** Tests reading all rows from a sub-split table. */ @Test public void testReadingWithSubSplits() throws Exception { final String table = "TEST-MANY-ROWS-SPLITS-TABLE"; final int numRows = 1000; final int numSamples = 10; final int numSplits = 20; final long bytesPerRow = 100L; // Set up test table data and sample row keys for size estimation and splitting. makeTableData(table, numRows); service.setupSampleRowKeys(table, numSamples, bytesPerRow); // Generate source and split it. BigtableSource source = new BigtableSource( config.withTableId(ValueProvider.StaticValueProvider.of(table)), null /*filter*/, Arrays.asList(ByteKeyRange.ALL_KEYS), null /*size*/); List<BigtableSource> splits = source.split(numRows * bytesPerRow / numSplits, null); // Test num splits and split equality. assertThat(splits, hasSize(numSplits)); assertSourcesEqualReferenceSource(source, splits, null /* options */); }
SourceTestUtils.assertSourcesEqualReferenceSource(source, splits, options); nonEmptySplits = 0; for (BoundedSource<Bird> subSource : splits) { SourceTestUtils.assertSourcesEqualReferenceSource(source, splits, options); nonEmptySplits = 0; for (BoundedSource<Bird> subSource : splits) { SourceTestUtils.assertSourcesEqualReferenceSource(source, splits, options);
@Test public void testSplits() throws Exception { PipelineOptions options = PipelineOptionsFactory.create(); List<KV<IntWritable, Text>> expectedResults = createRandomRecords(3, 10000, 0); File file = createFileWithData("tmp.seq", expectedResults); HDFSFileSource<KV<IntWritable, Text>, IntWritable, Text> source = HDFSFileSource.from( file.toString(), SequenceFileInputFormat.class, IntWritable.class, Text.class); // Assert that the source produces the expected records assertEquals(expectedResults, readFromSource(source, options)); // Split with a small bundle size (has to be at least size of sync interval) List<? extends BoundedSource<KV<IntWritable, Text>>> splits = source .splitIntoBundles(SequenceFile.SYNC_INTERVAL, options); assertTrue(splits.size() > 2); SourceTestUtils.assertSourcesEqualReferenceSource(source, splits, options); int nonEmptySplits = 0; for (BoundedSource<KV<IntWritable, Text>> subSource : splits) { if (readFromSource(subSource, options).size() > 0) { nonEmptySplits += 1; } } assertTrue(nonEmptySplits > 2); }
List<? extends BoundedSource<String>> splits = initialSource.split(desiredBundleSizeBytes, options); SourceTestUtils.assertSourcesEqualReferenceSource(initialSource, splits, options); long indexSize = BoundedElasticsearchSource.estimateIndexSize(connectionConfiguration);
/** Tests reading all rows from a sub-split table. */ @Test public void testReadingWithFilterAndSubSplits() throws Exception { final String table = "TEST-FILTER-SUB-SPLITS"; final int numRows = 1700; final int numSamples = 10; final int numSplits = 20; final long bytesPerRow = 100L; // Set up test table data and sample row keys for size estimation and splitting. makeTableData(table, numRows); service.setupSampleRowKeys(table, numSamples, bytesPerRow); // Generate source and split it. RowFilter filter = RowFilter.newBuilder().setRowKeyRegexFilter(ByteString.copyFromUtf8(".*17.*")).build(); BigtableSource source = new BigtableSource( config.withTableId(ValueProvider.StaticValueProvider.of(table)), filter, Arrays.asList(ByteKeyRange.ALL_KEYS), null /*size*/); List<BigtableSource> splits = source.split(numRows * bytesPerRow / numSplits, null); // Test num splits and split equality. assertThat(splits, hasSize(numSplits)); assertSourcesEqualReferenceSource(source, splits, null /* options */); }
assertSourcesEqualReferenceSource(referenceSource, splits, null /* options */);
assertSourcesEqualReferenceSource(referenceSource, splits, null /* options */);
@Test public void testSplitIntoSingleRecordBundles() throws Exception { PipelineOptions options = PipelineOptionsFactory.create(); SyntheticSourceOptions sourceOptions = new SyntheticSourceOptions(); sourceOptions.numRecords = 10; sourceOptions.setSeed(123456); sourceOptions.bundleSizeDistribution = fromRealDistribution(new ConstantRealDistribution(1.0)); sourceOptions.forceNumInitialBundles = 10; SyntheticBoundedSource source = new SyntheticBoundedSource(sourceOptions); List<SyntheticBoundedSource> sources = source.split(42L, options); for (SyntheticBoundedSource recordSource : sources) { recordSource.validate(); assertEquals(1, recordSource.getEndOffset() - recordSource.getStartOffset()); } SourceTestUtils.assertSourcesEqualReferenceSource(source, sources, options); } }
expectedKeyRangesAfterReducedSplits.toArray())); assertAllSourcesHaveSingleAdjacentRanges(reducedSplits); assertSourcesEqualReferenceSource(source, reducedSplits, null /* options */);