private void testSplitIntoBundlesP(long splitPointFrequency) throws Exception { PipelineOptions options = PipelineOptionsFactory.create(); testSourceOptions.splitPointFrequencyRecords = splitPointFrequency; testSourceOptions.numRecords = 100; SyntheticBoundedSource source = new SyntheticBoundedSource(testSourceOptions); SourceTestUtils.assertSourcesEqualReferenceSource(source, source.split(10, options), options); SourceTestUtils.assertSourcesEqualReferenceSource(source, source.split(40, options), options); SourceTestUtils.assertSourcesEqualReferenceSource(source, source.split(100, options), options); }
public static <T> List<T> readFromSplitsOfSource( BoundedSource<T> source, long desiredBundleSizeBytes, PipelineOptions options) throws Exception { List<T> res = Lists.newArrayList(); for (BoundedSource<T> split : source.split(desiredBundleSizeBytes, options)) { res.addAll(readFromSource(split, options)); } return res; }
private void testSplitAtFractionP(long splitPointFrequency) throws Exception { PipelineOptions options = PipelineOptionsFactory.create(); testSourceOptions.splitPointFrequencyRecords = splitPointFrequency; SyntheticBoundedSource source = new SyntheticBoundedSource(testSourceOptions); SourceTestUtils.assertSplitAtFractionExhaustive(source, options); // Can't split if already consumed. SourceTestUtils.assertSplitAtFractionFails(source, 5, 0.3, options); SourceTestUtils.assertSplitAtFractionSucceedsAndConsistent(source, 1, 0.3, options); }
/** * Asserts that the {@code source}'s reader either fails to {@code splitAtFraction(fraction)} * after reading {@code numItemsToReadBeforeSplit} items, or succeeds in a way that is consistent * according to {@link #assertSplitAtFractionSucceedsAndConsistent}. * * <p>Returns SplitAtFractionResult. */ public static <T> SplitAtFractionResult assertSplitAtFractionBehavior( BoundedSource<T> source, int numItemsToReadBeforeSplit, double splitFraction, ExpectedSplitOutcome expectedOutcome, PipelineOptions options) throws Exception { return assertSplitAtFractionBehaviorImpl( source, readFromSource(source, options), numItemsToReadBeforeSplit, splitFraction, expectedOutcome, options); }
/** * Assert that a {@code Reader} returns a {@code Source} that, when read from, produces the same * records as the reader. */ public static <T> void assertUnstartedReaderReadsSameAsItsSource( BoundedSource.BoundedReader<T> reader, PipelineOptions options) throws Exception { Coder<T> coder = reader.getCurrentSource().getOutputCoder(); List<T> expected = readFromUnstartedReader(reader); List<T> actual = readFromSource(reader.getCurrentSource(), options); List<ReadableStructuralValue<T>> expectedStructural = createStructuralValues(coder, expected); List<ReadableStructuralValue<T>> actualStructural = createStructuralValues(coder, actual); assertThat(actualStructural, containsInAnyOrder(expectedStructural.toArray())); }
PipelineOptions options) throws Exception { List<T> primaryItems = readFromSource(primary, options); if (residual != null) { List<T> residualItems = readFromSource(residual, options); List<T> totalItems = new ArrayList<>(); totalItems.addAll(primaryItems); splitFraction, numItemsToReadBeforeSplit, source, primary, residual); Coder<T> coder = primary.getOutputCoder(); List<ReadableStructuralValue<T>> primaryValues = createStructuralValues(coder, primaryItems); List<ReadableStructuralValue<T>> currentValues = createStructuralValues(coder, currentItems); List<ReadableStructuralValue<T>> expectedValues = createStructuralValues(coder, expectedItems); List<ReadableStructuralValue<T>> totalValues = createStructuralValues(coder, totalItems); assertListsEqualInOrder( errorMsgForPrimarySourceComp, "current", currentValues, "primary", primaryValues); assertListsEqualInOrder( errorMsgForTotalSourceComp, "total", expectedValues, "primary+residual", totalValues); return new SplitAtFractionResult(primaryItems.size(), residualItems.size());
@Test public void testSplits() throws Exception { PipelineOptions options = PipelineOptionsFactory.create(); List<KV<IntWritable, Text>> expectedResults = createRandomRecords(3, 10000, 0); File file = createFileWithData("tmp.seq", expectedResults); HDFSFileSource<KV<IntWritable, Text>, IntWritable, Text> source = HDFSFileSource.from( file.toString(), SequenceFileInputFormat.class, IntWritable.class, Text.class); // Assert that the source produces the expected records assertEquals(expectedResults, readFromSource(source, options)); // Split with a small bundle size (has to be at least size of sync interval) List<? extends BoundedSource<KV<IntWritable, Text>>> splits = source .splitIntoBundles(SequenceFile.SYNC_INTERVAL, options); assertTrue(splits.size() > 2); SourceTestUtils.assertSourcesEqualReferenceSource(source, splits, options); int nonEmptySplits = 0; for (BoundedSource<KV<IntWritable, Text>> subSource : splits) { if (readFromSource(subSource, options).size() > 0) { nonEmptySplits += 1; } } assertTrue(nonEmptySplits > 2); }
List<? extends BoundedSource<FixedRecord>> splits = source.split(file.length() / 3, null); for (BoundedSource<FixedRecord> subSource : splits) { int items = SourceTestUtils.readFromSource(subSource, null).size(); SourceTestUtils.assertSplitAtFractionFails(subSource, 0, 0.0, null); SourceTestUtils.assertSplitAtFractionFails(subSource, 0, 0.7, null); SourceTestUtils.assertSplitAtFractionSucceedsAndConsistent(subSource, 1, 0.7, null); SourceTestUtils.assertSplitAtFractionSucceedsAndConsistent( subSource, DEFAULT_RECORD_COUNT / 100, 0.7, null); SourceTestUtils.assertSplitAtFractionSucceedsAndConsistent( subSource, DEFAULT_RECORD_COUNT / 10, 0.1, null); SourceTestUtils.assertSplitAtFractionFails( subSource, DEFAULT_RECORD_COUNT / 10 + 1, 0.1, null); SourceTestUtils.assertSplitAtFractionFails(subSource, DEFAULT_RECORD_COUNT / 3, 0.3, null); SourceTestUtils.assertSplitAtFractionFails(subSource, items, 0.9, null); SourceTestUtils.assertSplitAtFractionFails(subSource, items, 1.0, null); SourceTestUtils.assertSplitAtFractionSucceedsAndConsistent(subSource, items, 0.999, null);
@Test public void testSplitAtFraction() throws Exception { PipelineOptions options = PipelineOptionsFactory.create(); File file = createFileWithData("file", createStringDataset(3, 100)); Metadata metadata = FileSystems.matchSingleFileSpec(file.getPath()); TestFileBasedSource source = new TestFileBasedSource(metadata, 1, 0, file.length(), null); // Shouldn't be able to split while unstarted. assertSplitAtFractionFails(source, 0, 0.7, options); assertSplitAtFractionSucceedsAndConsistent(source, 1, 0.7, options); assertSplitAtFractionSucceedsAndConsistent(source, 30, 0.7, options); assertSplitAtFractionFails(source, 0, 0.0, options); assertSplitAtFractionFails(source, 70, 0.3, options); assertSplitAtFractionFails(source, 100, 1.0, options); assertSplitAtFractionFails(source, 100, 0.99, options); assertSplitAtFractionSucceedsAndConsistent(source, 100, 0.995, options); }
@Test public void testToUnsplittableSource() throws Exception { PipelineOptions options = PipelineOptionsFactory.create(); BoundedSource<Long> baseSource = CountingSource.upTo(100); BoundedSource<Long> unsplittableSource = SourceTestUtils.toUnsplittableSource(baseSource); List<?> splits = unsplittableSource.split(1, options); assertEquals(1, splits.size()); assertEquals(unsplittableSource, splits.get(0)); BoundedReader<Long> unsplittableReader = unsplittableSource.createReader(options); assertEquals(0, unsplittableReader.getFractionConsumed(), 1e-15); Set<Long> expected = Sets.newHashSet(SourceTestUtils.readFromSource(baseSource, options)); Set<Long> actual = Sets.newHashSet(); actual.addAll(SourceTestUtils.readNItemsFromUnstartedReader(unsplittableReader, 40)); assertNull(unsplittableReader.splitAtFraction(0.5)); actual.addAll(SourceTestUtils.readRemainingFromReader(unsplittableReader, true /* started */)); assertEquals(1, unsplittableReader.getFractionConsumed(), 1e-15); assertEquals(100, actual.size()); assertEquals(Sets.newHashSet(expected), Sets.newHashSet(actual)); } }
@Test public void testSourceSplitAtFraction() throws Exception { List<Integer> elements = new ArrayList<>(); Random random = new Random(); for (int i = 0; i < 25; i++) { elements.add(random.nextInt()); } CreateSource<Integer> source = CreateSource.fromIterable(elements, VarIntCoder.of()); SourceTestUtils.assertSplitAtFractionExhaustive(source, PipelineOptionsFactory.create()); } }
List<T> expectedItems = readFromSource(source, options); assertFalse("Empty source", expectedItems.isEmpty()); assertFalse("Source reads a single item", expectedItems.size() == 1); for (int i = 0; i < expectedItems.size(); i++) { SplitFractionStatistics stats = new SplitFractionStatistics(); assertSplitAtFractionBinary(source, expectedItems, i, 0.0, null, 1.0, null, options, stats); if (!stats.successfulFractions.isEmpty()) { anySuccessfulFractions = true; break; if (assertSplitAtFractionConcurrent( executor, source, expectedItems, i, minNonTrivialFraction, options)) { haveSuccess = true;
try { List<T> items = readNItemsFromUnstartedReader(reader, numItemsToReadBeforeSplitting); unblockSplitter.countDown(); items.addAll(readRemainingFromReader(reader, numItemsToReadBeforeSplitting > 0)); return items; } finally { verifySingleSplitAtFractionResult( source, expectedItems,
/** * Given a reference {@code Source} and a list of {@code Source}s, assert that the union of the * records read from the list of sources is equal to the records read from the reference source. */ public static <T> void assertSourcesEqualReferenceSource( BoundedSource<T> referenceSource, List<? extends BoundedSource<T>> sources, PipelineOptions options) throws Exception { Coder<T> coder = referenceSource.getOutputCoder(); List<T> referenceRecords = readFromSource(referenceSource, options); List<T> bundleRecords = new ArrayList<>(); for (BoundedSource<T> source : sources) { assertThat( "Coder type for source " + source + " is not compatible with Coder type for referenceSource " + referenceSource, source.getOutputCoder(), equalTo(coder)); List<T> elems = readFromSource(source, options); bundleRecords.addAll(elems); } List<ReadableStructuralValue<T>> bundleValues = createStructuralValues(coder, bundleRecords); List<ReadableStructuralValue<T>> referenceValues = createStructuralValues(coder, referenceRecords); assertThat(bundleValues, containsInAnyOrder(referenceValues.toArray())); }
private void testSourceAndReadersWorkP(long splitPointFrequency) throws Exception { PipelineOptions options = PipelineOptionsFactory.create(); testSourceOptions.splitPointFrequencyRecords = splitPointFrequency; SyntheticBoundedSource source = new SyntheticBoundedSource(testSourceOptions); assertEquals(10 * (10 + 20), source.getEstimatedSizeBytes(options)); SourceTestUtils.assertUnstartedReaderReadsSameAsItsSource( source.createReader(options), options); }
@Test public void generatesInitialSplits() throws Exception { when(context.createRootBundle()).thenAnswer(invocation -> bundleFactory.createRootBundle()); int numSplits = 5; Collection<CommittedBundle<?>> initialInputs = new UnboundedReadEvaluatorFactory.InputProvider(context, options) .getInitialInputs(graph.getProducer(longs), numSplits); // CountingSource.unbounded has very good splitting behavior assertThat(initialInputs, hasSize(numSplits)); int readPerSplit = 100; int totalSize = numSplits * readPerSplit; Set<Long> expectedOutputs = ContiguousSet.create(Range.closedOpen(0L, (long) totalSize), DiscreteDomain.longs()); Collection<Long> readItems = new ArrayList<>(totalSize); for (CommittedBundle<?> initialInput : initialInputs) { CommittedBundle<UnboundedSourceShard<Long, ?>> shardBundle = (CommittedBundle<UnboundedSourceShard<Long, ?>>) initialInput; WindowedValue<UnboundedSourceShard<Long, ?>> shard = Iterables.getOnlyElement(shardBundle.getElements()); assertThat(shard.getTimestamp(), equalTo(BoundedWindow.TIMESTAMP_MIN_VALUE)); assertThat(shard.getWindows(), Matchers.contains(GlobalWindow.INSTANCE)); UnboundedSource<Long, ?> shardSource = shard.getValue().getSource(); readItems.addAll( SourceTestUtils.readNItemsFromUnstartedReader( shardSource.createReader( PipelineOptionsFactory.create(), null /* No starting checkpoint */), readPerSplit)); } assertThat(readItems, containsInAnyOrder(expectedOutputs.toArray(new Long[0]))); }
/** Reads all elements from the given unstarted {@link Source.Reader}. */ public static <T> List<T> readFromUnstartedReader(Source.Reader<T> reader) throws IOException { return readRemainingFromReader(reader, false); }
p.apply(Read.from(SourceTestUtils.toUnsplittableSource(CountingSource.upTo(10L)))); AppliedPTransform<?, ?, ?> transform = DirectGraphs.getProducer(read);
assertEquals(expected, SourceTestUtils.readFromSource(source, options)); SourceTestUtils.assertSourcesEqualReferenceSource(source, splits, options); nonEmptySplits = 0; for (BoundedSource<Bird> subSource : splits) { if (SourceTestUtils.readFromSource(subSource, options).size() > 0) { nonEmptySplits += 1; SourceTestUtils.assertSourcesEqualReferenceSource(source, splits, options); nonEmptySplits = 0; for (BoundedSource<Bird> subSource : splits) { if (SourceTestUtils.readFromSource(subSource, options).size() > 0) { nonEmptySplits += 1; SourceTestUtils.assertSourcesEqualReferenceSource(source, splits, options);
/** Unit tests of splitAtFraction. */ @Test public void testReadingSplitAtFraction() throws Exception { final String table = "TEST-SPLIT-AT-FRACTION"; final int numRows = 10; final int numSamples = 1; final long bytesPerRow = 1L; makeTableData(table, numRows); service.setupSampleRowKeys(table, numSamples, bytesPerRow); BigtableSource source = new BigtableSource( config.withTableId(ValueProvider.StaticValueProvider.of(table)), null, Arrays.asList(service.getTableRange(table)), null); // With 0 items read, all split requests will fail. assertSplitAtFractionFails(source, 0, 0.1, null /* options */); assertSplitAtFractionFails(source, 0, 1.0, null /* options */); // With 1 items read, all split requests past 1/10th will succeed. assertSplitAtFractionSucceedsAndConsistent(source, 1, 0.333, null /* options */); assertSplitAtFractionSucceedsAndConsistent(source, 1, 0.666, null /* options */); // With 3 items read, all split requests past 3/10ths will succeed. assertSplitAtFractionFails(source, 3, 0.2, null /* options */); assertSplitAtFractionSucceedsAndConsistent(source, 3, 0.571, null /* options */); assertSplitAtFractionSucceedsAndConsistent(source, 3, 0.9, null /* options */); // With 6 items read, all split requests past 6/10ths will succeed. assertSplitAtFractionFails(source, 6, 0.5, null /* options */); assertSplitAtFractionSucceedsAndConsistent(source, 6, 0.7, null /* options */); }