/** * Asserts that the {@code source}'s reader either fails to {@code splitAtFraction(fraction)} * after reading {@code numItemsToReadBeforeSplit} items, or succeeds in a way that is consistent * according to {@link #assertSplitAtFractionSucceedsAndConsistent}. * * <p>Returns SplitAtFractionResult. */ public static <T> SplitAtFractionResult assertSplitAtFractionBehavior( BoundedSource<T> source, int numItemsToReadBeforeSplit, double splitFraction, ExpectedSplitOutcome expectedOutcome, PipelineOptions options) throws Exception { return assertSplitAtFractionBehaviorImpl( source, readFromSource(source, options), numItemsToReadBeforeSplit, splitFraction, expectedOutcome, options); }
public static <T> List<T> readFromSplitsOfSource( BoundedSource<T> source, long desiredBundleSizeBytes, PipelineOptions options) throws Exception { List<T> res = Lists.newArrayList(); for (BoundedSource<T> split : source.split(desiredBundleSizeBytes, options)) { res.addAll(readFromSource(split, options)); } return res; }
/** * Assert that a {@code Reader} returns a {@code Source} that, when read from, produces the same * records as the reader. */ public static <T> void assertUnstartedReaderReadsSameAsItsSource( BoundedSource.BoundedReader<T> reader, PipelineOptions options) throws Exception { Coder<T> coder = reader.getCurrentSource().getOutputCoder(); List<T> expected = readFromUnstartedReader(reader); List<T> actual = readFromSource(reader.getCurrentSource(), options); List<ReadableStructuralValue<T>> expectedStructural = createStructuralValues(coder, expected); List<ReadableStructuralValue<T>> actualStructural = createStructuralValues(coder, actual); assertThat(actualStructural, containsInAnyOrder(expectedStructural.toArray())); }
private void runTestReadWithData(byte[] data, List<String> expectedResults) throws Exception { TextSource source = prepareSource(data); List<String> actual = SourceTestUtils.readFromSource(source, PipelineOptionsFactory.create()); assertThat( actual, containsInAnyOrder(new ArrayList<>(expectedResults).toArray(new String[0]))); } }
@Test public void testReadRangeAtMiddle() throws IOException { PipelineOptions options = PipelineOptionsFactory.create(); List<String> data = createStringDataset(3, 50); String fileName = "file"; File file = createFileWithData(fileName, data); Metadata metadata = FileSystems.matchSingleFileSpec(file.getPath()); TestFileBasedSource source1 = new TestFileBasedSource(metadata, 64, 0, 52, null); TestFileBasedSource source2 = new TestFileBasedSource(metadata, 64, 52, 72, null); TestFileBasedSource source3 = new TestFileBasedSource(metadata, 64, 72, Long.MAX_VALUE, null); List<String> results = new ArrayList<>(); results.addAll(readFromSource(source1, options)); results.addAll(readFromSource(source2, options)); results.addAll(readFromSource(source3, options)); assertThat(data, containsInAnyOrder(results.toArray())); }
@Test public void testReadRangeFromFileWithSplitsFromMiddle() throws IOException { PipelineOptions options = PipelineOptionsFactory.create(); String header = "<h>"; List<String> data = new ArrayList<>(); for (int i = 0; i < 10; i++) { data.add(header); data.addAll(createStringDataset(3, 9)); } String fileName = "file"; File file = createFileWithData(fileName, data); Metadata metadata = FileSystems.matchSingleFileSpec(file.getPath()); TestFileBasedSource source1 = new TestFileBasedSource(metadata, 64, 0, 42, header); TestFileBasedSource source2 = new TestFileBasedSource(metadata, 64, 42, 112, header); TestFileBasedSource source3 = new TestFileBasedSource(metadata, 64, 112, Long.MAX_VALUE, header); List<String> expectedResults = new ArrayList<>(); expectedResults.addAll(data); // Remove all occurrences of header from expected results. expectedResults.removeAll(Collections.singletonList(header)); List<String> results = new ArrayList<>(); results.addAll(readFromSource(source1, options)); results.addAll(readFromSource(source2, options)); results.addAll(readFromSource(source3, options)); assertThat(expectedResults, containsInAnyOrder(results.toArray())); }
@Test public void testReadRangeAtStart() throws IOException { PipelineOptions options = PipelineOptionsFactory.create(); List<String> data = createStringDataset(3, 50); String fileName = "file"; File file = createFileWithData(fileName, data); Metadata metadata = FileSystems.matchSingleFileSpec(file.getPath()); TestFileBasedSource source1 = new TestFileBasedSource(metadata, 64, 0, 25, null); TestFileBasedSource source2 = new TestFileBasedSource(metadata, 64, 25, Long.MAX_VALUE, null); List<String> results = new ArrayList<>(); results.addAll(readFromSource(source1, options)); results.addAll(readFromSource(source2, options)); assertThat(data, containsInAnyOrder(results.toArray())); }
@Test public void testEmptyFilepatternTreatmentDefaultDisallow() throws IOException { PipelineOptions options = PipelineOptionsFactory.create(); TestFileBasedSource source = new TestFileBasedSource(new File(tempFolder.getRoot(), "doesNotExist").getPath(), 64, null); thrown.expect(FileNotFoundException.class); readFromSource(source, options); }
@Test public void testReadRangeAtEnd() throws IOException { PipelineOptions options = PipelineOptionsFactory.create(); List<String> data = createStringDataset(3, 50); String fileName = "file"; File file = createFileWithData(fileName, data); Metadata metadata = FileSystems.matchSingleFileSpec(file.getPath()); TestFileBasedSource source1 = new TestFileBasedSource(metadata, 64, 0, 162, null); TestFileBasedSource source2 = new TestFileBasedSource(metadata, 1024, 162, Long.MAX_VALUE, null); List<String> results = new ArrayList<>(); results.addAll(readFromSource(source1, options)); results.addAll(readFromSource(source2, options)); assertThat(data, containsInAnyOrder(results.toArray())); }
@Test public void testEmptyFilepatternTreatmentAllowIfWildcard() throws IOException { PipelineOptions options = PipelineOptionsFactory.create(); TestFileBasedSource source = new TestFileBasedSource( new File(tempFolder.getRoot(), "doesNotExist").getPath(), EmptyMatchTreatment.ALLOW_IF_WILDCARD, 64, null); thrown.expect(FileNotFoundException.class); readFromSource(source, options); }
@Test public void testEmptyFilepatternTreatmentAllow() throws IOException { PipelineOptions options = PipelineOptionsFactory.create(); TestFileBasedSource source = new TestFileBasedSource( new File(tempFolder.getRoot(), "doesNotExist").getPath(), EmptyMatchTreatment.ALLOW, 64, null); TestFileBasedSource sourceWithWildcard = new TestFileBasedSource( new File(tempFolder.getRoot(), "doesNotExist*").getPath(), EmptyMatchTreatment.ALLOW_IF_WILDCARD, 64, null); assertEquals(0, readFromSource(source, options).size()); assertEquals(0, readFromSource(sourceWithWildcard, options).size()); }
@Test public void testReadEverythingFromFileWithSplits() throws IOException { PipelineOptions options = PipelineOptionsFactory.create(); String header = "<h>"; List<String> data = new ArrayList<>(); for (int i = 0; i < 10; i++) { data.add(header); data.addAll(createStringDataset(3, 9)); } String fileName = "file"; File file = createFileWithData(fileName, data); TestFileBasedSource source = new TestFileBasedSource(file.getPath(), 64, header); List<String> expectedResults = new ArrayList<>(); expectedResults.addAll(data); // Remove all occurrences of header from expected results. expectedResults.removeAll(Collections.singletonList(header)); assertEquals(expectedResults, readFromSource(source, options)); }
@Test public void testReadAllSplitsOfSingleFile() throws Exception { PipelineOptions options = PipelineOptionsFactory.create(); List<String> data = createStringDataset(3, 50); String fileName = "file"; File file = createFileWithData(fileName, data); TestFileBasedSource source = new TestFileBasedSource(file.getPath(), 16, null); List<? extends BoundedSource<String>> sources = source.split(32, null); // Not a trivial split. assertTrue(sources.size() > 1); List<String> results = new ArrayList<>(); for (BoundedSource<String> split : sources) { results.addAll(readFromSource(split, options)); } assertThat(data, containsInAnyOrder(results.toArray())); }
@Test public void testFullyReadFilePattern() throws IOException { PipelineOptions options = PipelineOptionsFactory.create(); List<String> data1 = createStringDataset(3, 50); File file1 = createFileWithData("file1", data1); List<String> data2 = createStringDataset(3, 50); createFileWithData("file2", data2); List<String> data3 = createStringDataset(3, 50); createFileWithData("file3", data3); List<String> data4 = createStringDataset(3, 50); createFileWithData("otherfile", data4); TestFileBasedSource source = new TestFileBasedSource(new File(file1.getParent(), "file*").getPath(), 64, null); List<String> expectedResults = new ArrayList<>(); expectedResults.addAll(data1); expectedResults.addAll(data2); expectedResults.addAll(data3); assertThat(expectedResults, containsInAnyOrder(readFromSource(source, options).toArray())); }
@Test public void testFullyReadSingleFile() throws Exception { PipelineOptions options = PipelineOptionsFactory.create(); List<KV<IntWritable, Text>> expectedResults = createRandomRecords(3, 10, 0); File file = createFileWithData("tmp.seq", expectedResults); HDFSFileSource<KV<IntWritable, Text>, IntWritable, Text> source = HDFSFileSource.from( file.toString(), SequenceFileInputFormat.class, IntWritable.class, Text.class); assertEquals(file.length(), source.getEstimatedSizeBytes(null)); assertThat(expectedResults, containsInAnyOrder(readFromSource(source, options).toArray())); }
@Test public void testFullyReadSingleFile() throws IOException { PipelineOptions options = PipelineOptionsFactory.create(); List<String> data = createStringDataset(3, 50); String fileName = "file"; File file = createFileWithData(fileName, data); TestFileBasedSource source = new TestFileBasedSource(file.getPath(), 64, null); assertEquals(data, readFromSource(source, options)); }
@Test public void testFullyReadFilePatternFirstRecordEmpty() throws IOException { PipelineOptions options = PipelineOptionsFactory.create(); File file1 = createFileWithData("file1", new ArrayList<>()); String pattern = file1.getParent() + "/file*"; List<String> data2 = createStringDataset(3, 50); createFileWithData("file2", data2); List<String> data3 = createStringDataset(3, 50); createFileWithData("file3", data3); List<String> data4 = createStringDataset(3, 50); createFileWithData("otherfile", data4); TestFileBasedSource source = new TestFileBasedSource(pattern, 64, null); List<String> expectedResults = new ArrayList<>(); expectedResults.addAll(data2); expectedResults.addAll(data3); assertThat(expectedResults, containsInAnyOrder(readFromSource(source, options).toArray())); }
@Test public void testSplits() throws Exception { PipelineOptions options = PipelineOptionsFactory.create(); List<KV<IntWritable, Text>> expectedResults = createRandomRecords(3, 10000, 0); File file = createFileWithData("tmp.seq", expectedResults); HDFSFileSource<KV<IntWritable, Text>, IntWritable, Text> source = HDFSFileSource.from( file.toString(), SequenceFileInputFormat.class, IntWritable.class, Text.class); // Assert that the source produces the expected records assertEquals(expectedResults, readFromSource(source, options)); // Split with a small bundle size (has to be at least size of sync interval) List<? extends BoundedSource<KV<IntWritable, Text>>> splits = source .splitIntoBundles(SequenceFile.SYNC_INTERVAL, options); assertTrue(splits.size() > 2); SourceTestUtils.assertSourcesEqualReferenceSource(source, splits, options); int nonEmptySplits = 0; for (BoundedSource<KV<IntWritable, Text>> subSource : splits) { if (readFromSource(subSource, options).size() > 0) { nonEmptySplits += 1; } } assertTrue(nonEmptySplits > 2); }
/** Test reading multiple files. */ @Test public void testCompressedReadMultipleFiles() throws Exception { int numFiles = 3; String baseName = "test_input-"; String filePattern = new File(tmpFolder.getRoot().toString(), baseName + "*").toString(); List<Byte> expected = new ArrayList<>(); for (int i = 0; i < numFiles; i++) { byte[] generated = generateInput(100); File tmpFile = tmpFolder.newFile(baseName + i); writeFile(tmpFile, generated, CompressionMode.GZIP); expected.addAll(Bytes.asList(generated)); } CompressedSource<Byte> source = CompressedSource.from(new ByteSource(filePattern, 1)) .withDecompression(CompressionMode.GZIP); List<Byte> actual = SourceTestUtils.readFromSource(source, PipelineOptionsFactory.create()); assertEquals(HashMultiset.create(expected), HashMultiset.create(actual)); }
@Test public void testToUnsplittableSource() throws Exception { PipelineOptions options = PipelineOptionsFactory.create(); BoundedSource<Long> baseSource = CountingSource.upTo(100); BoundedSource<Long> unsplittableSource = SourceTestUtils.toUnsplittableSource(baseSource); List<?> splits = unsplittableSource.split(1, options); assertEquals(1, splits.size()); assertEquals(unsplittableSource, splits.get(0)); BoundedReader<Long> unsplittableReader = unsplittableSource.createReader(options); assertEquals(0, unsplittableReader.getFractionConsumed(), 1e-15); Set<Long> expected = Sets.newHashSet(SourceTestUtils.readFromSource(baseSource, options)); Set<Long> actual = Sets.newHashSet(); actual.addAll(SourceTestUtils.readNItemsFromUnstartedReader(unsplittableReader, 40)); assertNull(unsplittableReader.splitAtFraction(0.5)); actual.addAll(SourceTestUtils.readRemainingFromReader(unsplittableReader, true /* started */)); assertEquals(1, unsplittableReader.getFractionConsumed(), 1e-15); assertEquals(100, actual.size()); assertEquals(Sets.newHashSet(expected), Sets.newHashSet(actual)); } }