/** * Creates a {@code CompressedSource} for an individual file. Used by {@link * CompressedSource#createForSubrangeOfFile}. */ private CompressedSource( FileBasedSource<T> sourceDelegate, DecompressingChannelFactory channelFactory, Metadata metadata, long minBundleSize, long startOffset, long endOffset) { super(metadata, minBundleSize, startOffset, endOffset); this.sourceDelegate = sourceDelegate; this.channelFactory = channelFactory; boolean splittable; try { splittable = isSplittable(); } catch (Exception e) { throw new RuntimeException("Failed to determine if the source is splittable", e); } checkArgument( splittable || startOffset == 0, "CompressedSources must start reading at offset 0. Requested offset: %s", startOffset); }
/** * Creates a {@code FileBasedReader} to read a single file. * * <p>Uses the delegate source to create a single file reader for the delegate source. Utilizes * the default decompression channel factory to not wrap the source reader if the file name does * not represent a compressed file allowing for splitting of the source. */ @Override protected final FileBasedReader<T> createSingleFileReader(PipelineOptions options) { if (isSplittable()) { return sourceDelegate.createSingleFileReader(options); } return new CompressedReader<>(this, sourceDelegate.createSingleFileReader(options)); }
/** Test splittability of files in AUTO mode. */ @Test public void testAutoSplittable() throws Exception { CompressedSource<Byte> source; // GZip files are not splittable source = CompressedSource.from(new ByteSource("input.gz", 1)); assertFalse(source.isSplittable()); source = CompressedSource.from(new ByteSource("input.GZ", 1)); assertFalse(source.isSplittable()); // BZ2 files are not splittable source = CompressedSource.from(new ByteSource("input.bz2", 1)); assertFalse(source.isSplittable()); source = CompressedSource.from(new ByteSource("input.BZ2", 1)); assertFalse(source.isSplittable()); // ZIP files are not splittable source = CompressedSource.from(new ByteSource("input.zip", 1)); assertFalse(source.isSplittable()); source = CompressedSource.from(new ByteSource("input.ZIP", 1)); assertFalse(source.isSplittable()); // DEFLATE files are not splittable source = CompressedSource.from(new ByteSource("input.deflate", 1)); assertFalse(source.isSplittable()); source = CompressedSource.from(new ByteSource("input.DEFLATE", 1)); assertFalse(source.isSplittable()); // Other extensions are assumed to be splittable. source = CompressedSource.from(new ByteSource("input.txt", 1)); assertTrue(source.isSplittable()); source = CompressedSource.from(new ByteSource("input.csv", 1)); assertTrue(source.isSplittable()); }
@Test public void testGzipFileIsNotSplittable() throws Exception { String baseName = "test-input"; File compressedFile = tmpFolder.newFile(baseName + ".gz"); writeFile(compressedFile, generateInput(10), CompressionMode.GZIP); CompressedSource<Byte> source = CompressedSource.from(new ByteSource(compressedFile.getPath(), 1)); assertFalse(source.isSplittable()); }
@Test public void testBzip2FileIsNotSplittable() throws Exception { String baseName = "test-input"; File compressedFile = tmpFolder.newFile(baseName + ".bz2"); writeFile(compressedFile, generateInput(10), CompressionMode.BZIP2); CompressedSource<Byte> source = CompressedSource.from(new ByteSource(compressedFile.getPath(), 1)); assertFalse(source.isSplittable()); }
/** Test splittability of files in GZIP mode -- none should be splittable. */ @Test public void testGzipSplittable() throws Exception { CompressedSource<Byte> source; // GZip files are not splittable source = CompressedSource.from(new ByteSource("input.gz", 1)) .withDecompression(CompressionMode.GZIP); assertFalse(source.isSplittable()); source = CompressedSource.from(new ByteSource("input.GZ", 1)) .withDecompression(CompressionMode.GZIP); assertFalse(source.isSplittable()); // Other extensions are also not splittable. source = CompressedSource.from(new ByteSource("input.txt", 1)) .withDecompression(CompressionMode.GZIP); assertFalse(source.isSplittable()); source = CompressedSource.from(new ByteSource("input.csv", 1)) .withDecompression(CompressionMode.GZIP); assertFalse(source.isSplittable()); }
@Test public void testUncompressedFileWithAutoIsSplittable() throws Exception { String baseName = "test-input"; File uncompressedFile = tmpFolder.newFile(baseName + ".bin"); Files.write(generateInput(10), uncompressedFile); CompressedSource<Byte> source = CompressedSource.from(new ByteSource(uncompressedFile.getPath(), 1)); assertTrue(source.isSplittable()); SourceTestUtils.assertSplitAtFractionExhaustive(source, PipelineOptionsFactory.create()); }
@Test public void testUncompressedFileWithUncompressedIsSplittable() throws Exception { String baseName = "test-input"; File uncompressedFile = tmpFolder.newFile(baseName + ".bin"); Files.write(generateInput(10), uncompressedFile); CompressedSource<Byte> source = CompressedSource.from(new ByteSource(uncompressedFile.getPath(), 1)) .withDecompression(CompressionMode.UNCOMPRESSED); assertTrue(source.isSplittable()); SourceTestUtils.assertSplitAtFractionExhaustive(source, PipelineOptionsFactory.create()); }