@Override public void validate() { super.validate(); switch (mode) { case FILEPATTERN: checkArgument( getStartOffset() == 0, "FileBasedSource is based on a file pattern or a full single file " + "but the starting offset proposed %s is not zero", getStartOffset()); checkArgument( getEndOffset() == Long.MAX_VALUE, "FileBasedSource is based on a file pattern or a full single file " + "but the ending offset proposed %s is not Long.MAX_VALUE", getEndOffset()); break; case SINGLE_FILE_OR_SUBRANGE: // Nothing more to validate. break; default: throw new IllegalStateException("Unknown mode: " + mode); } }
reader.sync(getCurrentSource().getStartOffset()); } catch (EOFException e) { LOG.debug("Found EOF when starting to read: " + getCurrentSource().getStartOffset()); eof = true; isFirstRecord = true; LOG.debug("startReading, offset: " + getCurrentSource().getStartOffset() + ", position: " + startOfNextRecord);
@Override public final long getEstimatedSizeBytes(PipelineOptions options) throws IOException { // This implementation of method getEstimatedSizeBytes is provided to simplify subclasses. Here // we perform the size estimation of files and file patterns using the interface provided by // FileSystem. String fileOrPattern = fileOrPatternSpec.get(); if (mode == Mode.FILEPATTERN) { long totalSize = 0; List<Metadata> allMatches = FileSystems.match(fileOrPattern, emptyMatchTreatment).metadata(); for (Metadata metadata : allMatches) { totalSize += metadata.sizeBytes(); } LOG.info( "Filepattern {} matched {} files with total size {}", fileOrPattern, allMatches.size(), totalSize); return totalSize; } else { long start = getStartOffset(); long end = Math.min(getEndOffset(), getMaxEndOffset(options)); return end - start; } }
@Override protected final boolean startImpl() throws IOException { FileBasedSource<T> source = getCurrentSource(); this.channel = FileSystems.open(source.getSingleFileMetadata().resourceId()); if (channel instanceof SeekableByteChannel) { SeekableByteChannel seekChannel = (SeekableByteChannel) channel; seekChannel.position(source.getStartOffset()); } else { // Channel is not seekable. Must not be a subrange. checkArgument( source.mode != Mode.SINGLE_FILE_OR_SUBRANGE, "Subrange-based sources must only be defined for file types that support seekable " + " read channels"); checkArgument( source.getStartOffset() == 0, "Start offset %s is not zero but channel for reading the file is not seekable.", source.getStartOffset()); } startReading(channel); // Advance once to load the first record. return advanceImpl(); }
@Override protected void startReading(ReadableByteChannel channel) throws IOException { this.inChannel = channel; // If the first offset is greater than zero, we need to skip bytes until we see our // first delimiter. long startOffset = getCurrentSource().getStartOffset(); if (startOffset > 0) { checkState( channel instanceof SeekableByteChannel, "%s only supports reading from a SeekableByteChannel when given a start offset" + " greater than 0.", TextSource.class.getSimpleName()); long requiredPosition = startOffset - 1; if (delimiter != null && startOffset >= delimiter.length) { // we need to move back the offset of at worse delimiter.size to be sure to see // all the bytes of the delimiter in the call to findDelimiterBounds() below requiredPosition = startOffset - delimiter.length; } ((SeekableByteChannel) channel).position(requiredPosition); findDelimiterBounds(); buffer = buffer.substring(endOfDelimiterInBuffer); startOfNextRecord = requiredPosition + endOfDelimiterInBuffer; endOfDelimiterInBuffer = 0; startOfDelimiterInBuffer = 0; } }
@Override public final FileBasedSource<T> createSourceForSubrange(long start, long end) { checkArgument( mode != Mode.FILEPATTERN, "Cannot split a file pattern based source based on positions"); checkArgument( start >= getStartOffset(), "Start offset value %s of the subrange cannot be smaller than the start offset value %s" + " of the parent source", start, getStartOffset()); checkArgument( end <= getEndOffset(), "End offset value %s of the subrange cannot be larger than the end offset value %s", end, getEndOffset()); checkState( singleFileMetadata != null, "A single file source should not have null metadata: %s", this); FileBasedSource<T> source = createForSubrangeOfFile(singleFileMetadata, start, end); if (start > 0 || end != Long.MAX_VALUE) { checkArgument( source.getMode() == Mode.SINGLE_FILE_OR_SUBRANGE, "Source created for the range [%s,%s) must be a subrange source", start, end); } return source; }
@Override @Nullable public Double getFractionConsumed() { if (!isStarted()) { return 0.0; } if (isDone()) { return 1.0; } FileBasedSource<T> source = getCurrentSource(); if (source.getEndOffset() == Long.MAX_VALUE) { // Unknown end offset, so we cannot tell. return null; } long currentBlockOffset = getCurrentBlockOffset(); long startOffset = source.getStartOffset(); long endOffset = source.getEndOffset(); double fractionAtBlockStart = ((double) (currentBlockOffset - startOffset)) / (endOffset - startOffset); double fractionAtBlockEnd = ((double) (currentBlockOffset + getCurrentBlockSize() - startOffset) / (endOffset - startOffset)); double blockFraction = getCurrentBlock().getFractionOfBlockConsumed(); return Math.min( 1.0, fractionAtBlockStart + blockFraction * (fractionAtBlockEnd - fractionAtBlockStart)); }
/** * Creates a decompressing channel from the input channel and passes it to its delegate reader's * {@link FileBasedReader#startReading(ReadableByteChannel)}. */ @Override protected final void startReading(ReadableByteChannel channel) throws IOException { synchronized (progressLock) { this.channel = new CountingChannel(channel, getCurrentSource().getStartOffset()); channel = this.channel; } if (channelFactory == CompressionMode.AUTO) { readerDelegate.startReading( Compression.detect(getCurrentSource().getFileOrPatternSpec()) .readDecompressed(channel)); } else { readerDelegate.startReading(channelFactory.createDecompressingChannel(channel)); } }