@Override public void validate() { super.validate(); switch (mode) { case FILEPATTERN: checkArgument( getStartOffset() == 0, "FileBasedSource is based on a file pattern or a full single file " + "but the starting offset proposed %s is not zero", getStartOffset()); checkArgument( getEndOffset() == Long.MAX_VALUE, "FileBasedSource is based on a file pattern or a full single file " + "but the ending offset proposed %s is not Long.MAX_VALUE", getEndOffset()); break; case SINGLE_FILE_OR_SUBRANGE: // Nothing more to validate. break; default: throw new IllegalStateException("Unknown mode: " + mode); } }
@Override public long getSplitPointsRemaining() { if (isStarted() && startOfNextRecord >= getCurrentSource().getEndOffset()) { return isDone() ? 0 : 1; } return super.getSplitPointsRemaining(); }
@Override public final long getEstimatedSizeBytes(PipelineOptions options) throws IOException { // This implementation of method getEstimatedSizeBytes is provided to simplify subclasses. Here // we perform the size estimation of files and file patterns using the interface provided by // FileSystem. String fileOrPattern = fileOrPatternSpec.get(); if (mode == Mode.FILEPATTERN) { long totalSize = 0; List<Metadata> allMatches = FileSystems.match(fileOrPattern, emptyMatchTreatment).metadata(); for (Metadata metadata : allMatches) { totalSize += metadata.sizeBytes(); } LOG.info( "Filepattern {} matched {} files with total size {}", fileOrPattern, allMatches.size(), totalSize); return totalSize; } else { long start = getStartOffset(); long end = Math.min(getEndOffset(), getMaxEndOffset(options)); return end - start; } }
@Override @Nullable public Double getFractionConsumed() { if (!isStarted()) { return 0.0; } if (isDone()) { return 1.0; } FileBasedSource<T> source = getCurrentSource(); if (source.getEndOffset() == Long.MAX_VALUE) { // Unknown end offset, so we cannot tell. return null; } long currentBlockOffset = getCurrentBlockOffset(); long startOffset = source.getStartOffset(); long endOffset = source.getEndOffset(); double fractionAtBlockStart = ((double) (currentBlockOffset - startOffset)) / (endOffset - startOffset); double fractionAtBlockEnd = ((double) (currentBlockOffset + getCurrentBlockSize() - startOffset) / (endOffset - startOffset)); double blockFraction = getCurrentBlock().getFractionOfBlockConsumed(); return Math.min( 1.0, fractionAtBlockStart + blockFraction * (fractionAtBlockEnd - fractionAtBlockStart)); }
@Override public final FileBasedSource<T> createSourceForSubrange(long start, long end) { checkArgument( mode != Mode.FILEPATTERN, "Cannot split a file pattern based source based on positions"); checkArgument( start >= getStartOffset(), "Start offset value %s of the subrange cannot be smaller than the start offset value %s" + " of the parent source", start, getStartOffset()); checkArgument( end <= getEndOffset(), "End offset value %s of the subrange cannot be larger than the end offset value %s", end, getEndOffset()); checkState( singleFileMetadata != null, "A single file source should not have null metadata: %s", this); FileBasedSource<T> source = createForSubrangeOfFile(singleFileMetadata, start, end); if (start > 0 || end != Long.MAX_VALUE) { checkArgument( source.getMode() == Mode.SINGLE_FILE_OR_SUBRANGE, "Source created for the range [%s,%s) must be a subrange source", start, end); } return source; }