/** * Tests if the expected sequence and amount of data can be read. */ @Test public void checkRead() throws Exception { BinaryInputFormat<T> input = this.createInputFormat(); FileInputSplit[] inputSplits = input.createInputSplits(0); Arrays.sort(inputSplits, new InputSplitSorter()); int readCount = 0; for (FileInputSplit inputSplit : inputSplits) { input.open(inputSplit); input.reopen(inputSplit, input.getCurrentState()); T record = createInstance(); while (!input.reachedEnd()) { if (input.nextRecord(record) != null) { this.checkEquals(this.getRecord(readCount), record); if (!input.reachedEnd()) { Tuple2<Long, Long> state = input.getCurrentState(); input = this.createInputFormat(); input.reopen(inputSplit, state); } readCount++; } } } Assert.assertEquals(this.numberOfTuples, readCount); }
@Test public void testGetStatisticsMultiplePaths() throws IOException { final int blockInfoSize = new BlockInfo().getInfoSize(); final int blockSize = blockInfoSize + 8; final int numBlocks1 = 3; final int numBlocks2 = 5; final File tempFile = createBinaryInputFile("binary_input_format_test", blockSize, numBlocks1); final File tempFile2 = createBinaryInputFile("binary_input_format_test_2", blockSize, numBlocks2); final BinaryInputFormat<Record> inputFormat = new MyBinaryInputFormat(); inputFormat.setFilePaths(tempFile.toURI().toString(), tempFile2.toURI().toString()); inputFormat.setBlockSize(blockSize); BaseStatistics stats = inputFormat.getStatistics(null); Assert.assertEquals("The file size statistics is wrong", blockSize * (numBlocks1 + numBlocks2), stats.getTotalInputSize()); }
@PublicEvolving @Override public void reopen(FileInputSplit split, Tuple2<Long, Long> state) throws IOException { Preconditions.checkNotNull(split, "reopen() cannot be called on a null split."); Preconditions.checkNotNull(state, "reopen() cannot be called with a null initial state."); try { this.open(split); } finally { this.blockInfo = this.createAndReadBlockInfo(); long blockPos = state.f0; this.readRecords = state.f1; this.stream.seek(this.splitStart + blockPos); this.blockBasedInput = new BlockBasedInput(this.stream, (int) blockPos, this.splitLength); this.dataInputStream = new DataInputViewStreamWrapper(blockBasedInput); } } }
@Override public T nextRecord(T record) throws IOException { if (this.reachedEnd()) { return null; } record = this.deserialize(record, this.dataInputStream); this.readRecords++; return record; }
final FileBaseStatistics stats = getFileStats(cachedFileStats, getFilePaths(), allFiles); if (stats == null) { return null; return (SequentialStatistics) stats; return createStatistics(allFiles, stats); } catch (IOException ioex) { if (LOG.isWarnEnabled()) { LOG.warn( String.format("Could not determine complete statistics for files '%s' due to an I/O error", Arrays.toString(getFilePaths())), ioex); LOG.error( String.format("Unexpected problem while getting the file statistics for files '%s'", Arrays.toString(getFilePaths())), t);
@Test public void testCreateInputSplitsWithOneFile() throws IOException { // create temporary file with 3 blocks final File tempFile = File.createTempFile("binary_input_format_test", "tmp"); tempFile.deleteOnExit(); final int blockInfoSize = new BlockInfo().getInfoSize(); final int blockSize = blockInfoSize + 8; final int numBlocks = 3; FileOutputStream fileOutputStream = new FileOutputStream(tempFile); for(int i = 0; i < blockSize * numBlocks; i++) { fileOutputStream.write(new byte[]{1}); } fileOutputStream.close(); final Configuration config = new Configuration(); config.setLong("input.block_size", blockSize + 10); final BinaryInputFormat<Record> inputFormat = new MyBinaryInputFormat(); inputFormat.setFilePath(tempFile.toURI().toString()); inputFormat.setBlockSize(blockSize); inputFormat.configure(config); FileInputSplit[] inputSplits = inputFormat.createInputSplits(numBlocks); Assert.assertEquals("Returns requested numbers of splits.", numBlocks, inputSplits.length); Assert.assertEquals("1. split has block size length.", blockSize, inputSplits[0].getLength()); Assert.assertEquals("2. split has block size length.", blockSize, inputSplits[1].getLength()); Assert.assertEquals("3. split has block size length.", blockSize, inputSplits[2].getLength()); }
inputFormat.setFilePaths(pathFile1, pathFile2); inputFormat.setBlockSize(blockSize); FileInputSplit[] inputSplits = inputFormat.createInputSplits(numBlocksTotal);
protected FileInputSplit[] getInputSplits() throws IOException { return this.createInputSplits(0); }
@Override public FileInputSplit[] createInputSplits(int minNumSplits) throws IOException { final List<FileStatus> files = this.getFiles(); final List<FileInputSplit> inputSplits = new ArrayList<FileInputSplit>(minNumSplits); for (FileStatus file : files) { final FileSystem fs = file.getPath().getFileSystem(); final long blockSize = this.blockSize == NATIVE_BLOCK_SIZE ? fs.getDefaultBlockSize() : this.blockSize; for (long pos = 0, length = file.getLen(); pos < length; pos += blockSize) { long remainingLength = Math.min(pos + blockSize, length) - pos; // get the block locations and make sure they are in order with respect to their offset final BlockLocation[] blocks = fs.getFileBlockLocations(file, pos, remainingLength); Arrays.sort(blocks); inputSplits.add(new FileInputSplit(inputSplits.size(), file.getPath(), pos, remainingLength, blocks[0].getHosts())); } } if (inputSplits.size() < minNumSplits) { LOG.warn(String.format( "With the given block size %d, the files %s cannot be split into %d blocks. Filling up with empty splits...", blockSize, Arrays.toString(getFilePaths()), minNumSplits)); FileStatus last = files.get(files.size() - 1); final BlockLocation[] blocks = last.getPath().getFileSystem().getFileBlockLocations(last, 0, last.getLen()); for (int index = files.size(); index < minNumSplits; index++) { inputSplits.add(new FileInputSplit(index, last.getPath(), last.getLen(), 0, blocks[0].getHosts())); } } return inputSplits.toArray(new FileInputSplit[inputSplits.size()]); }
@Override public void configure(Configuration parameters) { super.configure(parameters); // the if is to prevent the configure() method from // overwriting the value set by the setter if (this.blockSize == NATIVE_BLOCK_SIZE) { long blockSize = parameters.getLong(BLOCK_SIZE_PARAMETER_KEY, NATIVE_BLOCK_SIZE); setBlockSize(blockSize); } }
@Override public void open(FileInputSplit split) throws IOException { super.open(split); this.blockInfo = this.createAndReadBlockInfo(); // We set the size of the BlockBasedInput to splitLength as each split contains one block. // After reading the block info, we seek in the file to the correct position. this.readRecords = 0; this.stream.seek(this.splitStart + this.blockInfo.getFirstRecordStart()); this.blockBasedInput = new BlockBasedInput(this.stream, (int) blockInfo.getFirstRecordStart(), this.splitLength); this.dataInputStream = new DataInputViewStreamWrapper(blockBasedInput); }
protected List<FileStatus> getFiles() throws IOException { // get all the files that are involved in the splits List<FileStatus> files = new ArrayList<>(); for (Path filePath: getFilePaths()) { final FileSystem fs = filePath.getFileSystem(); final FileStatus pathFile = fs.getFileStatus(filePath); if (pathFile.isDir()) { // input is directory. list all contained files final FileStatus[] partials = fs.listStatus(filePath); for (FileStatus partial : partials) { if (!partial.isDir()) { files.add(partial); } } } else { files.add(pathFile); } } return files; }
final FileBaseStatistics stats = getFileStats(cachedFileStats, getFilePaths(), allFiles); if (stats == null) { return null; return (SequentialStatistics) stats; return createStatistics(allFiles, stats); } catch (IOException ioex) { if (LOG.isWarnEnabled()) { LOG.warn( String.format("Could not determine complete statistics for files '%s' due to an I/O error", Arrays.toString(getFilePaths())), ioex); LOG.error( String.format("Unexpected problem while getting the file statistics for files '%s'", Arrays.toString(getFilePaths())), t);
@Override public T nextRecord(T record) throws IOException { if (this.reachedEnd()) { return null; } record = this.deserialize(record, this.dataInputStream); this.readRecords++; return record; }
/** * Checks if the expected input splits were created. */ @Test public void checkInputSplits() throws IOException { FileInputSplit[] inputSplits = this.createInputFormat().createInputSplits(0); Arrays.sort(inputSplits, new InputSplitSorter()); int splitIndex = 0; for (int fileIndex = 0; fileIndex < this.parallelism; fileIndex++) { List<FileInputSplit> sameFileSplits = new ArrayList<FileInputSplit>(); Path lastPath = inputSplits[splitIndex].getPath(); for (; splitIndex < inputSplits.length; splitIndex++) { if (!inputSplits[splitIndex].getPath().equals(lastPath)) { break; } sameFileSplits.add(inputSplits[splitIndex]); } Assert.assertEquals(this.getExpectedBlockCount(fileIndex), sameFileSplits.size()); long lastBlockLength = this.rawDataSizes[fileIndex] % (this.blockSize - getInfoSize()) + getInfoSize(); for (int index = 0; index < sameFileSplits.size(); index++) { Assert.assertEquals(this.blockSize * index, sameFileSplits.get(index).getStart()); if (index < sameFileSplits.size() - 1) { Assert.assertEquals(this.blockSize, sameFileSplits.get(index).getLength()); } } Assert.assertEquals(lastBlockLength, sameFileSplits.get(sameFileSplits.size() - 1).getLength()); } }
@Override public FileInputSplit[] createInputSplits(int minNumSplits) throws IOException { final List<FileStatus> files = this.getFiles(); final List<FileInputSplit> inputSplits = new ArrayList<FileInputSplit>(minNumSplits); for (FileStatus file : files) { final FileSystem fs = file.getPath().getFileSystem(); final long blockSize = this.blockSize == NATIVE_BLOCK_SIZE ? fs.getDefaultBlockSize() : this.blockSize; for (long pos = 0, length = file.getLen(); pos < length; pos += blockSize) { long remainingLength = Math.min(pos + blockSize, length) - pos; // get the block locations and make sure they are in order with respect to their offset final BlockLocation[] blocks = fs.getFileBlockLocations(file, pos, remainingLength); Arrays.sort(blocks); inputSplits.add(new FileInputSplit(inputSplits.size(), file.getPath(), pos, remainingLength, blocks[0].getHosts())); } } if (inputSplits.size() < minNumSplits) { LOG.warn(String.format( "With the given block size %d, the files %s cannot be split into %d blocks. Filling up with empty splits...", blockSize, Arrays.toString(getFilePaths()), minNumSplits)); FileStatus last = files.get(files.size() - 1); final BlockLocation[] blocks = last.getPath().getFileSystem().getFileBlockLocations(last, 0, last.getLen()); for (int index = files.size(); index < minNumSplits; index++) { inputSplits.add(new FileInputSplit(index, last.getPath(), last.getLen(), 0, blocks[0].getHosts())); } } return inputSplits.toArray(new FileInputSplit[inputSplits.size()]); }
@Override public void configure(Configuration parameters) { super.configure(parameters); // the if is to prevent the configure() method from // overwriting the value set by the setter if (this.blockSize == NATIVE_BLOCK_SIZE) { long blockSize = parameters.getLong(BLOCK_SIZE_PARAMETER_KEY, NATIVE_BLOCK_SIZE); setBlockSize(blockSize); } }
@Override public void open(FileInputSplit split) throws IOException { super.open(split); this.blockInfo = this.createAndReadBlockInfo(); // We set the size of the BlockBasedInput to splitLength as each split contains one block. // After reading the block info, we seek in the file to the correct position. this.readRecords = 0; this.stream.seek(this.splitStart + this.blockInfo.getFirstRecordStart()); this.blockBasedInput = new BlockBasedInput(this.stream, (int) blockInfo.getFirstRecordStart(), this.splitLength); this.dataInputStream = new DataInputViewStreamWrapper(blockBasedInput); }
protected List<FileStatus> getFiles() throws IOException { // get all the files that are involved in the splits List<FileStatus> files = new ArrayList<>(); for (Path filePath: getFilePaths()) { final FileSystem fs = filePath.getFileSystem(); final FileStatus pathFile = fs.getFileStatus(filePath); if (pathFile.isDir()) { // input is directory. list all contained files final FileStatus[] partials = fs.listStatus(filePath); for (FileStatus partial : partials) { if (!partial.isDir()) { files.add(partial); } } } else { files.add(pathFile); } } return files; }
final FileBaseStatistics stats = getFileStats(cachedFileStats, getFilePaths(), allFiles); if (stats == null) { return null; return (SequentialStatistics) stats; return createStatistics(allFiles, stats); } catch (IOException ioex) { if (LOG.isWarnEnabled()) { LOG.warn( String.format("Could not determine complete statistics for files '%s' due to an I/O error", Arrays.toString(getFilePaths())), ioex); LOG.error( String.format("Unexpected problem while getting the file statistics for files '%s'", Arrays.toString(getFilePaths())), t);