long footerLengthIndex = stat.getLen() - ParquetFooterInputFromCache.FOOTER_LENGTH_SIZE - ParquetFileWriter.MAGIC.length; stream.seek(footerLengthIndex); int footerLength = BytesUtils.readIntLittleEndian(stream); stream.seek(footerLengthIndex - footerLength); if (LOG.isInfoEnabled()) { LOG.info("Caching the footer of length " + footerLength + " for " + cacheKey);
private static final ParquetMetadata readFooter(InputFile file, ParquetReadOptions options, SeekableInputStream f, ParquetMetadataConverter converter) throws IOException { long fileLen = file.getLength(); String filePath = file.toString(); LOG.debug("File length {}", fileLen); int FOOTER_LENGTH_SIZE = 4; if (fileLen < MAGIC.length + FOOTER_LENGTH_SIZE + MAGIC.length) { // MAGIC + data + footer + footerIndex + MAGIC throw new RuntimeException(filePath + " is not a Parquet file (too small length: " + fileLen + ")"); } long footerLengthIndex = fileLen - FOOTER_LENGTH_SIZE - MAGIC.length; LOG.debug("reading footer index at {}", footerLengthIndex); f.seek(footerLengthIndex); int footerLength = readIntLittleEndian(f); byte[] magic = new byte[MAGIC.length]; f.readFully(magic); if (!Arrays.equals(MAGIC, magic)) { throw new RuntimeException(filePath + " is not a Parquet file. expected magic number at tail " + Arrays.toString(MAGIC) + " but found " + Arrays.toString(magic)); } long footerIndex = footerLengthIndex - footerLength; LOG.debug("read footer length: {}, footer index: {}", footerLength, footerIndex); if (footerIndex < MAGIC.length || footerIndex >= footerLengthIndex) { throw new RuntimeException("corrupted file: the footer index is not within the file: " + footerIndex); } f.seek(footerIndex); return converter.readParquetMetadata(f, options.getMetadataFilter()); }
private static final ParquetMetadata readFooter(InputFile file, ParquetReadOptions options, SeekableInputStream f, ParquetMetadataConverter converter) throws IOException { long fileLen = file.getLength(); String filePath = file.toString(); LOG.debug("File length {}", fileLen); int FOOTER_LENGTH_SIZE = 4; if (fileLen < MAGIC.length + FOOTER_LENGTH_SIZE + MAGIC.length) { // MAGIC + data + footer + footerIndex + MAGIC throw new RuntimeException(filePath + " is not a Parquet file (too small length: " + fileLen + ")"); } long footerLengthIndex = fileLen - FOOTER_LENGTH_SIZE - MAGIC.length; LOG.debug("reading footer index at {}", footerLengthIndex); f.seek(footerLengthIndex); int footerLength = readIntLittleEndian(f); byte[] magic = new byte[MAGIC.length]; f.readFully(magic); if (!Arrays.equals(MAGIC, magic)) { throw new RuntimeException(filePath + " is not a Parquet file. expected magic number at tail " + Arrays.toString(MAGIC) + " but found " + Arrays.toString(magic)); } long footerIndex = footerLengthIndex - footerLength; LOG.debug("read footer length: {}, footer index: {}", footerLength, footerIndex); if (footerIndex < MAGIC.length || footerIndex >= footerLengthIndex) { throw new RuntimeException("corrupted file: the footer index is not within the file: " + footerIndex); } f.seek(footerIndex); return converter.readParquetMetadata(f, options.getMetadataFilter()); }
/** * Copy from a FS input stream to an output stream. Thread-safe * * @param from a {@link SeekableInputStream} * @param to any {@link PositionOutputStream} * @param start where in the from stream to start copying * @param length the number of bytes to copy * @throws IOException if there is an error while reading or writing */ private static void copy(SeekableInputStream from, PositionOutputStream to, long start, long length) throws IOException{ LOG.debug("Copying {} bytes at {} to {}" ,length , start , to.getPos()); from.seek(start); long bytesCopied = 0; byte[] buffer = COPY_BUFFER.get(); while (bytesCopied < length) { long bytesLeft = length - bytesCopied; int bytesRead = from.read(buffer, 0, (buffer.length < bytesLeft ? buffer.length : (int) bytesLeft)); if (bytesRead < 0) { throw new IllegalArgumentException( "Unexpected end of input file at " + start + bytesCopied); } to.write(buffer, 0, bytesRead); bytesCopied += bytesRead; } }
/** * Copy from a FS input stream to an output stream. Thread-safe * * @param from a {@link SeekableInputStream} * @param to any {@link PositionOutputStream} * @param start where in the from stream to start copying * @param length the number of bytes to copy * @throws IOException if there is an error while reading or writing */ private static void copy(SeekableInputStream from, PositionOutputStream to, long start, long length) throws IOException{ LOG.debug("Copying {} bytes at {} to {}" ,length , start , to.getPos()); from.seek(start); long bytesCopied = 0; byte[] buffer = COPY_BUFFER.get(); while (bytesCopied < length) { long bytesLeft = length - bytesCopied; int bytesRead = from.read(buffer, 0, (buffer.length < bytesLeft ? buffer.length : (int) bytesLeft)); if (bytesRead < 0) { throw new IllegalArgumentException( "Unexpected end of input file at " + start + bytesCopied); } to.write(buffer, 0, bytesRead); bytesCopied += bytesRead; } }
/** * @param f file to read the blocks from * @return the ByteBuffer blocks * @throws IOException if there is an error while reading from the stream */ List<ByteBuffer> readBlocks(SeekableInputStream f, long offset, int length) throws IOException { f.seek(offset); int fullAllocations = length / options.getMaxAllocationSize(); int lastAllocationSize = length % options.getMaxAllocationSize(); int numAllocations = fullAllocations + (lastAllocationSize > 0 ? 1 : 0); List<ByteBuffer> buffers = new ArrayList<>(numAllocations); for (int i = 0; i < fullAllocations; i++) { buffers.add(options.getAllocator().allocate(options.getMaxAllocationSize())); } if (lastAllocationSize > 0) { buffers.add(options.getAllocator().allocate(lastAllocationSize)); } for (ByteBuffer buffer : buffers) { f.readFully(buffer); buffer.flip(); } return buffers; }
/** * @param column * the column chunk which the offset index is to be returned for * @return the offset index for the specified column chunk or {@code null} if there is no index * @throws IOException * if any I/O error occurs during reading the file */ @Private public OffsetIndex readOffsetIndex(ColumnChunkMetaData column) throws IOException { IndexReference ref = column.getOffsetIndexReference(); if (ref == null) { return null; } f.seek(ref.getOffset()); return ParquetMetadataConverter.fromParquetOffsetIndex(Util.readOffsetIndex(f)); }
/** * @param column * the column chunk which the column index is to be returned for * @return the column index for the specified column chunk or {@code null} if there is no index * @throws IOException * if any I/O error occurs during reading the file */ @Private public ColumnIndex readColumnIndex(ColumnChunkMetaData column) throws IOException { IndexReference ref = column.getColumnIndexReference(); if (ref == null) { return null; } f.seek(ref.getOffset()); return ParquetMetadataConverter.fromParquetColumnIndex(column.getPrimitiveType(), Util.readColumnIndex(f)); }
/** * Reads and decompresses a dictionary page for the given column chunk. * * Returns null if the given column chunk has no dictionary page. * * @param meta a column's ColumnChunkMetaData to read the dictionary from * @return an uncompressed DictionaryPage or null * @throws IOException if there is an error while reading the dictionary */ DictionaryPage readDictionary(ColumnChunkMetaData meta) throws IOException { if (!meta.getEncodings().contains(Encoding.PLAIN_DICTIONARY) && !meta.getEncodings().contains(Encoding.RLE_DICTIONARY)) { return null; } // TODO: this should use getDictionaryPageOffset() but it isn't reliable. if (f.getPos() != meta.getStartingPos()) { f.seek(meta.getStartingPos()); } PageHeader pageHeader = Util.readPageHeader(f); if (!pageHeader.isSetDictionary_page_header()) { return null; // TODO: should this complain? } DictionaryPage compressedPage = readCompressedDictionary(pageHeader, f); BytesInputDecompressor decompressor = options.getCodecFactory().getDecompressor(meta.getCodec()); return new DictionaryPage( decompressor.decompress(compressedPage.getBytes(), compressedPage.getUncompressedSize()), compressedPage.getDictionarySize(), compressedPage.getEncoding()); }
/** * Reads and decompresses a dictionary page for the given column chunk. * * Returns null if the given column chunk has no dictionary page. * * @param meta a column's ColumnChunkMetaData to read the dictionary from * @return an uncompressed DictionaryPage or null * @throws IOException if there is an error while reading the dictionary */ DictionaryPage readDictionary(ColumnChunkMetaData meta) throws IOException { if (!meta.getEncodings().contains(Encoding.PLAIN_DICTIONARY) && !meta.getEncodings().contains(Encoding.RLE_DICTIONARY)) { return null; } // TODO: this should use getDictionaryPageOffset() but it isn't reliable. if (f.getPos() != meta.getStartingPos()) { f.seek(meta.getStartingPos()); } PageHeader pageHeader = Util.readPageHeader(f); if (!pageHeader.isSetDictionary_page_header()) { return null; // TODO: should this complain? } DictionaryPage compressedPage = readCompressedDictionary(pageHeader, f); BytesInputDecompressor decompressor = options.getCodecFactory().getDecompressor(meta.getCodec()); return new DictionaryPage( decompressor.decompress(compressedPage.getBytes(), compressedPage.getUncompressedSize()), compressedPage.getDictionarySize(), compressedPage.getEncoding()); }