long footerLengthIndex = stat.getLen() - ParquetFooterInputFromCache.FOOTER_LENGTH_SIZE - ParquetFileWriter.MAGIC.length; stream.seek(footerLengthIndex); int footerLength = BytesUtils.readIntLittleEndian(stream); stream.seek(footerLengthIndex - footerLength); if (LOG.isInfoEnabled()) { LOG.info("Caching the footer of length " + footerLength + " for " + cacheKey);
private static final ParquetMetadata readFooter(InputFile file, ParquetReadOptions options, SeekableInputStream f, ParquetMetadataConverter converter) throws IOException { long fileLen = file.getLength(); String filePath = file.toString(); LOG.debug("File length {}", fileLen); int FOOTER_LENGTH_SIZE = 4; if (fileLen < MAGIC.length + FOOTER_LENGTH_SIZE + MAGIC.length) { // MAGIC + data + footer + footerIndex + MAGIC throw new RuntimeException(filePath + " is not a Parquet file (too small length: " + fileLen + ")"); } long footerLengthIndex = fileLen - FOOTER_LENGTH_SIZE - MAGIC.length; LOG.debug("reading footer index at {}", footerLengthIndex); f.seek(footerLengthIndex); int footerLength = readIntLittleEndian(f); byte[] magic = new byte[MAGIC.length]; f.readFully(magic); if (!Arrays.equals(MAGIC, magic)) { throw new RuntimeException(filePath + " is not a Parquet file. expected magic number at tail " + Arrays.toString(MAGIC) + " but found " + Arrays.toString(magic)); } long footerIndex = footerLengthIndex - footerLength; LOG.debug("read footer length: {}, footer index: {}", footerLength, footerIndex); if (footerIndex < MAGIC.length || footerIndex >= footerLengthIndex) { throw new RuntimeException("corrupted file: the footer index is not within the file: " + footerIndex); } f.seek(footerIndex); return converter.readParquetMetadata(f, options.getMetadataFilter()); }
/** * Copy from a FS input stream to an output stream. Thread-safe * * @param from a {@link SeekableInputStream} * @param to any {@link PositionOutputStream} * @param start where in the from stream to start copying * @param length the number of bytes to copy * @throws IOException if there is an error while reading or writing */ private static void copy(SeekableInputStream from, PositionOutputStream to, long start, long length) throws IOException{ LOG.debug("Copying {} bytes at {} to {}" ,length , start , to.getPos()); from.seek(start); long bytesCopied = 0; byte[] buffer = COPY_BUFFER.get(); while (bytesCopied < length) { long bytesLeft = length - bytesCopied; int bytesRead = from.read(buffer, 0, (buffer.length < bytesLeft ? buffer.length : (int) bytesLeft)); if (bytesRead < 0) { throw new IllegalArgumentException( "Unexpected end of input file at " + start + bytesCopied); } to.write(buffer, 0, bytesRead); bytesCopied += bytesRead; } }
/** * Reads and decompresses a dictionary page for the given column chunk. * * Returns null if the given column chunk has no dictionary page. * * @param meta a column's ColumnChunkMetaData to read the dictionary from * @return an uncompressed DictionaryPage or null * @throws IOException if there is an error while reading the dictionary */ DictionaryPage readDictionary(ColumnChunkMetaData meta) throws IOException { if (!meta.getEncodings().contains(Encoding.PLAIN_DICTIONARY) && !meta.getEncodings().contains(Encoding.RLE_DICTIONARY)) { return null; } // TODO: this should use getDictionaryPageOffset() but it isn't reliable. if (f.getPos() != meta.getStartingPos()) { f.seek(meta.getStartingPos()); } PageHeader pageHeader = Util.readPageHeader(f); if (!pageHeader.isSetDictionary_page_header()) { return null; // TODO: should this complain? } DictionaryPage compressedPage = readCompressedDictionary(pageHeader, f); BytesInputDecompressor decompressor = options.getCodecFactory().getDecompressor(meta.getCodec()); return new DictionaryPage( decompressor.decompress(compressedPage.getBytes(), compressedPage.getUncompressedSize()), compressedPage.getDictionarySize(), compressedPage.getEncoding()); }
public BytesInput readAsBytesInput(int size) throws IOException { int available = stream.available(); if (size > available) { // this is to workaround a bug where the compressedLength // of the chunk is missing the size of the header of the dictionary // to allow reading older files (using dictionary) we need this. // usually 13 to 19 bytes are missing int missingBytes = size - available; LOG.info("completed the column chunk with {} bytes", missingBytes); List<ByteBuffer> buffers = new ArrayList<>(); buffers.addAll(stream.sliceBuffers(available)); ByteBuffer lastBuffer = ByteBuffer.allocate(missingBytes); f.readFully(lastBuffer); buffers.add(lastBuffer); return BytesInput.from(buffers); } return super.readAsBytesInput(size); }
public void loadPage(DrillBuf target, int pageLength) throws IOException { target.clear(); HadoopStreams.wrap(input).read(target.nioBuffer(0, pageLength)); target.writerIndex(pageLength); }
@Override public void close() throws IOException { try { if (f != null) { f.close(); } } finally { options.getCodecFactory().release(); } }
private static final ParquetMetadata readFooter(InputFile file, ParquetReadOptions options, SeekableInputStream f, ParquetMetadataConverter converter) throws IOException { long fileLen = file.getLength(); String filePath = file.toString(); LOG.debug("File length {}", fileLen); int FOOTER_LENGTH_SIZE = 4; if (fileLen < MAGIC.length + FOOTER_LENGTH_SIZE + MAGIC.length) { // MAGIC + data + footer + footerIndex + MAGIC throw new RuntimeException(filePath + " is not a Parquet file (too small length: " + fileLen + ")"); } long footerLengthIndex = fileLen - FOOTER_LENGTH_SIZE - MAGIC.length; LOG.debug("reading footer index at {}", footerLengthIndex); f.seek(footerLengthIndex); int footerLength = readIntLittleEndian(f); byte[] magic = new byte[MAGIC.length]; f.readFully(magic); if (!Arrays.equals(MAGIC, magic)) { throw new RuntimeException(filePath + " is not a Parquet file. expected magic number at tail " + Arrays.toString(MAGIC) + " but found " + Arrays.toString(magic)); } long footerIndex = footerLengthIndex - footerLength; LOG.debug("read footer length: {}, footer index: {}", footerLength, footerIndex); if (footerIndex < MAGIC.length || footerIndex >= footerLengthIndex) { throw new RuntimeException("corrupted file: the footer index is not within the file: " + footerIndex); } f.seek(footerIndex); return converter.readParquetMetadata(f, options.getMetadataFilter()); }
/** * Copy from a FS input stream to an output stream. Thread-safe * * @param from a {@link SeekableInputStream} * @param to any {@link PositionOutputStream} * @param start where in the from stream to start copying * @param length the number of bytes to copy * @throws IOException if there is an error while reading or writing */ private static void copy(SeekableInputStream from, PositionOutputStream to, long start, long length) throws IOException{ LOG.debug("Copying {} bytes at {} to {}" ,length , start , to.getPos()); from.seek(start); long bytesCopied = 0; byte[] buffer = COPY_BUFFER.get(); while (bytesCopied < length) { long bytesLeft = length - bytesCopied; int bytesRead = from.read(buffer, 0, (buffer.length < bytesLeft ? buffer.length : (int) bytesLeft)); if (bytesRead < 0) { throw new IllegalArgumentException( "Unexpected end of input file at " + start + bytesCopied); } to.write(buffer, 0, bytesRead); bytesCopied += bytesRead; } }
/** * Reads and decompresses a dictionary page for the given column chunk. * * Returns null if the given column chunk has no dictionary page. * * @param meta a column's ColumnChunkMetaData to read the dictionary from * @return an uncompressed DictionaryPage or null * @throws IOException if there is an error while reading the dictionary */ DictionaryPage readDictionary(ColumnChunkMetaData meta) throws IOException { if (!meta.getEncodings().contains(Encoding.PLAIN_DICTIONARY) && !meta.getEncodings().contains(Encoding.RLE_DICTIONARY)) { return null; } // TODO: this should use getDictionaryPageOffset() but it isn't reliable. if (f.getPos() != meta.getStartingPos()) { f.seek(meta.getStartingPos()); } PageHeader pageHeader = Util.readPageHeader(f); if (!pageHeader.isSetDictionary_page_header()) { return null; // TODO: should this complain? } DictionaryPage compressedPage = readCompressedDictionary(pageHeader, f); BytesInputDecompressor decompressor = options.getCodecFactory().getDecompressor(meta.getCodec()); return new DictionaryPage( decompressor.decompress(compressedPage.getBytes(), compressedPage.getUncompressedSize()), compressedPage.getDictionarySize(), compressedPage.getEncoding()); }
public BytesInput readAsBytesInput(int size) throws IOException { int available = stream.available(); if (size > available) { // this is to workaround a bug where the compressedLength // of the chunk is missing the size of the header of the dictionary // to allow reading older files (using dictionary) we need this. // usually 13 to 19 bytes are missing int missingBytes = size - available; LOG.info("completed the column chunk with {} bytes", missingBytes); List<ByteBuffer> buffers = new ArrayList<>(); buffers.addAll(stream.sliceBuffers(available)); ByteBuffer lastBuffer = ByteBuffer.allocate(missingBytes); f.readFully(lastBuffer); buffers.add(lastBuffer); return BytesInput.from(buffers); } return super.readAsBytesInput(size); }
public synchronized int read(DrillBuf buf, int off, int len) throws IOException { buf.clear(); ByteBuffer directBuffer = buf.nioBuffer(0, len); int lengthLeftToRead = len; SeekableInputStream seekableInputStream = HadoopStreams.wrap(getInputStream()); while (lengthLeftToRead > 0) { if(logger.isTraceEnabled()) { logger.trace("PERF: Disk read start. {}, StartOffset: {}, TotalByteSize: {}", this.streamId, this.startOffset, this.totalByteSize); } Stopwatch timer = Stopwatch.createStarted(); int bytesRead = seekableInputStream.read(directBuffer); if (bytesRead < 0) { return bytesRead; } lengthLeftToRead -= bytesRead; if(logger.isTraceEnabled()) { logger.trace( "PERF: Disk read complete. {}, StartOffset: {}, TotalByteSize: {}, BytesRead: {}, Time: {} ms", this.streamId, this.startOffset, this.totalByteSize, bytesRead, ((double) timer.elapsed(TimeUnit.MICROSECONDS)) / 1000); } } buf.writerIndex(len); return len; }
@Override public void close() throws IOException { try { if (f != null) { f.close(); } } finally { options.getCodecFactory().release(); } }
/** * @param f file to read the blocks from * @return the ByteBuffer blocks * @throws IOException if there is an error while reading from the stream */ List<ByteBuffer> readBlocks(SeekableInputStream f, long offset, int length) throws IOException { f.seek(offset); int fullAllocations = length / options.getMaxAllocationSize(); int lastAllocationSize = length % options.getMaxAllocationSize(); int numAllocations = fullAllocations + (lastAllocationSize > 0 ? 1 : 0); List<ByteBuffer> buffers = new ArrayList<>(numAllocations); for (int i = 0; i < fullAllocations; i++) { buffers.add(options.getAllocator().allocate(options.getMaxAllocationSize())); } if (lastAllocationSize > 0) { buffers.add(options.getAllocator().allocate(lastAllocationSize)); } for (ByteBuffer buffer : buffers) { f.readFully(buffer); buffer.flip(); } return buffers; }
private DictionaryPage readCompressedDictionary( PageHeader pageHeader, SeekableInputStream fin) throws IOException { DictionaryPageHeader dictHeader = pageHeader.getDictionary_page_header(); int uncompressedPageSize = pageHeader.getUncompressed_page_size(); int compressedPageSize = pageHeader.getCompressed_page_size(); byte [] dictPageBytes = new byte[compressedPageSize]; fin.readFully(dictPageBytes); BytesInput bin = BytesInput.from(dictPageBytes); return new DictionaryPage( bin, uncompressedPageSize, dictHeader.getNum_values(), converter.getEncoding(dictHeader.getEncoding())); }
/** * @param column * the column chunk which the offset index is to be returned for * @return the offset index for the specified column chunk or {@code null} if there is no index * @throws IOException * if any I/O error occurs during reading the file */ @Private public OffsetIndex readOffsetIndex(ColumnChunkMetaData column) throws IOException { IndexReference ref = column.getOffsetIndexReference(); if (ref == null) { return null; } f.seek(ref.getOffset()); return ParquetMetadataConverter.fromParquetOffsetIndex(Util.readOffsetIndex(f)); }
if (bytesToRead > 0) { try { nBytes = HadoopStreams.wrap(getInputStream()).read(directBuffer); } catch (Exception e) { logger.error("Error reading from stream {}. Error was : {}", this.streamId, e.getMessage());
public ParquetFileReader(InputFile file, ParquetReadOptions options) throws IOException { this.converter = new ParquetMetadataConverter(options); this.file = file; this.f = file.newStream(); this.options = options; try { this.footer = readFooter(file, options, f, converter); } catch (Exception e) { // In case that reading footer throws an exception in the constructor, the new stream // should be closed. Otherwise, there's no way to close this outside. f.close(); throw e; } this.fileMetaData = footer.getFileMetaData(); this.blocks = filterRowGroups(footer.getBlocks()); this.blockIndexStores = listWithNulls(this.blocks.size()); this.blockRowRanges = listWithNulls(this.blocks.size()); for (ColumnDescriptor col : footer.getFileMetaData().getSchema().getColumns()) { paths.put(ColumnPath.get(col.getPath()), col); } }
private DictionaryPage readCompressedDictionary( PageHeader pageHeader, SeekableInputStream fin) throws IOException { DictionaryPageHeader dictHeader = pageHeader.getDictionary_page_header(); int uncompressedPageSize = pageHeader.getUncompressed_page_size(); int compressedPageSize = pageHeader.getCompressed_page_size(); byte [] dictPageBytes = new byte[compressedPageSize]; fin.readFully(dictPageBytes); BytesInput bin = BytesInput.from(dictPageBytes); return new DictionaryPage( bin, uncompressedPageSize, dictHeader.getNum_values(), converter.getEncoding(dictHeader.getEncoding())); }