public ParquetReadOptions build() { return new ParquetReadOptions( useSignedStringMinMax, useStatsFilter, useDictionaryFilter, useRecordFilter, useColumnIndexFilter, recordFilter, metadataFilter, codecFactory, allocator, maxAllocationSize, properties); } }
/** * @param f file to read the blocks from * @return the ByteBuffer blocks * @throws IOException if there is an error while reading from the stream */ List<ByteBuffer> readBlocks(SeekableInputStream f, long offset, int length) throws IOException { f.seek(offset); int fullAllocations = length / options.getMaxAllocationSize(); int lastAllocationSize = length % options.getMaxAllocationSize(); int numAllocations = fullAllocations + (lastAllocationSize > 0 ? 1 : 0); List<ByteBuffer> buffers = new ArrayList<>(numAllocations); for (int i = 0; i < fullAllocations; i++) { buffers.add(options.getAllocator().allocate(options.getMaxAllocationSize())); } if (lastAllocationSize > 0) { buffers.add(options.getAllocator().allocate(lastAllocationSize)); } for (ByteBuffer buffer : buffers) { f.readFully(buffer); buffer.flip(); } return buffers; }
private static float getFloat(ParquetReadOptions options, String key, float defaultValue) { String value = options.getProperty(key); if (value != null) { return Float.valueOf(value); } else { return defaultValue; } } }
private List<BlockMetaData> filterRowGroups(List<BlockMetaData> blocks) throws IOException { // set up data filters based on configured levels List<RowGroupFilter.FilterLevel> levels = new ArrayList<>(); if (options.useStatsFilter()) { levels.add(STATISTICS); } if (options.useDictionaryFilter()) { levels.add(DICTIONARY); } FilterCompat.Filter recordFilter = options.getRecordFilter(); if (recordFilter != null) { return RowGroupFilter.filterRowGroups(levels, recordFilter, blocks, this); } return blocks; }
public void initialize(ParquetFileReader reader, ParquetReadOptions options) { // copy custom configuration to the Configuration passed to the ReadSupport Configuration conf = new Configuration(); if (options instanceof HadoopReadOptions) { conf = ((HadoopReadOptions) options).getConf(); } for (String property : options.getPropertyNames()) { conf.set(property, options.getProperty(property)); } // initialize a ReadContext for this file this.reader = reader; FileMetaData parquetFileMetadata = reader.getFooter().getFileMetaData(); this.fileSchema = parquetFileMetadata.getSchema(); Map<String, String> fileMetadata = parquetFileMetadata.getKeyValueMetaData(); ReadSupport.ReadContext readContext = readSupport.init(new InitContext(conf, toSetMultiMap(fileMetadata), fileSchema)); this.columnIOFactory = new ColumnIOFactory(parquetFileMetadata.getCreatedBy()); this.requestedSchema = readContext.getRequestedSchema(); this.columnCount = requestedSchema.getPaths().size(); this.recordConverter = readSupport.prepareForRead(conf, fileMetadata, fileSchema, readContext); this.strictTypeChecking = options.isEnabled(STRICT_TYPE_CHECKING, true); this.total = reader.getRecordCount(); this.unmaterializableRecordCounter = new UnmaterializableRecordCounter(options, total); this.filterRecords = options.useRecordFilter(); reader.setRequestedSchema(requestedSchema); LOG.info("RecordReader initialized will read a total of {} records.", total); }
public ParquetReadOptions toReadOptions() { return ParquetReadOptions.builder() .useSignedStringMinMax(enableStringsSignedMinMax) .build(); }
@Override public void close() throws IOException { try { if (f != null) { f.close(); } } finally { options.getCodecFactory().release(); } }
private RowRanges getRowRanges(int blockIndex) { RowRanges rowRanges = blockRowRanges.get(blockIndex); if (rowRanges == null) { rowRanges = ColumnIndexFilter.calculateRowRanges(options.getRecordFilter(), getColumnIndexStore(blockIndex), paths.keySet(), blocks.get(blockIndex).getRowCount()); blockRowRanges.set(blockIndex, rowRanges); } return rowRanges; }
private static final ParquetMetadata readFooter(InputFile file, ParquetReadOptions options, SeekableInputStream f, ParquetMetadataConverter converter) throws IOException { long fileLen = file.getLength(); String filePath = file.toString(); LOG.debug("File length {}", fileLen); int FOOTER_LENGTH_SIZE = 4; if (fileLen < MAGIC.length + FOOTER_LENGTH_SIZE + MAGIC.length) { // MAGIC + data + footer + footerIndex + MAGIC throw new RuntimeException(filePath + " is not a Parquet file (too small length: " + fileLen + ")"); } long footerLengthIndex = fileLen - FOOTER_LENGTH_SIZE - MAGIC.length; LOG.debug("reading footer index at {}", footerLengthIndex); f.seek(footerLengthIndex); int footerLength = readIntLittleEndian(f); byte[] magic = new byte[MAGIC.length]; f.readFully(magic); if (!Arrays.equals(MAGIC, magic)) { throw new RuntimeException(filePath + " is not a Parquet file. expected magic number at tail " + Arrays.toString(MAGIC) + " but found " + Arrays.toString(magic)); } long footerIndex = footerLengthIndex - footerLength; LOG.debug("read footer length: {}, footer index: {}", footerLength, footerIndex); if (footerIndex < MAGIC.length || footerIndex >= footerLengthIndex) { throw new RuntimeException("corrupted file: the footer index is not within the file: " + footerIndex); } f.seek(footerIndex); return converter.readParquetMetadata(f, options.getMetadataFilter()); }
public void initialize(ParquetFileReader reader, ParquetReadOptions options) { // copy custom configuration to the Configuration passed to the ReadSupport Configuration conf = new Configuration(); if (options instanceof HadoopReadOptions) { conf = ((HadoopReadOptions) options).getConf(); } for (String property : options.getPropertyNames()) { conf.set(property, options.getProperty(property)); } // initialize a ReadContext for this file this.reader = reader; FileMetaData parquetFileMetadata = reader.getFooter().getFileMetaData(); this.fileSchema = parquetFileMetadata.getSchema(); Map<String, String> fileMetadata = parquetFileMetadata.getKeyValueMetaData(); ReadSupport.ReadContext readContext = readSupport.init(new InitContext(conf, toSetMultiMap(fileMetadata), fileSchema)); this.columnIOFactory = new ColumnIOFactory(parquetFileMetadata.getCreatedBy()); this.requestedSchema = readContext.getRequestedSchema(); this.columnCount = requestedSchema.getPaths().size(); this.recordConverter = readSupport.prepareForRead(conf, fileMetadata, fileSchema, readContext); this.strictTypeChecking = options.isEnabled(STRICT_TYPE_CHECKING, true); this.total = reader.getFilteredRecordCount(); this.unmaterializableRecordCounter = new UnmaterializableRecordCounter(options, total); this.filterRecords = options.useRecordFilter(); reader.setRequestedSchema(requestedSchema); LOG.info("RecordReader initialized will read a total of {} records.", total); }
/** * Open a {@link InputFile file}. * * @param file an input file * @return an open ParquetFileReader * @throws IOException if there is an error while opening the file */ public static ParquetFileReader open(InputFile file) throws IOException { return new ParquetFileReader(file, ParquetReadOptions.builder().build()); }
private List<BlockMetaData> filterRowGroups(List<BlockMetaData> blocks) throws IOException { // set up data filters based on configured levels List<RowGroupFilter.FilterLevel> levels = new ArrayList<>(); if (options.useStatsFilter()) { levels.add(STATISTICS); } if (options.useDictionaryFilter()) { levels.add(DICTIONARY); } FilterCompat.Filter recordFilter = options.getRecordFilter(); if (recordFilter != null) { return RowGroupFilter.filterRowGroups(levels, recordFilter, blocks, this); } return blocks; }
@Override public void close() throws IOException { try { if (f != null) { f.close(); } } finally { options.getCodecFactory().release(); } }
private void initReader() throws IOException { if (reader != null) { reader.close(); reader = null; } if (filesIterator.hasNext()) { InputFile file = filesIterator.next(); ParquetFileReader fileReader = ParquetFileReader.open(file, options); reader = new InternalParquetRecordReader<>(readSupport, options.getRecordFilter()); reader.initialize(fileReader, options); } }
private static final ParquetMetadata readFooter(InputFile file, ParquetReadOptions options, SeekableInputStream f, ParquetMetadataConverter converter) throws IOException { long fileLen = file.getLength(); String filePath = file.toString(); LOG.debug("File length {}", fileLen); int FOOTER_LENGTH_SIZE = 4; if (fileLen < MAGIC.length + FOOTER_LENGTH_SIZE + MAGIC.length) { // MAGIC + data + footer + footerIndex + MAGIC throw new RuntimeException(filePath + " is not a Parquet file (too small length: " + fileLen + ")"); } long footerLengthIndex = fileLen - FOOTER_LENGTH_SIZE - MAGIC.length; LOG.debug("reading footer index at {}", footerLengthIndex); f.seek(footerLengthIndex); int footerLength = readIntLittleEndian(f); byte[] magic = new byte[MAGIC.length]; f.readFully(magic); if (!Arrays.equals(MAGIC, magic)) { throw new RuntimeException(filePath + " is not a Parquet file. expected magic number at tail " + Arrays.toString(MAGIC) + " but found " + Arrays.toString(magic)); } long footerIndex = footerLengthIndex - footerLength; LOG.debug("read footer length: {}, footer index: {}", footerLength, footerIndex); if (footerIndex < MAGIC.length || footerIndex >= footerLengthIndex) { throw new RuntimeException("corrupted file: the footer index is not within the file: " + footerIndex); } f.seek(footerIndex); return converter.readParquetMetadata(f, options.getMetadataFilter()); }
/** * Open a {@link InputFile file}. * * @param file an input file * @return an open ParquetFileReader * @throws IOException if there is an error while opening the file */ public static ParquetFileReader open(InputFile file) throws IOException { return new ParquetFileReader(file, ParquetReadOptions.builder().build()); }
f.seek(offset); int fullAllocations = length / options.getMaxAllocationSize(); int lastAllocationSize = length % options.getMaxAllocationSize(); buffers.add(options.getAllocator().allocate(options.getMaxAllocationSize())); buffers.add(options.getAllocator().allocate(lastAllocationSize));
/** * Reads and decompresses a dictionary page for the given column chunk. * * Returns null if the given column chunk has no dictionary page. * * @param meta a column's ColumnChunkMetaData to read the dictionary from * @return an uncompressed DictionaryPage or null * @throws IOException if there is an error while reading the dictionary */ DictionaryPage readDictionary(ColumnChunkMetaData meta) throws IOException { if (!meta.getEncodings().contains(Encoding.PLAIN_DICTIONARY) && !meta.getEncodings().contains(Encoding.RLE_DICTIONARY)) { return null; } // TODO: this should use getDictionaryPageOffset() but it isn't reliable. if (f.getPos() != meta.getStartingPos()) { f.seek(meta.getStartingPos()); } PageHeader pageHeader = Util.readPageHeader(f); if (!pageHeader.isSetDictionary_page_header()) { return null; // TODO: should this complain? } DictionaryPage compressedPage = readCompressedDictionary(pageHeader, f); BytesInputDecompressor decompressor = options.getCodecFactory().getDecompressor(meta.getCodec()); return new DictionaryPage( decompressor.decompress(compressedPage.getBytes(), compressedPage.getUncompressedSize()), compressedPage.getDictionarySize(), compressedPage.getEncoding()); }
@Override public String getProperty(String property) { String value = super.getProperty(property); if (value != null) { return value; } return conf.get(property); }
private void initReader() throws IOException { if (reader != null) { reader.close(); reader = null; } if (filesIterator.hasNext()) { InputFile file = filesIterator.next(); ParquetFileReader fileReader = ParquetFileReader.open(file, options); reader = new InternalParquetRecordReader<>(readSupport, options.getRecordFilter()); reader.initialize(fileReader, options); } }