private boolean advanceToNextRowGroup() { currentRowGroupMemoryContext.close(); currentRowGroupMemoryContext = systemMemoryContext.newAggregatedMemoryContext(); if (currentBlock == blocks.size()) { return false; } currentBlockMetadata = blocks.get(currentBlock); currentBlock = currentBlock + 1; nextRowInGroup = 0L; currentGroupRowCount = currentBlockMetadata.getRowCount(); initializeColumnReaders(); return true; }
public static boolean predicateMatches(Predicate parquetPredicate, BlockMetaData block, ParquetDataSource dataSource, Map<List<String>, RichColumnDescriptor> descriptorsByPath, TupleDomain<ColumnDescriptor> parquetTupleDomain, boolean failOnCorruptedParquetStatistics) throws ParquetCorruptionException { Map<ColumnDescriptor, Statistics<?>> columnStatistics = getStatistics(block, descriptorsByPath); if (!parquetPredicate.matches(block.getRowCount(), columnStatistics, dataSource.getId(), failOnCorruptedParquetStatistics)) { return false; } Map<ColumnDescriptor, DictionaryDescriptor> dictionaries = getDictionaries(block, dataSource, descriptorsByPath, parquetTupleDomain); return parquetPredicate.matches(dictionaries); }
private ColumnChunk readPrimitive(PrimitiveField field) throws IOException { ColumnDescriptor columnDescriptor = field.getDescriptor(); PrimitiveColumnReader columnReader = columnReaders[field.getId()]; if (columnReader.getPageReader() == null) { validateParquet(currentBlockMetadata.getRowCount() > 0, "Row group has 0 rows"); ColumnChunkMetaData metadata = getColumnChunkMetaData(columnDescriptor); long startingPosition = metadata.getStartingPos(); int totalSize = toIntExact(metadata.getTotalSize()); byte[] buffer = allocateBlock(totalSize); dataSource.readFully(startingPosition, buffer); ColumnChunkDescriptor descriptor = new ColumnChunkDescriptor(columnDescriptor, metadata, totalSize); ParquetColumnChunk columnChunk = new ParquetColumnChunk(descriptor, buffer, 0); columnReader.setPageReader(columnChunk.readAllPages()); } return columnReader.readPrimitive(field); }
private boolean advanceToNextRowGroup() throws InterruptedException { if (currentBlock == blocks.size()) { return false; } currentBlockMetadata = blocks.get(currentBlock); currentBlock = currentBlock + 1; long rowCount = currentBlockMetadata.getRowCount(); nextRowInGroup = 0L; currentGroupRowCount = rowCount; columnReadersMap.clear(); initializeColumnReaders(); return true; }
/** * get the number of rows * @return */ @Override public long getRowCount () { if (this.rowCount <= 0) { long rowCount = 0; for (BlockMetaData block : getBlocks()) { rowCount += block.getRowCount(); } this.rowCount = rowCount; } return this.rowCount; }
public ParquetReader(MessageType fileSchema, Map<String, String> extraMetadata, MessageType requestedSchema, List<BlockMetaData> blocks, Configuration configuration, ParquetDataSource dataSource) throws IOException { this.fileSchema = fileSchema; this.extraMetadata = extraMetadata; this.requestedSchema = requestedSchema; this.blocks = blocks; this.dataSource = dataSource; codecFactory = new ParquetCodecFactory(configuration); for (BlockMetaData block : blocks) { fileRowCount += block.getRowCount(); } initializeColumnReaders(); }
if (this.offset <= begin && end <= this.offset + this.fragmentSize && block.getRowCount() != 0) { if (LOG.isDebugEnabled()) { LOG.debug(MessageFormat.format(
public static boolean predicateMatches(ParquetPredicate parquetPredicate, BlockMetaData block, Configuration configuration, ParquetDataSource dataSource, MessageType requestedSchema, TupleDomain<HiveColumnHandle> effectivePredicate) { Map<Integer, Statistics<?>> columnStatistics = getStatisticsByColumnOrdinal(block); if (!parquetPredicate.matches(block.getRowCount(), columnStatistics)) { return false; } Map<Integer, ParquetDictionaryDescriptor> dictionaries = getDictionariesByColumnOrdinal(block, configuration, dataSource, requestedSchema, effectivePredicate); return parquetPredicate.matches(dictionaries); }
+ "path={0}, rows={1}, range={2}+{3}, allocation={4}", //$NON-NLS-1$ status.getPath(), block.getRowCount(), begin, end - begin,
public void initialize(MessageType requestedSchema, MessageType fileSchema, Map<String, String> extraMetadata, Map<String, String> readSupportMetadata, Path file, List<BlockMetaData> blocks, Configuration configuration) throws IOException { this.requestedSchema = requestedSchema; this.fileSchema = fileSchema; this.file = file; this.columnCount = this.requestedSchema.getPaths().size(); this.recordConverter = readSupport.prepareForRead( configuration, extraMetadata, fileSchema, new ReadSupport.ReadContext(requestedSchema, readSupportMetadata)); List<ColumnDescriptor> columns = requestedSchema.getColumns(); reader = new ParquetFileReader(configuration, file, blocks, columns); for (BlockMetaData block : blocks) { total += block.getRowCount(); } LOG.info("RecordReader initialized will read a total of " + total + " records."); }
if (block.getRowCount() == 0) { throw new RuntimeException("Illegal row group of 0 rows"); ColumnChunkPageReadStore columnChunkPageReadStore = new ColumnChunkPageReadStore(block.getRowCount());
private static void showDetails(PrettyPrintWriter out, BlockMetaData meta, Long num) { long rows = meta.getRowCount(); long tbs = meta.getTotalByteSize(); long offset = meta.getStartingPos(); out.format("row group%s: RC:%d TS:%d OFFSET:%d%n", (num == null ? "" : " " + num), rows, tbs, offset); out.rule('-'); showDetails(out, meta.getColumns()); }
public FileMetaData toParquetMetadata(int currentVersion, ParquetMetadata parquetMetadata) { List<BlockMetaData> blocks = parquetMetadata.getBlocks(); List<RowGroup> rowGroups = new ArrayList<RowGroup>(); int numRows = 0; for (BlockMetaData block : blocks) { numRows += block.getRowCount(); addRowGroup(parquetMetadata, rowGroups, block); } FileMetaData fileMetaData = new FileMetaData( currentVersion, toParquetSchema(parquetMetadata.getFileMetaData().getSchema()), numRows, rowGroups); Set<Entry<String, String>> keyValues = parquetMetadata.getFileMetaData().getKeyValueMetaData().entrySet(); for (Entry<String, String> keyValue : keyValues) { addKeyValue(fileMetaData, keyValue.getKey(), keyValue.getValue()); } fileMetaData.setCreated_by(parquetMetadata.getFileMetaData().getCreatedBy()); return fileMetaData; }
public void initialize(MessageType fileSchema, Map<String, String> fileMetadata, Path file, List<BlockMetaData> blocks, Configuration configuration) throws IOException { // initialize a ReadContext for this file ReadSupport.ReadContext readContext = readSupport.init(new InitContext( configuration, toSetMultiMap(fileMetadata), fileSchema)); this.requestedSchema = readContext.getRequestedSchema(); this.fileSchema = fileSchema; this.file = file; this.columnCount = requestedSchema.getPaths().size(); this.recordConverter = readSupport.prepareForRead( configuration, fileMetadata, fileSchema, readContext); this.strictTypeChecking = configuration.getBoolean(STRICT_TYPE_CHECKING, true); List<ColumnDescriptor> columns = requestedSchema.getColumns(); reader = new ParquetFileReader(configuration, file, blocks, columns); for (BlockMetaData block : blocks) { total += block.getRowCount(); } LOG.info("RecordReader initialized will read a total of " + total + " records."); }
public Block readBlock(ColumnDescriptor columnDescriptor, Type type) throws IOException { ParquetColumnReader columnReader = columnReadersMap.get(columnDescriptor); if (columnReader.getPageReader() == null) { validateParquet(currentBlockMetadata.getRowCount() > 0, "Row group having 0 rows"); ColumnChunkMetaData metadata = getColumnChunkMetaData(columnDescriptor); long startingPosition = metadata.getStartingPos(); int totalSize = Ints.checkedCast(metadata.getTotalSize()); byte[] buffer = new byte[totalSize]; dataSource.readFully(startingPosition, buffer); ParquetColumnChunkDescriptor descriptor = new ParquetColumnChunkDescriptor(columnDescriptor, metadata, startingPosition, totalSize); ParquetColumnChunk columnChunk = new ParquetColumnChunk(descriptor, buffer, 0, codecFactory); columnReader.setPageReader(columnChunk.readAllPages()); } return columnReader.readBlock(type); }
private void addRowGroup(ParquetMetadata parquetMetadata, List<RowGroup> rowGroups, BlockMetaData block) { //rowGroup.total_byte_size = ; List<ColumnChunkMetaData> columns = block.getColumns(); List<ColumnChunk> parquetColumns = new ArrayList<ColumnChunk>(); for (ColumnChunkMetaData columnMetaData : columns) { ColumnChunk columnChunk = new ColumnChunk(columnMetaData.getFirstDataPageOffset()); // verify this is the right offset columnChunk.file_path = block.getPath(); // they are in the same file for now columnChunk.meta_data = new parquet.format.ColumnMetaData( getType(columnMetaData.getType()), toFormatEncodings(columnMetaData.getEncodings()), Arrays.asList(columnMetaData.getPath().toArray()), columnMetaData.getCodec().getParquetCompressionCodec(), columnMetaData.getValueCount(), columnMetaData.getTotalUncompressedSize(), columnMetaData.getTotalSize(), columnMetaData.getFirstDataPageOffset()); columnChunk.meta_data.dictionary_page_offset = columnMetaData.getDictionaryPageOffset(); if (!columnMetaData.getStatistics().isEmpty()) { columnChunk.meta_data.setStatistics(toParquetStatistics(columnMetaData.getStatistics())); } // columnChunk.meta_data.index_page_offset = ; // columnChunk.meta_data.key_value_metadata = ; // nothing yet parquetColumns.add(columnChunk); } RowGroup rowGroup = new RowGroup(parquetColumns, block.getTotalByteSize(), block.getRowCount()); rowGroups.add(rowGroup); }
private static void add(ParquetMetadata footer) { for (BlockMetaData blockMetaData : footer.getBlocks()) { ++ blockCount; MessageType schema = footer.getFileMetaData().getSchema(); recordCount += blockMetaData.getRowCount(); List<ColumnChunkMetaData> columns = blockMetaData.getColumns(); for (ColumnChunkMetaData columnMetaData : columns) { ColumnDescriptor desc = schema.getColumnDescription(columnMetaData.getPath().toArray()); add( desc, columnMetaData.getValueCount(), columnMetaData.getTotalSize(), columnMetaData.getTotalUncompressedSize(), columnMetaData.getEncodings(), columnMetaData.getStatistics()); } } }