totalCountLoadedSoFar += pages.getRowCount();
pages.getPageReader(descriptors.get(0)), skipTimestampConversion, type, typeInfo); } else { pages.getPageReader(descriptors.get(0)), skipTimestampConversion, getElementType(type), typeInfo); case MAP: descriptors.get(0), pages.getPageReader(descriptors.get(0)), skipTimestampConversion, kvTypes.get(0), typeInfo); VectorizedListColumnReader valueListColumnReader = new VectorizedListColumnReader( descriptors.get(1), pages.getPageReader(descriptors.get(1)), skipTimestampConversion, kvTypes.get(1), typeInfo); return new VectorizedMapColumnReader(keyListColumnReader, valueListColumnReader);
private void checkEndOfRowGroup() throws IOException { if (rowsReturned != totalCountLoadedSoFar) return; PageReadStore pages = reader.readNextRowGroup(); if (pages == null) { throw new IOException("expecting more rows but reached last block. Read " + rowsReturned + " out of " + totalRowCount); } List<ColumnDescriptor> columns = requestedSchema.getColumns(); columnReaders = new VectorizedColumnReader[columns.size()]; for (int i = 0; i < columns.size(); ++i) { if (missingColumns[i]) continue; columnReaders[i] = new VectorizedColumnReader(columns.get(i), pages.getPageReader(columns.get(i))); } totalCountLoadedSoFar += pages.getRowCount(); } }
@Override public ColumnReader getColumnReader(ColumnDescriptor path) { PrimitiveConverter converter = getPrimitiveConverter(path); PageReader pageReader = pageReadStore.getPageReader(path); Optional<PrimitiveIterator.OfLong> rowIndexes = pageReadStore.getRowIndexes(); if (rowIndexes.isPresent()) { return new SynchronizingColumnReader(path, pageReader, converter, writerVersion, rowIndexes.get()); } else { return new ColumnReaderImpl(path, pageReader, converter, writerVersion); } }
private void checkEndOfRowGroup() throws IOException { if (rowsReturned != totalCountLoadedSoFar) return; PageReadStore pages = reader.readNextRowGroup(); if (pages == null) { throw new IOException("expecting more rows but reached last block. Read " + rowsReturned + " out of " + totalRowCount); } List<ColumnDescriptor> columns = requestedSchema.getColumns(); List<Type> types = requestedSchema.asGroupType().getFields(); columnReaders = new VectorizedColumnReader[columns.size()]; for (int i = 0; i < columns.size(); ++i) { if (missingColumns[i]) continue; columnReaders[i] = new VectorizedColumnReader(columns.get(i), types.get(i).getOriginalType(), pages.getPageReader(columns.get(i)), convertTz); } totalCountLoadedSoFar += pages.getRowCount(); } }
final long rows = pages.getRowCount(); final RecordReader recordReader = colIO.getRecordReader(pages, new GroupRecordConverter(schema));
private void checkEndOfRowGroup() throws IOException { if (rowsReturned != totalCountLoadedSoFar) return; PageReadStore pages = reader.readNextRowGroup(); if (pages == null) { throw new IOException("expecting more rows but reached last block. Read " + rowsReturned + " out of " + totalRowCount); } List<ColumnDescriptor> columns = requestedSchema.getColumns(); List<Type> types = requestedSchema.asGroupType().getFields(); columnReaders = new VectorizedColumnReader[columns.size()]; for (int i = 0; i < columns.size(); ++i) { if (missingColumns[i]) continue; columnReaders[i] = new VectorizedColumnReader(columns.get(i), types.get(i).getOriginalType(), pages.getPageReader(columns.get(i)), convertTz); } totalCountLoadedSoFar += pages.getRowCount(); } }
@Override public void setPageSource(PageReadStore pageStore) { column.setPageSource(pageStore.getPageReader(desc)); }
/** * Load Decision Tree model. * * @param pathToMdl Path to model. */ private static Model loadDecisionTreeModel(String pathToMdl) { try (ParquetFileReader r = ParquetFileReader.open(HadoopInputFile.fromPath(new Path(pathToMdl), new Configuration()))) { PageReadStore pages; final MessageType schema = r.getFooter().getFileMetaData().getSchema(); final MessageColumnIO colIO = new ColumnIOFactory().getColumnIO(schema); final Map<Integer, NodeData> nodes = new TreeMap<>(); while (null != (pages = r.readNextRowGroup())) { final long rows = pages.getRowCount(); final RecordReader recordReader = colIO.getRecordReader(pages, new GroupRecordConverter(schema)); for (int i = 0; i < rows; i++) { final SimpleGroup g = (SimpleGroup)recordReader.read(); NodeData nodeData = extractNodeDataFromParquetRow(g); nodes.put(nodeData.id, nodeData); } } return buildDecisionTreeModel(nodes); } catch (IOException e) { System.out.println("Error reading parquet file."); e.printStackTrace(); } return null; }
protected void checkEndOfRowGroup() throws IOException { if (rowsReturned != totalCountLoadedSoFar) { return; } PageReadStore pages = reader.readNextRowGroup(); if (pages == null) { throw new IOException("expecting more rows but reached last block. Read " + rowsReturned + " out of " + totalRowCount); } List<ColumnDescriptor> columns = requestedSchema.getColumns(); columnReaders = new VectorizedColumnReader[columns.size()]; for (int i = 0; i < columns.size(); ++i) { columnReaders[i] = new VectorizedColumnReader(columns.get(i), pages.getPageReader(columns.get(i))); } totalCountLoadedSoFar += pages.getRowCount(); }
@Override public ColumnReader getColumnReader(ColumnDescriptor path) { return newMemColumnReader(path, pageReadStore.getPageReader(path)); }
final long rows = pagesMetaData.getRowCount(); final RecordReader recordReader = colIO.getRecordReader(pagesMetaData, new GroupRecordConverter(schema)); for (int i = 0; i < rows; i++) { final Map<Integer, TreeMap<Integer, NodeData>> nodesByTreeId = new TreeMap<>(); while (null != (pages = r.readNextRowGroup())) { final long rows = pages.getRowCount(); final RecordReader recordReader = colIO.getRecordReader(pages, new GroupRecordConverter(schema)); for (int i = 0; i < rows; i++) {
private void checkEndOfRowGroup() throws IOException { if (rowsReturned != totalCountLoadedSoFar) return; PageReadStore pages = reader.readNextRowGroup(); if (pages == null) { throw new IOException("expecting more rows but reached last block. Read " + rowsReturned + " out of " + totalRowCount); } List<ColumnDescriptor> columns = requestedSchema.getColumns(); columnReaders = new VectorizedColumnReader[columns.size()]; for (int i = 0; i < columns.size(); ++i) { if (missingColumns[i]) continue; columnReaders[i] = new VectorizedColumnReader(columns.get(i), pages.getPageReader(columns.get(i))); } totalCountLoadedSoFar += pages.getRowCount(); } }
private void validateContains(MessageType schema, PageReadStore pages, String[] path, int values, BytesInput bytes) throws IOException { PageReader pageReader = pages.getPageReader(schema.getColumnDescription(path)); DataPageV1 page = (DataPageV1) pageReader.readPage(); assertEquals(values, page.getValueCount()); assertArrayEquals(bytes.toByteArray(), page.getBytes().toByteArray()); }
/** * Load logistic regression model. * * @param pathToMdl Path to model. */ private static Model loadLogRegModel(String pathToMdl) { Vector coefficients = null; double interceptor = 0; try (ParquetFileReader r = ParquetFileReader.open(HadoopInputFile.fromPath(new Path(pathToMdl), new Configuration()))) { PageReadStore pages; final MessageType schema = r.getFooter().getFileMetaData().getSchema(); final MessageColumnIO colIO = new ColumnIOFactory().getColumnIO(schema); while (null != (pages = r.readNextRowGroup())) { final long rows = pages.getRowCount(); final RecordReader recordReader = colIO.getRecordReader(pages, new GroupRecordConverter(schema)); for (int i = 0; i < rows; i++) { final SimpleGroup g = (SimpleGroup)recordReader.read(); interceptor = readInterceptor(g); coefficients = readCoefficients(g); } } } catch (IOException e) { System.out.println("Error reading parquet file."); e.printStackTrace(); } return new LogisticRegressionModel(coefficients, interceptor); }
/** * Load SVM model. * * @param pathToMdl Path to model. */ private static Model loadLinearSVMModel(String pathToMdl) { Vector coefficients = null; double interceptor = 0; try (ParquetFileReader r = ParquetFileReader.open(HadoopInputFile.fromPath(new Path(pathToMdl), new Configuration()))) { PageReadStore pages; final MessageType schema = r.getFooter().getFileMetaData().getSchema(); final MessageColumnIO colIO = new ColumnIOFactory().getColumnIO(schema); while (null != (pages = r.readNextRowGroup())) { final long rows = pages.getRowCount(); final RecordReader recordReader = colIO.getRecordReader(pages, new GroupRecordConverter(schema)); for (int i = 0; i < rows; i++) { final SimpleGroup g = (SimpleGroup)recordReader.read(); interceptor = readSVMInterceptor(g); coefficients = readSVMCoefficients(g); } } } catch (IOException e) { System.out.println("Error reading parquet file."); e.printStackTrace(); } return new SVMLinearClassificationModel(coefficients, interceptor); }
/** * Load linear regression model. * * @param pathToMdl Path to model. */ private static Model loadLinRegModel(String pathToMdl) { Vector coefficients = null; double interceptor = 0; try (ParquetFileReader r = ParquetFileReader.open(HadoopInputFile.fromPath(new Path(pathToMdl), new Configuration()))) { PageReadStore pages; final MessageType schema = r.getFooter().getFileMetaData().getSchema(); final MessageColumnIO colIO = new ColumnIOFactory().getColumnIO(schema); while (null != (pages = r.readNextRowGroup())) { final long rows = pages.getRowCount(); final RecordReader recordReader = colIO.getRecordReader(pages, new GroupRecordConverter(schema)); for (int i = 0; i < rows; i++) { final SimpleGroup g = (SimpleGroup)recordReader.read(); interceptor = readLinRegInterceptor(g); coefficients = readLinRegCoefficients(g); } } } catch (IOException e) { System.out.println("Error reading parquet file."); e.printStackTrace(); } return new LinearRegressionModel(coefficients, interceptor); }
private static Model loadKMeansModel(String pathToMdl) { Vector[] centers = null; try (ParquetFileReader r = ParquetFileReader.open(HadoopInputFile.fromPath(new Path(pathToMdl), new Configuration()))) { PageReadStore pages; final MessageType schema = r.getFooter().getFileMetaData().getSchema(); final MessageColumnIO colIO = new ColumnIOFactory().getColumnIO(schema); while (null != (pages = r.readNextRowGroup())) { final int rows = (int)pages.getRowCount(); final RecordReader recordReader = colIO.getRecordReader(pages, new GroupRecordConverter(schema)); centers = new DenseVector[rows]; for (int i = 0; i < rows; i++) { final SimpleGroup g = (SimpleGroup)recordReader.read(); // final int clusterIdx = g.getInteger(0, 0); Group clusterCenterCoeff = g.getGroup(1, 0).getGroup(3, 0); final int amountOfCoefficients = clusterCenterCoeff.getFieldRepetitionCount(0); centers[i] = new DenseVector(amountOfCoefficients); for (int j = 0; j < amountOfCoefficients; j++) { double coefficient = clusterCenterCoeff.getGroup(0, j).getDouble(0, 0); centers[i].set(j, coefficient); } } } } catch (IOException e) { System.out.println("Error reading parquet file."); e.printStackTrace(); } return new KMeansModel(centers, new EuclideanDistance()); }
totalTimeSpentReadingBytes += timeSpentReading; BenchmarkCounter.incrementTime(timeSpentReading); if (LOG.isInfoEnabled()) LOG.info("block read in memory in {} ms. row count = {}", timeSpentReading, pages.getRowCount()); LOG.debug("initializing Record assembly with requested schema {}", requestedSchema); MessageColumnIO columnIO = columnIOFactory.getColumnIO(requestedSchema, fileSchema, strictTypeChecking); filterRecords ? filter : FilterCompat.NOOP); startedAssemblingCurrentBlockAt = System.currentTimeMillis(); totalCountLoadedSoFar += pages.getRowCount(); ++ currentBlock;
totalTimeSpentReadingBytes += timeSpentReading; BenchmarkCounter.incrementTime(timeSpentReading); if (LOG.isInfoEnabled()) LOG.info("block read in memory in {} ms. row count = {}", timeSpentReading, pages.getRowCount()); LOG.debug("initializing Record assembly with requested schema {}", requestedSchema); MessageColumnIO columnIO = columnIOFactory.getColumnIO(requestedSchema, fileSchema, strictTypeChecking); filterRecords ? filter : FilterCompat.NOOP); startedAssemblingCurrentBlockAt = System.currentTimeMillis(); totalCountLoadedSoFar += pages.getRowCount(); ++ currentBlock;