public ParquetDatasetSplitXAttr newMessage() { return new ParquetDatasetSplitXAttr(); }
new ParquetDatasetSplitXAttr() .setPath(pathString) .setStart(rowGroupInfo.getStart()) .setRowGroupIndex(rowGroupInfo.getRowGroupIndex()) .setUpdateKey(new FileSystemCachedEntity() .setPath(pathString) .setLastModificationTime(rowGroupInfo.getStatus().getModificationTime()) .setLength(length)) .setColumnValueCountsList(columnValueCounts) .setLength(rowGroupInfo.getLength()))));
private ByteString convertToScanXAttr(ByteString xattrFullSerialized) { ParquetDatasetSplitXAttr fullXAttr = ParquetDatasetXAttrSerDe.PARQUET_DATASET_SPLIT_XATTR_SERIALIZER.revert(xattrFullSerialized.toByteArray());; ParquetDatasetSplitScanXAttr scanXAttr = new ParquetDatasetSplitScanXAttr(); scanXAttr.setPath(fullXAttr.getPath()); scanXAttr.setFileLength(fullXAttr.getUpdateKey().getLength()); scanXAttr.setStart(fullXAttr.getStart()); scanXAttr.setLength(fullXAttr.getLength()); scanXAttr.setRowGroupIndex(fullXAttr.getRowGroupIndex()); return ByteString.copyFrom(ParquetDatasetXAttrSerDe.PARQUET_DATASET_SPLIT_SCAN_XATTR_SERIALIZER.serialize(scanXAttr)); }
private static long getAccurateColumnCount(String name, Iterator<DatasetSplit> splits){ long def = 0; int splitCount = 0; int columnObservation = 0; while(splits.hasNext()){ DatasetSplit split = splits.next(); splitCount++; ParquetDatasetSplitXAttr xattr = ParquetDatasetXAttrSerDe.PARQUET_DATASET_SPLIT_XATTR_SERIALIZER.deserialize(split.getExtendedProperty().toByteArray()); List<ColumnValueCount> counts = xattr.getColumnValueCountsList(); for(ColumnValueCount c : counts){ if(c.getColumn().equalsIgnoreCase(name)){ def += c.getCount(); columnObservation++; continue; } } } if(splitCount != columnObservation){ // missing metadata observations, make sure to avoid wrong result. return GroupScan.NO_COLUMN_STATS; } return def; }
private Collection<FsPermissionTask> getSplitPermissiomTasks(DatasetConfig datasetConfig, FileSystemWrapper userFs, String user) { final SplitsPointer splitsPointer = DatasetSplitsPointer.of(context.getNamespaceService(user), datasetConfig); final boolean isParquet = datasetConfig.getPhysicalDataset().getFormatSettings().getType() == FileType.PARQUET; final List<FsPermissionTask> fsPermissionTasks = Lists.newArrayList(); final List<Path> batch = Lists.newArrayList(); for (DatasetSplit split: splitsPointer.getSplitIterable()) { final Path filePath; if (isParquet) { filePath = new Path(PARQUET_DATASET_SPLIT_XATTR_SERIALIZER.revert(split.getExtendedProperty().toByteArray()).getPath()); } else { filePath = new Path(EASY_DATASET_SPLIT_XATTR_SERIALIZER.revert(split.getExtendedProperty().toByteArray()).getPath()); } batch.add(filePath); if (batch.size() == PERMISSION_CHECK_TASK_BATCH_SIZE) { // make a copy of batch fsPermissionTasks.add(new FsPermissionTask(userFs, new ArrayList<>(batch), FsAction.READ)); batch.clear(); } } if (!batch.isEmpty()) { fsPermissionTasks.add(new FsPermissionTask(userFs, batch, FsAction.READ)); } return fsPermissionTasks; }