@VisibleForTesting static ColumnStatistics createDataColumnStatistics(String column, Type type, double rowsCount, Collection<PartitionStatistics> partitionStatistics) { List<HiveColumnStatistics> columnStatistics = partitionStatistics.stream() .map(PartitionStatistics::getColumnStatistics) .map(statistics -> statistics.get(column)) .filter(Objects::nonNull) .collect(toImmutableList()); if (columnStatistics.isEmpty()) { return ColumnStatistics.empty(); } return ColumnStatistics.builder() .setDistinctValuesCount(calculateDistinctValuesCount(columnStatistics)) .setNullsFraction(calculateNullsFraction(column, partitionStatistics)) .setDataSize(calculateDataSize(column, partitionStatistics, rowsCount)) .setRange(calculateRange(type, columnStatistics)) .build(); }
private ColumnStatistics toColumnStatistics(ColumnStatisticsData stats, Type columnType) { return ColumnStatistics.builder() .setNullsFraction(Estimate.zero()) .setDistinctValuesCount(stats.getDistinctValuesCount().map(Estimate::of).orElse(Estimate.unknown())) .setDataSize(stats.getDataSize().map(Estimate::of).orElse(Estimate.unknown())) .setRange(toRange(stats.getMin(), stats.getMax(), columnType)) .build(); }
private ColumnStatistics toColumnStatistics(ColumnStatisticsData columnStatisticsData, Type type, long rowCount) { ColumnStatistics.Builder columnStatistics = ColumnStatistics.builder(); long nullCount = columnStatisticsData.getNullsCount(); columnStatistics.setNullsFraction(Estimate.of((double) nullCount / rowCount)); columnStatistics.setRange(toRange(columnStatisticsData.getMin(), columnStatisticsData.getMax(), type)); columnStatistics.setDistinctValuesCount(Estimate.of(columnStatisticsData.getDistinctValuesCount())); columnStatistics.setDataSize(columnStatisticsData.getDataSize().map(Estimate::of).orElse(Estimate.unknown())); return columnStatistics.build(); }
.setNullsFraction(Estimate.of(0)) .setDistinctValuesCount(Estimate.of(3)) .setDataSize(Estimate.of(48.0)) .build()); .setNullsFraction(Estimate.of(0)) .setDistinctValuesCount(Estimate.of(1)) .setDataSize(Estimate.of(5.0)) .build());
PARTITION_COLUMN_1, ColumnStatistics.builder() .setDataSize(Estimate.of(7000)) .setNullsFraction(Estimate.of(0)) .setDistinctValuesCount(Estimate.of(1))
private static ColumnStatistics createColumnStatistics(Optional<Double> distinctValuesCount, Optional<DoubleRange> range, Optional<Double> dataSize) { return ColumnStatistics.builder() .setNullsFraction(Estimate.zero()) .setDistinctValuesCount(toEstimate(distinctValuesCount)) .setRange(range) .setDataSize(toEstimate(dataSize)) .build(); }
private ColumnStatistics toColumnStatistics(ColumnStatisticsData stats, Type columnType) { return ColumnStatistics.builder() .setNullsFraction(Estimate.zero()) .setDistinctValuesCount(stats.getDistinctValuesCount().map(Estimate::of).orElse(Estimate.unknown())) .setDataSize(stats.getDataSize().map(Estimate::of).orElse(Estimate.unknown())) .setRange(toRange(stats.getMin(), stats.getMax(), columnType)) .build(); }
private ColumnStatistics toColumnStatistics(ColumnStatisticsData columnStatisticsData, Type type, long rowCount) { ColumnStatistics.Builder columnStatistics = ColumnStatistics.builder(); long nullCount = columnStatisticsData.getNullsCount(); columnStatistics.setNullsFraction(Estimate.of((double) nullCount / rowCount)); columnStatistics.setRange(toRange(columnStatisticsData.getMin(), columnStatisticsData.getMax(), type)); columnStatistics.setDistinctValuesCount(Estimate.of(columnStatisticsData.getDistinctValuesCount())); columnStatistics.setDataSize(columnStatisticsData.getDataSize().map(Estimate::of).orElse(Estimate.unknown())); return columnStatistics.build(); }
.setNullsFraction(Estimate.of(0)) .setDistinctValuesCount(Estimate.of(3)) .setDataSize(Estimate.of(48.0)) .build()); .setNullsFraction(Estimate.of(0)) .setDistinctValuesCount(Estimate.of(1)) .setDataSize(Estimate.of(5.0)) .build());
PARTITION_COLUMN_1, ColumnStatistics.builder() .setDataSize(Estimate.of(7000)) .setNullsFraction(Estimate.of(0)) .setDistinctValuesCount(Estimate.of(1))
private TableStatistics createZeroStatistics(Map<String, ColumnHandle> columns, Map<String, Type> columnTypes) { TableStatistics.Builder result = TableStatistics.builder(); result.setRowCount(Estimate.of(0)); columns.forEach((columnName, columnHandle) -> { Type columnType = columnTypes.get(columnName); verify(columnType != null, "columnType is missing for column: %s", columnName); ColumnStatistics.Builder columnStatistics = ColumnStatistics.builder(); columnStatistics.setNullsFraction(Estimate.of(0)); columnStatistics.setDistinctValuesCount(Estimate.of(0)); if (hasDataSize(columnType)) { columnStatistics.setDataSize(Estimate.of(0)); } result.setColumnStatistics(columnHandle, columnStatistics.build()); }); return result.build(); }
private static ColumnStatistics createPartitionColumnStatistics( HiveColumnHandle column, Type type, List<HivePartition> partitions, Map<String, PartitionStatistics> statistics, double averageRowsPerPartition, double rowCount) { return ColumnStatistics.builder() .setDistinctValuesCount(Estimate.of(calculateDistinctPartitionKeys(column, partitions, statistics, averageRowsPerPartition))) .setNullsFraction(Estimate.of(calculateNullsFractionForPartitioningKey(column, partitions, statistics, averageRowsPerPartition, rowCount))) .setRange(calculateRangeForPartitioningKey(column, type, partitions)) .setDataSize(calculateDataSizeForPartitioningKey(column, type, partitions, statistics, averageRowsPerPartition)) .build(); }
private static ColumnStatistics createColumnStatistics(Optional<Double> distinctValuesCount, Optional<DoubleRange> range, Optional<Double> dataSize) { return ColumnStatistics.builder() .setNullsFraction(Estimate.zero()) .setDistinctValuesCount(toEstimate(distinctValuesCount)) .setRange(range) .setDataSize(toEstimate(dataSize)) .build(); }