public StatisticRange addAndCollapseDistinctValues(StatisticRange other) { double overlapPercentOfThis = this.overlapPercentWith(other); double overlapPercentOfOther = other.overlapPercentWith(this); double overlapDistinctValuesThis = overlapPercentOfThis * distinctValues; double overlapDistinctValuesOther = overlapPercentOfOther * other.distinctValues; double maxOverlappingValues = max(overlapDistinctValuesThis, overlapDistinctValuesOther); double newDistinctValues = maxOverlappingValues + (1 - overlapPercentOfThis) * distinctValues + (1 - overlapPercentOfOther) * other.distinctValues; return new StatisticRange(minExcludeNaN(low, other.low), maxExcludeNaN(high, other.high), newDistinctValues); }
public StatisticRange intersect(StatisticRange other) { double newLow = max(low, other.low); double newHigh = min(high, other.high); if (newLow <= newHigh) { return new StatisticRange(newLow, newHigh, overlappingDistinctValues(other)); } return empty(); }
public Builder setStatisticsRange(StatisticRange range) { return setLowValue(range.getLow()) .setHighValue(range.getHigh()) .setDistinctValuesCount(range.getDistinctValuesCount()); }
private double overlappingDistinctValues(StatisticRange other) { double overlapPercentOfLeft = overlapPercentWith(other); double overlapPercentOfRight = other.overlapPercentWith(this); double overlapDistinctValuesLeft = overlapPercentOfLeft * distinctValues; double overlapDistinctValuesRight = overlapPercentOfRight * other.distinctValues; double minInputDistinctValues = minExcludeNaN(this.distinctValues, other.distinctValues); return minExcludeNaN(minInputDistinctValues, maxExcludeNaN(overlapDistinctValuesLeft, overlapDistinctValuesRight)); }
public StatisticRange addAndSumDistinctValues(StatisticRange other) { double newDistinctValues = distinctValues + other.distinctValues; return new StatisticRange(minExcludeNaN(low, other.low), maxExcludeNaN(high, other.high), newDistinctValues); }
private static PlanNodeStatsEstimate estimateExpressionNotEqualToLiteral( PlanNodeStatsEstimate inputStatistics, SymbolStatsEstimate expressionStatistics, Optional<Symbol> expressionSymbol, OptionalDouble literalValue) { StatisticRange expressionRange = StatisticRange.from(expressionStatistics); StatisticRange filterRange; if (literalValue.isPresent()) { filterRange = new StatisticRange(literalValue.getAsDouble(), literalValue.getAsDouble(), 1); } else { filterRange = new StatisticRange(NEGATIVE_INFINITY, POSITIVE_INFINITY, 1); } StatisticRange intersectRange = expressionRange.intersect(filterRange); double filterFactor = 1 - expressionRange.overlapPercentWith(intersectRange); PlanNodeStatsEstimate.Builder estimate = PlanNodeStatsEstimate.buildFrom(inputStatistics); estimate.setOutputRowCount(filterFactor * (1 - expressionStatistics.getNullsFraction()) * inputStatistics.getOutputRowCount()); if (expressionSymbol.isPresent()) { SymbolStatsEstimate symbolNewEstimate = buildFrom(expressionStatistics) .setNullsFraction(0.0) .setDistinctValuesCount(max(expressionStatistics.getDistinctValuesCount() - 1, 0)) .build(); estimate = estimate.addSymbolStatistics(expressionSymbol.get(), symbolNewEstimate); } return estimate.build(); }
private PlanNodeStatsEstimate filterByAuxiliaryClause(PlanNodeStatsEstimate stats, EquiJoinClause clause, TypeProvider types) { // we just clear null fraction and adjust ranges here // selectivity is mostly handled by driving clause. We just scale heuristically by UNKNOWN_FILTER_COEFFICIENT here. SymbolStatsEstimate leftStats = stats.getSymbolStatistics(clause.getLeft()); SymbolStatsEstimate rightStats = stats.getSymbolStatistics(clause.getRight()); StatisticRange leftRange = StatisticRange.from(leftStats); StatisticRange rightRange = StatisticRange.from(rightStats); StatisticRange intersect = leftRange.intersect(rightRange); double leftFilterValue = firstNonNaN(leftRange.overlapPercentWith(intersect), 1); double rightFilterValue = firstNonNaN(rightRange.overlapPercentWith(intersect), 1); double leftNdvInRange = leftFilterValue * leftRange.getDistinctValuesCount(); double rightNdvInRange = rightFilterValue * rightRange.getDistinctValuesCount(); double retainedNdv = MoreMath.min(leftNdvInRange, rightNdvInRange); SymbolStatsEstimate newLeftStats = buildFrom(leftStats) .setNullsFraction(0) .setStatisticsRange(intersect) .setDistinctValuesCount(retainedNdv) .build(); SymbolStatsEstimate newRightStats = buildFrom(rightStats) .setNullsFraction(0) .setStatisticsRange(intersect) .setDistinctValuesCount(retainedNdv) .build(); PlanNodeStatsEstimate.Builder result = PlanNodeStatsEstimate.buildFrom(stats) .setOutputRowCount(stats.getOutputRowCount() * UNKNOWN_FILTER_COEFFICIENT) .addSymbolStatistics(clause.getLeft(), newLeftStats) .addSymbolStatistics(clause.getRight(), newRightStats); return normalizer.normalize(result.build(), types); }
public StatisticRange statisticRange() { return new StatisticRange(lowValue, highValue, distinctValuesCount); }
@Test public void testCapRange() { PlanNodeStatsEstimate emptyRange = statistics(10, NaN, NaN, StatisticRange.empty()); PlanNodeStatsEstimate openRange = statistics(10, NaN, NaN, openRange(NaN)); PlanNodeStatsEstimate first = statistics(10, NaN, NaN, new StatisticRange(12, 100, NaN)); PlanNodeStatsEstimate second = statistics(10, NaN, NaN, new StatisticRange(13, 99, NaN)); assertCapRange(emptyRange, emptyRange, NaN, NaN); assertCapRange(emptyRange, openRange, NaN, NaN); assertCapRange(openRange, emptyRange, NaN, NaN); assertCapRange(first, openRange, 12, 100); assertCapRange(openRange, second, 13, 99); assertCapRange(first, second, 13, 99); }
private static PlanNodeStatsEstimate estimateFilterRange( PlanNodeStatsEstimate inputStatistics, SymbolStatsEstimate expressionStatistics, Optional<Symbol> expressionSymbol, StatisticRange filterRange) { StatisticRange expressionRange = StatisticRange.from(expressionStatistics); StatisticRange intersectRange = expressionRange.intersect(filterRange); double filterFactor = expressionRange.overlapPercentWith(intersectRange); PlanNodeStatsEstimate estimate = inputStatistics.mapOutputRowCount(rowCount -> filterFactor * (1 - expressionStatistics.getNullsFraction()) * rowCount); if (expressionSymbol.isPresent()) { SymbolStatsEstimate symbolNewEstimate = SymbolStatsEstimate.builder() .setAverageRowSize(expressionStatistics.getAverageRowSize()) .setStatisticsRange(intersectRange) .setNullsFraction(0.0) .build(); estimate = estimate.mapSymbolColumnStatistics(expressionSymbol.get(), oldStats -> symbolNewEstimate); } return estimate; }
private static PlanNodeStatsEstimate estimateExpressionEqualToExpression( PlanNodeStatsEstimate inputStatistics, SymbolStatsEstimate leftExpressionStatistics, Optional<Symbol> leftExpressionSymbol, SymbolStatsEstimate rightExpressionStatistics, Optional<Symbol> rightExpressionSymbol) { if (isNaN(leftExpressionStatistics.getDistinctValuesCount()) || isNaN(rightExpressionStatistics.getDistinctValuesCount())) { return PlanNodeStatsEstimate.unknown(); } StatisticRange leftExpressionRange = StatisticRange.from(leftExpressionStatistics); StatisticRange rightExpressionRange = StatisticRange.from(rightExpressionStatistics); StatisticRange intersect = leftExpressionRange.intersect(rightExpressionRange); double nullsFilterFactor = (1 - leftExpressionStatistics.getNullsFraction()) * (1 - rightExpressionStatistics.getNullsFraction()); double leftNdv = leftExpressionRange.getDistinctValuesCount(); double rightNdv = rightExpressionRange.getDistinctValuesCount(); double filterFactor = 1.0 / max(leftNdv, rightNdv, 1); double retainedNdv = min(leftNdv, rightNdv); PlanNodeStatsEstimate.Builder estimate = PlanNodeStatsEstimate.buildFrom(inputStatistics) .setOutputRowCount(inputStatistics.getOutputRowCount() * nullsFilterFactor * filterFactor); SymbolStatsEstimate equalityStats = SymbolStatsEstimate.builder() .setAverageRowSize(averageExcludingNaNs(leftExpressionStatistics.getAverageRowSize(), rightExpressionStatistics.getAverageRowSize())) .setNullsFraction(0) .setStatisticsRange(intersect) .setDistinctValuesCount(retainedNdv) .build(); leftExpressionSymbol.ifPresent(symbol -> estimate.addSymbolStatistics(symbol, equalityStats)); rightExpressionSymbol.ifPresent(symbol -> estimate.addSymbolStatistics(symbol, equalityStats)); return estimate.build(); }
@Test public void testOverlapPercentWith() { StatisticRange zeroToTen = range(0, 10, 10); StatisticRange empty = StatisticRange.empty(); // Equal ranges assertOverlap(zeroToTen, range(0, 10, 5), 1); assertOverlap(zeroToTen, range(0, 10, 20), 1); assertOverlap(zeroToTen, range(0, 10, 20), 1); // Some overlap assertOverlap(zeroToTen, range(5, 3000, 3), 0.5); // Single value overlap assertOverlap(zeroToTen, range(3, 3, 1), 1 / zeroToTen.getDistinctValuesCount()); assertOverlap(zeroToTen, range(10, 100, 357), 1 / zeroToTen.getDistinctValuesCount()); // No overlap assertOverlap(zeroToTen, range(20, 30, 10), 0); // Empty ranges assertOverlap(zeroToTen, empty, 0); assertOverlap(empty, zeroToTen, 0); // no test for empty, empty) since any return value is correct assertOverlap(unboundedRange(10), empty, 0); // Unbounded (infinite), NDV-based assertOverlap(unboundedRange(10), unboundedRange(20), 1); assertOverlap(unboundedRange(20), unboundedRange(10), 0.5); assertOverlap(unboundedRange(0.1), unboundedRange(1), 1); assertOverlap(unboundedRange(0.0), unboundedRange(1), 0); assertOverlap(unboundedRange(0.0), unboundedRange(0), 0); }
@Test public void testAddRowCount() { PlanNodeStatsEstimate unknownStats = statistics(NaN, NaN, NaN, StatisticRange.empty()); PlanNodeStatsEstimate first = statistics(10, NaN, NaN, StatisticRange.empty()); PlanNodeStatsEstimate second = statistics(20, NaN, NaN, StatisticRange.empty()); assertEquals(addStatsAndSumDistinctValues(unknownStats, unknownStats), PlanNodeStatsEstimate.unknown()); assertEquals(addStatsAndSumDistinctValues(first, unknownStats), PlanNodeStatsEstimate.unknown()); assertEquals(addStatsAndSumDistinctValues(unknownStats, second), PlanNodeStatsEstimate.unknown()); assertEquals(addStatsAndSumDistinctValues(first, second).getOutputRowCount(), 30.0); }
public double overlapPercentWith(StatisticRange other) { requireNonNull(other, "other is null"); if (this.isEmpty() || other.isEmpty() || this.distinctValues == 0 || other.distinctValues == 0) { return 0.0; // zero is better than NaN as it will behave properly for calculating row count } if (this.equals(other)) { return 1.0; } double lengthOfIntersect = min(this.high, other.high) - max(this.low, other.low); if (isInfinite(lengthOfIntersect)) { if (isFinite(this.distinctValues) && isFinite(other.distinctValues)) { return min(other.distinctValues / this.distinctValues, 1); } return INFINITE_TO_INFINITE_RANGE_INTERSECT_OVERLAP_HEURISTIC_FACTOR; } if (lengthOfIntersect == 0) { return 1 / max(this.distinctValues, 1); } if (lengthOfIntersect < 0) { return 0; } if (isInfinite(length()) && isFinite(lengthOfIntersect)) { return INFINITE_TO_FINITE_RANGE_INTERSECT_OVERLAP_HEURISTIC_FACTOR; } if (lengthOfIntersect > 0) { return lengthOfIntersect / length(); } return NaN; }
@Test public void testAddAndSumDistinctValues() { assertEquals(unboundedRange(NaN).addAndSumDistinctValues(unboundedRange(NaN)), unboundedRange(NaN)); assertEquals(unboundedRange(NaN).addAndSumDistinctValues(unboundedRange(1)), unboundedRange(NaN)); assertEquals(unboundedRange(1).addAndSumDistinctValues(unboundedRange(NaN)), unboundedRange(NaN)); assertEquals(unboundedRange(1).addAndSumDistinctValues(unboundedRange(2)), unboundedRange(3)); assertEquals(StatisticRange.empty().addAndSumDistinctValues(StatisticRange.empty()), StatisticRange.empty()); assertEquals(range(0, 1, 1).addAndSumDistinctValues(StatisticRange.empty()), range(0, 1, 1)); assertEquals(range(0, 1, 1).addAndSumDistinctValues(range(1, 2, 1)), range(0, 2, 2)); }
@Test public void testAddAndMaxDistinctValues() { assertEquals(unboundedRange(NaN).addAndMaxDistinctValues(unboundedRange(NaN)), unboundedRange(NaN)); assertEquals(unboundedRange(NaN).addAndMaxDistinctValues(unboundedRange(1)), unboundedRange(NaN)); assertEquals(unboundedRange(1).addAndMaxDistinctValues(unboundedRange(NaN)), unboundedRange(NaN)); assertEquals(unboundedRange(1).addAndMaxDistinctValues(unboundedRange(2)), unboundedRange(2)); assertEquals(StatisticRange.empty().addAndMaxDistinctValues(StatisticRange.empty()), StatisticRange.empty()); assertEquals(range(0, 1, 1).addAndMaxDistinctValues(StatisticRange.empty()), range(0, 1, 1)); assertEquals(range(0, 1, 1).addAndMaxDistinctValues(range(1, 2, 1)), range(0, 2, 1)); }
@Test public void testAddAndCollapseDistinctValues() { assertEquals(unboundedRange(NaN).addAndCollapseDistinctValues(unboundedRange(NaN)), unboundedRange(NaN)); assertEquals(unboundedRange(NaN).addAndCollapseDistinctValues(unboundedRange(1)), unboundedRange(NaN)); assertEquals(unboundedRange(1).addAndCollapseDistinctValues(unboundedRange(NaN)), unboundedRange(NaN)); assertEquals(unboundedRange(1).addAndCollapseDistinctValues(unboundedRange(2)), unboundedRange(2)); assertEquals(StatisticRange.empty().addAndCollapseDistinctValues(StatisticRange.empty()), StatisticRange.empty()); assertEquals(range(0, 1, 1).addAndCollapseDistinctValues(StatisticRange.empty()), range(0, 1, 1)); assertEquals(range(0, 1, 1).addAndCollapseDistinctValues(range(1, 2, 1)), range(0, 2, 1)); assertEquals(range(0, 3, 3).addAndCollapseDistinctValues(range(2, 6, 4)), range(0, 6, 6)); }
private static void assertOverlap(StatisticRange a, StatisticRange b, double expected) { assertEstimateEquals(a.overlapPercentWith(b), expected, "overlapPercentWith"); } }
@Test public void testIntersect() { StatisticRange zeroToTen = range(0, 10, 10); StatisticRange fiveToFifteen = range(5, 15, 60); assertEquals(zeroToTen.intersect(fiveToFifteen), range(5, 10, 10)); }
private static SymbolStatsEstimate addColumnStats(SymbolStatsEstimate leftStats, double leftRows, SymbolStatsEstimate rightStats, double rightRows, double newRowCount, RangeAdditionStrategy strategy) { checkArgument(newRowCount > 0, "newRowCount must be greater than zero"); StatisticRange leftRange = StatisticRange.from(leftStats); StatisticRange rightRange = StatisticRange.from(rightStats); StatisticRange sum = strategy.add(leftRange, rightRange); double nullsCountRight = rightStats.getNullsFraction() * rightRows; double nullsCountLeft = leftStats.getNullsFraction() * leftRows; double totalSizeLeft = (leftRows - nullsCountLeft) * leftStats.getAverageRowSize(); double totalSizeRight = (rightRows - nullsCountRight) * rightStats.getAverageRowSize(); double newNullsFraction = (nullsCountLeft + nullsCountRight) / newRowCount; double newNonNullsRowCount = newRowCount * (1.0 - newNullsFraction); // FIXME, weights to average. left and right should be equal in most cases anyway double newAverageRowSize = newNonNullsRowCount == 0 ? 0 : ((totalSizeLeft + totalSizeRight) / newNonNullsRowCount); return SymbolStatsEstimate.builder() .setStatisticsRange(sum) .setAverageRowSize(newAverageRowSize) .setNullsFraction(newNullsFraction) .build(); } }