@Test public void testIntegerGtEq() { boolean shouldRead = new InclusiveMetricsEvaluator(SCHEMA, greaterThanOrEqual("id", 85)).eval(FILE); Assert.assertFalse("Should not read: id range above upper bound (85 < 79)", shouldRead); shouldRead = new InclusiveMetricsEvaluator(SCHEMA, greaterThanOrEqual("id", 80)).eval(FILE); Assert.assertFalse("Should not read: id range above upper bound (80 > 79)", shouldRead); shouldRead = new InclusiveMetricsEvaluator(SCHEMA, greaterThanOrEqual("id", 79)).eval(FILE); Assert.assertTrue("Should read: one possible id", shouldRead); shouldRead = new InclusiveMetricsEvaluator(SCHEMA, greaterThanOrEqual("id", 75)).eval(FILE); Assert.assertTrue("Should read: may possible ids", shouldRead); }
/** * Test whether the file may contain records that match the expression. * * @param file a data file * @return false if the file cannot contain rows that match the expression, true otherwise. */ public boolean eval(DataFile file) { // TODO: detect the case where a column is missing from the file using file's max field id. return visitor().eval(file); }
@Override public Iterator<DataFile> iterator() { if (rowFilter != null && rowFilter != Expressions.alwaysTrue() && partFilter != null && partFilter != Expressions.alwaysTrue()) { Evaluator evaluator = evaluator(); InclusiveMetricsEvaluator metricsEvaluator = metricsEvaluator(); return Iterators.transform( Iterators.filter(reader.iterator(partFilter, columns), input -> (input != null && evaluator.eval(input.partition()) && metricsEvaluator.eval(input))), DataFile::copy); } else { return Iterators.transform(reader.iterator(partFilter, columns), DataFile::copy); } }
private InclusiveMetricsEvaluator metricsEvaluator() { if (lazyMetricsEvaluator == null) { this.lazyMetricsEvaluator = new InclusiveMetricsEvaluator(reader.spec().schema(), rowFilter); } return lazyMetricsEvaluator; } }
Iterable<ManifestEntry> allEntries() { if (rowFilter != null && rowFilter != Expressions.alwaysTrue() && partFilter != null && partFilter != Expressions.alwaysTrue()) { Evaluator evaluator = evaluator(); InclusiveMetricsEvaluator metricsEvaluator = metricsEvaluator(); return Iterables.filter(reader.entries(columns), entry -> (entry != null && evaluator.eval(entry.file().partition()) && metricsEvaluator.eval(entry.file()))); } else { return reader.entries(columns); } }
@Test public void testIntegerLtEq() { boolean shouldRead = new InclusiveMetricsEvaluator(SCHEMA, lessThanOrEqual("id", 5)).eval(FILE); Assert.assertFalse("Should not read: id range below lower bound (5 < 30)", shouldRead); shouldRead = new InclusiveMetricsEvaluator(SCHEMA, lessThanOrEqual("id", 29)).eval(FILE); Assert.assertFalse("Should not read: id range below lower bound (29 < 30)", shouldRead); shouldRead = new InclusiveMetricsEvaluator(SCHEMA, lessThanOrEqual("id", 30)).eval(FILE); Assert.assertTrue("Should read: one possible id", shouldRead); shouldRead = new InclusiveMetricsEvaluator(SCHEMA, lessThanOrEqual("id", 79)).eval(FILE); Assert.assertTrue("Should read: many possible ids", shouldRead); }
Iterable<ManifestEntry> liveEntries() { if (rowFilter != null && rowFilter != Expressions.alwaysTrue() && partFilter != null && partFilter != Expressions.alwaysTrue()) { Evaluator evaluator = evaluator(); InclusiveMetricsEvaluator metricsEvaluator = metricsEvaluator(); return Iterables.filter(reader.entries(columns), entry -> (entry != null && entry.status() != Status.DELETED && evaluator.eval(entry.file().partition()) && metricsEvaluator.eval(entry.file()))); } else { return Iterables.filter(reader.entries(columns), entry -> entry != null && entry.status() != Status.DELETED); } }
@Test public void testAllNulls() { boolean shouldRead = new InclusiveMetricsEvaluator(SCHEMA, notNull("all_nulls")).eval(FILE); Assert.assertFalse("Should skip: no non-null value in all null column", shouldRead); shouldRead = new InclusiveMetricsEvaluator(SCHEMA, notNull("some_nulls")).eval(FILE); Assert.assertTrue("Should read: column with some nulls contains a non-null value", shouldRead); shouldRead = new InclusiveMetricsEvaluator(SCHEMA, notNull("no_nulls")).eval(FILE); Assert.assertTrue("Should read: non-null column contains a non-null value", shouldRead); }
@Test public void testIntegerLt() { boolean shouldRead = new InclusiveMetricsEvaluator(SCHEMA, lessThan("id", 5)).eval(FILE); Assert.assertFalse("Should not read: id range below lower bound (5 < 30)", shouldRead); shouldRead = new InclusiveMetricsEvaluator(SCHEMA, lessThan("id", 30)).eval(FILE); Assert.assertFalse("Should not read: id range below lower bound (30 is not < 30)", shouldRead); shouldRead = new InclusiveMetricsEvaluator(SCHEMA, lessThan("id", 31)).eval(FILE); Assert.assertTrue("Should read: one possible id", shouldRead); shouldRead = new InclusiveMetricsEvaluator(SCHEMA, lessThan("id", 79)).eval(FILE); Assert.assertTrue("Should read: may possible ids", shouldRead); }
@Test public void testIntegerGt() { boolean shouldRead = new InclusiveMetricsEvaluator(SCHEMA, greaterThan("id", 85)).eval(FILE); Assert.assertFalse("Should not read: id range above upper bound (85 < 79)", shouldRead); shouldRead = new InclusiveMetricsEvaluator(SCHEMA, greaterThan("id", 79)).eval(FILE); Assert.assertFalse("Should not read: id range above upper bound (79 is not > 79)", shouldRead); shouldRead = new InclusiveMetricsEvaluator(SCHEMA, greaterThan("id", 78)).eval(FILE); Assert.assertTrue("Should read: one possible id", shouldRead); shouldRead = new InclusiveMetricsEvaluator(SCHEMA, greaterThan("id", 75)).eval(FILE); Assert.assertTrue("Should read: may possible ids", shouldRead); }
@Test public void testNoNulls() { boolean shouldRead = new InclusiveMetricsEvaluator(SCHEMA, isNull("all_nulls")).eval(FILE); Assert.assertTrue("Should read: at least one null value in all null column", shouldRead); shouldRead = new InclusiveMetricsEvaluator(SCHEMA, isNull("some_nulls")).eval(FILE); Assert.assertTrue("Should read: column with some nulls contains a null value", shouldRead); shouldRead = new InclusiveMetricsEvaluator(SCHEMA, isNull("no_nulls")).eval(FILE); Assert.assertFalse("Should skip: non-null column contains no null values", shouldRead); }
@Test public void testRequiredColumn() { boolean shouldRead = new InclusiveMetricsEvaluator(SCHEMA, notNull("required")).eval(FILE); Assert.assertTrue("Should read: required columns are always non-null", shouldRead); shouldRead = new InclusiveMetricsEvaluator(SCHEMA, isNull("required")).eval(FILE); Assert.assertFalse("Should skip: required columns are always non-null", shouldRead); }
@Test public void testIntegerEq() { boolean shouldRead = new InclusiveMetricsEvaluator(SCHEMA, equal("id", 5)).eval(FILE); Assert.assertFalse("Should not read: id below lower bound", shouldRead); shouldRead = new InclusiveMetricsEvaluator(SCHEMA, equal("id", 29)).eval(FILE); Assert.assertFalse("Should not read: id below lower bound", shouldRead); shouldRead = new InclusiveMetricsEvaluator(SCHEMA, equal("id", 30)).eval(FILE); Assert.assertTrue("Should read: id equal to lower bound", shouldRead); shouldRead = new InclusiveMetricsEvaluator(SCHEMA, equal("id", 75)).eval(FILE); Assert.assertTrue("Should read: id between lower and upper bounds", shouldRead); shouldRead = new InclusiveMetricsEvaluator(SCHEMA, equal("id", 79)).eval(FILE); Assert.assertTrue("Should read: id equal to upper bound", shouldRead); shouldRead = new InclusiveMetricsEvaluator(SCHEMA, equal("id", 80)).eval(FILE); Assert.assertFalse("Should not read: id above upper bound", shouldRead); shouldRead = new InclusiveMetricsEvaluator(SCHEMA, equal("id", 85)).eval(FILE); Assert.assertFalse("Should not read: id above upper bound", shouldRead); }
@Test public void testIntegerNotEq() { boolean shouldRead = new InclusiveMetricsEvaluator(SCHEMA, notEqual("id", 5)).eval(FILE); Assert.assertTrue("Should read: id below lower bound", shouldRead); shouldRead = new InclusiveMetricsEvaluator(SCHEMA, notEqual("id", 29)).eval(FILE); Assert.assertTrue("Should read: id below lower bound", shouldRead); shouldRead = new InclusiveMetricsEvaluator(SCHEMA, notEqual("id", 30)).eval(FILE); Assert.assertTrue("Should read: id equal to lower bound", shouldRead); shouldRead = new InclusiveMetricsEvaluator(SCHEMA, notEqual("id", 75)).eval(FILE); Assert.assertTrue("Should read: id between lower and upper bounds", shouldRead); shouldRead = new InclusiveMetricsEvaluator(SCHEMA, notEqual("id", 79)).eval(FILE); Assert.assertTrue("Should read: id equal to upper bound", shouldRead); shouldRead = new InclusiveMetricsEvaluator(SCHEMA, notEqual("id", 80)).eval(FILE); Assert.assertTrue("Should read: id above upper bound", shouldRead); shouldRead = new InclusiveMetricsEvaluator(SCHEMA, notEqual("id", 85)).eval(FILE); Assert.assertTrue("Should read: id above upper bound", shouldRead); }
@Test public void testMissingColumn() { TestHelpers.assertThrows("Should complain about missing column in expression", ValidationException.class, "Cannot find field 'missing'", () -> new InclusiveMetricsEvaluator(SCHEMA, lessThan("missing", 5)).eval(FILE)); }
@Test public void testNot() { // this test case must use a real predicate, not alwaysTrue(), or binding will simplify it out boolean shouldRead = new InclusiveMetricsEvaluator(SCHEMA, not(lessThan("id", 5))).eval(FILE); Assert.assertTrue("Should read: not(false)", shouldRead); shouldRead = new InclusiveMetricsEvaluator(SCHEMA, not(greaterThan("id", 5))).eval(FILE); Assert.assertFalse("Should skip: not(true)", shouldRead); }
@Test public void testOr() { // this test case must use a real predicate, not alwaysTrue(), or binding will simplify it out boolean shouldRead = new InclusiveMetricsEvaluator(SCHEMA, or(lessThan("id", 5), greaterThanOrEqual("id", 80))).eval(FILE); Assert.assertFalse("Should skip: or(false, false)", shouldRead); shouldRead = new InclusiveMetricsEvaluator(SCHEMA, or(lessThan("id", 5), greaterThanOrEqual("id", 60))).eval(FILE); Assert.assertTrue("Should read: or(false, true)", shouldRead); }
@Test public void testIntegerNotEqRewritten() { boolean shouldRead = new InclusiveMetricsEvaluator(SCHEMA, not(equal("id", 5))).eval(FILE); Assert.assertTrue("Should read: id below lower bound", shouldRead); shouldRead = new InclusiveMetricsEvaluator(SCHEMA, not(equal("id", 29))).eval(FILE); Assert.assertTrue("Should read: id below lower bound", shouldRead); shouldRead = new InclusiveMetricsEvaluator(SCHEMA, not(equal("id", 30))).eval(FILE); Assert.assertTrue("Should read: id equal to lower bound", shouldRead); shouldRead = new InclusiveMetricsEvaluator(SCHEMA, not(equal("id", 75))).eval(FILE); Assert.assertTrue("Should read: id between lower and upper bounds", shouldRead); shouldRead = new InclusiveMetricsEvaluator(SCHEMA, not(equal("id", 79))).eval(FILE); Assert.assertTrue("Should read: id equal to upper bound", shouldRead); shouldRead = new InclusiveMetricsEvaluator(SCHEMA, not(equal("id", 80))).eval(FILE); Assert.assertTrue("Should read: id above upper bound", shouldRead); shouldRead = new InclusiveMetricsEvaluator(SCHEMA, not(equal("id", 85))).eval(FILE); Assert.assertTrue("Should read: id above upper bound", shouldRead); } }
@Test public void testAnd() { // this test case must use a real predicate, not alwaysTrue(), or binding will simplify it out boolean shouldRead = new InclusiveMetricsEvaluator(SCHEMA, and(lessThan("id", 5), greaterThanOrEqual("id", 0))).eval(FILE); Assert.assertFalse("Should skip: and(false, false)", shouldRead); shouldRead = new InclusiveMetricsEvaluator(SCHEMA, and(greaterThan("id", 5), lessThanOrEqual("id", 30))).eval(FILE); Assert.assertTrue("Should read: and(true, true)", shouldRead); }
@Test public void testZeroRecordFile() { DataFile empty = new TestDataFile("file.parquet", Row.of(), 0); Expression[] exprs = new Expression[] { lessThan("id", 5), lessThanOrEqual("id", 30), equal("id", 70), greaterThan("id", 78), greaterThanOrEqual("id", 90), notEqual("id", 101), isNull("some_nulls"), notNull("some_nulls") }; for (Expression expr : exprs) { boolean shouldRead = new InclusiveMetricsEvaluator(SCHEMA, expr).eval(empty); Assert.assertFalse("Should never read 0-record file: " + expr, shouldRead); } }