@Test public void testAllNulls() { boolean shouldRead = new ParquetMetricsRowGroupFilter(SCHEMA, notNull("all_nulls")) .shouldRead(PARQUET_SCHEMA, ROW_GROUP_METADATA); Assert.assertFalse("Should skip: no non-null value in all null column", shouldRead); shouldRead = new ParquetMetricsRowGroupFilter(SCHEMA, notNull("some_nulls")) .shouldRead(PARQUET_SCHEMA, ROW_GROUP_METADATA); Assert.assertTrue("Should read: column with some nulls contains a non-null value", shouldRead); shouldRead = new ParquetMetricsRowGroupFilter(SCHEMA, notNull("no_nulls")) .shouldRead(PARQUET_SCHEMA, ROW_GROUP_METADATA); Assert.assertTrue("Should read: non-null column contains a non-null value", shouldRead); }
/** * Test whether the file may contain records that match the expression. * * @param fileSchema schema for the Parquet file * @param rowGroup metadata for a row group * @return false if the file cannot contain rows that match the expression, true otherwise. */ public boolean shouldRead(MessageType fileSchema, BlockMetaData rowGroup) { return visitor().eval(fileSchema, rowGroup); }
@Test public void testRequiredColumn() { boolean shouldRead = new ParquetMetricsRowGroupFilter(SCHEMA, notNull("required")) .shouldRead(PARQUET_SCHEMA, ROW_GROUP_METADATA); Assert.assertTrue("Should read: required columns are always non-null", shouldRead); shouldRead = new ParquetMetricsRowGroupFilter(SCHEMA, isNull("required")) .shouldRead(PARQUET_SCHEMA, ROW_GROUP_METADATA); Assert.assertFalse("Should skip: required columns are always non-null", shouldRead); }
@Test public void testNoNulls() { boolean shouldRead = new ParquetMetricsRowGroupFilter(SCHEMA, isNull("all_nulls")) .shouldRead(PARQUET_SCHEMA, ROW_GROUP_METADATA); Assert.assertTrue("Should read: at least one null value in all null column", shouldRead); shouldRead = new ParquetMetricsRowGroupFilter(SCHEMA, isNull("some_nulls")) .shouldRead(PARQUET_SCHEMA, ROW_GROUP_METADATA); Assert.assertTrue("Should read: column with some nulls contains a null value", shouldRead); shouldRead = new ParquetMetricsRowGroupFilter(SCHEMA, isNull("no_nulls")) .shouldRead(PARQUET_SCHEMA, ROW_GROUP_METADATA); Assert.assertFalse("Should skip: non-null column contains no null values", shouldRead); }
@Test public void testEq() { boolean shouldRead = new ParquetMetricsRowGroupFilter(SCHEMA, equal(column, readValue)) .shouldRead(PARQUET_SCHEMA, ROW_GROUP_METADATA); Assert.assertTrue("Should read: value is in the row group: " + readValue, shouldRead); shouldRead = new ParquetMetricsRowGroupFilter(SCHEMA, equal(column, skipValue)) .shouldRead(PARQUET_SCHEMA, ROW_GROUP_METADATA); Assert.assertFalse("Should skip: value is not in the row group: " + skipValue, shouldRead); } }
@Test public void testIntegerLtEq() { boolean shouldRead = new ParquetMetricsRowGroupFilter(SCHEMA, lessThanOrEqual("id", 5)) .shouldRead(PARQUET_SCHEMA, ROW_GROUP_METADATA); Assert.assertFalse("Should not read: id range below lower bound (5 < 30)", shouldRead); shouldRead = new ParquetMetricsRowGroupFilter(SCHEMA, lessThanOrEqual("id", 29)) .shouldRead(PARQUET_SCHEMA, ROW_GROUP_METADATA); Assert.assertFalse("Should not read: id range below lower bound (29 < 30)", shouldRead); shouldRead = new ParquetMetricsRowGroupFilter(SCHEMA, lessThanOrEqual("id", 30)) .shouldRead(PARQUET_SCHEMA, ROW_GROUP_METADATA); Assert.assertTrue("Should read: one possible id", shouldRead); shouldRead = new ParquetMetricsRowGroupFilter(SCHEMA, lessThanOrEqual("id", 79)) .shouldRead(PARQUET_SCHEMA, ROW_GROUP_METADATA); Assert.assertTrue("Should read: many possible ids", shouldRead); }
@Test public void testIntegerLt() { boolean shouldRead = new ParquetMetricsRowGroupFilter(SCHEMA, lessThan("id", 5)) .shouldRead(PARQUET_SCHEMA, ROW_GROUP_METADATA); Assert.assertFalse("Should not read: id range below lower bound (5 < 30)", shouldRead); shouldRead = new ParquetMetricsRowGroupFilter(SCHEMA, lessThan("id", 30)) .shouldRead(PARQUET_SCHEMA, ROW_GROUP_METADATA); Assert.assertFalse("Should not read: id range below lower bound (30 is not < 30)", shouldRead); shouldRead = new ParquetMetricsRowGroupFilter(SCHEMA, lessThan("id", 31)) .shouldRead(PARQUET_SCHEMA, ROW_GROUP_METADATA); Assert.assertTrue("Should read: one possible id", shouldRead); shouldRead = new ParquetMetricsRowGroupFilter(SCHEMA, lessThan("id", 79)) .shouldRead(PARQUET_SCHEMA, ROW_GROUP_METADATA); Assert.assertTrue("Should read: may possible ids", shouldRead); }
@Test public void testIntegerGt() { boolean shouldRead = new ParquetMetricsRowGroupFilter(SCHEMA, greaterThan("id", 85)) .shouldRead(PARQUET_SCHEMA, ROW_GROUP_METADATA); Assert.assertFalse("Should not read: id range above upper bound (85 < 79)", shouldRead); shouldRead = new ParquetMetricsRowGroupFilter(SCHEMA, greaterThan("id", 79)) .shouldRead(PARQUET_SCHEMA, ROW_GROUP_METADATA); Assert.assertFalse("Should not read: id range above upper bound (79 is not > 79)", shouldRead); shouldRead = new ParquetMetricsRowGroupFilter(SCHEMA, greaterThan("id", 78)) .shouldRead(PARQUET_SCHEMA, ROW_GROUP_METADATA); Assert.assertTrue("Should read: one possible id", shouldRead); shouldRead = new ParquetMetricsRowGroupFilter(SCHEMA, greaterThan("id", 75)) .shouldRead(PARQUET_SCHEMA, ROW_GROUP_METADATA); Assert.assertTrue("Should read: may possible ids", shouldRead); }
@Test public void testIntegerGtEq() { boolean shouldRead = new ParquetMetricsRowGroupFilter(SCHEMA, greaterThanOrEqual("id", 85)) .shouldRead(PARQUET_SCHEMA, ROW_GROUP_METADATA); Assert.assertFalse("Should not read: id range above upper bound (85 < 79)", shouldRead); shouldRead = new ParquetMetricsRowGroupFilter(SCHEMA, greaterThanOrEqual("id", 80)) .shouldRead(PARQUET_SCHEMA, ROW_GROUP_METADATA); Assert.assertFalse("Should not read: id range above upper bound (80 > 79)", shouldRead); shouldRead = new ParquetMetricsRowGroupFilter(SCHEMA, greaterThanOrEqual("id", 79)) .shouldRead(PARQUET_SCHEMA, ROW_GROUP_METADATA); Assert.assertTrue("Should read: one possible id", shouldRead); shouldRead = new ParquetMetricsRowGroupFilter(SCHEMA, greaterThanOrEqual("id", 75)) .shouldRead(PARQUET_SCHEMA, ROW_GROUP_METADATA); Assert.assertTrue("Should read: may possible ids", shouldRead); }
@Test public void testMissingColumn() { TestHelpers.assertThrows("Should complain about missing column in expression", ValidationException.class, "Cannot find field 'missing'", () -> new ParquetMetricsRowGroupFilter(SCHEMA, lessThan("missing", 5)) .shouldRead(PARQUET_SCHEMA, ROW_GROUP_METADATA)); }
@Test public void testIntegerEq() { boolean shouldRead = new ParquetMetricsRowGroupFilter(SCHEMA, equal("id", 5)) .shouldRead(PARQUET_SCHEMA, ROW_GROUP_METADATA); Assert.assertFalse("Should not read: id below lower bound", shouldRead); shouldRead = new ParquetMetricsRowGroupFilter(SCHEMA, equal("id", 29)) .shouldRead(PARQUET_SCHEMA, ROW_GROUP_METADATA); Assert.assertFalse("Should not read: id below lower bound", shouldRead); shouldRead = new ParquetMetricsRowGroupFilter(SCHEMA, equal("id", 30)) .shouldRead(PARQUET_SCHEMA, ROW_GROUP_METADATA); Assert.assertTrue("Should read: id equal to lower bound", shouldRead); shouldRead = new ParquetMetricsRowGroupFilter(SCHEMA, equal("id", 75)) .shouldRead(PARQUET_SCHEMA, ROW_GROUP_METADATA); Assert.assertTrue("Should read: id between lower and upper bounds", shouldRead); shouldRead = new ParquetMetricsRowGroupFilter(SCHEMA, equal("id", 79)) .shouldRead(PARQUET_SCHEMA, ROW_GROUP_METADATA); Assert.assertTrue("Should read: id equal to upper bound", shouldRead); shouldRead = new ParquetMetricsRowGroupFilter(SCHEMA, equal("id", 80)) .shouldRead(PARQUET_SCHEMA, ROW_GROUP_METADATA); Assert.assertFalse("Should not read: id above upper bound", shouldRead); shouldRead = new ParquetMetricsRowGroupFilter(SCHEMA, equal("id", 85)) .shouldRead(PARQUET_SCHEMA, ROW_GROUP_METADATA); Assert.assertFalse("Should not read: id above upper bound", shouldRead); }
@Test public void testNot() { // this test case must use a real predicate, not alwaysTrue(), or binding will simplify it out boolean shouldRead = new ParquetMetricsRowGroupFilter(SCHEMA, not(lessThan("id", 5))) .shouldRead(PARQUET_SCHEMA, ROW_GROUP_METADATA); Assert.assertTrue("Should read: not(false)", shouldRead); shouldRead = new ParquetMetricsRowGroupFilter(SCHEMA, not(greaterThan("id", 5))) .shouldRead(PARQUET_SCHEMA, ROW_GROUP_METADATA); Assert.assertFalse("Should skip: not(true)", shouldRead); }
@Test public void testIntegerNotEq() { boolean shouldRead = new ParquetMetricsRowGroupFilter(SCHEMA, notEqual("id", 5)) .shouldRead(PARQUET_SCHEMA, ROW_GROUP_METADATA); Assert.assertTrue("Should read: id below lower bound", shouldRead); shouldRead = new ParquetMetricsRowGroupFilter(SCHEMA, notEqual("id", 29)) .shouldRead(PARQUET_SCHEMA, ROW_GROUP_METADATA); Assert.assertTrue("Should read: id below lower bound", shouldRead); shouldRead = new ParquetMetricsRowGroupFilter(SCHEMA, notEqual("id", 30)) .shouldRead(PARQUET_SCHEMA, ROW_GROUP_METADATA); Assert.assertTrue("Should read: id equal to lower bound", shouldRead); shouldRead = new ParquetMetricsRowGroupFilter(SCHEMA, notEqual("id", 75)) .shouldRead(PARQUET_SCHEMA, ROW_GROUP_METADATA); Assert.assertTrue("Should read: id between lower and upper bounds", shouldRead); shouldRead = new ParquetMetricsRowGroupFilter(SCHEMA, notEqual("id", 79)) .shouldRead(PARQUET_SCHEMA, ROW_GROUP_METADATA); Assert.assertTrue("Should read: id equal to upper bound", shouldRead); shouldRead = new ParquetMetricsRowGroupFilter(SCHEMA, notEqual("id", 80)) .shouldRead(PARQUET_SCHEMA, ROW_GROUP_METADATA); Assert.assertTrue("Should read: id above upper bound", shouldRead); shouldRead = new ParquetMetricsRowGroupFilter(SCHEMA, notEqual("id", 85)) .shouldRead(PARQUET_SCHEMA, ROW_GROUP_METADATA); Assert.assertTrue("Should read: id above upper bound", shouldRead); }
@Test public void testOr() { // this test case must use a real predicate, not alwaysTrue(), or binding will simplify it out boolean shouldRead = new ParquetMetricsRowGroupFilter(SCHEMA, or(lessThan("id", 5), greaterThanOrEqual("id", 80))) .shouldRead(PARQUET_SCHEMA, ROW_GROUP_METADATA); Assert.assertFalse("Should skip: or(false, false)", shouldRead); shouldRead = new ParquetMetricsRowGroupFilter(SCHEMA, or(lessThan("id", 5), greaterThanOrEqual("id", 60))) .shouldRead(PARQUET_SCHEMA, ROW_GROUP_METADATA); Assert.assertTrue("Should read: or(false, true)", shouldRead); }
@Test public void testAnd() { // this test case must use a real predicate, not alwaysTrue(), or binding will simplify it out boolean shouldRead = new ParquetMetricsRowGroupFilter(SCHEMA, and(lessThan("id", 5), greaterThanOrEqual("id", 0))) .shouldRead(PARQUET_SCHEMA, ROW_GROUP_METADATA); Assert.assertFalse("Should skip: and(false, false)", shouldRead); shouldRead = new ParquetMetricsRowGroupFilter(SCHEMA, and(greaterThan("id", 5), lessThanOrEqual("id", 30))) .shouldRead(PARQUET_SCHEMA, ROW_GROUP_METADATA); Assert.assertTrue("Should read: and(true, true)", shouldRead); }
@Test public void testIntegerNotEqRewritten() { boolean shouldRead = new ParquetMetricsRowGroupFilter(SCHEMA, not(equal("id", 5))) .shouldRead(PARQUET_SCHEMA, ROW_GROUP_METADATA); Assert.assertTrue("Should read: id below lower bound", shouldRead); shouldRead = new ParquetMetricsRowGroupFilter(SCHEMA, not(equal("id", 29))) .shouldRead(PARQUET_SCHEMA, ROW_GROUP_METADATA); Assert.assertTrue("Should read: id below lower bound", shouldRead); shouldRead = new ParquetMetricsRowGroupFilter(SCHEMA, not(equal("id", 30))) .shouldRead(PARQUET_SCHEMA, ROW_GROUP_METADATA); Assert.assertTrue("Should read: id equal to lower bound", shouldRead); shouldRead = new ParquetMetricsRowGroupFilter(SCHEMA, not(equal("id", 75))) .shouldRead(PARQUET_SCHEMA, ROW_GROUP_METADATA); Assert.assertTrue("Should read: id between lower and upper bounds", shouldRead); shouldRead = new ParquetMetricsRowGroupFilter(SCHEMA, not(equal("id", 79))) .shouldRead(PARQUET_SCHEMA, ROW_GROUP_METADATA); Assert.assertTrue("Should read: id equal to upper bound", shouldRead); shouldRead = new ParquetMetricsRowGroupFilter(SCHEMA, not(equal("id", 80))) .shouldRead(PARQUET_SCHEMA, ROW_GROUP_METADATA); Assert.assertTrue("Should read: id above upper bound", shouldRead); shouldRead = new ParquetMetricsRowGroupFilter(SCHEMA, not(equal("id", 85))) .shouldRead(PARQUET_SCHEMA, ROW_GROUP_METADATA); Assert.assertTrue("Should read: id above upper bound", shouldRead);
ParquetDictionaryRowGroupFilter dictFilter = null; if (filter != null) { statsFilter = new ParquetMetricsRowGroupFilter(expectedSchema, filter); dictFilter = new ParquetDictionaryRowGroupFilter(expectedSchema, filter); BlockMetaData rowGroup = rowGroups.get(i); boolean shouldRead = filter == null || ( statsFilter.shouldRead(typeWithIds, rowGroup) && dictFilter.shouldRead(typeWithIds, rowGroup, reader.getDictionaryReader(rowGroup))); this.shouldSkip[i] = !shouldRead;
@Test public void testColumnNotInFile() { Expression[] cannotMatch = new Expression[] { lessThan("not_in_file", 1.0f), lessThanOrEqual("not_in_file", 1.0f), equal("not_in_file", 1.0f), greaterThan("not_in_file", 1.0f), greaterThanOrEqual("not_in_file", 1.0f), notNull("not_in_file") }; for (Expression expr : cannotMatch) { boolean shouldRead = new ParquetMetricsRowGroupFilter(SCHEMA, expr) .shouldRead(PARQUET_SCHEMA, ROW_GROUP_METADATA); Assert.assertFalse("Should skip when column is not in file (all nulls): " + expr, shouldRead); } Expression[] canMatch = new Expression[] { isNull("not_in_file"), notEqual("not_in_file", 1.0f) }; for (Expression expr : canMatch) { boolean shouldRead = new ParquetMetricsRowGroupFilter(SCHEMA, expr) .shouldRead(PARQUET_SCHEMA, ROW_GROUP_METADATA); Assert.assertTrue("Should read when column is not in file (all nulls): " + expr, shouldRead); } }
@Test public void testMissingStats() { Expression[] exprs = new Expression[] { lessThan("no_stats", "a"), lessThanOrEqual("no_stats", "b"), equal("no_stats", "c"), greaterThan("no_stats", "d"), greaterThanOrEqual("no_stats", "e"), notEqual("no_stats", "f"), isNull("no_stats"), notNull("no_stats") }; for (Expression expr : exprs) { boolean shouldRead = new ParquetMetricsRowGroupFilter(SCHEMA, expr) .shouldRead(PARQUET_SCHEMA, ROW_GROUP_METADATA); Assert.assertTrue("Should read when missing stats for expr: " + expr, shouldRead); } }
@Test public void testZeroRecordFile() { BlockMetaData emptyBlock = new BlockMetaData(); emptyBlock.setRowCount(0); Expression[] exprs = new Expression[] { lessThan("id", 5), lessThanOrEqual("id", 30), equal("id", 70), greaterThan("id", 78), greaterThanOrEqual("id", 90), notEqual("id", 101), isNull("some_nulls"), notNull("some_nulls") }; for (Expression expr : exprs) { boolean shouldRead = new ParquetMetricsRowGroupFilter(SCHEMA, expr) .shouldRead(PARQUET_SCHEMA, emptyBlock); Assert.assertFalse("Should never read 0-record file: " + expr, shouldRead); } }