@Test public void testAllNulls() { boolean shouldRead = new ParquetDictionaryRowGroupFilter(SCHEMA, notNull("all_nulls")) .shouldRead(PARQUET_SCHEMA, ROW_GROUP_METADATA, DICTIONARY_STORE); Assert.assertTrue("Should read: dictionary filter doesn't help", shouldRead); shouldRead = new ParquetDictionaryRowGroupFilter(SCHEMA, notNull("some_nulls")) .shouldRead(PARQUET_SCHEMA, ROW_GROUP_METADATA, DICTIONARY_STORE); Assert.assertTrue("Should read: dictionary filter doesn't help", shouldRead); shouldRead = new ParquetDictionaryRowGroupFilter(SCHEMA, notNull("no_nulls")) .shouldRead(PARQUET_SCHEMA, ROW_GROUP_METADATA, DICTIONARY_STORE); Assert.assertTrue("Should read: dictionary filter doesn't help", shouldRead); }
/** * Test whether the dictionaries for a row group may contain records that match the expression. * * @param fileSchema schema for the Parquet file * @param dictionaries a dictionary page read store * @return false if the file cannot contain rows that match the expression, true otherwise. */ public boolean shouldRead(MessageType fileSchema, BlockMetaData rowGroup, DictionaryPageReadStore dictionaries) { return visitor().eval(fileSchema, rowGroup, dictionaries); }
@Test public void testNoNulls() { boolean shouldRead = new ParquetDictionaryRowGroupFilter(SCHEMA, isNull("all_nulls")) .shouldRead(PARQUET_SCHEMA, ROW_GROUP_METADATA, DICTIONARY_STORE); Assert.assertTrue("Should read: dictionary filter doesn't help", shouldRead); shouldRead = new ParquetDictionaryRowGroupFilter(SCHEMA, isNull("some_nulls")) .shouldRead(PARQUET_SCHEMA, ROW_GROUP_METADATA, DICTIONARY_STORE); Assert.assertTrue("Should read: dictionary filter doesn't help", shouldRead); shouldRead = new ParquetDictionaryRowGroupFilter(SCHEMA, isNull("no_nulls")) .shouldRead(PARQUET_SCHEMA, ROW_GROUP_METADATA, DICTIONARY_STORE); Assert.assertTrue("Should read: dictionary filter doesn't help", shouldRead); }
@Test public void testRequiredColumn() { boolean shouldRead = new ParquetDictionaryRowGroupFilter(SCHEMA, notNull("required")) .shouldRead(PARQUET_SCHEMA, ROW_GROUP_METADATA, DICTIONARY_STORE); Assert.assertTrue("Should read: required columns are always non-null", shouldRead); shouldRead = new ParquetDictionaryRowGroupFilter(SCHEMA, isNull("required")) .shouldRead(PARQUET_SCHEMA, ROW_GROUP_METADATA, DICTIONARY_STORE); Assert.assertFalse("Should skip: required columns are always non-null", shouldRead); }
@Test public void testIntegerGt() { boolean shouldRead = new ParquetDictionaryRowGroupFilter(SCHEMA, greaterThan("id", 85)) .shouldRead(PARQUET_SCHEMA, ROW_GROUP_METADATA, DICTIONARY_STORE); Assert.assertFalse("Should not read: id range above upper bound (85 < 79)", shouldRead); shouldRead = new ParquetDictionaryRowGroupFilter(SCHEMA, greaterThan("id", 79)) .shouldRead(PARQUET_SCHEMA, ROW_GROUP_METADATA, DICTIONARY_STORE); Assert.assertFalse("Should not read: id range above upper bound (79 is not > 79)", shouldRead); shouldRead = new ParquetDictionaryRowGroupFilter(SCHEMA, greaterThan("id", 78)) .shouldRead(PARQUET_SCHEMA, ROW_GROUP_METADATA, DICTIONARY_STORE); Assert.assertTrue("Should read: one possible id", shouldRead); shouldRead = new ParquetDictionaryRowGroupFilter(SCHEMA, greaterThan("id", 75)) .shouldRead(PARQUET_SCHEMA, ROW_GROUP_METADATA, DICTIONARY_STORE); Assert.assertTrue("Should read: may possible ids", shouldRead); }
@Test public void testIntegerLtEq() { boolean shouldRead = new ParquetDictionaryRowGroupFilter(SCHEMA, lessThanOrEqual("id", 5)) .shouldRead(PARQUET_SCHEMA, ROW_GROUP_METADATA, DICTIONARY_STORE); Assert.assertFalse("Should not read: id range below lower bound (5 < 30)", shouldRead); shouldRead = new ParquetDictionaryRowGroupFilter(SCHEMA, lessThanOrEqual("id", 29)) .shouldRead(PARQUET_SCHEMA, ROW_GROUP_METADATA, DICTIONARY_STORE); Assert.assertFalse("Should not read: id range below lower bound (29 < 30)", shouldRead); shouldRead = new ParquetDictionaryRowGroupFilter(SCHEMA, lessThanOrEqual("id", 30)) .shouldRead(PARQUET_SCHEMA, ROW_GROUP_METADATA, DICTIONARY_STORE); Assert.assertTrue("Should read: one possible id", shouldRead); shouldRead = new ParquetDictionaryRowGroupFilter(SCHEMA, lessThanOrEqual("id", 79)) .shouldRead(PARQUET_SCHEMA, ROW_GROUP_METADATA, DICTIONARY_STORE); Assert.assertTrue("Should read: many possible ids", shouldRead); }
@Test public void testIntegerGtEq() { boolean shouldRead = new ParquetDictionaryRowGroupFilter(SCHEMA, greaterThanOrEqual("id", 85)) .shouldRead(PARQUET_SCHEMA, ROW_GROUP_METADATA, DICTIONARY_STORE); Assert.assertFalse("Should not read: id range above upper bound (85 < 79)", shouldRead); shouldRead = new ParquetDictionaryRowGroupFilter(SCHEMA, greaterThanOrEqual("id", 80)) .shouldRead(PARQUET_SCHEMA, ROW_GROUP_METADATA, DICTIONARY_STORE); Assert.assertFalse("Should not read: id range above upper bound (80 > 79)", shouldRead); shouldRead = new ParquetDictionaryRowGroupFilter(SCHEMA, greaterThanOrEqual("id", 79)) .shouldRead(PARQUET_SCHEMA, ROW_GROUP_METADATA, DICTIONARY_STORE); Assert.assertTrue("Should read: one possible id", shouldRead); shouldRead = new ParquetDictionaryRowGroupFilter(SCHEMA, greaterThanOrEqual("id", 75)) .shouldRead(PARQUET_SCHEMA, ROW_GROUP_METADATA, DICTIONARY_STORE); Assert.assertTrue("Should read: may possible ids", shouldRead); }
@Test public void testIntegerLt() { boolean shouldRead = new ParquetDictionaryRowGroupFilter(SCHEMA, lessThan("id", 5)) .shouldRead(PARQUET_SCHEMA, ROW_GROUP_METADATA, DICTIONARY_STORE); Assert.assertFalse("Should not read: id range below lower bound (5 < 30)", shouldRead); shouldRead = new ParquetDictionaryRowGroupFilter(SCHEMA, lessThan("id", 30)) .shouldRead(PARQUET_SCHEMA, ROW_GROUP_METADATA, DICTIONARY_STORE); Assert.assertFalse("Should not read: id range below lower bound (30 is not < 30)", shouldRead); shouldRead = new ParquetDictionaryRowGroupFilter(SCHEMA, lessThan("id", 31)) .shouldRead(PARQUET_SCHEMA, ROW_GROUP_METADATA, DICTIONARY_STORE); Assert.assertTrue("Should read: one possible id", shouldRead); shouldRead = new ParquetDictionaryRowGroupFilter(SCHEMA, lessThan("id", 79)) .shouldRead(PARQUET_SCHEMA, ROW_GROUP_METADATA, DICTIONARY_STORE); Assert.assertTrue("Should read: may possible ids", shouldRead); }
@Test public void testMissingStats() { boolean shouldRead = new ParquetDictionaryRowGroupFilter(SCHEMA, equal("no_stats", "a")) .shouldRead(PARQUET_SCHEMA, ROW_GROUP_METADATA, DICTIONARY_STORE); Assert.assertFalse("Should skip: stats are missing but dictionary is present", shouldRead); }
@Test public void testMissingColumn() { TestHelpers.assertThrows("Should complain about missing column in expression", ValidationException.class, "Cannot find field 'missing'", () -> new ParquetDictionaryRowGroupFilter(SCHEMA, lessThan("missing", 5)) .shouldRead(PARQUET_SCHEMA, ROW_GROUP_METADATA, DICTIONARY_STORE)); }
@Test public void testStringNotEq() { boolean shouldRead = new ParquetDictionaryRowGroupFilter(SCHEMA, notEqual("some_nulls", "some")) .shouldRead(PARQUET_SCHEMA, ROW_GROUP_METADATA, DICTIONARY_STORE); Assert.assertFalse("Should skip: all values are 'some'", shouldRead); }
@Test public void testNot() { // this test case must use a real predicate, not alwaysTrue(), or binding will simplify it out boolean shouldRead = new ParquetDictionaryRowGroupFilter(SCHEMA, not(lessThan("id", 5))) .shouldRead(PARQUET_SCHEMA, ROW_GROUP_METADATA, DICTIONARY_STORE); Assert.assertTrue("Should read: not(false)", shouldRead); shouldRead = new ParquetDictionaryRowGroupFilter(SCHEMA, not(greaterThan("id", 5))) .shouldRead(PARQUET_SCHEMA, ROW_GROUP_METADATA, DICTIONARY_STORE); Assert.assertFalse("Should skip: not(true)", shouldRead); }
@Test public void testIntegerNotEq() { boolean shouldRead = new ParquetDictionaryRowGroupFilter(SCHEMA, notEqual("id", 5)) .shouldRead(PARQUET_SCHEMA, ROW_GROUP_METADATA, DICTIONARY_STORE); Assert.assertTrue("Should read: id below lower bound", shouldRead); shouldRead = new ParquetDictionaryRowGroupFilter(SCHEMA, notEqual("id", 29)) .shouldRead(PARQUET_SCHEMA, ROW_GROUP_METADATA, DICTIONARY_STORE); Assert.assertTrue("Should read: id below lower bound", shouldRead); shouldRead = new ParquetDictionaryRowGroupFilter(SCHEMA, notEqual("id", 30)) .shouldRead(PARQUET_SCHEMA, ROW_GROUP_METADATA, DICTIONARY_STORE); Assert.assertTrue("Should read: id equal to lower bound", shouldRead); shouldRead = new ParquetDictionaryRowGroupFilter(SCHEMA, notEqual("id", 75)) .shouldRead(PARQUET_SCHEMA, ROW_GROUP_METADATA, DICTIONARY_STORE); Assert.assertTrue("Should read: id between lower and upper bounds", shouldRead); shouldRead = new ParquetDictionaryRowGroupFilter(SCHEMA, notEqual("id", 79)) .shouldRead(PARQUET_SCHEMA, ROW_GROUP_METADATA, DICTIONARY_STORE); Assert.assertTrue("Should read: id equal to upper bound", shouldRead); shouldRead = new ParquetDictionaryRowGroupFilter(SCHEMA, notEqual("id", 80)) .shouldRead(PARQUET_SCHEMA, ROW_GROUP_METADATA, DICTIONARY_STORE); Assert.assertTrue("Should read: id above upper bound", shouldRead); shouldRead = new ParquetDictionaryRowGroupFilter(SCHEMA, notEqual("id", 85)) .shouldRead(PARQUET_SCHEMA, ROW_GROUP_METADATA, DICTIONARY_STORE); Assert.assertTrue("Should read: id above upper bound", shouldRead); }
@Test public void testIntegerEq() { boolean shouldRead = new ParquetDictionaryRowGroupFilter(SCHEMA, equal("id", 5)) .shouldRead(PARQUET_SCHEMA, ROW_GROUP_METADATA, DICTIONARY_STORE); Assert.assertFalse("Should not read: id below lower bound", shouldRead); shouldRead = new ParquetDictionaryRowGroupFilter(SCHEMA, equal("id", 29)) .shouldRead(PARQUET_SCHEMA, ROW_GROUP_METADATA, DICTIONARY_STORE); Assert.assertFalse("Should not read: id below lower bound", shouldRead); shouldRead = new ParquetDictionaryRowGroupFilter(SCHEMA, equal("id", 30)) .shouldRead(PARQUET_SCHEMA, ROW_GROUP_METADATA, DICTIONARY_STORE); Assert.assertTrue("Should read: id equal to lower bound", shouldRead); shouldRead = new ParquetDictionaryRowGroupFilter(SCHEMA, equal("id", 75)) .shouldRead(PARQUET_SCHEMA, ROW_GROUP_METADATA, DICTIONARY_STORE); Assert.assertTrue("Should read: id between lower and upper bounds", shouldRead); shouldRead = new ParquetDictionaryRowGroupFilter(SCHEMA, equal("id", 79)) .shouldRead(PARQUET_SCHEMA, ROW_GROUP_METADATA, DICTIONARY_STORE); Assert.assertTrue("Should read: id equal to upper bound", shouldRead); shouldRead = new ParquetDictionaryRowGroupFilter(SCHEMA, equal("id", 80)) .shouldRead(PARQUET_SCHEMA, ROW_GROUP_METADATA, DICTIONARY_STORE); Assert.assertFalse("Should not read: id above upper bound", shouldRead); shouldRead = new ParquetDictionaryRowGroupFilter(SCHEMA, equal("id", 85)) .shouldRead(PARQUET_SCHEMA, ROW_GROUP_METADATA, DICTIONARY_STORE); Assert.assertFalse("Should not read: id above upper bound", shouldRead); }
@Test public void testOr() { // this test case must use a real predicate, not alwaysTrue(), or binding will simplify it out boolean shouldRead = new ParquetDictionaryRowGroupFilter(SCHEMA, or(lessThan("id", 5), greaterThanOrEqual("id", 80))) .shouldRead(PARQUET_SCHEMA, ROW_GROUP_METADATA, DICTIONARY_STORE); Assert.assertFalse("Should skip: or(false, false)", shouldRead); shouldRead = new ParquetDictionaryRowGroupFilter(SCHEMA, or(lessThan("id", 5), greaterThanOrEqual("id", 60))) .shouldRead(PARQUET_SCHEMA, ROW_GROUP_METADATA, DICTIONARY_STORE); Assert.assertTrue("Should read: or(false, true)", shouldRead); }
@Test public void testAnd() { // this test case must use a real predicate, not alwaysTrue(), or binding will simplify it out boolean shouldRead = new ParquetDictionaryRowGroupFilter(SCHEMA, and(lessThan("id", 5), greaterThanOrEqual("id", 0))) .shouldRead(PARQUET_SCHEMA, ROW_GROUP_METADATA, DICTIONARY_STORE); Assert.assertFalse("Should skip: and(false, false)", shouldRead); shouldRead = new ParquetDictionaryRowGroupFilter(SCHEMA, and(greaterThan("id", 5), lessThanOrEqual("id", 30))) .shouldRead(PARQUET_SCHEMA, ROW_GROUP_METADATA, DICTIONARY_STORE); Assert.assertTrue("Should read: and(true, true)", shouldRead); }
@Test public void testIntegerNotEqRewritten() { boolean shouldRead = new ParquetDictionaryRowGroupFilter(SCHEMA, not(equal("id", 5))) .shouldRead(PARQUET_SCHEMA, ROW_GROUP_METADATA, DICTIONARY_STORE); Assert.assertTrue("Should read: id below lower bound", shouldRead); shouldRead = new ParquetDictionaryRowGroupFilter(SCHEMA, not(equal("id", 29))) .shouldRead(PARQUET_SCHEMA, ROW_GROUP_METADATA, DICTIONARY_STORE); Assert.assertTrue("Should read: id below lower bound", shouldRead); shouldRead = new ParquetDictionaryRowGroupFilter(SCHEMA, not(equal("id", 30))) .shouldRead(PARQUET_SCHEMA, ROW_GROUP_METADATA, DICTIONARY_STORE); Assert.assertTrue("Should read: id equal to lower bound", shouldRead); shouldRead = new ParquetDictionaryRowGroupFilter(SCHEMA, not(equal("id", 75))) .shouldRead(PARQUET_SCHEMA, ROW_GROUP_METADATA, DICTIONARY_STORE); Assert.assertTrue("Should read: id between lower and upper bounds", shouldRead); shouldRead = new ParquetDictionaryRowGroupFilter(SCHEMA, not(equal("id", 79))) .shouldRead(PARQUET_SCHEMA, ROW_GROUP_METADATA, DICTIONARY_STORE); Assert.assertTrue("Should read: id equal to upper bound", shouldRead); shouldRead = new ParquetDictionaryRowGroupFilter(SCHEMA, not(equal("id", 80))) .shouldRead(PARQUET_SCHEMA, ROW_GROUP_METADATA, DICTIONARY_STORE); Assert.assertTrue("Should read: id above upper bound", shouldRead); shouldRead = new ParquetDictionaryRowGroupFilter(SCHEMA, not(equal("id", 85))) .shouldRead(PARQUET_SCHEMA, ROW_GROUP_METADATA, DICTIONARY_STORE); Assert.assertTrue("Should read: id above upper bound", shouldRead); }
if (filter != null) { statsFilter = new ParquetMetricsRowGroupFilter(expectedSchema, filter); dictFilter = new ParquetDictionaryRowGroupFilter(expectedSchema, filter); boolean shouldRead = filter == null || ( statsFilter.shouldRead(typeWithIds, rowGroup) && dictFilter.shouldRead(typeWithIds, rowGroup, reader.getDictionaryReader(rowGroup))); this.shouldSkip[i] = !shouldRead; if (shouldRead) {
@Test public void testColumnFallbackOrNotDictionaryEncoded() { Expression[] exprs = new Expression[] { lessThan("non_dict", "a"), lessThanOrEqual("non_dict", "a"), equal("non_dict", "a"), greaterThan("non_dict", "a"), greaterThanOrEqual("non_dict", "a"), notNull("non_dict"), isNull("non_dict"), notEqual("non_dict", "a") }; for (Expression expr : exprs) { boolean shouldRead = new ParquetDictionaryRowGroupFilter(SCHEMA, expr) .shouldRead(PARQUET_SCHEMA, ROW_GROUP_METADATA, DICTIONARY_STORE); Assert.assertTrue("Should read: dictionary cannot be found: " + expr, shouldRead); } }
@Test public void testColumnNotInFile() { Expression[] exprs = new Expression[] { lessThan("not_in_file", 1.0f), lessThanOrEqual("not_in_file", 1.0f), equal("not_in_file", 1.0f), greaterThan("not_in_file", 1.0f), greaterThanOrEqual("not_in_file", 1.0f), notNull("not_in_file"), isNull("not_in_file"), notEqual("not_in_file", 1.0f) }; for (Expression expr : exprs) { boolean shouldRead = new ParquetDictionaryRowGroupFilter(SCHEMA, expr) .shouldRead(PARQUET_SCHEMA, ROW_GROUP_METADATA, DICTIONARY_STORE); Assert.assertTrue("Should read: dictionary cannot be found: " + expr, shouldRead); } }