private void pushFilters(DataSourceReader reader, com.netflix.iceberg.expressions.Expression... filters) { Expression[] expressions = new Expression[filters.length]; for (int i = 0; i < filters.length; i += 1) { expressions[i] = SparkExpressions.convert(filters[i], SCHEMA); } pushFilters(reader, expressions); }
@Test public void testUnpartitionedTimestampFilter() { DataSourceOptions options = new DataSourceOptions(ImmutableMap.of( "path", unpartitioned.toString()) ); IcebergSource source = new IcebergSource(); DataSourceReader reader = source.createReader(options); pushFilters(reader, Expressions.lessThan("ts", "2017-12-22T00:00:00+00:00")); List<DataReaderFactory<UnsafeRow>> tasks = planTasks(reader); Assert.assertEquals("Should only create one task for a small file", 1, tasks.size()); assertEqualsSafe(SCHEMA.asStruct(), expected(5,6,7,8,9), read(unpartitioned.toString(), "ts < cast('2017-12-22 00:00:00+00:00' as timestamp)")); }
@Test public void testUnpartitionedIDFilters() { DataSourceOptions options = new DataSourceOptions(ImmutableMap.of( "path", unpartitioned.toString()) ); IcebergSource source = new IcebergSource(); for (int i = 0; i < 10; i += 1) { DataSourceReader reader = source.createReader(options); pushFilters(reader, Expressions.equal("id", i)); List<DataReaderFactory<UnsafeRow>> tasks = planTasks(reader); Assert.assertEquals("Should only create one task for a small file", 1, tasks.size()); // validate row filtering assertEqualsSafe(SCHEMA.asStruct(), expected(i), read(unpartitioned.toString(), "id = " + i)); } }
@Test public void testBucketPartitionedIDFilters() { File location = buildPartitionedTable("bucketed_by_id", BUCKET_BY_ID, "bucket4", "id"); DataSourceOptions options = new DataSourceOptions(ImmutableMap.of( "path", location.toString()) ); IcebergSource source = new IcebergSource(); DataSourceReader unfiltered = source.createReader(options); Assert.assertEquals("Unfiltered table should created 4 read tasks", 4, planTasks(unfiltered).size()); for (int i = 0; i < 10; i += 1) { DataSourceReader reader = source.createReader(options); pushFilters(reader, Expressions.equal("id", i)); List<DataReaderFactory<UnsafeRow>> tasks = planTasks(reader); // validate predicate push-down Assert.assertEquals("Should create one task for a single bucket", 1, tasks.size()); // validate row filtering assertEqualsSafe(SCHEMA.asStruct(), expected(i), read(location.toString(), "id = " + i)); } }
@Test public void testTrunctateDataPartitionedFilters() { File location = buildPartitionedTable("trunc", PARTITION_BY_FIRST_LETTER, "trunc1", "data"); DataSourceOptions options = new DataSourceOptions(ImmutableMap.of( "path", location.toString()) ); IcebergSource source = new IcebergSource(); DataSourceReader unfiltered = source.createReader(options); Assert.assertEquals("Unfiltered table should have created 9 read tasks", 9, planTasks(unfiltered).size()); { DataSourceReader reader = source.createReader(options); pushFilters(reader, Expressions.equal("data", "goldfish")); List<DataReaderFactory<UnsafeRow>> tasks = planTasks(reader); Assert.assertEquals("Should create 1 task for 'goldfish' (g)", 1, tasks.size()); } { DataSourceReader reader = source.createReader(options); pushFilters(reader, col("data").$eq$eq$eq("goldfish").expr()); List<DataReaderFactory<UnsafeRow>> tasks = planTasks(reader); Assert.assertEquals("Should create 1 task for 'goldfish' (g)", 1, tasks.size()); } assertEqualsSafe(SCHEMA.asStruct(), expected(9), read(location.toString(), "data = 'goldfish'")); }
@Test public void testHourPartitionedTimestampFilters() { File location = buildPartitionedTable("partitioned_by_hour", PARTITION_BY_HOUR, "ts_hour", "ts"); DataSourceOptions options = new DataSourceOptions(ImmutableMap.of( "path", location.toString()) ); IcebergSource source = new IcebergSource(); DataSourceReader unfiltered = source.createReader(options); Assert.assertEquals("Unfiltered table should created 9 read tasks", 9, planTasks(unfiltered).size()); { DataSourceReader reader = source.createReader(options); pushFilters(reader, Expressions.lessThan("ts", "2017-12-22T00:00:00+00:00")); List<DataReaderFactory<UnsafeRow>> tasks = planTasks(reader); Assert.assertEquals("Should create 4 tasks for 2017-12-21: 15, 17, 21, 22", 4, tasks.size()); assertEqualsSafe(SCHEMA.asStruct(), expected(8, 9, 7, 6, 5), read(location.toString(), "ts < cast('2017-12-22 00:00:00+00:00' as timestamp)")); } { DataSourceReader reader = source.createReader(options); pushFilters(reader, Expressions.and( Expressions.greaterThan("ts", "2017-12-22T06:00:00+00:00"), Expressions.lessThan("ts", "2017-12-22T08:00:00+00:00"))); List<DataReaderFactory<UnsafeRow>> tasks = planTasks(reader); Assert.assertEquals("Should create 2 tasks for 2017-12-22: 6, 7", 2, tasks.size()); assertEqualsSafe(SCHEMA.asStruct(), expected(2, 1), read(location.toString(), "ts > cast('2017-12-22 06:00:00+00:00' as timestamp) and " + "ts < cast('2017-12-22 08:00:00+00:00' as timestamp)")); } }
pushFilters(reader, Expressions.lessThan("ts", "2017-12-22T00:00:00+00:00")); pushFilters(reader, col("ts").cast(DateType$.MODULE$).$eq$eq$eq(lit(day)).expr()); pushFilters(reader, to_date(col("ts")).$eq$eq$eq(lit(day)).expr()); pushFilters(reader, Expressions.and( Expressions.greaterThan("ts", "2017-12-22T06:00:00+00:00"), Expressions.lessThan("ts", "2017-12-22T08:00:00+00:00")));