@Test public void testFilterByNonProjectedColumn() { { Schema actualProjection = SCHEMA.select("id", "data"); List<Record> expected = Lists.newArrayList(); for (Record rec : expected(5, 6 ,7, 8, 9)) { expected.add(projectFlat(actualProjection, rec)); } assertEqualsSafe(actualProjection.asStruct(), expected, read( unpartitioned.toString(), "cast('2017-12-22 00:00:00+00:00' as timestamp) > ts", "id", "data")); } { // only project id: ts will be projected because of the filter, but data will not be included Schema actualProjection = SCHEMA.select("id"); List<Record> expected = Lists.newArrayList(); for (Record rec : expected(1, 2)) { expected.add(projectFlat(actualProjection, rec)); } assertEqualsSafe(actualProjection.asStruct(), expected, read( unpartitioned.toString(), "ts > cast('2017-12-22 06:00:00+00:00' as timestamp) and " + "cast('2017-12-22 08:00:00+00:00' as timestamp) > ts", "id")); } }
@Test public void testUnpartitionedTimestampFilter() { DataSourceOptions options = new DataSourceOptions(ImmutableMap.of( "path", unpartitioned.toString()) ); IcebergSource source = new IcebergSource(); DataSourceReader reader = source.createReader(options); pushFilters(reader, Expressions.lessThan("ts", "2017-12-22T00:00:00+00:00")); List<DataReaderFactory<UnsafeRow>> tasks = planTasks(reader); Assert.assertEquals("Should only create one task for a small file", 1, tasks.size()); assertEqualsSafe(SCHEMA.asStruct(), expected(5,6,7,8,9), read(unpartitioned.toString(), "ts < cast('2017-12-22 00:00:00+00:00' as timestamp)")); }
@Test public void testUnpartitionedIDFilters() { DataSourceOptions options = new DataSourceOptions(ImmutableMap.of( "path", unpartitioned.toString()) ); IcebergSource source = new IcebergSource(); for (int i = 0; i < 10; i += 1) { DataSourceReader reader = source.createReader(options); pushFilters(reader, Expressions.equal("id", i)); List<DataReaderFactory<UnsafeRow>> tasks = planTasks(reader); Assert.assertEquals("Should only create one task for a small file", 1, tasks.size()); // validate row filtering assertEqualsSafe(SCHEMA.asStruct(), expected(i), read(unpartitioned.toString(), "id = " + i)); } }
@Test public void testBucketPartitionedIDFilters() { File location = buildPartitionedTable("bucketed_by_id", BUCKET_BY_ID, "bucket4", "id"); DataSourceOptions options = new DataSourceOptions(ImmutableMap.of( "path", location.toString()) ); IcebergSource source = new IcebergSource(); DataSourceReader unfiltered = source.createReader(options); Assert.assertEquals("Unfiltered table should created 4 read tasks", 4, planTasks(unfiltered).size()); for (int i = 0; i < 10; i += 1) { DataSourceReader reader = source.createReader(options); pushFilters(reader, Expressions.equal("id", i)); List<DataReaderFactory<UnsafeRow>> tasks = planTasks(reader); // validate predicate push-down Assert.assertEquals("Should create one task for a single bucket", 1, tasks.size()); // validate row filtering assertEqualsSafe(SCHEMA.asStruct(), expected(i), read(location.toString(), "id = " + i)); } }
@Test public void testHourPartitionedTimestampFilters() { File location = buildPartitionedTable("partitioned_by_hour", PARTITION_BY_HOUR, "ts_hour", "ts"); DataSourceOptions options = new DataSourceOptions(ImmutableMap.of( "path", location.toString()) ); IcebergSource source = new IcebergSource(); DataSourceReader unfiltered = source.createReader(options); Assert.assertEquals("Unfiltered table should created 9 read tasks", 9, planTasks(unfiltered).size()); { DataSourceReader reader = source.createReader(options); pushFilters(reader, Expressions.lessThan("ts", "2017-12-22T00:00:00+00:00")); List<DataReaderFactory<UnsafeRow>> tasks = planTasks(reader); Assert.assertEquals("Should create 4 tasks for 2017-12-21: 15, 17, 21, 22", 4, tasks.size()); assertEqualsSafe(SCHEMA.asStruct(), expected(8, 9, 7, 6, 5), read(location.toString(), "ts < cast('2017-12-22 00:00:00+00:00' as timestamp)")); } { DataSourceReader reader = source.createReader(options); pushFilters(reader, Expressions.and( Expressions.greaterThan("ts", "2017-12-22T06:00:00+00:00"), Expressions.lessThan("ts", "2017-12-22T08:00:00+00:00"))); List<DataReaderFactory<UnsafeRow>> tasks = planTasks(reader); Assert.assertEquals("Should create 2 tasks for 2017-12-22: 6, 7", 2, tasks.size()); assertEqualsSafe(SCHEMA.asStruct(), expected(2, 1), read(location.toString(), "ts > cast('2017-12-22 06:00:00+00:00' as timestamp) and " + "ts < cast('2017-12-22 08:00:00+00:00' as timestamp)")); } }
@Test public void testTrunctateDataPartitionedFilters() { File location = buildPartitionedTable("trunc", PARTITION_BY_FIRST_LETTER, "trunc1", "data"); DataSourceOptions options = new DataSourceOptions(ImmutableMap.of( "path", location.toString()) ); IcebergSource source = new IcebergSource(); DataSourceReader unfiltered = source.createReader(options); Assert.assertEquals("Unfiltered table should have created 9 read tasks", 9, planTasks(unfiltered).size()); { DataSourceReader reader = source.createReader(options); pushFilters(reader, Expressions.equal("data", "goldfish")); List<DataReaderFactory<UnsafeRow>> tasks = planTasks(reader); Assert.assertEquals("Should create 1 task for 'goldfish' (g)", 1, tasks.size()); } { DataSourceReader reader = source.createReader(options); pushFilters(reader, col("data").$eq$eq$eq("goldfish").expr()); List<DataReaderFactory<UnsafeRow>> tasks = planTasks(reader); Assert.assertEquals("Should create 1 task for 'goldfish' (g)", 1, tasks.size()); } assertEqualsSafe(SCHEMA.asStruct(), expected(9), read(location.toString(), "data = 'goldfish'")); }
Assert.assertEquals("Should create one task for 2017-12-21", 1, tasks.size()); assertEqualsSafe(SCHEMA.asStruct(), expected(5, 6, 7, 8, 9), read(location.toString(), "ts < cast('2017-12-22 00:00:00+00:00' as timestamp)")); Assert.assertEquals("Should create one task for 2017-12-21", 1, tasks.size()); assertEqualsSafe(SCHEMA.asStruct(), expected(5, 6, 7, 8, 9), read(location.toString(), "cast(ts as date) = date '2017-12-21'")); Assert.assertEquals("Should create one task for 2017-12-21", 1, tasks.size()); assertEqualsSafe(SCHEMA.asStruct(), expected(5, 6, 7, 8, 9), read(location.toString(), "to_date(ts) = date '2017-12-21'")); Assert.assertEquals("Should create one task for 2017-12-22", 1, tasks.size()); assertEqualsSafe(SCHEMA.asStruct(), expected(1, 2), read(location.toString(), "ts > cast('2017-12-22 06:00:00+00:00' as timestamp) and " + "ts < cast('2017-12-22 08:00:00+00:00' as timestamp)"));