public static boolean isStatisticsOverflow(Type type, ParquetIntegerStatistics parquetIntegerStatistics) { long min = parquetIntegerStatistics.getMin(); long max = parquetIntegerStatistics.getMax(); return (type.equals(TINYINT) && (min < Byte.MIN_VALUE || max > Byte.MAX_VALUE)) || (type.equals(SMALLINT) && (min < Short.MIN_VALUE || max > Short.MAX_VALUE)) || (type.equals(INTEGER) && (min < Integer.MIN_VALUE || max > Integer.MAX_VALUE)); }
public static boolean predicateMatches(Predicate parquetPredicate, BlockMetaData block, ParquetDataSource dataSource, Map<List<String>, RichColumnDescriptor> descriptorsByPath, TupleDomain<ColumnDescriptor> parquetTupleDomain, boolean failOnCorruptedParquetStatistics) throws ParquetCorruptionException { Map<ColumnDescriptor, Statistics<?>> columnStatistics = getStatistics(block, descriptorsByPath); if (!parquetPredicate.matches(block.getRowCount(), columnStatistics, dataSource.getId(), failOnCorruptedParquetStatistics)) { return false; } Map<ColumnDescriptor, DictionaryDescriptor> dictionaries = getDictionaries(block, dataSource, descriptorsByPath, parquetTupleDomain); return parquetPredicate.matches(dictionaries); }
public static Predicate buildPredicate(MessageType requestedSchema, TupleDomain<ColumnDescriptor> parquetTupleDomain, Map<List<String>, RichColumnDescriptor> descriptorsByPath) { ImmutableList.Builder<RichColumnDescriptor> columnReferences = ImmutableList.builder(); for (String[] paths : requestedSchema.getPaths()) { RichColumnDescriptor descriptor = descriptorsByPath.get(Arrays.asList(paths)); if (descriptor != null) { columnReferences.add(descriptor); } } return new TupleDomainParquetPredicate(parquetTupleDomain, columnReferences.build()); }
private static Map<ColumnDescriptor, DictionaryDescriptor> getDictionaries(BlockMetaData blockMetadata, ParquetDataSource dataSource, Map<List<String>, RichColumnDescriptor> descriptorsByPath, TupleDomain<ColumnDescriptor> parquetTupleDomain) { ImmutableMap.Builder<ColumnDescriptor, DictionaryDescriptor> dictionaries = ImmutableMap.builder(); for (ColumnChunkMetaData columnMetaData : blockMetadata.getColumns()) { RichColumnDescriptor descriptor = descriptorsByPath.get(Arrays.asList(columnMetaData.getPath().toArray())); if (descriptor != null) { if (isOnlyDictionaryEncodingPages(columnMetaData.getEncodings()) && isColumnPredicate(descriptor, parquetTupleDomain)) { int totalSize = toIntExact(columnMetaData.getTotalSize()); byte[] buffer = new byte[totalSize]; dataSource.readFully(columnMetaData.getStartingPos(), buffer); Optional<DictionaryPage> dictionaryPage = readDictionaryPage(buffer, columnMetaData.getCodec()); dictionaries.put(descriptor, new DictionaryDescriptor(descriptor, dictionaryPage)); break; } } } return dictionaries.build(); }
@Override public boolean matches(Map<ColumnDescriptor, DictionaryDescriptor> dictionaries) { ImmutableMap.Builder<ColumnDescriptor, Domain> domains = ImmutableMap.builder(); for (RichColumnDescriptor column : columns) { DictionaryDescriptor dictionaryDescriptor = dictionaries.get(column); Domain domain = getDomain(getPrestoType(effectivePredicate, column), dictionaryDescriptor); if (domain != null) { domains.put(column, domain); } } TupleDomain<ColumnDescriptor> stripeDomain = TupleDomain.withColumnDomains(domains.build()); return effectivePredicate.overlaps(stripeDomain); }
LongStatistics longStatistics = (LongStatistics) statistics; if (longStatistics.genericGetMin() > longStatistics.genericGetMax()) { failWithCorruptionException(failOnCorruptedParquetStatistics, column, id, longStatistics); return Domain.create(ValueSet.all(type), hasNullValue); parquetIntegerStatistics = new ParquetIntegerStatistics(longStatistics.genericGetMin(), longStatistics.genericGetMax()); failWithCorruptionException(failOnCorruptedParquetStatistics, column, id, intStatistics); return Domain.create(ValueSet.all(type), hasNullValue); parquetIntegerStatistics = new ParquetIntegerStatistics((long) intStatistics.getMin(), (long) intStatistics.getMax()); if (isStatisticsOverflow(type, parquetIntegerStatistics)) { return Domain.create(ValueSet.all(type), hasNullValue); return createDomain(type, hasNullValue, parquetIntegerStatistics); failWithCorruptionException(failOnCorruptedParquetStatistics, column, id, floatStatistics); return Domain.create(ValueSet.all(type), hasNullValue); ParquetIntegerStatistics parquetStatistics = new ParquetIntegerStatistics( (long) floatToRawIntBits(floatStatistics.getMin()), (long) floatToRawIntBits(floatStatistics.getMax())); return createDomain(type, hasNullValue, parquetStatistics); failWithCorruptionException(failOnCorruptedParquetStatistics, column, id, doubleStatistics); return Domain.create(ValueSet.all(type), hasNullValue);
@Test public void testMatchesWithDescriptors() throws ParquetCorruptionException { ColumnDescriptor columnDescriptor = new ColumnDescriptor(new String[] {"path"}, BINARY, 0, 0); RichColumnDescriptor column = new RichColumnDescriptor(columnDescriptor, new PrimitiveType(OPTIONAL, BINARY, "Test column")); TupleDomain<ColumnDescriptor> effectivePredicate = getEffectivePredicate(column, createVarcharType(255), EMPTY_SLICE); TupleDomainParquetPredicate parquetPredicate = new TupleDomainParquetPredicate(effectivePredicate, singletonList(column)); DictionaryPage page = new DictionaryPage(Slices.wrappedBuffer(new byte[] {0, 0, 0, 0}), 1, PLAIN_DICTIONARY); assertTrue(parquetPredicate.matches(singletonMap(column, new DictionaryDescriptor(column, Optional.of(page))))); }
private static <F, T extends Comparable<T>> Domain createDomain(Type type, boolean hasNullValue, ParquetRangeStatistics<F> rangeStatistics, Function<F, T> function) { F min = rangeStatistics.getMin(); F max = rangeStatistics.getMax(); if (min != null && max != null) { return Domain.create(ValueSet.ofRanges(Range.range(type, function.apply(min), true, function.apply(max), true)), hasNullValue); } if (max != null) { return Domain.create(ValueSet.ofRanges(Range.lessThanOrEqual(type, function.apply(max))), hasNullValue); } if (min != null) { return Domain.create(ValueSet.ofRanges(Range.greaterThanOrEqual(type, function.apply(min))), hasNullValue); } return Domain.create(ValueSet.all(type), hasNullValue); } }
@Test public void testMatchesWithStatistics() throws ParquetCorruptionException { String value = "Test"; ColumnDescriptor columnDescriptor = new ColumnDescriptor(new String[] {"path"}, BINARY, 0, 0); RichColumnDescriptor column = new RichColumnDescriptor(columnDescriptor, new PrimitiveType(OPTIONAL, BINARY, "Test column")); TupleDomain<ColumnDescriptor> effectivePredicate = getEffectivePredicate(column, createVarcharType(255), utf8Slice(value)); TupleDomainParquetPredicate parquetPredicate = new TupleDomainParquetPredicate(effectivePredicate, singletonList(column)); Statistics stats = getStatsBasedOnType(column.getType()); stats.setNumNulls(1L); stats.setMinMaxFromBytes(value.getBytes(), value.getBytes()); assertTrue(parquetPredicate.matches(2, singletonMap(column, stats), ID, true)); }
@Test @SuppressWarnings("deprecation") public void testDictionaryEncodingCasesV1() { Set<Encoding> required = ImmutableSet.of(BIT_PACKED); Set<Encoding> optional = ImmutableSet.of(BIT_PACKED, RLE); Set<Encoding> repeated = ImmutableSet.of(RLE); Set<Encoding> notDictionary = ImmutableSet.of(PLAIN); Set<Encoding> mixedDictionary = ImmutableSet.of(PLAIN_DICTIONARY, PLAIN); Set<Encoding> dictionary = ImmutableSet.of(PLAIN_DICTIONARY); assertFalse(isOnlyDictionaryEncodingPages(union(required, notDictionary)), "required notDictionary"); assertFalse(isOnlyDictionaryEncodingPages(union(optional, notDictionary)), "optional notDictionary"); assertFalse(isOnlyDictionaryEncodingPages(union(repeated, notDictionary)), "repeated notDictionary"); assertFalse(isOnlyDictionaryEncodingPages(union(required, mixedDictionary)), "required mixedDictionary"); assertFalse(isOnlyDictionaryEncodingPages(union(optional, mixedDictionary)), "optional mixedDictionary"); assertFalse(isOnlyDictionaryEncodingPages(union(repeated, mixedDictionary)), "repeated mixedDictionary"); assertTrue(isOnlyDictionaryEncodingPages(union(required, dictionary)), "required dictionary"); assertTrue(isOnlyDictionaryEncodingPages(union(optional, dictionary)), "optional dictionary"); assertTrue(isOnlyDictionaryEncodingPages(union(repeated, dictionary)), "repeated dictionary"); }
@Test public void testBoolean() throws ParquetCorruptionException { String column = "BooleanColumn"; assertEquals(getDomain(BOOLEAN, 0, null, ID, column, true), all(BOOLEAN)); assertEquals(getDomain(BOOLEAN, 10, booleanColumnStats(true, true), ID, column, true), singleValue(BOOLEAN, true)); assertEquals(getDomain(BOOLEAN, 10, booleanColumnStats(false, false), ID, column, true), singleValue(BOOLEAN, false)); assertEquals(getDomain(BOOLEAN, 20, booleanColumnStats(false, true), ID, column, true), all(BOOLEAN)); }
@Override public boolean matches(long numberOfRows, Map<ColumnDescriptor, Statistics<?>> statistics, ParquetDataSourceId id, boolean failOnCorruptedParquetStatistics) throws ParquetCorruptionException { if (numberOfRows == 0) { return false; } ImmutableMap.Builder<ColumnDescriptor, Domain> domains = ImmutableMap.builder(); for (RichColumnDescriptor column : columns) { Statistics<?> columnStatistics = statistics.get(column); Domain domain; Type type = getPrestoType(effectivePredicate, column); if (columnStatistics == null || columnStatistics.isEmpty()) { // no stats for column domain = Domain.all(type); } else { domain = getDomain(type, numberOfRows, columnStatistics, id, column.toString(), failOnCorruptedParquetStatistics); } domains.put(column, domain); } TupleDomain<ColumnDescriptor> stripeDomain = TupleDomain.withColumnDomains(domains.build()); return effectivePredicate.overlaps(stripeDomain); }
@Test public void testSmallint() throws ParquetCorruptionException { String column = "SmallintColumn"; assertEquals(getDomain(SMALLINT, 0, null, ID, column, true), all(SMALLINT)); assertEquals(getDomain(SMALLINT, 10, longColumnStats(100, 100), ID, column, true), singleValue(SMALLINT, 100L)); assertEquals(getDomain(SMALLINT, 10, longColumnStats(0, 100), ID, column, true), create(ValueSet.ofRanges(range(SMALLINT, 0L, true, 100L, true)), false)); assertEquals(getDomain(SMALLINT, 20, longColumnStats(0, 2147483648L), ID, column, true), notNull(SMALLINT)); // ignore corrupted statistics assertEquals(getDomain(SMALLINT, 10, longColumnStats(2147483648L, 0), ID, column, false), create(ValueSet.all(SMALLINT), false)); // fail on corrupted statistics assertThatExceptionOfType(ParquetCorruptionException.class) .isThrownBy(() -> getDomain(SMALLINT, 10, longColumnStats(2147483648L, 10), ID, column, true)) .withMessage("Corrupted statistics for column \"SmallintColumn\" in Parquet file \"testFile\": [min: 2147483648, max: 10, num_nulls: 0]"); }
@Test public void testDate() throws ParquetCorruptionException { String column = "DateColumn"; assertEquals(getDomain(DATE, 0, null, ID, column, true), all(DATE)); assertEquals(getDomain(DATE, 10, intColumnStats(100, 100), ID, column, true), singleValue(DATE, 100L)); assertEquals(getDomain(DATE, 10, intColumnStats(0, 100), ID, column, true), create(ValueSet.ofRanges(range(DATE, 0L, true, 100L, true)), false)); // ignore corrupted statistics assertEquals(getDomain(DATE, 10, intColumnStats(200, 100), ID, column, false), create(ValueSet.all(DATE), false)); // fail on corrupted statistics assertThatExceptionOfType(ParquetCorruptionException.class) .isThrownBy(() -> getDomain(DATE, 10, intColumnStats(200, 100), ID, column, true)) .withMessage("Corrupted statistics for column \"DateColumn\" in Parquet file \"testFile\": [min: 200, max: 100, num_nulls: 0]"); }
@Test public void testBigint() throws ParquetCorruptionException { String column = "BigintColumn"; assertEquals(getDomain(BIGINT, 0, null, ID, column, true), all(BIGINT)); assertEquals(getDomain(BIGINT, 10, longColumnStats(100L, 100L), ID, column, true), singleValue(BIGINT, 100L)); assertEquals(getDomain(BIGINT, 10, longColumnStats(0L, 100L), ID, column, true), create(ValueSet.ofRanges(range(BIGINT, 0L, true, 100L, true)), false)); // ignore corrupted statistics assertEquals(getDomain(BIGINT, 10, longColumnStats(100L, 0L), ID, column, false), create(ValueSet.all(BIGINT), false)); // fail on corrupted statistics assertThatExceptionOfType(ParquetCorruptionException.class) .isThrownBy(() -> getDomain(BIGINT, 10, longColumnStats(100L, 10L), ID, column, true)) .withMessage("Corrupted statistics for column \"BigintColumn\" in Parquet file \"testFile\": [min: 100, max: 10, num_nulls: 0]"); }
@Test public void testDouble() throws ParquetCorruptionException { String column = "DoubleColumn"; assertEquals(getDomain(DOUBLE, 0, null, ID, column, true), all(DOUBLE)); assertEquals(getDomain(DOUBLE, 10, doubleColumnStats(42.24, 42.24), ID, column, true), singleValue(DOUBLE, 42.24)); assertEquals(getDomain(DOUBLE, 10, doubleColumnStats(3.3, 42.24), ID, column, true), create(ValueSet.ofRanges(range(DOUBLE, 3.3, true, 42.24, true)), false)); // ignore corrupted statistics assertEquals(getDomain(DOUBLE, 10, doubleColumnStats(42.24, 3.3), ID, column, false), create(ValueSet.all(DOUBLE), false)); // fail on corrupted statistics assertThatExceptionOfType(ParquetCorruptionException.class) .isThrownBy(() -> getDomain(DOUBLE, 10, doubleColumnStats(42.24, 3.3), ID, column, true)) .withMessage("Corrupted statistics for column \"DoubleColumn\" in Parquet file \"testFile\": [min: 42.24000, max: 3.30000, num_nulls: 0]"); }
@Test public void testInteger() throws ParquetCorruptionException { String column = "IntegerColumn"; assertEquals(getDomain(INTEGER, 0, null, ID, column, true), all(INTEGER)); assertEquals(getDomain(INTEGER, 10, longColumnStats(100, 100), ID, column, true), singleValue(INTEGER, 100L)); assertEquals(getDomain(INTEGER, 10, longColumnStats(0, 100), ID, column, true), create(ValueSet.ofRanges(range(INTEGER, 0L, true, 100L, true)), false)); assertEquals(getDomain(INTEGER, 20, longColumnStats(0, 2147483648L), ID, column, true), notNull(INTEGER)); // ignore corrupted statistics assertEquals(getDomain(INTEGER, 10, longColumnStats(2147483648L, 0), ID, column, false), create(ValueSet.all(INTEGER), false)); // fail on corrupted statistics assertThatExceptionOfType(ParquetCorruptionException.class) .isThrownBy(() -> getDomain(INTEGER, 10, longColumnStats(2147483648L, 10), ID, column, true)) .withMessage("Corrupted statistics for column \"IntegerColumn\" in Parquet file \"testFile\": [min: 2147483648, max: 10, num_nulls: 0]"); }
@Test public void testTinyint() throws ParquetCorruptionException { String column = "TinyintColumn"; assertEquals(getDomain(TINYINT, 0, null, ID, column, true), all(TINYINT)); assertEquals(getDomain(TINYINT, 10, longColumnStats(100, 100), ID, column, true), singleValue(TINYINT, 100L)); assertEquals(getDomain(TINYINT, 10, longColumnStats(0, 100), ID, column, true), create(ValueSet.ofRanges(range(TINYINT, 0L, true, 100L, true)), false)); assertEquals(getDomain(TINYINT, 20, longColumnStats(0, 2147483648L), ID, column, true), notNull(TINYINT)); // ignore corrupted statistics assertEquals(getDomain(TINYINT, 10, longColumnStats(2147483648L, 0), ID, column, false), create(ValueSet.all(TINYINT), false)); // fail on corrupted statistics assertThatExceptionOfType(ParquetCorruptionException.class) .isThrownBy(() -> getDomain(TINYINT, 10, longColumnStats(2147483648L, 10), ID, column, true)) .withMessage("Corrupted statistics for column \"TinyintColumn\" in Parquet file \"testFile\": [min: 2147483648, max: 10, num_nulls: 0]"); }
@Test public void testFloat() throws ParquetCorruptionException { String column = "FloatColumn"; assertEquals(getDomain(REAL, 0, null, ID, column, true), all(REAL)); float minimum = 4.3f; float maximum = 40.3f; assertEquals(getDomain(REAL, 10, floatColumnStats(minimum, minimum), ID, column, true), singleValue(REAL, (long) floatToRawIntBits(minimum))); assertEquals( getDomain(REAL, 10, floatColumnStats(minimum, maximum), ID, column, true), create(ValueSet.ofRanges(range(REAL, (long) floatToRawIntBits(minimum), true, (long) floatToRawIntBits(maximum), true)), false)); // ignore corrupted statistics assertEquals(getDomain(REAL, 10, floatColumnStats(maximum, minimum), ID, column, false), create(ValueSet.all(REAL), false)); // fail on corrupted statistics assertThatExceptionOfType(ParquetCorruptionException.class) .isThrownBy(() -> getDomain(REAL, 10, floatColumnStats(maximum, minimum), ID, column, true)) .withMessage("Corrupted statistics for column \"FloatColumn\" in Parquet file \"testFile\": [min: 40.30000, max: 4.30000, num_nulls: 0]"); }
@Test public void testString() throws ParquetCorruptionException { String column = "StringColumn"; assertEquals(getDomain(createUnboundedVarcharType(), 0, null, ID, column, true), all(createUnboundedVarcharType())); assertEquals(getDomain(createUnboundedVarcharType(), 10, stringColumnStats("taco", "taco"), ID, column, true), singleValue(createUnboundedVarcharType(), utf8Slice("taco"))); assertEquals(getDomain(createUnboundedVarcharType(), 10, stringColumnStats("apple", "taco"), ID, column, true), create(ValueSet.ofRanges(range(createUnboundedVarcharType(), utf8Slice("apple"), true, utf8Slice("taco"), true)), false)); assertEquals(getDomain(createUnboundedVarcharType(), 10, stringColumnStats("中国", "美利坚"), ID, column, true), create(ValueSet.ofRanges(range(createUnboundedVarcharType(), utf8Slice("中国"), true, utf8Slice("美利坚"), true)), false)); // ignore corrupted statistics assertEquals(getDomain(createUnboundedVarcharType(), 10, stringColumnStats("taco", "apple"), ID, column, false), create(ValueSet.all(createUnboundedVarcharType()), false)); // fail on corrupted statistics assertThatExceptionOfType(ParquetCorruptionException.class) .isThrownBy(() -> getDomain(createUnboundedVarcharType(), 10, stringColumnStats("taco", "apple"), ID, column, true)) .withMessage("Corrupted statistics for column \"StringColumn\" in Parquet file \"testFile\": [min: taco, max: apple, num_nulls: 0]"); }