/** * Used by Avro reflection to instantiate this class when reading manifest files. */ public GenericDataFile(org.apache.avro.Schema avroSchema) { this.avroSchema = avroSchema; Types.StructType schema = AvroSchemaUtil.convert(avroSchema).asNestedType().asStructType(); // partition type may be null if the field was not projected Type partType = schema.fieldType("partition"); if (partType != null) { this.partitionType = partType.asNestedType().asStructType(); } else { this.partitionType = EMPTY_STRUCT_TYPE; } List<Types.NestedField> fields = schema.fields(); List<Types.NestedField> allFields = DataFile.getType(partitionType).fields(); this.fromProjectionPos = new int[fields.size()]; for (int i = 0; i < fromProjectionPos.length; i += 1) { boolean found = false; for (int j = 0; j < allFields.size(); j += 1) { if (fields.get(i).fieldId() == allFields.get(j).fieldId()) { found = true; fromProjectionPos[i] = j; } } if (!found) { throw new IllegalArgumentException("Cannot find projected field: " + fields.get(i)); } } this.partitionData = new PartitionData(partitionType); }
@Override public Type struct(Types.StructType struct, Iterable<Type> fieldTypes) { Preconditions.checkNotNull(sourceType, "Evaluation must start with a schema."); Preconditions.checkArgument(sourceType.isStructType(), "Not a struct: " + sourceType); Types.StructType sourceStruct = sourceType.asStructType(); List<Types.NestedField> fields = struct.fields(); int length = fields.size(); List<Type> types = Lists.newArrayList(fieldTypes); List<Types.NestedField> newFields = Lists.newArrayListWithExpectedSize(length); for (int i = 0; i < length; i += 1) { Types.NestedField field = fields.get(i); int sourceFieldId = sourceStruct.field(field.name()).fieldId(); if (field.isRequired()) { newFields.add(Types.NestedField.required(sourceFieldId, field.name(), types.get(i))); } else { newFields.add(Types.NestedField.optional(sourceFieldId, field.name(), types.get(i))); } } return Types.StructType.of(newFields); }
@Override public Schema.Field field(Schema.Field field, Supplier<Schema> fieldResult) { Types.StructType struct = current.asNestedType().asStructType(); int fieldId = AvroSchemaUtil.getFieldId(field); Types.NestedField expectedField = struct.field(fieldId); // TODO: what if there are no ids? // if the field isn't present, it was not selected if (expectedField == null) { return null; } String expectedName = expectedField.name(); this.current = expectedField.type(); try { Schema schema = fieldResult.get(); if (schema != field.schema() || !expectedName.equals(field.name())) { // add an alias for the field return copyField(field, schema, expectedName); } else { // always copy because fields can't be reused return copyField(field, field.schema(), field.name()); } } finally { this.current = struct; } }
@Override public <T> Boolean ltEq(BoundReference<T> ref, Literal<T> lit) { Integer id = ref.fieldId(); Types.NestedField field = struct.field(id); Preconditions.checkNotNull(field, "Cannot filter by nested column: %s", schema.findField(id)); Long valueCount = valueCounts.get(id); if (valueCount == null) { // the column is not present and is all nulls return ROWS_CANNOT_MATCH; } Statistics<?> colStats = stats.get(id); if (colStats != null && !colStats.isEmpty()) { if (!colStats.hasNonNullValue()) { return ROWS_CANNOT_MATCH; } T lower = min(colStats, id); int cmp = lit.comparator().compare(lower, lit.value()); if (cmp > 0) { return ROWS_CANNOT_MATCH; } } return ROWS_MIGHT_MATCH; }
@Override public <T> Boolean gt(BoundReference<T> ref, Literal<T> lit) { Integer id = ref.fieldId(); Types.NestedField field = struct.field(id); Preconditions.checkNotNull(field, "Cannot filter by nested column: %s", schema.findField(id)); Long valueCount = valueCounts.get(id); if (valueCount == null) { // the column is not present and is all nulls return ROWS_CANNOT_MATCH; } Statistics<?> colStats = stats.get(id); if (colStats != null && !colStats.isEmpty()) { if (!colStats.hasNonNullValue()) { return ROWS_CANNOT_MATCH; } T upper = max(colStats, id); int cmp = lit.comparator().compare(upper, lit.value()); if (cmp <= 0) { return ROWS_CANNOT_MATCH; } } return ROWS_MIGHT_MATCH; }
@Override public <T> Boolean eq(BoundReference<T> ref, Literal<T> lit) { // Rows must match when Min == X == Max Integer id = ref.fieldId(); Types.NestedField field = struct.field(id); Preconditions.checkNotNull(field, "Cannot filter by nested column: %s", schema.findField(id)); if (lowerBounds != null && lowerBounds.containsKey(id) && upperBounds != null && upperBounds.containsKey(id)) { T lower = Conversions.fromByteBuffer(struct.field(id).type(), lowerBounds.get(id)); int cmp = lit.comparator().compare(lower, lit.value()); if (cmp != 0) { return ROWS_MIGHT_NOT_MATCH; } T upper = Conversions.fromByteBuffer(field.type(), upperBounds.get(id)); cmp = lit.comparator().compare(upper, lit.value()); if (cmp != 0) { return ROWS_MIGHT_NOT_MATCH; } return ROWS_MUST_MATCH; } return ROWS_MIGHT_NOT_MATCH; }
@Override public <T> Boolean gtEq(BoundReference<T> ref, Literal<T> lit) { Integer id = ref.fieldId(); Types.NestedField field = struct.field(id); Preconditions.checkNotNull(field, "Cannot filter by nested column: %s", schema.findField(id)); Long valueCount = valueCounts.get(id); if (valueCount == null) { // the column is not present and is all nulls return ROWS_CANNOT_MATCH; } Statistics<?> colStats = stats.get(id); if (colStats != null && !colStats.isEmpty()) { if (!colStats.hasNonNullValue()) { return ROWS_CANNOT_MATCH; } T upper = max(colStats, id); int cmp = lit.comparator().compare(upper, lit.value()); if (cmp < 0) { return ROWS_CANNOT_MATCH; } } return ROWS_MIGHT_MATCH; }
@Override public <T> Boolean lt(BoundReference<T> ref, Literal<T> lit) { Integer id = ref.fieldId(); Types.NestedField field = struct.field(id); Preconditions.checkNotNull(field, "Cannot filter by nested column: %s", schema.findField(id)); Long valueCount = valueCounts.get(id); if (valueCount == null) { // the column is not present and is all nulls return ROWS_CANNOT_MATCH; } Statistics<?> colStats = stats.get(id); if (colStats != null && !colStats.isEmpty()) { if (!colStats.hasNonNullValue()) { return ROWS_CANNOT_MATCH; } T lower = min(colStats, id); int cmp = lit.comparator().compare(lower, lit.value()); if (cmp >= 0) { return ROWS_CANNOT_MATCH; } } return ROWS_MIGHT_MATCH; }
@Override public <T> Boolean notEq(BoundReference<T> ref, Literal<T> lit) { // Rows must match when X < Min or Max < X because it is not in the range Integer id = ref.fieldId(); Types.NestedField field = struct.field(id); Preconditions.checkNotNull(field, "Cannot filter by nested column: %s", schema.findField(id)); if (lowerBounds != null && lowerBounds.containsKey(id)) { T lower = Conversions.fromByteBuffer(struct.field(id).type(), lowerBounds.get(id)); int cmp = lit.comparator().compare(lower, lit.value()); if (cmp > 0) { return ROWS_MUST_MATCH; } } if (upperBounds != null && upperBounds.containsKey(id)) { T upper = Conversions.fromByteBuffer(field.type(), upperBounds.get(id)); int cmp = lit.comparator().compare(upper, lit.value()); if (cmp < 0) { return ROWS_MUST_MATCH; } } return ROWS_MIGHT_NOT_MATCH; }
@Override public <T> Boolean eq(BoundReference<T> ref, Literal<T> lit) { Integer id = ref.fieldId(); Types.NestedField field = struct.field(id); Preconditions.checkNotNull(field, "Cannot filter by nested column: %s", schema.findField(id)); if (lowerBounds != null && lowerBounds.containsKey(id)) { T lower = Conversions.fromByteBuffer(struct.field(id).type(), lowerBounds.get(id)); int cmp = lit.comparator().compare(lower, lit.value()); if (cmp > 0) { return ROWS_CANNOT_MATCH; } } if (upperBounds != null && upperBounds.containsKey(id)) { T upper = Conversions.fromByteBuffer(field.type(), upperBounds.get(id)); int cmp = lit.comparator().compare(upper, lit.value()); if (cmp < 0) { return ROWS_CANNOT_MATCH; } } return ROWS_MIGHT_MATCH; }
@Override public Type record(Schema record, List<String> names, List<Type> fieldTypes) { List<Schema.Field> fields = record.getFields(); List<Types.NestedField> newFields = Lists.newArrayListWithExpectedSize(fields.size()); if (root == record) { this.nextId = 0; } for (int i = 0; i < fields.size(); i += 1) { Schema.Field field = fields.get(i); Type fieldType = fieldTypes.get(i); int fieldId = getId(field); if (AvroSchemaUtil.isOptionSchema(field.schema())) { newFields.add(Types.NestedField.optional(fieldId, field.name(), fieldType)); } else { newFields.add(Types.NestedField.required(fieldId, field.name(), fieldType)); } } return Types.StructType.of(newFields); }
@Override public <T> Boolean notNull(BoundReference<T> ref) { // no need to check whether the field is required because binding evaluates that case // if the column has no non-null values, the expression cannot match Integer id = ref.fieldId(); Preconditions.checkNotNull(struct.field(id), "Cannot filter by nested column: %s", schema.findField(id)); Long valueCount = valueCounts.get(id); if (valueCount == null) { // the column is not present and is all nulls return ROWS_CANNOT_MATCH; } Statistics<?> colStats = stats.get(id); if (colStats != null && valueCount - colStats.getNumNulls() == 0) { // (num nulls == value count) => all values are null => no non-null values return ROWS_CANNOT_MATCH; } return ROWS_MIGHT_MATCH; }
@Override public <T> Boolean lt(BoundReference<T> ref, Literal<T> lit) { Integer id = ref.fieldId(); Types.NestedField field = struct.field(id); Preconditions.checkNotNull(field, "Cannot filter by nested column: %s", schema.findField(id)); Boolean hasNonDictPage = isFallback.get(id); if (hasNonDictPage == null || hasNonDictPage) { return ROWS_MIGHT_MATCH; } Set<T> dictionary = dict(id, lit.comparator()); // if any item in the dictionary matches the predicate, then at least one row does for (T item : dictionary) { int cmp = lit.comparator().compare(item, lit.value()); if (cmp < 0) { return ROWS_MIGHT_MATCH; } } return ROWS_CANNOT_MATCH; }
@Override public <T> Boolean isNull(BoundReference<T> ref) { // no need to check whether the field is required because binding evaluates that case // if the column has no null values, the expression cannot match Integer id = ref.fieldId(); Preconditions.checkNotNull(struct.field(id), "Cannot filter by nested column: %s", schema.findField(id)); Long valueCount = valueCounts.get(id); if (valueCount == null) { // the column is not present and is all nulls return ROWS_MIGHT_MATCH; } Statistics<?> colStats = stats.get(id); if (colStats != null && !colStats.isEmpty() && colStats.getNumNulls() == 0) { // there are stats and no values are null => all values are non-null return ROWS_CANNOT_MATCH; } return ROWS_MIGHT_MATCH; }
@Override public <T> Boolean gtEq(BoundReference<T> ref, Literal<T> lit) { Integer id = ref.fieldId(); Types.NestedField field = struct.field(id); Preconditions.checkNotNull(field, "Cannot filter by nested column: %s", schema.findField(id)); Boolean hasNonDictPage = isFallback.get(id); if (hasNonDictPage == null || hasNonDictPage) { return ROWS_MIGHT_MATCH; } Set<T> dictionary = dict(id, lit.comparator()); // if any item in the dictionary matches the predicate, then at least one row does for (T item : dictionary) { int cmp = lit.comparator().compare(item, lit.value()); if (cmp >= 0) { return ROWS_MIGHT_MATCH; } } return ROWS_CANNOT_MATCH; }
@Override public <T> Boolean gt(BoundReference<T> ref, Literal<T> lit) { Integer id = ref.fieldId(); Types.NestedField field = struct.field(id); Preconditions.checkNotNull(field, "Cannot filter by nested column: %s", schema.findField(id)); Boolean hasNonDictPage = isFallback.get(id); if (hasNonDictPage == null || hasNonDictPage) { return ROWS_MIGHT_MATCH; } Set<T> dictionary = dict(id, lit.comparator()); // if any item in the dictionary matches the predicate, then at least one row does for (T item : dictionary) { int cmp = lit.comparator().compare(item, lit.value()); if (cmp > 0) { return ROWS_MIGHT_MATCH; } } return ROWS_CANNOT_MATCH; }
@Override public <T> Boolean ltEq(BoundReference<T> ref, Literal<T> lit) { Integer id = ref.fieldId(); Types.NestedField field = struct.field(id); Preconditions.checkNotNull(field, "Cannot filter by nested column: %s", schema.findField(id)); Boolean hasNonDictPage = isFallback.get(id); if (hasNonDictPage == null || hasNonDictPage) { return ROWS_MIGHT_MATCH; } Set<T> dictionary = dict(id, lit.comparator()); // if any item in the dictionary matches the predicate, then at least one row does for (T item : dictionary) { int cmp = lit.comparator().compare(item, lit.value()); if (cmp <= 0) { return ROWS_MIGHT_MATCH; } } return ROWS_CANNOT_MATCH; }
@Test public void testMapOfListToStructs() { Type map = Types.MapType.ofRequired(33, 34, Types.ListType.ofRequired(35, Types.IntegerType.get()), Types.StructType.of( required(36, "a", Types.IntegerType.get()), optional(37, "b", Types.IntegerType.get()) )); Schema schema = AvroSchemaUtil.createMap( 33, addElementId(35, Schema.createArray(Schema.create(Schema.Type.INT))), 34, record("r34", requiredField(36, "a", Schema.create(Schema.Type.INT)), optionalField(37, "b", Schema.create(Schema.Type.INT)))); Assert.assertEquals("Avro schema to map", map, AvroSchemaUtil.convert(schema)); Assert.assertEquals("Map to Avro schema", schema, AvroSchemaUtil.convert(map)); }
@Test public void complexNested() throws IOException { convertToPigSchema(new Schema( optional(1,"t", StructType.of( optional(2, "b", ListType.ofOptional(3,StructType.of( optional(4, "i", IntegerType.get()), optional(5,"s", StringType.get()) ))) )), optional(6, "m1", MapType.ofOptional(7,8, StringType.get(), StructType.of( optional(9, "b", ListType.ofOptional(10, BinaryType.get()) ), optional(11, "m2", MapType.ofOptional(12,13, StringType.get(), IntegerType.get())) ))), optional(14, "b1", ListType.ofOptional(15, MapType.ofOptional(16,17, StringType.get(), ListType.ofOptional(18, FloatType.get())))) ), "t:(b:{(i:int,s:chararray)}),m1:[(b:{(bytearray)},m2:[int])],b1:{([{(float)}])}", ""); }
/** * Reassigns ids in a schema from another schema. * <p> * Ids are determined by field names. If a field in the schema cannot be found in the source * schema, this will throw IllegalArgumentException. * <p> * This will not alter a schema's structure, nullability, or types. * * @param schema the schema to have ids reassigned * @param idSourceSchema the schema from which field ids will be used * @return an structurally identical schema with field ids matching the source schema * @throws IllegalArgumentException if a field cannot be found (by name) in the source schema */ public static Schema reassignIds(Schema schema, Schema idSourceSchema) { Types.StructType struct = visit(schema, new ReassignIds(idSourceSchema)).asStructType(); return new Schema(struct.fields()); }