if (i < fields.size() && !field.name().equals(fields.get(i).name())) { hasChange = true; Schema.Field avroField = updateMap.get(field.name()); Preconditions.checkArgument(field.isOptional(), "Missing required field: %s", field.name()); field.name(), toOption(convert(field.type())), null, JsonProperties.NULL_VALUE); newField.addProp(AvroSchemaUtil.FIELD_ID_PROP, field.fieldId()); updatedFields.add(newField); hasChange = true;
changed = true; } else if (field.type() == type) { projectedFields.put(field.name(), field); } else if (field.isOptional()) { changed = true; projectedFields.put(field.name(), Types.NestedField.optional(field.fieldId(), field.name(), type)); projectedFields.put(field.name(), Types.NestedField.required(field.fieldId(), field.name(), type)); if (!fields.get(i).name().equals(name)) { reordered = true;
@Override public List<String> field(Types.NestedField readField, Supplier<List<String>> fieldErrors) { Types.StructType struct = currentType.asStructType(); Types.NestedField field = struct.field(readField.fieldId()); List<String> errors = Lists.newArrayList(); if (readField.isRequired()) { return ImmutableList.of(readField.name() + " is required, but is missing"); this.currentType = field.type(); try { if (readField.isRequired() && field.isOptional()) { errors.add(readField.name() + " should be required, but is optional"); if (error.startsWith(":")) { errors.add(readField.name() + error); } else { errors.add(readField.name() + "." + error);
switch (op()) { case IS_NULL: if (field.isRequired()) { return Expressions.alwaysFalse(); return new BoundPredicate<>(IS_NULL, new BoundReference<>(struct, field.fieldId())); case NOT_NULL: if (field.isRequired()) { return Expressions.alwaysTrue(); return new BoundPredicate<>(NOT_NULL, new BoundReference<>(struct, field.fieldId())); default: throw new ValidationException("Operation must be IS_NULL or NOT_NULL"); Literal<T> lit = literal().to(field.type()); if (lit == null) { throw new ValidationException(String.format( "Invalid value for comparison inclusive type %s: %s (%s)", field.type(), literal().value(), literal().value().getClass().getName())); return new BoundPredicate<>(op(), new BoundReference<>(struct, field.fieldId()), lit);
String name = field.name(); Types.NestedField update = updates.get(field.fieldId()); if (update != null && update.name() != null) { name = update.name(); if (!name.equals(field.name()) || field.type() != resultType) { hasChange = true; if (field.isOptional()) { newFields.add(Types.NestedField.optional(field.fieldId(), name, resultType)); } else { newFields.add(Types.NestedField.required(field.fieldId(), name, resultType));
Types.NestedField field = fields.get(i); Type projectedType = fieldResults.get(i); if (field.type() == projectedType) { } else if (projectedType != null) { sameTypes = false; // signal that some types were altered if (field.isOptional()) { selectedFields.add( Types.NestedField.optional(field.fieldId(), field.name(), projectedType)); } else { selectedFields.add( Types.NestedField.required(field.fieldId(), field.name(), projectedType));
@Override public Type array(Schema array, Type elementType) { if (array.getLogicalType() instanceof LogicalMap) { // map stored as an array Schema keyValueSchema = array.getElementType(); Preconditions.checkArgument(AvroSchemaUtil.isKeyValueSchema(keyValueSchema), "Invalid key-value pair schema: {}", keyValueSchema); Types.StructType keyValueType = elementType.asStructType(); Types.NestedField keyField = keyValueType.field("key"); Types.NestedField valueField = keyValueType.field("value"); if (keyValueType.field("value").isOptional()) { return Types.MapType.ofOptional( keyField.fieldId(), valueField.fieldId(), keyField.type(), valueField.type()); } else { return Types.MapType.ofRequired( keyField.fieldId(), valueField.fieldId(), keyField.type(), valueField.type()); } } else { // normal array Schema elementSchema = array.getElementType(); int id = getElementId(array); if (AvroSchemaUtil.isOptionSchema(elementSchema)) { return Types.ListType.ofOptional(id, elementType); } else { return Types.ListType.ofRequired(id, elementType); } } }
@Override public Type field(Types.NestedField field, Supplier<Type> fieldResult) { Preconditions.checkArgument(current instanceof StructType, "Not a struct: %s", current); StructType struct = (StructType) current; // fields are resolved by name because Spark only sees the current table schema. if (struct.getFieldIndex(field.name()).isEmpty()) { // make sure that filter fields are projected even if they aren't in the requested schema. if (filterRefs.contains(field.fieldId())) { return field.type(); } return null; } int fieldIndex = struct.fieldIndex(field.name()); StructField f = struct.fields()[fieldIndex]; Preconditions.checkArgument(f.nullable() || field.isRequired(), "Cannot project an optional field as non-null: %s", field.name()); this.current = f.dataType(); try { return fieldResult.get(); } catch (IllegalArgumentException e) { throw new IllegalArgumentException( "Invalid projection for field " + field.name() + ": " + e.getMessage(), e); } finally { this.current = struct; } }
@Override public Type struct(Types.StructType struct, Iterable<Type> fieldResults) { Preconditions.checkNotNull(struct, "Cannot prune null struct. Pruning must start with a schema."); Preconditions.checkArgument(current instanceof StructType, "Not a struct: %s", current); List<Types.NestedField> fields = struct.fields(); List<Type> types = Lists.newArrayList(fieldResults); boolean changed = false; List<Types.NestedField> newFields = Lists.newArrayListWithExpectedSize(types.size()); for (int i = 0; i < fields.size(); i += 1) { Types.NestedField field = fields.get(i); Type type = types.get(i); if (type == null) { changed = true; } else if (field.type() == type) { newFields.add(field); } else if (field.isOptional()) { changed = true; newFields.add(Types.NestedField.optional(field.fieldId(), field.name(), type)); } else { changed = true; newFields.add(Types.NestedField.required(field.fieldId(), field.name(), type)); } } if (changed) { return Types.StructType.of(newFields); } return struct; }
@Override public Type field(Types.NestedField field, Supplier<Type> fieldResult) { Preconditions.checkArgument(current instanceof StructType, "Not a struct: %s", current); StructType struct = (StructType) current; // fields are resolved by name because Spark only sees the current table schema. if (struct.getFieldIndex(field.name()).isEmpty()) { // make sure that filter fields are projected even if they aren't in the requested schema. if (filterRefs.contains(field.fieldId())) { return field.type(); } return null; } int fieldIndex = struct.fieldIndex(field.name()); StructField f = struct.fields()[fieldIndex]; Preconditions.checkArgument(f.nullable() || field.isRequired(), "Cannot project an optional field as non-null: %s", field.name()); this.current = f.dataType(); try { return fieldResult.get(); } catch (IllegalArgumentException e) { throw new IllegalArgumentException( "Invalid projection for field " + field.name() + ": " + e.getMessage(), e); } finally { this.current = struct; } }
/** * Converts iceberg schema to field dto. * * @param schema schema * @param partitionFields partitioned fields * @return list of field Info */ public List<FieldInfo> icebergeSchemaTofieldDtos(final Schema schema, final List<PartitionField> partitionFields) { final List<FieldInfo> fields = Lists.newArrayList(); final List<String> partitionNames = partitionFields.stream() .map(PartitionField::name).collect(Collectors.toList()); for (Types.NestedField field : schema.columns()) { final FieldInfo fieldInfo = new FieldInfo(); fieldInfo.setName(field.name()); fieldInfo.setType(toMetacatType(fromIcebergToHiveType(field.type()))); fieldInfo.setIsNullable(field.isOptional()); fieldInfo.setComment(field.doc()); fieldInfo.setPartitionKey(partitionNames.contains(field.name())); fields.add(fieldInfo); } return fields; }
@Override public Type struct(Types.StructType struct, Iterable<Type> fieldTypes) { Preconditions.checkArgument(sourceType.isStructType(), "Not a struct: " + sourceType); List<Types.NestedField> fields = struct.fields(); int length = fields.size(); List<Type> types = Lists.newArrayList(fieldTypes); List<Types.NestedField> newFields = Lists.newArrayListWithExpectedSize(length); boolean hasChange = false; for (int i = 0; i < length; i += 1) { Types.NestedField field = fields.get(i); Type resultType = types.get(i); if (field.type() == resultType) { newFields.add(field); } else if (field.isRequired()) { hasChange = true; newFields.add(Types.NestedField.required(field.fieldId(), field.name(), resultType)); } else { hasChange = true; newFields.add(Types.NestedField.optional(field.fieldId(), field.name(), resultType)); } } if (hasChange) { return Types.StructType.of(newFields); } return struct; }
@Override public Schema struct(Types.StructType struct, List<Schema> fieldSchemas) { Schema recordSchema = results.get(struct); if (recordSchema != null) { return recordSchema; } String recordName = names.get(struct); if (recordName == null) { recordName = "r" + fieldIds.peek(); } List<Types.NestedField> structFields = struct.fields(); List<Schema.Field> fields = Lists.newArrayListWithExpectedSize(fieldSchemas.size()); for (int i = 0; i < structFields.size(); i += 1) { Types.NestedField structField = structFields.get(i); Schema.Field field = new Schema.Field( structField.name(), fieldSchemas.get(i), null, structField.isOptional() ? NULL_VALUE : null); field.addProp(AvroSchemaUtil.FIELD_ID_PROP, structField.fieldId()); fields.add(field); } recordSchema = Schema.createRecord(recordName, null, null, false, fields); results.put(struct, recordSchema); return recordSchema; }
@Override public Map<Integer, Accessor<InternalRow>> struct( Types.StructType struct, List<Map<Integer, Accessor<InternalRow>>> fieldResults) { Map<Integer, Accessor<InternalRow>> accessors = Maps.newHashMap(); List<Types.NestedField> fields = struct.fields(); for (int i = 0; i < fieldResults.size(); i += 1) { Types.NestedField field = fields.get(i); Map<Integer, Accessor<InternalRow>> result = fieldResults.get(i); if (result != null) { for (Map.Entry<Integer, Accessor<InternalRow>> entry : result.entrySet()) { accessors.put(entry.getKey(), newAccessor(i, field.isOptional(), field.type().asNestedType().asStructType(), entry.getValue())); } } else { accessors.put(field.fieldId(), newAccessor(i, field.type())); } } if (accessors.isEmpty()) { return null; } return accessors; }
/** * Converts iceberg schema to field dto. * * @param schema schema * @param partitionFields partitioned fields * @return list of field Info */ public List<FieldInfo> icebergeSchemaTofieldDtos(final Schema schema, final List<PartitionField> partitionFields) { final List<FieldInfo> fields = Lists.newArrayList(); final List<String> partitionNames = partitionFields.stream() .map(PartitionField::name).collect(Collectors.toList()); for (int i = 0; i < schema.columns().size(); i++) { final Types.NestedField field = schema.columns().get(i); final FieldInfo fieldInfo = new FieldInfo(); fieldInfo.setName(field.name()); fieldInfo.setType(toMetacatType(fromIcebergToHiveType(field.type()))); fieldInfo.setIsNullable(field.isOptional()); fieldInfo.setPartitionKey(partitionNames.contains(field.name())); fields.add(fieldInfo); } return fields; }
@Override public Type struct(Types.StructType struct, Iterable<Type> futures) { List<Types.NestedField> fields = struct.fields(); int length = struct.fields().size(); List<Integer> newIds = Lists.newArrayListWithExpectedSize(length); for (int i = 0; i < length; i += 1) { newIds.add(nextId.get()); // assign IDs for this struct's fields first } List<Types.NestedField> newFields = Lists.newArrayListWithExpectedSize(length); Iterator<Type> types = futures.iterator(); for (int i = 0; i < length; i += 1) { Types.NestedField field = fields.get(i); Type type = types.next(); if (field.isOptional()) { newFields.add(Types.NestedField.optional(newIds.get(i), field.name(), type)); } else { newFields.add(Types.NestedField.required(newIds.get(i), field.name(), type)); } } return Types.StructType.of(newFields); }
@Override public Type struct(Types.StructType struct, Iterable<Type> fieldTypes) { Preconditions.checkNotNull(sourceType, "Evaluation must start with a schema."); Preconditions.checkArgument(sourceType.isStructType(), "Not a struct: " + sourceType); Types.StructType sourceStruct = sourceType.asStructType(); List<Types.NestedField> fields = struct.fields(); int length = fields.size(); List<Type> types = Lists.newArrayList(fieldTypes); List<Types.NestedField> newFields = Lists.newArrayListWithExpectedSize(length); for (int i = 0; i < length; i += 1) { Types.NestedField field = fields.get(i); int sourceFieldId = sourceStruct.field(field.name()).fieldId(); if (field.isRequired()) { newFields.add(Types.NestedField.required(sourceFieldId, field.name(), types.get(i))); } else { newFields.add(Types.NestedField.optional(sourceFieldId, field.name(), types.get(i))); } } return Types.StructType.of(newFields); }
public Type field(NestedField field) { Type.Repetition repetition = field.isOptional() ? Type.Repetition.OPTIONAL : Type.Repetition.REQUIRED; int id = field.fieldId(); String name = field.name(); if (field.type().isPrimitiveType()) { return primitive(field.type().asPrimitiveType(), repetition, id, name); } else { NestedType nested = field.type().asNestedType(); if (nested.isStructType()) { return struct(nested.asStructType(), repetition, id, name); } else if (nested.isMapType()) { return map(nested.asMapType(), repetition, id, name); } else if (nested.isListType()) { return list(nested.asListType(), repetition, id, name); } throw new UnsupportedOperationException("Can't convert unknown type: " + nested); } }
private static UnsafeProjection projection(Schema finalSchema, Schema readSchema) { StructType struct = convert(readSchema); List<AttributeReference> refs = seqAsJavaListConverter(struct.toAttributes()).asJava(); List<Attribute> attrs = Lists.newArrayListWithExpectedSize(struct.fields().length); List<org.apache.spark.sql.catalyst.expressions.Expression> exprs = Lists.newArrayListWithExpectedSize(struct.fields().length); for (AttributeReference ref : refs) { attrs.add(ref.toAttribute()); } for (Types.NestedField field : finalSchema.columns()) { int indexInReadSchema = struct.fieldIndex(field.name()); exprs.add(refs.get(indexInReadSchema)); } return UnsafeProjection.create( asScalaBufferConverter(exprs).asScala().toSeq(), asScalaBufferConverter(attrs).asScala().toSeq()); }
@Override public List<String> getPredicateFields(String location, Job job) throws IOException { LOG.info(format("[%s]: getPredicateFields() -> %s", signature, location)); Schema schema = load(location, job).schema(); List<String> result = Lists.newArrayList(); for (Types.NestedField nf : schema.columns()) { switch (nf.type().typeId()) { case MAP: case LIST: case STRUCT: continue; default: result.add(nf.name()); } } return result; }