@Override public Type schema(Schema schema, Type structResult) { Collection<Types.NestedField> newColumns = adds.get(TABLE_ROOT_ID); if (newColumns != null) { return addFields(structResult.asNestedType().asStructType(), newColumns); } return structResult; }
/** * Used by Avro reflection to instantiate this class when reading manifest files. */ public PartitionData(Schema schema) { this.partitionType = AvroSchemaUtil.convert(schema).asNestedType().asStructType(); this.size = partitionType.fields().size(); this.data = new Object[size]; this.stringSchema = schema.toString(); this.schema = schema; }
public static Schema fromJson(JsonNode json) { Type type = typeFromJson(json); Preconditions.checkArgument(type.isNestedType() && type.asNestedType().isStructType(), "Cannot create schema, not a struct type: %s", type); return new Schema(type.asNestedType().asStructType().fields()); }
@Override public List<String> list(Types.ListType readList, Supplier<List<String>> elementErrors) { if (!currentType.isListType()) { return ImmutableList.of(String.format(": %s cannot be read as a list", currentType)); } Types.ListType list = currentType.asNestedType().asListType(); List<String> errors = Lists.newArrayList(); this.currentType = list.elementType(); try { if (readList.isElementRequired() && list.isElementOptional()) { errors.add(": elements should be required, but are optional"); } errors.addAll(elementErrors.get()); return errors; } finally { this.currentType = list; } }
@Override public List<String> map(Types.MapType readMap, Supplier<List<String>> keyErrors, Supplier<List<String>> valueErrors) { if (!currentType.isMapType()) { return ImmutableList.of(String.format(": %s cannot be read as a map", currentType)); } Types.MapType map = currentType.asNestedType().asMapType(); List<String> errors = Lists.newArrayList(); try { if (readMap.isValueRequired() && map.isValueOptional()) { errors.add(": values should be required, but are optional"); } this.currentType = map.keyType(); errors.addAll(keyErrors.get()); this.currentType = map.valueType(); errors.addAll(valueErrors.get()); return errors; } finally { this.currentType = map; } }
private ConvertColumnFilterToParquet(Schema schema, String column) { super(schema); this.partitionStruct = schema.findField(column).type().asNestedType().asStructType(); }
@Override public Type field(Types.NestedField field, Type fieldResult) { // the API validates deletes, updates, and additions don't conflict int fieldId = field.fieldId(); if (deletes.contains(fieldId)) { return null; } Types.NestedField update = updates.get(field.fieldId()); if (update != null && update.type() != field.type()) { // rename is handled in struct return update.type(); } Collection<Types.NestedField> newFields = adds.get(fieldId); if (newFields != null && !newFields.isEmpty()) { return addFields(fieldResult.asNestedType().asStructType(), newFields); } return fieldResult; }
@Override public Schema map(Schema map, Supplier<Schema> value) { Preconditions.checkArgument(current.isNestedType() && current.asNestedType().isMapType(), "Incompatible projected type: %s", current); Types.MapType m = current.asNestedType().asMapType(); Preconditions.checkArgument(m.keyType() == Types.StringType.get(), "Incompatible projected type: key type %s is not string", m.keyType()); this.current = m.valueType(); try { Schema valueSchema = value.get(); // element was changed, create a new map if (valueSchema != map.getValueType()) { return Schema.createMap(valueSchema); } return map; } finally { this.current = m; } }
public static Schema select(Schema schema, Set<Integer> fieldIds) { Preconditions.checkNotNull(schema, "Schema cannot be null"); Preconditions.checkNotNull(fieldIds, "Field ids cannot be null"); Type result = visit(schema, new PruneColumns(fieldIds)); if (schema.asStruct() == result) { return schema; } else if (result != null) { if (schema.getAliases() != null) { return new Schema(result.asNestedType().fields(), schema.getAliases()); } else { return new Schema(result.asNestedType().fields()); } } return new Schema(ImmutableList.of(), schema.getAliases()); }
public static Schema convert(MessageType parquetSchema) { MessageTypeToType converter = new MessageTypeToType(parquetSchema); return new Schema( ParquetTypeVisitor.visit(parquetSchema, converter).asNestedType().fields(), converter.getAliases()); }
private static Schema applyChanges(Schema schema, List<Integer> deletes, Map<Integer, Types.NestedField> updates, Multimap<Integer, Types.NestedField> adds) { Types.StructType struct = TypeUtil .visit(schema, new ApplyChanges(deletes, updates, adds)) .asNestedType().asStructType(); return new Schema(struct.fields()); }
/** * Prune columns from a {@link Schema} using a {@link StructType Spark type} projection. * <p> * This requires that the Spark type is a projection of the Schema. Nullability and types must * match. * * @param schema a Schema * @param requestedType a projection of the Spark representation of the Schema * @return a Schema corresponding to the Spark projection * @throws IllegalArgumentException if the Spark type does not match the Schema */ public static Schema prune(Schema schema, StructType requestedType) { return new Schema(visit(schema, new PruneColumnsWithoutReordering(requestedType, ImmutableSet.of())) .asNestedType() .asStructType() .fields()); }
/** * Convert a Spark {@link StructType struct} to a {@link Schema} with new field ids. * <p> * This conversion assigns fresh ids. * <p> * Some data types are represented as the same Spark type. These are converted to a default type. * <p> * To convert using a reference schema for field ids and ambiguous types, use * {@link #convert(Schema, StructType)}. * * @param sparkType a Spark StructType * @return the equivalent Schema * @throws IllegalArgumentException if the type cannot be converted */ public static Schema convert(StructType sparkType) { Type converted = visit(sparkType, new SparkTypeToType(sparkType)); return new Schema(converted.asNestedType().asStructType().fields()); }
/** * Assigns fresh ids from the {@link NextID nextId function} for all fields in a schema. * * @param schema a schema * @param nextId an id assignment function * @return an structurally identical schema with new ids assigned by the nextId function */ public static Schema assignFreshIds(Schema schema, NextID nextId) { return new Schema(TypeUtil .visit(schema.asStruct(), new AssignFreshIds(nextId)) .asNestedType() .fields()); }
/** * Returns a {@link Schema} for the given table with fresh field ids. * <p> * This creates a Schema for an existing table by looking up the table's schema with Spark and * converting that schema. Spark/Hive partition columns are included in the schema. * * @param spark a Spark session * @param name a table name and (optional) database * @return a Schema for the table, if found */ public static Schema schemaForTable(SparkSession spark, String name) { StructType sparkType = spark.table(name).schema(); Type converted = visit(sparkType, new SparkTypeToType(sparkType)); return new Schema(converted.asNestedType().asStructType().fields()); }
/** * Prune columns from a {@link Schema} using a {@link StructType Spark type} projection. * <p> * This requires that the Spark type is a projection of the Schema. Nullability and types must * match. * <p> * The filters list of {@link Expression} is used to ensure that columns referenced by filters * are projected. * * @param schema a Schema * @param requestedType a projection of the Spark representation of the Schema * @param filters a list of filters * @return a Schema corresponding to the Spark projection * @throws IllegalArgumentException if the Spark type does not match the Schema */ public static Schema prune(Schema schema, StructType requestedType, List<Expression> filters) { Set<Integer> filterRefs = Binder.boundReferences(schema.asStruct(), filters); return new Schema(visit(schema, new PruneColumnsWithoutReordering(requestedType, filterRefs)) .asNestedType() .asStructType() .fields()); }
public Type field(NestedField field) { Type.Repetition repetition = field.isOptional() ? Type.Repetition.OPTIONAL : Type.Repetition.REQUIRED; int id = field.fieldId(); String name = field.name(); if (field.type().isPrimitiveType()) { return primitive(field.type().asPrimitiveType(), repetition, id, name); } else { NestedType nested = field.type().asNestedType(); if (nested.isStructType()) { return struct(nested.asStructType(), repetition, id, name); } else if (nested.isMapType()) { return map(nested.asMapType(), repetition, id, name); } else if (nested.isListType()) { return list(nested.asListType(), repetition, id, name); } throw new UnsupportedOperationException("Can't convert unknown type: " + nested); } }
@Test public void testStructs() throws Exception { Types.StructType struct = Types.StructType.of( Types.NestedField.required(34, "Name!", Types.StringType.get()), Types.NestedField.optional(35, "col", Types.DecimalType.of(38, 2))); Type copy = TestHelpers.roundTripSerialize(struct); Assert.assertEquals("Struct serialization should be equal to starting type", struct, copy); Type stringType = copy.asNestedType().asStructType().fieldType("Name!"); Assert.assertSame("Struct serialization should preserve identity type", Types.StringType.get(), stringType); Type decimalType = copy.asNestedType().asStructType().field(35).type(); Assert.assertEquals("Struct serialization should support id lookup", Types.DecimalType.of(38, 2), decimalType); }
@Test public void testLists() throws Exception { Type[] maps = new Type[] { Types.ListType.ofOptional(2, Types.DoubleType.get()), Types.ListType.ofRequired(5, Types.DoubleType.get()) }; for (Type list : maps) { Type copy = TestHelpers.roundTripSerialize(list); Assert.assertEquals("List serialization should be equal to starting type", list, copy); Assert.assertSame("List serialization should preserve identity type", Types.DoubleType.get(), list.asNestedType().asListType().elementType()); } }
@Test public void testMaps() throws Exception { Type[] maps = new Type[] { Types.MapType.ofOptional(1, 2, Types.StringType.get(), Types.LongType.get()), Types.MapType.ofRequired(4, 5, Types.StringType.get(), Types.LongType.get()) }; for (Type map : maps) { Type copy = TestHelpers.roundTripSerialize(map); Assert.assertEquals("Map serialization should be equal to starting type", map, copy); Assert.assertSame("Map serialization should preserve identity type", Types.LongType.get(), map.asNestedType().asMapType().valueType()); } }