/** * Convert a Spark {@link DataType struct} to a {@link Type} with new field ids. * <p> * This conversion assigns fresh ids. * <p> * Some data types are represented as the same Spark type. These are converted to a default type. * <p> * To convert using a reference schema for field ids and ambiguous types, use * {@link #convert(Schema, StructType)}. * * @param sparkType a Spark DataType * @return the equivalent Type * @throws IllegalArgumentException if the type cannot be converted */ public static Type convert(DataType sparkType) { return visit(sparkType, new SparkTypeToType()); }
/** * Convert a Spark {@link StructType struct} to a {@link Schema} with new field ids. * <p> * This conversion assigns fresh ids. * <p> * Some data types are represented as the same Spark type. These are converted to a default type. * <p> * To convert using a reference schema for field ids and ambiguous types, use * {@link #convert(Schema, StructType)}. * * @param sparkType a Spark StructType * @return the equivalent Schema * @throws IllegalArgumentException if the type cannot be converted */ public static Schema convert(StructType sparkType) { Type converted = visit(sparkType, new SparkTypeToType(sparkType)); return new Schema(converted.asNestedType().asStructType().fields()); }
/** * Convert a Spark {@link StructType struct} to a {@link Schema} based on the given schema. * <p> * This conversion does not assign new ids; it uses ids from the base schema. * <p> * Data types, field order, and nullability will match the spark type. This conversion may return * a schema that is not compatible with base schema. * * @param baseSchema a Schema on which conversion is based * @param sparkType a Spark StructType * @return the equivalent Schema * @throws IllegalArgumentException if the type cannot be converted or there are missing ids */ public static Schema convert(Schema baseSchema, StructType sparkType) { // convert to a type with fresh ids Types.StructType struct = visit(sparkType, new SparkTypeToType(sparkType)).asStructType(); // reassign ids to match the base schema Schema schema = TypeUtil.reassignIds(new Schema(struct.fields()), baseSchema); // fix types that can't be represented in Spark (UUID and Fixed) return FixupTypes.fixup(schema, baseSchema); }
@Override public Type struct(StructType struct, List<Type> types) { StructField[] fields = struct.fields(); List<Types.NestedField> newFields = Lists.newArrayListWithExpectedSize(fields.length); boolean isRoot = root == struct; for (int i = 0; i < fields.length; i += 1) { StructField field = fields[i]; Type type = types.get(i); int id; if (isRoot) { // for new conversions, use ordinals for ids in the root struct id = i; } else { id = getNextId(); } if (field.nullable()) { newFields.add(Types.NestedField.optional(id, field.name(), type)); } else { newFields.add(Types.NestedField.required(id, field.name(), type)); } } return Types.StructType.of(newFields); }
/** * Returns a {@link Schema} for the given table with fresh field ids. * <p> * This creates a Schema for an existing table by looking up the table's schema with Spark and * converting that schema. Spark/Hive partition columns are included in the schema. * * @param spark a Spark session * @param name a table name and (optional) database * @return a Schema for the table, if found */ public static Schema schemaForTable(SparkSession spark, String name) { StructType sparkType = spark.table(name).schema(); Type converted = visit(sparkType, new SparkTypeToType(sparkType)); return new Schema(converted.asNestedType().asStructType().fields()); }