/** * Convert a Spark {@link DataType struct} to a {@link Type} with new field ids. * <p> * This conversion assigns fresh ids. * <p> * Some data types are represented as the same Spark type. These are converted to a default type. * <p> * To convert using a reference schema for field ids and ambiguous types, use * {@link #convert(Schema, StructType)}. * * @param sparkType a Spark DataType * @return the equivalent Type * @throws IllegalArgumentException if the type cannot be converted */ public static Type convert(DataType sparkType) { return visit(sparkType, new SparkTypeToType()); }
static <T> T visit(DataType type, SparkTypeVisitor<T> visitor) { if (type instanceof StructType) { StructField[] fields = ((StructType) type).fields(); List<T> fieldResults = Lists.newArrayListWithExpectedSize(fields.length); for (StructField field : fields) { fieldResults.add(visitor.field( field, visit(field.dataType(), visitor))); } return visitor.struct((StructType) type, fieldResults); } else if (type instanceof MapType) { return visitor.map((MapType) type, visit(((MapType) type).keyType(), visitor), visit(((MapType) type).valueType(), visitor)); } else if (type instanceof ArrayType) { return visitor.array( (ArrayType) type, visit(((ArrayType) type).elementType(), visitor)); } else if (type instanceof UserDefinedType){ throw new UnsupportedOperationException( "User-defined types are not supported"); } else { return visitor.atomic(type); } }
/** * Convert a Spark {@link StructType struct} to a {@link Schema} with new field ids. * <p> * This conversion assigns fresh ids. * <p> * Some data types are represented as the same Spark type. These are converted to a default type. * <p> * To convert using a reference schema for field ids and ambiguous types, use * {@link #convert(Schema, StructType)}. * * @param sparkType a Spark StructType * @return the equivalent Schema * @throws IllegalArgumentException if the type cannot be converted */ public static Schema convert(StructType sparkType) { Type converted = visit(sparkType, new SparkTypeToType(sparkType)); return new Schema(converted.asNestedType().asStructType().fields()); }
/** * Convert a Spark {@link StructType struct} to a {@link Schema} based on the given schema. * <p> * This conversion does not assign new ids; it uses ids from the base schema. * <p> * Data types, field order, and nullability will match the spark type. This conversion may return * a schema that is not compatible with base schema. * * @param baseSchema a Schema on which conversion is based * @param sparkType a Spark StructType * @return the equivalent Schema * @throws IllegalArgumentException if the type cannot be converted or there are missing ids */ public static Schema convert(Schema baseSchema, StructType sparkType) { // convert to a type with fresh ids Types.StructType struct = visit(sparkType, new SparkTypeToType(sparkType)).asStructType(); // reassign ids to match the base schema Schema schema = TypeUtil.reassignIds(new Schema(struct.fields()), baseSchema); // fix types that can't be represented in Spark (UUID and Fixed) return FixupTypes.fixup(schema, baseSchema); }
/** * Returns a {@link Schema} for the given table with fresh field ids. * <p> * This creates a Schema for an existing table by looking up the table's schema with Spark and * converting that schema. Spark/Hive partition columns are included in the schema. * * @param spark a Spark session * @param name a table name and (optional) database * @return a Schema for the table, if found */ public static Schema schemaForTable(SparkSession spark, String name) { StructType sparkType = spark.table(name).schema(); Type converted = visit(sparkType, new SparkTypeToType(sparkType)); return new Schema(converted.asNestedType().asStructType().fields()); }