/** * Set the input format used to create the Hive table. * Note that this can be different than the input format used * for the file set itself. */ public Builder setExploreInputFormat(Class<?> inputFormat) { return setExploreInputFormat(inputFormat.getName()); }
/** * Configure a file set to use ORC file format with a given schema. The schema is parsed * validated and converted into a Hive schema which is compatible with ORC format. The file set is configured to use * ORC input and output format, and also configured for Explore to use Hive. The schema is added * to the file set properties in all the different required ways: * <ul> * <li>As a top-level dataset property;</li> * <li>As the schema for the input and output format;</li> * <li>As the schema to be used by the ORC serde (which is used by Hive).</li> * </ul> * * @param configuredSchema the original schema configured for the table * @param properties a builder for the file set properties */ public static void configureORCFileSet(String configuredSchema, FileSetProperties.Builder properties) { //TODO test if complex cases run with lowercase schema only String lowerCaseSchema = configuredSchema.toLowerCase(); String hiveSchema = parseHiveSchema(lowerCaseSchema, configuredSchema); hiveSchema = hiveSchema.substring(1, hiveSchema.length() - 1); properties.setExploreInputFormat("org.apache.hadoop.hive.ql.io.orc.OrcInputFormat") .setExploreOutputFormat("org.apache.hadoop.hive.ql.io.orc.OrcOutputFormat") .setSerDe("org.apache.hadoop.hive.ql.io.orc.OrcSerde") .setExploreSchema(hiveSchema) .setEnableExploreOnCreate(true) .add(DatasetProperties.SCHEMA, configuredSchema) .build(); }
.setExploreInputFormat("org.apache.hadoop.hive.ql.io.orc.OrcInputFormat") .setExploreOutputFormat("org.apache.hadoop.hive.ql.io.orc.OrcOutputFormat") .setExploreSchema("record STRING")
/** * Configure a file set to use Avro file format with a given schema. The schema is parsed * as an Avro schema, validated and converted into a Hive schema. The file set is configured to use * Avro key input and output format, and also configured for Explore to use Avro. The schema is added * to the file set properties in all the different required ways: * <ul> * <li>As a top-level dataset property;</li> * <li>As the schema for the input and output format;</li> * <li>As the schema of the Hive table;</li> * <li>As the schema to be used by the Avro serde (which is used by Hive).</li> * </ul> * @param configuredSchema the original schema configured for the table * @param properties a builder for the file set properties */ public static void configureAvroFileSet(String configuredSchema, FileSetProperties.Builder properties) { properties .setEnableExploreOnCreate(true) .setSerDe("org.apache.hadoop.hive.serde2.avro.AvroSerDe") .setExploreInputFormat("org.apache.hadoop.hive.ql.io.avro.AvroContainerInputFormat") .setExploreOutputFormat("org.apache.hadoop.hive.ql.io.avro.AvroContainerOutputFormat") .setTableProperty("avro.schema.literal", configuredSchema) .add(DatasetProperties.SCHEMA, configuredSchema); }