/** * Configure a file set to use Parquet file format with a given schema. The schema is lower-cased, parsed * as an Avro schema, validated and converted into a Hive schema. The file set is configured to use * Parquet input and output format, and also configured for Explore to use Parquet. The schema is added * to the file set properties in all the different required ways: * <ul> * <li>As a top-level dataset property;</li> * <li>As the schema for the input and output format;</li> * <li>As the schema of the Hive table.</li> * </ul> * @param configuredSchema the original schema configured for the table * @param properties a builder for the file set properties */ public static void configureParquetFileSet(String configuredSchema, FileSetProperties.Builder properties) { String hiveSchema = parseHiveSchema(configuredSchema, configuredSchema); properties .setEnableExploreOnCreate(true) .setExploreFormat("parquet") .setExploreSchema(hiveSchema.substring(1, hiveSchema.length() - 1)) .add(DatasetProperties.SCHEMA, configuredSchema); }
/** * Configure a file set to use ORC file format with a given schema. The schema is parsed * validated and converted into a Hive schema which is compatible with ORC format. The file set is configured to use * ORC input and output format, and also configured for Explore to use Hive. The schema is added * to the file set properties in all the different required ways: * <ul> * <li>As a top-level dataset property;</li> * <li>As the schema for the input and output format;</li> * <li>As the schema to be used by the ORC serde (which is used by Hive).</li> * </ul> * * @param configuredSchema the original schema configured for the table * @param properties a builder for the file set properties */ public static void configureORCFileSet(String configuredSchema, FileSetProperties.Builder properties) { //TODO test if complex cases run with lowercase schema only String lowerCaseSchema = configuredSchema.toLowerCase(); String hiveSchema = parseHiveSchema(lowerCaseSchema, configuredSchema); hiveSchema = hiveSchema.substring(1, hiveSchema.length() - 1); properties.setExploreInputFormat("org.apache.hadoop.hive.ql.io.orc.OrcInputFormat") .setExploreOutputFormat("org.apache.hadoop.hive.ql.io.orc.OrcOutputFormat") .setSerDe("org.apache.hadoop.hive.ql.io.orc.OrcSerde") .setExploreSchema(hiveSchema) .setEnableExploreOnCreate(true) .add(DatasetProperties.SCHEMA, configuredSchema) .build(); }
PartitionedFileSetProperties.builder() .setExploreFormat("csv") .setExploreSchema("key int, value string") .setEnableExploreOnCreate(true) .build());
.setOutputProperty(org.apache.hadoop.mapreduce.lib.output.TextOutputFormat.SEPERATOR, ",") .setExploreFormat("csv") .setExploreSchema("key string, value string") .build());
.setExploreInputFormat("org.apache.hadoop.hive.ql.io.orc.OrcInputFormat") .setExploreOutputFormat("org.apache.hadoop.hive.ql.io.orc.OrcOutputFormat") .setExploreSchema("record STRING") .build());
.setExploreFormat("text") .setExploreFormatProperty("delimiter", "\n") .setExploreSchema("record STRING") .build());
@Override public void configure() { addService(new PartitionService()); // Create a partitioned file set, configure it to work with MapReduce and with Explore createDataset("pfs", PartitionedFileSet.class, PartitionedFileSetProperties.builder() // Properties for partitioning .setPartitioning(Partitioning.builder().addStringField("partition").addIntField("sub-partition").build()) // Properties for file set .setInputFormat(TextInputFormat.class) .setOutputFormat(TextOutputFormat.class) .setOutputProperty(TextOutputFormat.SEPERATOR, ",") // Properties for Explore (to create a partitioned Hive table) .setEnableExploreOnCreate(true) .setExploreFormat("csv") .setExploreSchema("f1 STRING, f2 INT") .setDescription("App for testing authorization in partitioned filesets.") .build()); }
@Override public void configure() { createDataset(INPUT, KeyValueTable.class.getName(), DatasetProperties.EMPTY); // create two pfs, identical except for their (table) names for (String name : new String[] { PFS, OTHER }) { createDataset(name, PartitionedFileSet.class.getName(), PartitionedFileSetProperties.builder() .setPartitioning(Partitioning.builder().addIntField("number").build()) .setOutputFormat(TextOutputFormat.class) .setOutputProperty(TextOutputFormat.SEPERATOR, ",") .setEnableExploreOnCreate(true) .setExploreTableName(name) .setExploreSchema("key STRING, value STRING") .setExploreFormat("csv") .build()); } addMapReduce(new PartitionWriterMR()); }
@Override protected void addFileProperties(FileSetProperties.Builder propertiesBuilder) { propertiesBuilder .setInputFormat(TextInputFormat.class) .setOutputFormat(TextOutputFormat.class) .setEnableExploreOnCreate(true) .setExploreFormat("text") .setExploreSchema("text string"); }