@Test public void testInputOutputFormatClassAtRuntime() throws Exception { // create a dataset with text input and output formats DatasetId datasetId = OTHER_NAMESPACE.dataset("testRuntimeFormats"); dsFrameworkUtil.createInstance("fileSet", datasetId, FileSetProperties.builder() .setInputFormat(TextInputFormat.class) .setOutputFormat(TextOutputFormat.class) .build()); // without passing anything in arguments, the input/output format classes will come from dataset properties FileSet fs = dsFrameworkUtil.getInstance(datasetId); Assert.assertEquals(TextInputFormat.class.getName(), fs.getInputFormatClassName()); Assert.assertEquals(TextOutputFormat.class.getName(), fs.getOutputFormatClassName()); // allow overriding the input format in dataset runtime args fs = dsFrameworkUtil.getInstance(datasetId, ImmutableMap.of( FileSetProperties.INPUT_FORMAT, CombineTextInputFormat.class.getName())); Assert.assertEquals(CombineTextInputFormat.class.getName(), fs.getInputFormatClassName()); Assert.assertEquals(TextOutputFormat.class.getName(), fs.getOutputFormatClassName()); // allow overriding both the input and output format in dataset runtime args fs = dsFrameworkUtil.getInstance(datasetId, ImmutableMap.of( FileSetProperties.INPUT_FORMAT, CombineTextInputFormat.class.getName(), FileSetProperties.OUTPUT_FORMAT, NullOutputFormat.class.getName())); Assert.assertEquals(CombineTextInputFormat.class.getName(), fs.getInputFormatClassName()); Assert.assertEquals(NullOutputFormat.class.getName(), fs.getOutputFormatClassName()); }
.setBasePath(existingPath) .setPossessExisting(true) .build()); Assert.assertTrue(someFile.exists());
private DatasetProperties createProperties(OutputFormatProvider outputFormatProvider) { FileSetProperties.Builder fileProperties = SnapshotFileSet.getBaseProperties(config); addFileProperties(fileProperties); fileProperties.setOutputFormat(outputFormatProvider.getOutputFormatClassName()); for (Map.Entry<String, String> formatProperty : outputFormatProvider.getOutputFormatConfiguration().entrySet()) { fileProperties.setOutputProperty(formatProperty.getKey(), formatProperty.getValue()); } return fileProperties.build(); }
.setBasePath(baseDir.getPath()) .setDataExternal(true) .build()); .setBasePath(baseDir.getPath()) .setDataExternal(true) .build()); spec = framework.getDatasetSpec(MY_DS); Assert.assertNotNull(spec);
.setBasePath("perm/test/path") .setFilePermissions(fsPermissions) .setFileGroup(group) .build()); FileSet fs = dsFrameworkUtil.getInstance(datasetId);
.setBasePath("rtInput1") .setInputFormat(TextInputFormat.class) .setOutputFormat(TextOutputFormat.class) .setOutputProperty(TextOutputFormat.SEPERATOR, ":") .build()); dsFramework.addInstance("fileSet", rtOutput1, FileSetProperties.builder() .setBasePath("rtOutput1") .setInputFormat(TextInputFormat.class) .setOutputFormat(TextOutputFormat.class) .setOutputProperty(TextOutputFormat.SEPERATOR, ":") .build()); .setBasePath("rtInput2") .setInputFormat(TextInputFormat.class) .setOutputFormat(TextOutputFormat.class) .setOutputProperty(TextOutputFormat.SEPERATOR, ":") .build()); runtimeArguments = Maps.newHashMap(); runtimeArguments.put(AppWithMapReduceUsingRuntimeDatasets.INPUT_NAME, "rtInput2");
PartitionedFileSetProperties.builder() .setExploreFormat("csv") .setExploreSchema("key int, value string") .setEnableExploreOnCreate(true) .build()); DataSetManager<TimePartitionedFileSet> tpfsManager = getDataset("tpfs"); Date date = DATE_FORMAT.parse("6/4/12 10:00 am");
.setEnableExploreOnCreate(true) .setSerDe("org.apache.hadoop.hive.ql.io.orc.OrcSerde") .setExploreInputFormat("org.apache.hadoop.hive.ql.io.orc.OrcInputFormat") .setExploreOutputFormat("org.apache.hadoop.hive.ql.io.orc.OrcOutputFormat") .setExploreSchema("record STRING") .build());
.builder().setBasePath("some/path").setInputFormat(TextInputFormat.class).build()); DataSetManager<FileSet> bManager = getDataset("b"); String bFormat = bManager.get().getInputFormatClassName();
.setPartitioning(PARTITIONING) .setEnableExploreOnCreate(true) .setOutputFormat(org.apache.hadoop.mapreduce.lib.output.TextOutputFormat.class) .setOutputProperty(org.apache.hadoop.mapreduce.lib.output.TextOutputFormat.SEPERATOR, ",") .setExploreFormat("csv") .setExploreSchema("key string, value string") .build());
DatasetProperties filesetAvroTableProps = FileSetProperties.builder() .setTableProperty(DatasetSystemMetadataProvider.FILESET_AVRO_SCHEMA_PROPERTY, "avro-table-schema") .build(); assertDatasetSchema("avro-table-schema", filesetAvroTableProps); .setTableProperty(DatasetSystemMetadataProvider.FILESET_AVRO_SCHEMA_PROPERTY, "avro-table-schema") .add(DatasetProperties.SCHEMA, "avro-schema") .build(); assertDatasetSchema("avro-schema", filesetAvroTableProps); .setOutputProperty(DatasetSystemMetadataProvider.FILESET_AVRO_SCHEMA_OUTPUT_KEY, "avro-output-schema") .build(); assertDatasetSchema("avro-output-schema", filesetAvroOutputProps); .setOutputProperty(DatasetSystemMetadataProvider.FILESET_AVRO_SCHEMA_OUTPUT_KEY, "avro-output-schema") .add(DatasetProperties.SCHEMA, "avro-schema") .build(); assertDatasetSchema("avro-schema", filesetAvroOutputProps); .setOutputProperty(DatasetSystemMetadataProvider.FILESET_PARQUET_SCHEMA_OUTPUT_KEY, "parquet-output-schema") .build(); assertDatasetSchema("parquet-output-schema", filesetParquetProps); .setOutputProperty(DatasetSystemMetadataProvider.FILESET_PARQUET_SCHEMA_OUTPUT_KEY, "parquet-output-schema") .add(DatasetProperties.SCHEMA, "parquet-schema") .build(); assertDatasetSchema("parquet-schema", filesetParquetProps);
.setPartitioning(PARTITIONING_1) .setBasePath(absolutePath.getPath()) .setDataExternal(true) .build()); final PartitionedFileSet pfs = dsFrameworkUtil.getInstance(pfsExternalInstance);
DatasetId inputDatasetId = inputNSMeta.getNamespaceId().dataset("input"); addDatasetInstance(FileSet.class.getName(), inputDatasetId, FileSetProperties.builder().setInputFormat(TextInputFormat.class).build());
.setBasePath(absolutePath) .setDataExternal(true) .build());
.setOutputFormat(TextOutputFormat.class) .setOutputProperty(TextOutputFormat.SEPERATOR, ",") .build()); .setOutputFormat(TextOutputFormat.class) .setOutputProperty(TextOutputFormat.SEPERATOR, ",") .setEnableExploreOnCreate(true) .setExploreFormat("text") .setExploreFormatProperty("delimiter", "\n") .setExploreSchema("record STRING") .build());
static void performAdmin(RuntimeContext context) { Admin admin = context.getAdmin(); Map<String, String> args = context.getRuntimeArguments(); try { // if invoked with dropAll=true, clean up all datasets (a, b, c, d) if ("true".equals(args.get("dropAll"))) { for (String name : new String[]{"a", "b", "c", "d"}) { if (admin.datasetExists(name)) { admin.dropDataset(name); } } } else { // create a, update b with /extra in base path, truncate c, drop d admin.createDataset("a", Table.class.getName(), DatasetProperties.EMPTY); String type = admin.getDatasetType("b"); Assert.assertEquals(FileSet.class.getName(), type); DatasetProperties bProps = admin.getDatasetProperties("b"); String base = bProps.getProperties().get("base.path"); Assert.assertNotNull(base); String newBase = args.get("new.base.path"); DatasetProperties newBProps = ((FileSetProperties.Builder) FileSetProperties.builder() .addAll(bProps.getProperties())).setDataExternal(true).setBasePath(newBase).build(); admin.updateDataset("b", newBProps); admin.truncateDataset("c"); admin.dropDataset("d"); } } catch (DatasetManagementException e) { Throwables.propagate(e); } }
/** * Configure a file set to use ORC file format with a given schema. The schema is parsed * validated and converted into a Hive schema which is compatible with ORC format. The file set is configured to use * ORC input and output format, and also configured for Explore to use Hive. The schema is added * to the file set properties in all the different required ways: * <ul> * <li>As a top-level dataset property;</li> * <li>As the schema for the input and output format;</li> * <li>As the schema to be used by the ORC serde (which is used by Hive).</li> * </ul> * * @param configuredSchema the original schema configured for the table * @param properties a builder for the file set properties */ public static void configureORCFileSet(String configuredSchema, FileSetProperties.Builder properties) { //TODO test if complex cases run with lowercase schema only String lowerCaseSchema = configuredSchema.toLowerCase(); String hiveSchema = parseHiveSchema(lowerCaseSchema, configuredSchema); hiveSchema = hiveSchema.substring(1, hiveSchema.length() - 1); properties.setExploreInputFormat("org.apache.hadoop.hive.ql.io.orc.OrcInputFormat") .setExploreOutputFormat("org.apache.hadoop.hive.ql.io.orc.OrcOutputFormat") .setSerDe("org.apache.hadoop.hive.ql.io.orc.OrcSerde") .setExploreSchema(hiveSchema) .setEnableExploreOnCreate(true) .add(DatasetProperties.SCHEMA, configuredSchema) .build(); }
@Override public void configure() { try { createDataset("fs", FileSet.class, FileSetProperties.builder() .setInputFormat(MyTextInputFormat.class) .setOutputFormat(MyTextOutputFormat.class) .setOutputProperty(TextOutputFormat.SEPERATOR, ":").build()); createDataset("pfs", PartitionedFileSet.class, PartitionedFileSetProperties.builder() .setPartitioning(Partitioning.builder().addStringField("x").build()) .setInputFormat(MyTextInputFormat.class) .setOutputFormat(TextOutputFormat.class) .setOutputProperty(TextOutputFormat.SEPERATOR, ":").build()); createDataset("tpfs", TimePartitionedFileSet.class, FileSetProperties.builder() .setInputFormat(MyTextInputFormat.class) .setOutputFormat(TextOutputFormat.class) .setOutputProperty(TextOutputFormat.SEPERATOR, ":").build()); createDataset("myfs", MyFileSet.class, FileSetProperties.builder() .setInputFormat(MyTextInputFormat.class) .setOutputFormat(TextOutputFormat.class) .setOutputProperty(TextOutputFormat.SEPERATOR, ":").build()); addSpark(new FileCountSparkProgram()); addSpark(new ScalaFileCountSparkProgram()); } catch (Throwable t) { throw Throwables.propagate(t); } }
@Test public void testFixProperties() throws DatasetManagementException, UnsupportedTypeException { testFix("fileSet", FileSetProperties.builder().setBasePath("/tmp/nn").setDataExternal(true).build()); testFix(FileSet.class.getName(), FileSetProperties.builder().setEnableExploreOnCreate(true).setExploreFormat("csv").build()); testFix("timePartitionedFileSet", FileSetProperties.builder().setBasePath("relative").build()); testFix(TimePartitionedFileSet.class.getName(), FileSetProperties.builder().setBasePath("relative").add("custom", "value").build()); testFix("objectMappedTable", ObjectMappedTableProperties.builder().setType(TestObject.class) .setRowKeyExploreName("x").setRowKeyExploreType(Schema.Type.STRING) .setConflictDetection(ConflictDetection.NONE).build()); testFix(ObjectMappedTable.class.getName(), ObjectMappedTableProperties.builder().setType(TestObject.class) .setRowKeyExploreName("x").setRowKeyExploreType(Schema.Type.STRING) .setConflictDetection(ConflictDetection.NONE).build()); testFix("lineageDataset", DatasetProperties.EMPTY); testFix(LineageDataset.class.getName(), TableProperties.builder().setTTL(1000).build()); testFix(UsageDataset.class.getSimpleName(), DatasetProperties.EMPTY); testFix("table", TableProperties.builder().setColumnFamily("fam").build()); testFix("indexedTable", DatasetProperties.builder().add(IndexedTable.INDEX_COLUMNS_CONF_KEY, "a,c").build()); }
/** * Configure a file set to use Avro file format with a given schema. The schema is parsed * as an Avro schema, validated and converted into a Hive schema. The file set is configured to use * Avro key input and output format, and also configured for Explore to use Avro. The schema is added * to the file set properties in all the different required ways: * <ul> * <li>As a top-level dataset property;</li> * <li>As the schema for the input and output format;</li> * <li>As the schema of the Hive table;</li> * <li>As the schema to be used by the Avro serde (which is used by Hive).</li> * </ul> * @param configuredSchema the original schema configured for the table * @param properties a builder for the file set properties */ public static void configureAvroFileSet(String configuredSchema, FileSetProperties.Builder properties) { properties .setEnableExploreOnCreate(true) .setSerDe("org.apache.hadoop.hive.serde2.avro.AvroSerDe") .setExploreInputFormat("org.apache.hadoop.hive.ql.io.avro.AvroContainerInputFormat") .setExploreOutputFormat("org.apache.hadoop.hive.ql.io.avro.AvroContainerOutputFormat") .setTableProperty("avro.schema.literal", configuredSchema) .add(DatasetProperties.SCHEMA, configuredSchema); }