/** * Sets the output format of the file dataset. */ public Builder setOutputFormat(Class<?> outputFormatClass) { setOutputFormat(outputFormatClass.getName()); return this; }
private DatasetProperties createProperties(OutputFormatProvider outputFormatProvider) { FileSetProperties.Builder properties = FileSetProperties.builder(); if (!Strings.isNullOrEmpty(tpfsSinkConfig.basePath)) { properties.setBasePath(tpfsSinkConfig.basePath); } properties.setOutputFormat(outputFormatProvider.getOutputFormatClassName()); for (Map.Entry<String, String> formatProperty : outputFormatProvider.getOutputFormatConfiguration().entrySet()) { properties.setOutputProperty(formatProperty.getKey(), formatProperty.getValue()); } addFileSetProperties(properties); return properties.build(); }
private DatasetProperties createProperties(OutputFormatProvider outputFormatProvider) { FileSetProperties.Builder fileProperties = SnapshotFileSet.getBaseProperties(config); addFileProperties(fileProperties); fileProperties.setOutputFormat(outputFormatProvider.getOutputFormatClassName()); for (Map.Entry<String, String> formatProperty : outputFormatProvider.getOutputFormatConfiguration().entrySet()) { fileProperties.setOutputProperty(formatProperty.getKey(), formatProperty.getValue()); } return fileProperties.build(); }
.setBasePath("rtInput1") .setInputFormat(TextInputFormat.class) .setOutputFormat(TextOutputFormat.class) .setOutputProperty(TextOutputFormat.SEPERATOR, ":") .build()); .setBasePath("rtOutput1") .setInputFormat(TextInputFormat.class) .setOutputFormat(TextOutputFormat.class) .setOutputProperty(TextOutputFormat.SEPERATOR, ":") .build()); .setBasePath("rtInput2") .setInputFormat(TextInputFormat.class) .setOutputFormat(TextOutputFormat.class) .setOutputProperty(TextOutputFormat.SEPERATOR, ":") .build());
.setPartitioning(PARTITIONING) .setEnableExploreOnCreate(true) .setOutputFormat(org.apache.hadoop.mapreduce.lib.output.TextOutputFormat.class) .setOutputProperty(org.apache.hadoop.mapreduce.lib.output.TextOutputFormat.SEPERATOR, ",") .setExploreFormat("csv")
.setOutputFormat(TextOutputFormat.class) .setOutputProperty(TextOutputFormat.SEPERATOR, ",") .build()); .setOutputFormat(TextOutputFormat.class) .setOutputProperty(TextOutputFormat.SEPERATOR, ",")
@Override public void configure() { try { createDataset("fs", FileSet.class, FileSetProperties.builder() .setInputFormat(MyTextInputFormat.class) .setOutputFormat(MyTextOutputFormat.class) .setOutputProperty(TextOutputFormat.SEPERATOR, ":").build()); createDataset("pfs", PartitionedFileSet.class, PartitionedFileSetProperties.builder() .setPartitioning(Partitioning.builder().addStringField("x").build()) .setInputFormat(MyTextInputFormat.class) .setOutputFormat(TextOutputFormat.class) .setOutputProperty(TextOutputFormat.SEPERATOR, ":").build()); createDataset("tpfs", TimePartitionedFileSet.class, FileSetProperties.builder() .setInputFormat(MyTextInputFormat.class) .setOutputFormat(TextOutputFormat.class) .setOutputProperty(TextOutputFormat.SEPERATOR, ":").build()); createDataset("myfs", MyFileSet.class, FileSetProperties.builder() .setInputFormat(MyTextInputFormat.class) .setOutputFormat(TextOutputFormat.class) .setOutputProperty(TextOutputFormat.SEPERATOR, ":").build()); addSpark(new FileCountSparkProgram()); addSpark(new ScalaFileCountSparkProgram()); } catch (Throwable t) { throw Throwables.propagate(t); } }
@Test public void testInputOutputFormatClassAtRuntime() throws Exception { // create a dataset with text input and output formats DatasetId datasetId = OTHER_NAMESPACE.dataset("testRuntimeFormats"); dsFrameworkUtil.createInstance("fileSet", datasetId, FileSetProperties.builder() .setInputFormat(TextInputFormat.class) .setOutputFormat(TextOutputFormat.class) .build()); // without passing anything in arguments, the input/output format classes will come from dataset properties FileSet fs = dsFrameworkUtil.getInstance(datasetId); Assert.assertEquals(TextInputFormat.class.getName(), fs.getInputFormatClassName()); Assert.assertEquals(TextOutputFormat.class.getName(), fs.getOutputFormatClassName()); // allow overriding the input format in dataset runtime args fs = dsFrameworkUtil.getInstance(datasetId, ImmutableMap.of( FileSetProperties.INPUT_FORMAT, CombineTextInputFormat.class.getName())); Assert.assertEquals(CombineTextInputFormat.class.getName(), fs.getInputFormatClassName()); Assert.assertEquals(TextOutputFormat.class.getName(), fs.getOutputFormatClassName()); // allow overriding both the input and output format in dataset runtime args fs = dsFrameworkUtil.getInstance(datasetId, ImmutableMap.of( FileSetProperties.INPUT_FORMAT, CombineTextInputFormat.class.getName(), FileSetProperties.OUTPUT_FORMAT, NullOutputFormat.class.getName())); Assert.assertEquals(CombineTextInputFormat.class.getName(), fs.getInputFormatClassName()); Assert.assertEquals(NullOutputFormat.class.getName(), fs.getOutputFormatClassName()); }
@Override public void configure() { setName("AppWithMapReduceUsingFile"); setDescription("Application with MapReduce job using file as dataset"); createDataset(INPUT, "table"); createDataset(OUTPUT, "table"); Class<? extends InputFormat> inputFormatClass = getConfig().isUseCombineFileInputFormat() ? CombineTextInputFormat.class : TextInputFormat.class; createDataset(PARTITIONED, "partitionedFileSet", PartitionedFileSetProperties.builder() .setPartitioning(Partitioning.builder() .addStringField("type") .addLongField("time") .build()) // properties for file set .setBasePath("partitioned") .setInputFormat(inputFormatClass) .setOutputFormat(TextOutputFormat.class) .setOutputProperty(TextOutputFormat.SEPERATOR, SEPARATOR) // don't configure properties for the Hive table - this is used in a context where explore is disabled .build()); addMapReduce(new PartitionWriter()); addMapReduce(new PartitionReader()); }
@Override public void configure() { setName("AppWithMapReduceUsingFile"); setDescription("Application with MapReduce job using file as dataset"); String inputDataset = getConfig().inputDataset; String outputDataset = getConfig().outputDataset; createDataset(inputDataset, "fileSet", FileSetProperties.builder() .setInputFormat(TextInputFormat.class) .setOutputFormat(TextOutputFormat.class) .setOutputProperty(TextOutputFormat.SEPERATOR, ":") .build()); if (!outputDataset.equals(inputDataset)) { createDataset(outputDataset, "fileSet", FileSetProperties.builder() .setBasePath("foo/my-file-output") .setInputFormat(TextInputFormat.class) .setOutputFormat(TextOutputFormat.class) .setOutputProperty(TextOutputFormat.SEPERATOR, ":") .build()); } addMapReduce(new ComputeSum(getConfig())); }
@Override public void configure() { setName("AppWithMapReduceUsingMultipleInputs"); setDescription("Application with MapReduce job using multiple inputs"); createDataset(PURCHASES, "fileSet", FileSetProperties.builder() .setInputFormat(TextInputFormat.class) .build()); createDataset(PURCHASES2, "fileSet", FileSetProperties.builder() .setInputFormat(TextInputFormat.class) .build()); createDataset(CUSTOMERS, "fileSet", FileSetProperties.builder() .setInputFormat(TextInputFormat.class) .build()); createDataset(OUTPUT_DATASET, "fileSet", FileSetProperties.builder() .setOutputFormat(TextOutputFormat.class) .setOutputProperty(TextOutputFormat.SEPERATOR, " ") .build()); addMapReduce(new ComputeSum()); addMapReduce(new InvalidMapReduce()); }
@Override public void configure() { addService(new PartitionService()); // Create a partitioned file set, configure it to work with MapReduce and with Explore createDataset("pfs", PartitionedFileSet.class, PartitionedFileSetProperties.builder() // Properties for partitioning .setPartitioning(Partitioning.builder().addStringField("partition").addIntField("sub-partition").build()) // Properties for file set .setInputFormat(TextInputFormat.class) .setOutputFormat(TextOutputFormat.class) .setOutputProperty(TextOutputFormat.SEPERATOR, ",") // Properties for Explore (to create a partitioned Hive table) .setEnableExploreOnCreate(true) .setExploreFormat("csv") .setExploreSchema("f1 STRING, f2 INT") .setDescription("App for testing authorization in partitioned filesets.") .build()); }
@Override public void configure() { setName("AppWithMapReduceUsingFile"); setDescription("Application with MapReduce job using file as dataset"); createDataset(INPUT, "table"); createDataset(OUTPUT, "table"); createDataset(TIME_PARTITIONED, "timePartitionedFileSet", FileSetProperties.builder() // properties for file set .setBasePath("partitioned") .setInputFormat(TextInputFormat.class) .setOutputFormat(TextOutputFormat.class) .setOutputProperty(TextOutputFormat.SEPERATOR, SEPARATOR) // don't configure properties for the Hive table - this is used in a context where explore is disabled .build()); addMapReduce(new PartitionWriter()); addMapReduce(new PartitionReader()); }
@Override protected void configure() { setName(WORKFLOW_NAME); setDescription("Workflow program with local datasets."); createLocalDataset(WORDCOUNT_DATASET, KeyValueTable.class); createLocalDataset(CSV_FILESET_DATASET, FileSet.class, FileSetProperties.builder() .setInputFormat(TextInputFormat.class) .setOutputFormat(TextOutputFormat.class) .build()); createLocalDataset(UNIQUE_ID_DATASET, KeyValueTable.class); addAction(new LocalDatasetWriter()); addSpark("JavaSparkCSVToSpaceConverter"); addMapReduce("WordCount"); addAction(new LocalDatasetReader("readerAction")); } }
@Override public void configure() { setName("AppWithMapReduceUsingMultipleOutputs"); setDescription("Application with MapReduce job using multiple outputs"); createDataset(PURCHASES, "fileSet", FileSetProperties.builder() .setInputFormat(TextInputFormat.class) .build()); createDataset(SEPARATED_PURCHASES, "fileSet", FileSetProperties.builder() .setOutputFormat(TextOutputFormat.class) .setOutputProperty(TextOutputFormat.SEPERATOR, " ") .build()); addMapReduce(new SeparatePurchases()); addMapReduce(new InvalidMapReduce()); }
@Override protected void addFileProperties(FileSetProperties.Builder propertiesBuilder) { propertiesBuilder .setInputFormat(TextInputFormat.class) .setOutputFormat(TextOutputFormat.class) .setEnableExploreOnCreate(true) .setExploreFormat("text") .setExploreSchema("text string"); }
@Override protected void configure() { setName(SPARK); setMainClass(getClass()); createDataset(SPARK_INPUT, FileSet.class, FileSetProperties.builder() .setInputFormat(TextInputFormat.class) .setOutputFormat(TextOutputFormat.class) .setOutputProperty(TextOutputFormat.SEPERATOR, ":").build()); createDataset(SPARK_TABLE, Table.class); usePlugin("t1", "n1", "plugin", PluginProperties.builder().add(KEY, TEST).build()); }
@Override public void configure() { createDataset("logs", FileSet.class, FileSetProperties.builder() .setInputFormat(TextInputFormat.class) .setOutputFormat(TextOutputFormat.class).build()); createDataset("logStats", KeyValueTable.class.getName()); addSpark(new SparkLogParser()); addSpark(new ScalaSparkLogParser()); }
@Override protected void configure() { createDataset(INPUT_FILE_SET, FileSet.class, FileSetProperties.builder() .setInputFormat(TextInputFormat.class) .setOutputFormat(TextOutputFormat.class).build()); createDataset(OUTPUT_FILE_SET, FileSet.class, FileSetProperties.builder() .setInputFormat(TextInputFormat.class) .setOutputFormat(TextOutputFormat.class).build()); }