/** * Sets the input format of the file dataset. */ public Builder setInputFormat(Class<?> inputFormatClass) { setInputFormat(inputFormatClass.getName()); return this; }
private DatasetProperties createProperties(InputFormatProvider inputFormatProvider) { FileSetProperties.Builder properties = SnapshotFileSet.getBaseProperties(config); if (!Strings.isNullOrEmpty(config.getBasePath())) { properties.setBasePath(config.getBasePath()); } properties.setInputFormat(inputFormatProvider.getInputFormatClassName()); for (Map.Entry<String, String> formatProperty : inputFormatProvider.getInputFormatConfiguration().entrySet()) { properties.setInputProperty(formatProperty.getKey(), formatProperty.getValue()); } addFileProperties(properties); return properties.build(); }
private DatasetProperties createProperties(InputFormatProvider inputFormatProvider) { FileSetProperties.Builder properties = FileSetProperties.builder(); if (!Strings.isNullOrEmpty(config.getBasePath())) { properties.setBasePath(config.getBasePath()); } properties.setInputFormat(inputFormatProvider.getInputFormatClassName()); for (Map.Entry<String, String> formatProperty : inputFormatProvider.getInputFormatConfiguration().entrySet()) { properties.setInputProperty(formatProperty.getKey(), formatProperty.getValue()); } addFileSetProperties(properties); return properties.build(); }
.setInputFormat(TextInputFormat.class) .setOutputFormat(TextOutputFormat.class) .setOutputProperty(TextOutputFormat.SEPERATOR, ":") dsFramework.addInstance("fileSet", rtOutput1, FileSetProperties.builder() .setBasePath("rtOutput1") .setInputFormat(TextInputFormat.class) .setOutputFormat(TextOutputFormat.class) .setOutputProperty(TextOutputFormat.SEPERATOR, ":") .setInputFormat(TextInputFormat.class) .setOutputFormat(TextOutputFormat.class) .setOutputProperty(TextOutputFormat.SEPERATOR, ":")
.builder().setBasePath("some/path").setInputFormat(TextInputFormat.class).build()); DataSetManager<FileSet> bManager = getDataset("b"); String bFormat = bManager.get().getInputFormatClassName();
DatasetId inputDatasetId = inputNSMeta.getNamespaceId().dataset("input"); addDatasetInstance(FileSet.class.getName(), inputDatasetId, FileSetProperties.builder().setInputFormat(TextInputFormat.class).build());
@Override public void configure() { try { createDataset("fs", FileSet.class, FileSetProperties.builder() .setInputFormat(MyTextInputFormat.class) .setOutputFormat(MyTextOutputFormat.class) .setOutputProperty(TextOutputFormat.SEPERATOR, ":").build()); createDataset("pfs", PartitionedFileSet.class, PartitionedFileSetProperties.builder() .setPartitioning(Partitioning.builder().addStringField("x").build()) .setInputFormat(MyTextInputFormat.class) .setOutputFormat(TextOutputFormat.class) .setOutputProperty(TextOutputFormat.SEPERATOR, ":").build()); createDataset("tpfs", TimePartitionedFileSet.class, FileSetProperties.builder() .setInputFormat(MyTextInputFormat.class) .setOutputFormat(TextOutputFormat.class) .setOutputProperty(TextOutputFormat.SEPERATOR, ":").build()); createDataset("myfs", MyFileSet.class, FileSetProperties.builder() .setInputFormat(MyTextInputFormat.class) .setOutputFormat(TextOutputFormat.class) .setOutputProperty(TextOutputFormat.SEPERATOR, ":").build()); addSpark(new FileCountSparkProgram()); addSpark(new ScalaFileCountSparkProgram()); } catch (Throwable t) { throw Throwables.propagate(t); } }
@Test public void testInputOutputFormatClassAtRuntime() throws Exception { // create a dataset with text input and output formats DatasetId datasetId = OTHER_NAMESPACE.dataset("testRuntimeFormats"); dsFrameworkUtil.createInstance("fileSet", datasetId, FileSetProperties.builder() .setInputFormat(TextInputFormat.class) .setOutputFormat(TextOutputFormat.class) .build()); // without passing anything in arguments, the input/output format classes will come from dataset properties FileSet fs = dsFrameworkUtil.getInstance(datasetId); Assert.assertEquals(TextInputFormat.class.getName(), fs.getInputFormatClassName()); Assert.assertEquals(TextOutputFormat.class.getName(), fs.getOutputFormatClassName()); // allow overriding the input format in dataset runtime args fs = dsFrameworkUtil.getInstance(datasetId, ImmutableMap.of( FileSetProperties.INPUT_FORMAT, CombineTextInputFormat.class.getName())); Assert.assertEquals(CombineTextInputFormat.class.getName(), fs.getInputFormatClassName()); Assert.assertEquals(TextOutputFormat.class.getName(), fs.getOutputFormatClassName()); // allow overriding both the input and output format in dataset runtime args fs = dsFrameworkUtil.getInstance(datasetId, ImmutableMap.of( FileSetProperties.INPUT_FORMAT, CombineTextInputFormat.class.getName(), FileSetProperties.OUTPUT_FORMAT, NullOutputFormat.class.getName())); Assert.assertEquals(CombineTextInputFormat.class.getName(), fs.getInputFormatClassName()); Assert.assertEquals(NullOutputFormat.class.getName(), fs.getOutputFormatClassName()); }
@Override public void configure() { setName("AppWithMapReduceUsingFile"); setDescription("Application with MapReduce job using file as dataset"); createDataset(INPUT, "table"); createDataset(OUTPUT, "table"); Class<? extends InputFormat> inputFormatClass = getConfig().isUseCombineFileInputFormat() ? CombineTextInputFormat.class : TextInputFormat.class; createDataset(PARTITIONED, "partitionedFileSet", PartitionedFileSetProperties.builder() .setPartitioning(Partitioning.builder() .addStringField("type") .addLongField("time") .build()) // properties for file set .setBasePath("partitioned") .setInputFormat(inputFormatClass) .setOutputFormat(TextOutputFormat.class) .setOutputProperty(TextOutputFormat.SEPERATOR, SEPARATOR) // don't configure properties for the Hive table - this is used in a context where explore is disabled .build()); addMapReduce(new PartitionWriter()); addMapReduce(new PartitionReader()); }
@Override public void configure() { setName("AppWithMapReduceUsingFile"); setDescription("Application with MapReduce job using file as dataset"); String inputDataset = getConfig().inputDataset; String outputDataset = getConfig().outputDataset; createDataset(inputDataset, "fileSet", FileSetProperties.builder() .setInputFormat(TextInputFormat.class) .setOutputFormat(TextOutputFormat.class) .setOutputProperty(TextOutputFormat.SEPERATOR, ":") .build()); if (!outputDataset.equals(inputDataset)) { createDataset(outputDataset, "fileSet", FileSetProperties.builder() .setBasePath("foo/my-file-output") .setInputFormat(TextInputFormat.class) .setOutputFormat(TextOutputFormat.class) .setOutputProperty(TextOutputFormat.SEPERATOR, ":") .build()); } addMapReduce(new ComputeSum(getConfig())); }
@Override public void configure() { setName("AppWithMapReduceUsingMultipleInputs"); setDescription("Application with MapReduce job using multiple inputs"); createDataset(PURCHASES, "fileSet", FileSetProperties.builder() .setInputFormat(TextInputFormat.class) .build()); createDataset(PURCHASES2, "fileSet", FileSetProperties.builder() .setInputFormat(TextInputFormat.class) .build()); createDataset(CUSTOMERS, "fileSet", FileSetProperties.builder() .setInputFormat(TextInputFormat.class) .build()); createDataset(OUTPUT_DATASET, "fileSet", FileSetProperties.builder() .setOutputFormat(TextOutputFormat.class) .setOutputProperty(TextOutputFormat.SEPERATOR, " ") .build()); addMapReduce(new ComputeSum()); addMapReduce(new InvalidMapReduce()); }
@Override public void configure() { setName("AppWithMapReduceUsingFile"); setDescription("Application with MapReduce job using file as dataset"); createDataset(INPUT, "table"); createDataset(OUTPUT, "table"); createDataset(TIME_PARTITIONED, "timePartitionedFileSet", FileSetProperties.builder() // properties for file set .setBasePath("partitioned") .setInputFormat(TextInputFormat.class) .setOutputFormat(TextOutputFormat.class) .setOutputProperty(TextOutputFormat.SEPERATOR, SEPARATOR) // don't configure properties for the Hive table - this is used in a context where explore is disabled .build()); addMapReduce(new PartitionWriter()); addMapReduce(new PartitionReader()); }
@Override protected void configure() { setName(WORKFLOW_NAME); setDescription("Workflow program with local datasets."); createLocalDataset(WORDCOUNT_DATASET, KeyValueTable.class); createLocalDataset(CSV_FILESET_DATASET, FileSet.class, FileSetProperties.builder() .setInputFormat(TextInputFormat.class) .setOutputFormat(TextOutputFormat.class) .build()); createLocalDataset(UNIQUE_ID_DATASET, KeyValueTable.class); addAction(new LocalDatasetWriter()); addSpark("JavaSparkCSVToSpaceConverter"); addMapReduce("WordCount"); addAction(new LocalDatasetReader("readerAction")); } }
@Override public void configure() { setName("AppWithMapReduceUsingMultipleOutputs"); setDescription("Application with MapReduce job using multiple outputs"); createDataset(PURCHASES, "fileSet", FileSetProperties.builder() .setInputFormat(TextInputFormat.class) .build()); createDataset(SEPARATED_PURCHASES, "fileSet", FileSetProperties.builder() .setOutputFormat(TextOutputFormat.class) .setOutputProperty(TextOutputFormat.SEPERATOR, " ") .build()); addMapReduce(new SeparatePurchases()); addMapReduce(new InvalidMapReduce()); }
@Override protected void addFileProperties(FileSetProperties.Builder propertiesBuilder) { propertiesBuilder .setInputFormat(TextInputFormat.class) .setOutputFormat(TextOutputFormat.class) .setEnableExploreOnCreate(true) .setExploreFormat("text") .setExploreSchema("text string"); }
@Override protected void configure() { setName(SPARK); setMainClass(getClass()); createDataset(SPARK_INPUT, FileSet.class, FileSetProperties.builder() .setInputFormat(TextInputFormat.class) .setOutputFormat(TextOutputFormat.class) .setOutputProperty(TextOutputFormat.SEPERATOR, ":").build()); createDataset(SPARK_TABLE, Table.class); usePlugin("t1", "n1", "plugin", PluginProperties.builder().add(KEY, TEST).build()); }
@Override public void configure() { createDataset("logs", FileSet.class, FileSetProperties.builder() .setInputFormat(TextInputFormat.class) .setOutputFormat(TextOutputFormat.class).build()); createDataset("logStats", KeyValueTable.class.getName()); addSpark(new SparkLogParser()); addSpark(new ScalaSparkLogParser()); }
@Override protected void configure() { createDataset(INPUT_FILE_SET, FileSet.class, FileSetProperties.builder() .setInputFormat(TextInputFormat.class) .setOutputFormat(TextOutputFormat.class).build()); createDataset(OUTPUT_FILE_SET, FileSet.class, FileSetProperties.builder() .setInputFormat(TextInputFormat.class) .setOutputFormat(TextOutputFormat.class).build()); }
@Override public void initialize(SparkHttpServiceContext context) throws Exception { super.initialize(context); try { context.getAdmin().createDataset("wordcount", FileSet.class.getName(), FileSetProperties.builder() .setInputFormat(TextInputFormat.class) .build()); } catch (InstanceConflictException e) { // It's ok if the dataset already exists } }