protected Map<String, String> updateArgumentsIfNeeded(Map<String, String> arguments) { Long time = TimePartitionedFileSetArguments.getOutputPartitionTime(arguments); if (time != null) { // set the output path according to partition time if (FileSetArguments.getOutputPath(arguments) == null) { String outputPathFormat = TimePartitionedFileSetArguments.getOutputPathFormat(arguments); String path; if (Strings.isNullOrEmpty(outputPathFormat)) { path = String.format("%tF/%tH-%tM.%d", time, time, time, time); } else { SimpleDateFormat format = new SimpleDateFormat(outputPathFormat); String timeZoneID = TimePartitionedFileSetArguments.getOutputPathTimeZone(arguments); if (!Strings.isNullOrEmpty(timeZoneID)) { format.setTimeZone(TimeZone.getTimeZone(timeZoneID)); } path = format.format(new Date(time)); } arguments = Maps.newHashMap(arguments); FileSetArguments.setOutputPath(arguments, path); } // add the corresponding partition key to the arguments PartitionKey outputKey = TimePartitionedFileSetDataset.partitionKeyForTime(time); PartitionedFileSetArguments.setOutputPartitionKey(arguments, outputKey); } // delegate to super class for anything it needs to do return updateArgumentsIfNeeded(arguments, TimePartitionedFileSetDataset.PARTITIONING); } }
protected Map<String, String> updateArgumentsIfNeeded(Map<String, String> arguments) { Long time = TimePartitionedFileSetArguments.getOutputPartitionTime(arguments); if (time != null) { // set the output path according to partition time if (FileSetArguments.getOutputPath(arguments) == null) { String outputPathFormat = TimePartitionedFileSetArguments.getOutputPathFormat(arguments); String path; if (Strings.isNullOrEmpty(outputPathFormat)) { path = String.format("%tF/%tH-%tM.%d", time, time, time, time); } else { SimpleDateFormat format = new SimpleDateFormat(outputPathFormat); String timeZoneID = TimePartitionedFileSetArguments.getOutputPathTimeZone(arguments); if (!Strings.isNullOrEmpty(timeZoneID)) { format.setTimeZone(TimeZone.getTimeZone(timeZoneID)); } path = format.format(new Date(time)); } arguments = Maps.newHashMap(arguments); FileSetArguments.setOutputPath(arguments, path); } // add the corresponding partition key to the arguments PartitionKey outputKey = TimePartitionedFileSetDataset.partitionKeyForTime(time); PartitionedFileSetArguments.setOutputPartitionKey(arguments, outputKey); } // delegate to super class for anything it needs to do return updateArgumentsIfNeeded(arguments, TimePartitionedFileSetDataset.PARTITIONING); } }
protected static Map<String, String> updateArgumentsIfNeeded(Map<String, String> arguments, Partitioning partitioning) { if (FileSetArguments.getOutputPath(arguments) == null) { PartitionKey key = PartitionedFileSetArguments.getOutputPartitionKey(arguments, partitioning); // we need to copy the map, to avoid modifying the passed-in map arguments = Maps.newHashMap(arguments); if (key != null) { FileSetArguments.setOutputPath(arguments, PartitionedFileSetDataset.getOutputPath(key, partitioning)); } else if (PartitionedFileSetArguments.getDynamicPartitioner(arguments) != null) { // when using DynamicPartitioner, use the baseLocation of the fileSet as the output location FileSetArguments.setBaseOutputPath(arguments); } } return arguments; }
protected static Map<String, String> updateArgumentsIfNeeded(Map<String, String> arguments, Partitioning partitioning) { if (FileSetArguments.getOutputPath(arguments) == null) { PartitionKey key = PartitionedFileSetArguments.getOutputPartitionKey(arguments, partitioning); // we need to copy the map, to avoid modifying the passed-in map arguments = Maps.newHashMap(arguments); if (key != null) { FileSetArguments.setOutputPath(arguments, PartitionedFileSetDataset.getOutputPath(key, partitioning)); } else if (PartitionedFileSetArguments.getDynamicPartitioner(arguments) != null) { // when using DynamicPartitioner, use the baseLocation of the fileSet as the output location FileSetArguments.setBaseOutputPath(arguments); } } return arguments; }
@Override public void run(DatasetContext context) throws Exception { Map<String, String> args = sec.getRuntimeArguments(); String outputPath = args.get("output.path"); Map<String, String> fileSetArgs = new HashMap<>(); FileSetArguments.setOutputPath(fileSetArgs, outputPath); FileSet fileSet = context.getDataset(WorkflowAppWithLocalDatasets.CSV_FILESET_DATASET, fileSetArgs); try (PrintWriter writer = new PrintWriter(fileSet.getOutputLocation().getOutputStream())) { for (String line : converted) { writer.write(line); writer.println(); } } } });
@Override public void initialize() throws Exception { MapReduceContext context = getContext(); Map<String, String> inputArgs = new HashMap<>(); FileSetArguments.setInputPath(inputArgs, "inputFile"); // test using a stream with the same name, but aliasing it differently (so mapper gets the alias'd name) context.addInput(Input.ofDataset(PURCHASES, inputArgs), FileMapper.class); Map<String, String> output1Args = new HashMap<>(); FileSetArguments.setOutputPath(output1Args, "small_purchases"); context.addOutput(Output.ofDataset(SEPARATED_PURCHASES, output1Args).alias("small_purchases")); Map<String, String> output2Args = new HashMap<>(); FileSetArguments.setOutputPath(output2Args, "large_purchases"); context.addOutput(Output.ofDataset(SEPARATED_PURCHASES, output2Args).alias("large_purchases")); Job job = context.getHadoopJob(); job.setMapperClass(FileMapper.class); job.setNumReduceTasks(0); } }
FileSetArguments.setOutputPath(args, outputPath); context.addOutput(Output.ofDataset(outputName, args));
private FileSet createFileset(DatasetId dsid) throws IOException, DatasetManagementException { dsFrameworkUtil.createInstance("fileSet", dsid, FileSetProperties.builder() .setBasePath("testDir").build()); Map<String, String> fileArgs = Maps.newHashMap(); FileSetArguments.setInputPath(fileArgs, "some?File1"); FileSetArguments.setOutputPath(fileArgs, "some?File1"); return dsFrameworkUtil.getInstance(dsid, fileArgs); }
@Test public void testAbsolutePath() throws IOException, DatasetManagementException { String absolutePath = tmpFolder.newFolder() + "/absolute/path"; dsFrameworkUtil.createInstance("fileSet", testFileSetInstance3, FileSetProperties.builder() .setBasePath(absolutePath).build()); // validate that the base path for the file set was created Assert.assertTrue(new File(absolutePath).isDirectory()); // instantiate the file set with an output path Map<String, String> fileArgs = Maps.newHashMap(); FileSetArguments.setOutputPath(fileArgs, "out"); FileSet fileSet = dsFrameworkUtil.getInstance(testFileSetInstance3, fileArgs); // write to the output path Assert.assertEquals(absolutePath + "/out", fileSet.getOutputLocation().toURI().getPath()); try (OutputStream out = fileSet.getOutputLocation().getOutputStream()) { out.write(42); } // validate that the file was created Assert.assertTrue(new File(absolutePath + "/out").isFile()); }
@Override public void initialize() throws Exception { MapReduceContext context = getContext(); Map<String, String> inputArgs = new HashMap<>(); FileSetArguments.setInputPath(inputArgs, "inputFile"); context.addInput(Input.ofDataset(PURCHASES, inputArgs), FileMapper.class); // A second input, aliasing so mapper gets the alias'd name context.addInput(Input.ofDataset(PURCHASES2, inputArgs).alias("secondPurchases"), FileMapper2.class); // since we set a Mapper class on the job itself, omitting the mapper in the addInput call will default to that context.addInput(Input.ofDataset(CUSTOMERS, inputArgs)); Map<String, String> outputArgs = new HashMap<>(); FileSetArguments.setOutputPath(outputArgs, "output"); context.addOutput(Output.ofDataset(OUTPUT_DATASET, outputArgs)); Job job = context.getHadoopJob(); job.setMapperClass(FileMapper.class); job.setReducerClass(FileReducer.class); } }
private void testSparkWithFileSet(ApplicationManager applicationManager, String sparkProgram) throws Exception { DataSetManager<FileSet> filesetManager = getDataset("fs"); FileSet fileset = filesetManager.get(); Location location = fileset.getLocation("nn"); prepareFileInput(location); Map<String, String> inputArgs = new HashMap<>(); FileSetArguments.setInputPath(inputArgs, "nn"); Map<String, String> outputArgs = new HashMap<>(); FileSetArguments.setOutputPath(inputArgs, "xx"); Map<String, String> args = new HashMap<>(); args.putAll(RuntimeArguments.addScope(Scope.DATASET, "fs", inputArgs)); args.putAll(RuntimeArguments.addScope(Scope.DATASET, "fs", outputArgs)); args.put("input", "fs"); args.put("output", "fs"); SparkManager sparkManager = applicationManager.getSparkManager(sparkProgram).start(args); sparkManager.waitForRun(ProgramRunStatus.COMPLETED, 1, TimeUnit.MINUTES); validateFileOutput(fileset.getLocation("xx"), "custom:"); // Cleanup paths after running test fileset.getLocation("nn").delete(true); fileset.getLocation("xx").delete(true); }
FileSetArguments.setOutputPath(fileArgs, "out"); FileSet fileSet = dsFrameworkUtil.getInstance(testFileSetInstance5, fileArgs); Assert.assertNotNull(fileSet);
FileSetArguments.setInputPath(inputArgs, "input"); Map<String, String> outputArgs = new HashMap<>(); FileSetArguments.setOutputPath(outputArgs, "output");
Map<String, String> outputArgs = Maps.newHashMap(); FileSetArguments.setInputPaths(inputArgs, inputPaths); FileSetArguments.setOutputPath(outputArgs, outputPath); if (outputSeparator != null) { outputArgs.put(FileSetProperties.OUTPUT_PROPERTIES_PREFIX + TextOutputFormat.SEPERATOR, "#");
FileSetArguments.setInputPath(inputArgs, "nn"); Map<String, String> outputArgs = new HashMap<>(); FileSetArguments.setOutputPath(inputArgs, "xx"); Map<String, String> args = new HashMap<>(); args.putAll(RuntimeArguments.addScope(Scope.DATASET, "myfs", inputArgs));
@Test public void testRollbackOnJobFailure() throws Exception { // tests the logic of #onFailure method Map<String, String> args = new HashMap<>(); FileSetArguments.setOutputPath(args, "custom/output/path"); PartitionedFileSetArguments.setOutputPartitionKey(args, PARTITION_KEY); PartitionedFileSet pfs = dsFrameworkUtil.getInstance(pfsInstance, args); TransactionContext txContext = new TransactionContext(txClient, (TransactionAware) pfs); txContext.start(); Location outputLocation = pfs.getEmbeddedFileSet().getOutputLocation(); Assert.assertFalse(outputLocation.exists()); outputLocation.mkdirs(); Assert.assertTrue(outputLocation.exists()); ((PartitionedFileSetDataset) pfs).onFailure(); txContext.abort(); // because the previous transaction aborted, the partition as well as the directory for it will not exist txContext.start(); Assert.assertNull(pfs.getPartition(PARTITION_KEY)); Assert.assertFalse(outputLocation.exists()); txContext.finish(); }