/** * Add an input path in the runtime arguments for a file dataset. */ public static void addInputPath(Map<String, String> arguments, String path) { String existing = arguments.get(INPUT_PATHS); if (existing == null) { setInputPath(arguments, path); } else { setInputPath(arguments, existing + "," + path); } }
protected static Map<String, String> updateArgumentsIfNeeded(Map<String, String> arguments, Partitioning partitioning) { if (FileSetArguments.getOutputPath(arguments) == null) { PartitionKey key = PartitionedFileSetArguments.getOutputPartitionKey(arguments, partitioning); // we need to copy the map, to avoid modifying the passed-in map arguments = Maps.newHashMap(arguments); if (key != null) { FileSetArguments.setOutputPath(arguments, PartitionedFileSetDataset.getOutputPath(key, partitioning)); } else if (PartitionedFileSetArguments.getDynamicPartitioner(arguments) != null) { // when using DynamicPartitioner, use the baseLocation of the fileSet as the output location FileSetArguments.setBaseOutputPath(arguments); } } return arguments; }
/** * Sets multiple input paths in the runtime arguments for a file dataset. */ public static void setInputPaths(Map<String, String> arguments, Collection<String> paths) { arguments.remove(INPUT_PATHS); for (String path : paths) { addInputPath(arguments, path); } }
private Location determineOutputLocation() { if (FileSetArguments.isBaseOutputPath(runtimeArguments)) { return baseLocation; } String outputPath = FileSetArguments.getOutputPath(runtimeArguments); return outputPath == null ? null : createLocation(outputPath); }
protected Map<String, String> updateArgumentsIfNeeded(Map<String, String> arguments) { Long time = TimePartitionedFileSetArguments.getOutputPartitionTime(arguments); if (time != null) { // set the output path according to partition time if (FileSetArguments.getOutputPath(arguments) == null) { String outputPathFormat = TimePartitionedFileSetArguments.getOutputPathFormat(arguments); String path; if (Strings.isNullOrEmpty(outputPathFormat)) { path = String.format("%tF/%tH-%tM.%d", time, time, time, time); } else { SimpleDateFormat format = new SimpleDateFormat(outputPathFormat); String timeZoneID = TimePartitionedFileSetArguments.getOutputPathTimeZone(arguments); if (!Strings.isNullOrEmpty(timeZoneID)) { format.setTimeZone(TimeZone.getTimeZone(timeZoneID)); } path = format.format(new Date(time)); } arguments = Maps.newHashMap(arguments); FileSetArguments.setOutputPath(arguments, path); } // add the corresponding partition key to the arguments PartitionKey outputKey = TimePartitionedFileSetDataset.partitionKeyForTime(time); PartitionedFileSetArguments.setOutputPartitionKey(arguments, outputKey); } // delegate to super class for anything it needs to do return updateArgumentsIfNeeded(arguments, TimePartitionedFileSetDataset.PARTITIONING); } }
private FileSet createFileset(DatasetId dsid) throws IOException, DatasetManagementException { dsFrameworkUtil.createInstance("fileSet", dsid, FileSetProperties.builder() .setBasePath("testDir").build()); Map<String, String> fileArgs = Maps.newHashMap(); FileSetArguments.setInputPath(fileArgs, "some?File1"); FileSetArguments.setOutputPath(fileArgs, "some?File1"); return dsFrameworkUtil.getInstance(dsid, fileArgs); }
FileSetArguments.setInputPaths(args, inputPaths); context.addInput(Input.ofDataset(inputName, args)); FileSetArguments.setOutputPath(args, outputPath); context.addOutput(Output.ofDataset(outputName, args));
@Override public void run(DatasetContext context) throws Exception { Map<String, String> args = sec.getRuntimeArguments(); String outputPath = args.get("output.path"); Map<String, String> fileSetArgs = new HashMap<>(); FileSetArguments.setOutputPath(fileSetArgs, outputPath); FileSet fileSet = context.getDataset(WorkflowAppWithLocalDatasets.CSV_FILESET_DATASET, fileSetArgs); try (PrintWriter writer = new PrintWriter(fileSet.getOutputLocation().getOutputStream())) { for (String line : converted) { writer.write(line); writer.println(); } } } });
private List<Location> determineInputLocations() { List<Location> locations = Lists.newLinkedList(); for (String path : FileSetArguments.getInputPaths(runtimeArguments)) { locations.add(createLocation(path)); } return locations; }
@Override public void onSuccess() throws DataSetException { String outputPath = FileSetArguments.getOutputPath(runtimeArguments); // If there is no output path, it is either using DynamicPartitioner or the job would have failed. // Either way, we can't do much here. if (outputPath == null) { return; } // its possible that there is no output key, if using the DynamicPartitioner, in which case // DynamicPartitioningOutputFormat is responsible for registering the partitions and the metadata PartitionKey outputKey = PartitionedFileSetArguments.getOutputPartitionKey(runtimeArguments, getPartitioning()); if (outputKey != null) { Map<String, String> metadata = PartitionedFileSetArguments.getOutputPartitionMetadata(runtimeArguments); addPartition(outputKey, outputPath, metadata, true, false); } // currently, FileSetDataset#onSuccess is a no-op, but call it, in case it does something in the future ((FileSetDataset) files).onSuccess(); }
@Override public void initialize() throws Exception { MapReduceContext context = getContext(); Map<String, String> inputArgs = new HashMap<>(); FileSetArguments.setInputPath(inputArgs, "inputFile"); // test using a stream with the same name, but aliasing it differently (so mapper gets the alias'd name) context.addInput(Input.ofDataset(PURCHASES, inputArgs), FileMapper.class); Map<String, String> output1Args = new HashMap<>(); FileSetArguments.setOutputPath(output1Args, "small_purchases"); context.addOutput(Output.ofDataset(SEPARATED_PURCHASES, output1Args).alias("small_purchases")); Map<String, String> output2Args = new HashMap<>(); FileSetArguments.setOutputPath(output2Args, "large_purchases"); context.addOutput(Output.ofDataset(SEPARATED_PURCHASES, output2Args).alias("large_purchases")); Job job = context.getHadoopJob(); job.setMapperClass(FileMapper.class); job.setNumReduceTasks(0); } }
protected Map<String, String> updateArgumentsIfNeeded(Map<String, String> arguments) { Long time = TimePartitionedFileSetArguments.getOutputPartitionTime(arguments); if (time != null) { // set the output path according to partition time if (FileSetArguments.getOutputPath(arguments) == null) { String outputPathFormat = TimePartitionedFileSetArguments.getOutputPathFormat(arguments); String path; if (Strings.isNullOrEmpty(outputPathFormat)) { path = String.format("%tF/%tH-%tM.%d", time, time, time, time); } else { SimpleDateFormat format = new SimpleDateFormat(outputPathFormat); String timeZoneID = TimePartitionedFileSetArguments.getOutputPathTimeZone(arguments); if (!Strings.isNullOrEmpty(timeZoneID)) { format.setTimeZone(TimeZone.getTimeZone(timeZoneID)); } path = format.format(new Date(time)); } arguments = Maps.newHashMap(arguments); FileSetArguments.setOutputPath(arguments, path); } // add the corresponding partition key to the arguments PartitionKey outputKey = TimePartitionedFileSetDataset.partitionKeyForTime(time); PartitionedFileSetArguments.setOutputPartitionKey(arguments, outputKey); } // delegate to super class for anything it needs to do return updateArgumentsIfNeeded(arguments, TimePartitionedFileSetDataset.PARTITIONING); } }
Map<String, String> inputArgs = Maps.newHashMap(); Map<String, String> outputArgs = Maps.newHashMap(); FileSetArguments.setInputPaths(inputArgs, inputPaths); FileSetArguments.setOutputPath(outputArgs, outputPath); if (outputSeparator != null) { outputArgs.put(FileSetProperties.OUTPUT_PROPERTIES_PREFIX + TextOutputFormat.SEPERATOR, "#");
@Test public void testAbsolutePath() throws IOException, DatasetManagementException { String absolutePath = tmpFolder.newFolder() + "/absolute/path"; dsFrameworkUtil.createInstance("fileSet", testFileSetInstance3, FileSetProperties.builder() .setBasePath(absolutePath).build()); // validate that the base path for the file set was created Assert.assertTrue(new File(absolutePath).isDirectory()); // instantiate the file set with an output path Map<String, String> fileArgs = Maps.newHashMap(); FileSetArguments.setOutputPath(fileArgs, "out"); FileSet fileSet = dsFrameworkUtil.getInstance(testFileSetInstance3, fileArgs); // write to the output path Assert.assertEquals(absolutePath + "/out", fileSet.getOutputLocation().toURI().getPath()); try (OutputStream out = fileSet.getOutputLocation().getOutputStream()) { out.write(42); } // validate that the file was created Assert.assertTrue(new File(absolutePath + "/out").isFile()); }
private Location determineOutputLocation() { if (FileSetArguments.isBaseOutputPath(runtimeArguments)) { return baseLocation; } String outputPath = FileSetArguments.getOutputPath(runtimeArguments); return outputPath == null ? null : createLocation(outputPath); }
private List<Location> determineInputLocations() { List<Location> locations = Lists.newLinkedList(); for (String path : FileSetArguments.getInputPaths(runtimeArguments)) { locations.add(createLocation(path)); } return locations; }
@Override public void onSuccess() throws DataSetException { String outputPath = FileSetArguments.getOutputPath(runtimeArguments); // If there is no output path, it is either using DynamicPartitioner or the job would have failed. // Either way, we can't do much here. if (outputPath == null) { return; } // its possible that there is no output key, if using the DynamicPartitioner, in which case // DynamicPartitioningOutputFormat is responsible for registering the partitions and the metadata PartitionKey outputKey = PartitionedFileSetArguments.getOutputPartitionKey(runtimeArguments, getPartitioning()); if (outputKey != null) { Map<String, String> metadata = PartitionedFileSetArguments.getOutputPartitionMetadata(runtimeArguments); addPartition(outputKey, outputPath, metadata, true, false); } // currently, FileSetDataset#onSuccess is a no-op, but call it, in case it does something in the future ((FileSetDataset) files).onSuccess(); }
@Override public void initialize() throws Exception { MapReduceContext context = getContext(); Map<String, String> inputArgs = new HashMap<>(); FileSetArguments.setInputPath(inputArgs, "inputFile"); context.addInput(Input.ofDataset(PURCHASES, inputArgs), FileMapper.class); // A second input, aliasing so mapper gets the alias'd name context.addInput(Input.ofDataset(PURCHASES2, inputArgs).alias("secondPurchases"), FileMapper2.class); // since we set a Mapper class on the job itself, omitting the mapper in the addInput call will default to that context.addInput(Input.ofDataset(CUSTOMERS, inputArgs)); Map<String, String> outputArgs = new HashMap<>(); FileSetArguments.setOutputPath(outputArgs, "output"); context.addOutput(Output.ofDataset(OUTPUT_DATASET, outputArgs)); Job job = context.getHadoopJob(); job.setMapperClass(FileMapper.class); job.setReducerClass(FileReducer.class); } }
private void testSparkWithGetDataset(Class<? extends Application> appClass, String sparkProgram) throws Exception { ApplicationManager applicationManager = deploy(appClass); DataSetManager<FileSet> filesetManager = getDataset("logs"); FileSet fileset = filesetManager.get(); Location location = fileset.getLocation("nn"); prepareInputFileSetWithLogData(location); Map<String, String> inputArgs = new HashMap<>(); FileSetArguments.setInputPath(inputArgs, "nn"); Map<String, String> args = new HashMap<>(); args.putAll(RuntimeArguments.addScope(Scope.DATASET, "logs", inputArgs)); args.put("input", "logs"); args.put("output", "logStats"); SparkManager sparkManager = applicationManager.getSparkManager(sparkProgram); sparkManager.startAndWaitForRun(args, ProgramRunStatus.COMPLETED, 2, TimeUnit.MINUTES); DataSetManager<KeyValueTable> logStatsManager = getDataset("logStats"); KeyValueTable logStatsTable = logStatsManager.get(); validateGetDatasetOutput(logStatsTable); // Cleanup after run location.delete(true); logStatsManager.flush(); try (CloseableIterator<KeyValue<byte[], byte[]>> scan = logStatsTable.scan(null, null)) { while (scan.hasNext()) { logStatsTable.delete(scan.next().getKey()); } } logStatsManager.flush(); }
protected static Map<String, String> updateArgumentsIfNeeded(Map<String, String> arguments, Partitioning partitioning) { if (FileSetArguments.getOutputPath(arguments) == null) { PartitionKey key = PartitionedFileSetArguments.getOutputPartitionKey(arguments, partitioning); // we need to copy the map, to avoid modifying the passed-in map arguments = Maps.newHashMap(arguments); if (key != null) { FileSetArguments.setOutputPath(arguments, PartitionedFileSetDataset.getOutputPath(key, partitioning)); } else if (PartitionedFileSetArguments.getDynamicPartitioner(arguments) != null) { // when using DynamicPartitioner, use the baseLocation of the fileSet as the output location FileSetArguments.setBaseOutputPath(arguments); } } return arguments; }