/** * Add an input path in the runtime arguments for a file dataset. */ public static void addInputPath(Map<String, String> arguments, String path) { String existing = arguments.get(INPUT_PATHS); if (existing == null) { setInputPath(arguments, path); } else { setInputPath(arguments, existing + "," + path); } }
private FileSet createFileset(DatasetId dsid) throws IOException, DatasetManagementException { dsFrameworkUtil.createInstance("fileSet", dsid, FileSetProperties.builder() .setBasePath("testDir").build()); Map<String, String> fileArgs = Maps.newHashMap(); FileSetArguments.setInputPath(fileArgs, "some?File1"); FileSetArguments.setOutputPath(fileArgs, "some?File1"); return dsFrameworkUtil.getInstance(dsid, fileArgs); }
@Override public void initialize() throws Exception { MapReduceContext context = getContext(); Map<String, String> inputArgs = new HashMap<>(); FileSetArguments.setInputPath(inputArgs, "inputFile"); // test using a stream with the same name, but aliasing it differently (so mapper gets the alias'd name) context.addInput(Input.ofDataset(PURCHASES, inputArgs), FileMapper.class); Map<String, String> output1Args = new HashMap<>(); FileSetArguments.setOutputPath(output1Args, "small_purchases"); context.addOutput(Output.ofDataset(SEPARATED_PURCHASES, output1Args).alias("small_purchases")); Map<String, String> output2Args = new HashMap<>(); FileSetArguments.setOutputPath(output2Args, "large_purchases"); context.addOutput(Output.ofDataset(SEPARATED_PURCHASES, output2Args).alias("large_purchases")); Job job = context.getHadoopJob(); job.setMapperClass(FileMapper.class); job.setNumReduceTasks(0); } }
@Override public void initialize() throws Exception { MapReduceContext context = getContext(); Map<String, String> inputArgs = new HashMap<>(); FileSetArguments.setInputPath(inputArgs, "inputFile"); context.addInput(Input.ofDataset(PURCHASES, inputArgs), FileMapper.class); // A second input, aliasing so mapper gets the alias'd name context.addInput(Input.ofDataset(PURCHASES2, inputArgs).alias("secondPurchases"), FileMapper2.class); // since we set a Mapper class on the job itself, omitting the mapper in the addInput call will default to that context.addInput(Input.ofDataset(CUSTOMERS, inputArgs)); Map<String, String> outputArgs = new HashMap<>(); FileSetArguments.setOutputPath(outputArgs, "output"); context.addOutput(Output.ofDataset(OUTPUT_DATASET, outputArgs)); Job job = context.getHadoopJob(); job.setMapperClass(FileMapper.class); job.setReducerClass(FileReducer.class); } }
private void testSparkWithFileSet(ApplicationManager applicationManager, String sparkProgram) throws Exception { DataSetManager<FileSet> filesetManager = getDataset("fs"); FileSet fileset = filesetManager.get(); Location location = fileset.getLocation("nn"); prepareFileInput(location); Map<String, String> inputArgs = new HashMap<>(); FileSetArguments.setInputPath(inputArgs, "nn"); Map<String, String> outputArgs = new HashMap<>(); FileSetArguments.setOutputPath(inputArgs, "xx"); Map<String, String> args = new HashMap<>(); args.putAll(RuntimeArguments.addScope(Scope.DATASET, "fs", inputArgs)); args.putAll(RuntimeArguments.addScope(Scope.DATASET, "fs", outputArgs)); args.put("input", "fs"); args.put("output", "fs"); SparkManager sparkManager = applicationManager.getSparkManager(sparkProgram).start(args); sparkManager.waitForRun(ProgramRunStatus.COMPLETED, 1, TimeUnit.MINUTES); validateFileOutput(fileset.getLocation("xx"), "custom:"); // Cleanup paths after running test fileset.getLocation("nn").delete(true); fileset.getLocation("xx").delete(true); }
FileSetArguments.setInputPath(fileArgs, "some.file"); FileSetArguments.setOutputPath(fileArgs, "out"); FileSet fileSet = dsFrameworkUtil.getInstance(testFileSetInstance5, fileArgs);
private void testSparkWithGetDataset(Class<? extends Application> appClass, String sparkProgram) throws Exception { ApplicationManager applicationManager = deploy(appClass); DataSetManager<FileSet> filesetManager = getDataset("logs"); FileSet fileset = filesetManager.get(); Location location = fileset.getLocation("nn"); prepareInputFileSetWithLogData(location); Map<String, String> inputArgs = new HashMap<>(); FileSetArguments.setInputPath(inputArgs, "nn"); Map<String, String> args = new HashMap<>(); args.putAll(RuntimeArguments.addScope(Scope.DATASET, "logs", inputArgs)); args.put("input", "logs"); args.put("output", "logStats"); SparkManager sparkManager = applicationManager.getSparkManager(sparkProgram); sparkManager.startAndWaitForRun(args, ProgramRunStatus.COMPLETED, 2, TimeUnit.MINUTES); DataSetManager<KeyValueTable> logStatsManager = getDataset("logStats"); KeyValueTable logStatsTable = logStatsManager.get(); validateGetDatasetOutput(logStatsTable); // Cleanup after run location.delete(true); logStatsManager.flush(); try (CloseableIterator<KeyValue<byte[], byte[]>> scan = logStatsTable.scan(null, null)) { while (scan.hasNext()) { logStatsTable.delete(scan.next().getKey()); } } logStatsManager.flush(); }
FileSetArguments.setInputPath(inputArgs, "input"); Map<String, String> outputArgs = new HashMap<>(); FileSetArguments.setOutputPath(outputArgs, "output");
args.put(ScalaCrossNSProgram.OUTPUT_NAME(), "count"); FileSetArguments.setInputPath(args, "inputFile");
FileSetArguments.setInputPath(inputArgs, "nn"); Map<String, String> outputArgs = new HashMap<>(); FileSetArguments.setOutputPath(inputArgs, "xx");
FileSetArguments.setInputPath(sparkArgs, "input");