@Override public Plan getPlan(String... args) throws IllegalArgumentException { // parse job parameters final int numSubTasks = (args.length > 0 ? Integer.parseInt(args[0]) : 1); final String input = (args.length > 1 ? args[1] : ""); final String output = (args.length > 2 ? args[2] : ""); // This task will read the input data and generate the key/value pairs final FileDataSource source = new FileDataSource(new TeraInputFormat(), input, "Data Source"); source.setDegreeOfParallelism(numSubTasks); // This task writes the sorted data back to disk final FileDataSink sink = new FileDataSink(new TeraOutputFormat(), output, "Data Sink"); sink.setDegreeOfParallelism(numSubTasks); sink.setGlobalOrder(new Ordering(0, TeraKey.class, Order.ASCENDING), new TeraDistribution()); sink.setInput(source); return new Plan(sink, "TeraSort"); } }
/** * Creates a configuration builder that can be used to set the input * format's parameters to the config in a fluent fashion. * * @return A config builder for setting parameters. */ public static ConfigBuilder configureRecordFormat(FileDataSink target) { return new ConfigBuilder(target.getParameters()); }
@Override protected Plan getTestJob() { FileDataSource input = new FileDataSource( new ContractITCaseInputFormat(), inPath); DelimitedInputFormat.configureDelimitedFormat(input) .recordDelimiter('\n'); input.setDegreeOfParallelism(config.getInteger("MapTest#NoSubtasks", 1)); MapOperator testMapper = MapOperator.builder(new TestMapper()).build(); testMapper.setDegreeOfParallelism(config.getInteger("MapTest#NoSubtasks", 1)); FileDataSink output = new FileDataSink( new ContractITCaseOutputFormat(), resultPath); output.setDegreeOfParallelism(1); output.setInput(testMapper); testMapper.setInput(input); return new Plan(output); }
.name("Count Words") .build(); FileDataSink out = new FileDataSink(new CsvOutputFormat(), output, reducer, "Word Counts"); CsvOutputFormat.configureRecordFormat(out) .recordDelimiter('\n')
@Override protected JobGraph getJobGraph() throws Exception { // init data source FileDataSource input = new FileDataSource(new ContractITCaseInputFormat(), inputPath); // init (working) map task MapOperator testMapper = MapOperator.builder(TestMapper.class).build(); // init data sink FileDataSink output = new FileDataSink(new ContractITCaseOutputFormat(), resultPath); // compose working program output.setInput(testMapper); testMapper.setInput(input); // generate plan Plan plan = new Plan(output); plan.setDefaultParallelism(4); // optimize and compile plan PactCompiler pc = new PactCompiler(new DataStatistics()); OptimizedPlan op = pc.compile(plan); // return job graph of working job NepheleJobGraphGenerator jgg = new NepheleJobGraphGenerator(); return jgg.compileJobGraph(op); }
FileDataSource source = new FileDataSource(new CsvInputFormat(',', IntValue.class, IntValue.class, IntValue.class), recordsPath); FileDataSink sink = new FileDataSink(CsvOutputFormat.class, output); CsvOutputFormat.configureRecordFormat(sink) .recordDelimiter('\n') .field(IntValue.class, 2); sink.setGlobalOrder( new Ordering(0, IntValue.class, Order.DESCENDING) .appendOrdering(1, IntValue.class, Order.ASCENDING) .appendOrdering(2, IntValue.class, Order.DESCENDING), new TripleIntDistribution(Order.DESCENDING, Order.ASCENDING, Order.DESCENDING)); sink.setInput(source);
/** * Creates a FileDataSink with the provided {@link FileOutputFormat} implementation and the given name, * writing to the file indicated by the given path. It uses the given contract as its input. * * @param f The {@link FileOutputFormat} implementation used to encode the data. * @param filePath The path to the file to write the contents to. * @param input The contract to use as the input. * @param name The given name for the sink, used in plans, logs and progress messages. */ public FileDataSink(Class<? extends FileOutputFormat<Record>> f, String filePath, Operator<Record> input, String name) { this(f, filePath, name); setInput(input); }
@SuppressWarnings({ "deprecation", "unchecked" }) @Override protected JobGraph getJobGraph() throws Exception { String path1 = config.getBoolean("input1PathHasData", false) ? textInput : emptyInput; String path2 = config.getBoolean("input2PathHasData", false) ? textInput : emptyInput; FileDataSource input1 = new FileDataSource(new ContractITCaseInputFormat(), path1); FileDataSource input2 = new FileDataSource(new ContractITCaseInputFormat(), path2); MapOperator testMapper1 = MapOperator.builder(new TestMapper()).build(); MapOperator testMapper2 = MapOperator.builder(new TestMapper()).build(); FileDataSink output = new FileDataSink(new ContractITCaseOutputFormat(), resultDir); testMapper1.setInput(input1); testMapper2.setInput(input2); output.addInput(testMapper1); output.addInput(testMapper2); Plan plan = new Plan(output); plan.setDefaultParallelism(4); PactCompiler pc = new PactCompiler(new DataStatistics()); OptimizedPlan op = pc.compile(plan); NepheleJobGraphGenerator jgg = new NepheleJobGraphGenerator(); return jgg.compileJobGraph(op); }
.name("Count Words") .build(); FileDataSink out = new FileDataSink(new CsvOutputFormat(), OUT_FILE, reduceNode, "Word Counts"); CsvOutputFormat.configureRecordFormat(out) .recordDelimiter('\n') out.setGlobalOrder(ordering, new SimpleDistribution(new StringValue[] {new StringValue("N")}));
FileDataSink result = new FileDataSink(new CsvOutputFormat(), output, antiJoinVisits, "Result"); result.setDegreeOfParallelism(numSubTasks); CsvOutputFormat.configureRecordFormat(result) .recordDelimiter('\n')
new FileDataSink(new StringTupleDataOutFormat(), this.outputPath, "Output"); result.setDegreeOfParallelism(this.degreeOfParallelism); result.setInput(groupByReturnFlag);
public Plan getPlan(int numSubTasks, String output) { List<Object> tmp = new ArrayList<Object>(); int pos = 0; for (String s : WordCountData.COUNTS.split("\n")) { List<Object> tmpInner = new ArrayList<Object>(); tmpInner.add(pos++); tmpInner.add(Integer.parseInt(s.split(" ")[1])); tmp.add(tmpInner); } // test serializable iterator input, the input record is {id, word} CollectionDataSource source = new CollectionDataSource(new SerializableIteratorTest(), "test_iterator"); // test collection input, the input record is {id, count} CollectionDataSource source2 = new CollectionDataSource(tmp, "test_collection"); JoinOperator join = JoinOperator.builder(Join.class, IntValue.class, 0, 0) .input1(source).input2(source2).build(); FileDataSink out = new FileDataSink(new CsvOutputFormat(), output, join, "Collection Join"); CsvOutputFormat.configureRecordFormat(out) .recordDelimiter('\n') .fieldDelimiter(' ') .field(StringValue.class, 0) .field(IntValue.class, 1); Plan plan = new Plan(out, "CollectionDataSource"); plan.setDefaultParallelism(numSubTasks); return plan; }
@Override protected JobGraph getFailingJobGraph() throws Exception { // init data source FileDataSource input = new FileDataSource(new ContractITCaseInputFormat(), inputPath); // init failing map task MapOperator testMapper = MapOperator.builder(FailingMapper.class).build(); // init data sink FileDataSink output = new FileDataSink(new ContractITCaseOutputFormat(), resultPath); // compose failing program output.setInput(testMapper); testMapper.setInput(input); // generate plan Plan plan = new Plan(output); plan.setDefaultParallelism(4); // optimize and compile plan PactCompiler pc = new PactCompiler(new DataStatistics()); OptimizedPlan op = pc.compile(plan); // return job graph of failing job NepheleJobGraphGenerator jgg = new NepheleJobGraphGenerator(); return jgg.compileJobGraph(op); }
/** * Creates a FileDataSink with the provided {@link FileOutputFormat} implementation the default name, * writing to the file indicated by the given path. It uses the given contract as its input. * * @param f The {@link FileOutputFormat} implementation used to encode the data. * @param filePath The path to the file to write the contents to. * @param input The contract to use as the input. */ public FileDataSink(FileOutputFormat<Record> f, String filePath, Operator<Record> input) { this(f, filePath); setInput(input); }
@SuppressWarnings({ "deprecation", "unchecked" }) @Override protected Plan getTestJob() { String input1Path = config.getString("UnionTest#Input1Path", "").equals("empty") ? emptyInPath : inPath; String input2Path = config.getString("UnionTest#Input2Path", "").equals("empty") ? emptyInPath : inPath; FileDataSource input1 = new FileDataSource( new ContractITCaseInputFormat(), input1Path); DelimitedInputFormat.configureDelimitedFormat(input1) .recordDelimiter('\n'); input1.setDegreeOfParallelism(config.getInteger("UnionTest#NoSubtasks", 1)); FileDataSource input2 = new FileDataSource( new ContractITCaseInputFormat(), input2Path); DelimitedInputFormat.configureDelimitedFormat(input2) .recordDelimiter('\n'); input2.setDegreeOfParallelism(config.getInteger("UnionTest#NoSubtasks", 1)); MapOperator testMapper = MapOperator.builder(new TestMapper()).build(); testMapper.setDegreeOfParallelism(config.getInteger("UnionTest#NoSubtasks", 1)); FileDataSink output = new FileDataSink( new ContractITCaseOutputFormat(), resultPath); output.setDegreeOfParallelism(1); output.setInput(testMapper); testMapper.addInput(input1); testMapper.addInput(input2); return new Plan(output); }
@Override public Plan getPlan(String... args) { // parse job parameters int numSubTasks = (args.length > 0 ? Integer.parseInt(args[0]) : 1); String dataInput = (args.length > 1 ? args[1] : ""); String output = (args.length > 2 ? args[2] : ""); FileDataSource source = new FileDataSource(new TextInputFormat(), dataInput, "Input Lines"); MapOperator mapper = MapOperator.builder(new TokenizeLine()) .input(source) .name("Tokenize Lines") .build(); ReduceOperator reducer = ReduceOperator.builder(CountWords.class, StringValue.class, 0) .input(mapper) .name("Count Words") .build(); @SuppressWarnings("unchecked") FileDataSink out = new FileDataSink(new CsvOutputFormat("\n", " ", StringValue.class, IntValue.class), output, reducer, "Word Counts"); Plan plan = new Plan(out, "WordCount Example"); plan.setDefaultParallelism(numSubTasks); return plan; }
closeTriads.setParameter("LOCAL_STRATEGY", "LOCAL_STRATEGY_HASH_BUILD_SECOND"); FileDataSink triangles = new FileDataSink(new CsvOutputFormat(), output, "Output"); CsvOutputFormat.configureRecordFormat(triangles) .recordDelimiter('\n') .field(StringValue.class, 2); triangles.setInput(closeTriads); closeTriads.setSecondInput(edges); closeTriads.setFirstInput(buildTriads);
@Override public Plan getPlan(String... args) throws IllegalArgumentException { // parse program parameters int numSubtasks = (args.length > 0 ? Integer.parseInt(args[0]) : 1); String recordsPath = (args.length > 1 ? args[1] : ""); String output = (args.length > 2 ? args[2] : ""); FileDataSource source = new FileDataSource(CsvInputFormat.class, recordsPath); source.setDegreeOfParallelism(numSubtasks); CsvInputFormat.configureRecordFormat(source) .recordDelimiter('\n') .fieldDelimiter('|') .field(IntValue.class, 0); FileDataSink sink = new FileDataSink(CsvOutputFormat.class, output); sink.setDegreeOfParallelism(numSubtasks); CsvOutputFormat.configureRecordFormat(sink) .recordDelimiter('\n') .fieldDelimiter('|') .lenient(true) .field(IntValue.class, 0); sink.setGlobalOrder(new Ordering(0, IntValue.class, Order.ASCENDING), new UniformIntegerDistribution(Integer.MIN_VALUE, Integer.MAX_VALUE)); sink.setInput(source); return new Plan(sink); }
/** * Creates a FileDataSink with the provided {@link FileOutputFormat} implementation and the given name, * writing to the file indicated by the given path. It uses the given contract as its input. * * @param f The {@link FileOutputFormat} implementation used to encode the data. * @param filePath The path to the file to write the contents to. * @param input The contract to use as the input. * @param name The given name for the sink, used in plans, logs and progress messages. */ public FileDataSink(FileOutputFormat<Record> f, String filePath, Operator<Record> input, String name) { this(f, filePath, name); setInput(input); }
/** * Creates a configuration builder that can be used to set the input format's parameters to the config in a fluent * fashion. * * @return A config builder for setting parameters. */ public static ConfigBuilder configureDelimitedFormat(FileDataSink target) { return new ConfigBuilder(target.getParameters()); }