@Override public Plan getPlan(String... args) { // parse job parameters int numSubTasks = (args.length > 0 ? Integer.parseInt(args[0]) : 1); String dataInput = (args.length > 1 ? args[1] : ""); String output = (args.length > 2 ? args[2] : ""); HadoopDataSource<LongWritable, Text> source = new HadoopDataSource<LongWritable, Text>( new TextInputFormat(), new JobConf(), "Input Lines"); TextInputFormat.addInputPath(source.getJobConf(), new Path(dataInput)); MapOperator mapper = MapOperator.builder(new TokenizeLine()) .input(source) .name("Tokenize Lines") .build(); ReduceOperator reducer = ReduceOperator.builder(CountWords.class, StringValue.class, 0) .input(mapper) .name("Count Words") .build(); HadoopDataSink<Text, IntWritable> out = new HadoopDataSink<Text, IntWritable>(new TextOutputFormat<Text, IntWritable>(),new JobConf(), "Hadoop TextOutputFormat", reducer, Text.class, IntWritable.class); TextOutputFormat.setOutputPath(out.getJobConf(), new Path(output)); Plan plan = new Plan(out, "Hadoop OutputFormat Example"); plan.setDefaultParallelism(numSubTasks); return plan; }
/** * The private constructor that only gets invoked from the Builder. * @param builder */ protected ReduceOperator(Builder builder) { super(builder.udf, OperatorInfoHelper.unary(), builder.getKeyColumnsArray(), builder.name); this.keyTypes = builder.getKeyClassesArray(); if (builder.inputs != null && !builder.inputs.isEmpty()) { setInput(Operator.createUnionCascade(builder.inputs)); } setGroupOrder(builder.secondaryOrder); setBroadcastVariables(builder.broadcastInputs); setSemanticProperties(FunctionAnnotation.readSingleConstantAnnotations(builder.udf)); }
/** * Creates and returns a ReduceOperator from using the values given * to the builder. * * @return The created operator */ public ReduceOperator build() { if (name == null) { name = udf.getUserCodeClass().getName(); } return new ReduceOperator(this); } }
@Override protected JobGraph getJobGraph() throws Exception { FileDataSource input = new FileDataSource( new ContractITCaseInputFormat(), inPath); DelimitedInputFormat.configureDelimitedFormat(input) .recordDelimiter('\n'); input.setDegreeOfParallelism(config.getInteger("ReduceTest#NoSubtasks", 1)); ReduceOperator testReducer = ReduceOperator.builder(new TestReducer(), StringValue.class, 0) .build(); testReducer.setDegreeOfParallelism(config.getInteger("ReduceTest#NoSubtasks", 1)); testReducer.getParameters().setString(PactCompiler.HINT_LOCAL_STRATEGY, config.getString("ReduceTest#LocalStrategy", "")); testReducer.getParameters().setString(PactCompiler.HINT_SHIP_STRATEGY, config.getString("ReduceTest#ShipStrategy", "")); FileDataSink output = new FileDataSink( new ContractITCaseOutputFormat(), resultPath); output.setDegreeOfParallelism(1); output.setInput(testReducer); testReducer.setInput(input); Plan plan = new Plan(output); PactCompiler pc = new PactCompiler(new DataStatistics()); OptimizedPlan op = pc.compile(plan); NepheleJobGraphGenerator jgg = new NepheleJobGraphGenerator(); return jgg.compileJobGraph(op); }
ReduceOperator.builder(CountAgg.class, StringValue.class, 0) .name("AggregateGroupBy") .build(); aggregation.setDegreeOfParallelism(this.degreeOfParallelism); join.setFirstInput(ordersFilter); join.setSecondInput(lineFilter); aggregation.setInput(join); result.setInput(aggregation);
@Override public Plan getPlan(String... args) { // parse job parameters int numSubTasks = (args.length > 0 ? Integer.parseInt(args[0]) : 1); String dataInput = (args.length > 1 ? args[1] : ""); String output = (args.length > 2 ? args[2] : ""); @SuppressWarnings("unchecked") CsvInputFormat format = new CsvInputFormat(' ', IntValue.class, IntValue.class); FileDataSource input = new FileDataSource(format, dataInput, "Input"); // create the reduce contract and sets the key to the first field ReduceOperator sorter = ReduceOperator.builder(new IdentityReducer(), IntValue.class, 0) .input(input) .name("Reducer") .build(); // sets the group sorting to the second field sorter.setGroupOrder(new Ordering(1, IntValue.class, Order.ASCENDING)); // create and configure the output format FileDataSink out = new FileDataSink(new CsvOutputFormat(), output, sorter, "Sorted Output"); CsvOutputFormat.configureRecordFormat(out) .recordDelimiter('\n') .fieldDelimiter(' ') .field(IntValue.class, 0) .field(IntValue.class, 1); Plan plan = new Plan(out, "SecondarySort Example"); plan.setDefaultParallelism(numSubTasks); return plan; }
FileDataSource input1 = new FileDataSource(format1, input1Path, "Input 1"); ReduceOperator aggInput1 = ReduceOperator.builder(DummyReduce.class, IntValue.class, 0) .input(input1) .name("AggOrders") input2.setDegreeOfParallelism(numSubtasksInput2); ReduceOperator aggInput2 = ReduceOperator.builder(DummyReduce.class, IntValue.class, 0) .input(input2) .name("AggLines") .build(); aggInput2.setDegreeOfParallelism(numSubtasksInput2);
@Override public boolean isCombinable() { return super.isCombinable() || getUserCodeWrapper().getUserCodeAnnotation(Combinable.class) != null; }
ReduceOperator aggCO = ReduceOperator.builder(new AggCO(), StringValue.class, 1) .name("AggCo") .build(); aggCO.setDegreeOfParallelism(numSubtasks); aggCO.setInput(joinCO); joinCO.setFirstInput(orders); joinCO.setSecondInput(customers);
@Override protected Plan getTestJob() { int dop = this.config.getInteger("GroupOrderTest#NumSubtasks", 1); @SuppressWarnings("unchecked") CsvInputFormat format = new CsvInputFormat(',', IntValue.class, IntValue.class); FileDataSource source = new FileDataSource(format, this.textPath, "Source"); ReduceOperator reducer = ReduceOperator.builder(CheckingReducer.class) .keyField(IntValue.class, 0) .input(source) .name("Ordered Reducer") .build(); reducer.setGroupOrder(new Ordering(1, IntValue.class, Order.ASCENDING)); FileDataSink sink = new FileDataSink(CsvOutputFormat.class, this.resultPath, reducer, "Sink"); CsvOutputFormat.configureRecordFormat(sink) .recordDelimiter('\n') .fieldDelimiter(',') .field(IntValue.class, 0) .field(IntValue.class, 1); Plan p = new Plan(sink); p.setDefaultParallelism(dop); return p; }
.name("Tokenize Lines") .build(); ReduceOperator reducer = ReduceOperator.builder(CountWords.class, StringValue.class, 0) .input(mapper) .name("Count Words")
public static void main(String[] args) throws Exception { GenericDataSource<UserGeneratingInputFormat> source = new GenericDataSource<UserGeneratingInputFormat>(UserGeneratingInputFormat.class); MapOperator mapper = MapOperator.builder(new NumberExtractingMapper()) .input(source).name("le mapper").build(); ReduceOperator reducer = ReduceOperator.builder(new ConcatenatingReducer(), IntValue.class, 1) .input(mapper).name("le reducer").build(); GenericDataSink sink = new GenericDataSink(PrintingOutputFormat.class, reducer); Plan p = new Plan(sink); p.setDefaultParallelism(4); LocalExecutor.execute(p); }
.input(edges).name("Project Edge").build(); ReduceOperator edgeCounter = ReduceOperator.builder(new CountEdges(), IntValue.class, 0) .input(projectEdge).name("Count Edges for Vertex").build(); ReduceOperator countJoiner = ReduceOperator.builder(new JoinCountsAndUniquify()) .keyField(IntValue.class, 0) .keyField(IntValue.class, 1)
@Override public Plan getPlan(String... args) { // parse job parameters int numSubTasks = (args.length > 0 ? Integer.parseInt(args[0]) : 1); String dataInput = (args.length > 1 ? args[1] : ""); String output = (args.length > 2 ? args[2] : ""); FileDataSource source = new FileDataSource(new TextInputFormat(), dataInput, "Input Lines"); MapOperator mapper = MapOperator.builder(new TokenizeLine()) .input(source) .name("Tokenize Lines") .build(); ReduceOperator reducer = ReduceOperator.builder(CountWords.class, StringValue.class, 0) .input(mapper) .name("Count Words") .build(); @SuppressWarnings("unchecked") FileDataSink out = new FileDataSink(new CsvOutputFormat("\n", " ", StringValue.class, IntValue.class), output, reducer, "Word Counts"); Plan plan = new Plan(out, "WordCount Example"); plan.setDefaultParallelism(numSubTasks); return plan; }
ReduceOperator findNearestClusterCenters = ReduceOperator.builder(new FindNearestCenter(), IntValue.class, 0) .input(computeDistance) .name("Find Nearest Centers") ReduceOperator recomputeClusterCenter = ReduceOperator.builder(new RecomputeClusterCenter(), IntValue.class, 0) .input(findNearestClusterCenters) .name("Recompute Center Positions") ReduceOperator findNearestFinalCluster = ReduceOperator.builder(new FindNearestCenter(), IntValue.class, 0) .input(computeFinalDistance) .name("Find Nearest Final Centers")
@Override public Plan getPlan(String... args) { int numSubTasks = (args.length > 0 ? Integer.parseInt(args[0]) : 1); String dataInput = (args.length > 1 ? args[1] : ""); String output = (args.length > 2 ? args[2] : ""); FileDataSource source = new FileDataSource(new TextInputFormat(), dataInput, "Input Lines"); MapOperator mapper = MapOperator.builder(new TokenizeLine()).input(source).name("Tokenize Lines").build(); ReduceOperator reducer = ReduceOperator.builder(CountWords.class, StringValue.class, 0).input(mapper) .name("Count Words").build(); FileDataSink out = new FileDataSink(new CsvOutputFormat(), output, reducer, "Word Counts"); CsvOutputFormat.configureRecordFormat(out).recordDelimiter('\n') .fieldDelimiter(' ').field(StringValue.class, 0) .field(IntValue.class, 1); Plan plan = new Plan(out, "WordCount Example"); plan.setDefaultParallelism(numSubTasks); return plan; }