@SuppressWarnings("unchecked") @Override public void open(Configuration parameters) throws Exception { super.open(parameters); this.reducer.configure(jobConf); this.combiner.configure(jobConf); this.reporter = new HadoopDummyReporter(); Class<KEYIN> inKeyClass = (Class<KEYIN>) TypeExtractor.getParameterType(Reducer.class, reducer.getClass(), 0); TypeSerializer<KEYIN> keySerializer = TypeExtractor.getForClass(inKeyClass).createSerializer(getRuntimeContext().getExecutionConfig()); this.valueIterator = new HadoopTupleUnwrappingIterator<>(keySerializer); this.combineCollector = new HadoopOutputCollector<>(); this.reduceCollector = new HadoopOutputCollector<>(); }
@SuppressWarnings("unchecked") @Override public void open(Configuration parameters) throws Exception { super.open(parameters); this.reducer.configure(jobConf); this.reporter = new HadoopDummyReporter(); this.reduceCollector = new HadoopOutputCollector<KEYOUT, VALUEOUT>(); Class<KEYIN> inKeyClass = (Class<KEYIN>) TypeExtractor.getParameterType(Reducer.class, reducer.getClass(), 0); TypeSerializer<KEYIN> keySerializer = TypeExtractor.getForClass(inKeyClass).createSerializer(getRuntimeContext().getExecutionConfig()); this.valueIterator = new HadoopTupleUnwrappingIterator<KEYIN, VALUEIN>(keySerializer); }
@Test public void testConfigurableMapper() throws Exception { final ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment(); JobConf conf = new JobConf(); conf.set("my.filterPrefix", "Hello"); DataSet<Tuple2<IntWritable, Text>> ds = HadoopTestData.getKVPairDataSet(env); DataSet<Tuple2<IntWritable, Text>> hellos = ds. flatMap(new HadoopMapFunction<IntWritable, Text, IntWritable, Text>(new ConfigurableMapper(), conf)); String resultPath = tempFolder.newFile().toURI().toString(); hellos.writeAsText(resultPath, FileSystem.WriteMode.OVERWRITE); env.execute(); String expected = "(2,Hello)\n" + "(3,Hello world)\n" + "(4,Hello world, how are you?)\n"; compareResultsByLinesInMemory(expected, resultPath); }
public static void main(String[] args) throws Exception { if (args.length < 2) { System.err.println("Usage: WordCount <input path> <result path>"); return; } final String inputPath = args[0]; final String outputPath = args[1]; final ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment(); // Set up the Hadoop Input Format HadoopInputFormat<LongWritable, Text> hadoopInputFormat = new HadoopInputFormat<LongWritable, Text>(new TextInputFormat(), LongWritable.class, Text.class, new JobConf()); TextInputFormat.addInputPath(hadoopInputFormat.getJobConf(), new Path(inputPath)); // Create a Flink job with it DataSet<Tuple2<LongWritable, Text>> text = env.createInput(hadoopInputFormat); DataSet<Tuple2<Text, LongWritable>> words = text.flatMap(new HadoopMapFunction<LongWritable, Text, Text, LongWritable>(new Tokenizer())) .groupBy(0).reduceGroup(new HadoopReduceCombineFunction<Text, LongWritable, Text, LongWritable>(new Counter(), new Counter())); // Set up Hadoop Output Format HadoopOutputFormat<Text, LongWritable> hadoopOutputFormat = new HadoopOutputFormat<Text, LongWritable>(new TextOutputFormat<Text, LongWritable>(), new JobConf()); hadoopOutputFormat.getJobConf().set("mapred.textoutputformat.separator", " "); TextOutputFormat.setOutputPath(hadoopOutputFormat.getJobConf(), new Path(outputPath)); // Output & Execute words.output(hadoopOutputFormat).setParallelism(1); env.execute("Hadoop Compat WordCount"); }
@Test public void testUngroupedHadoopReducer() throws Exception { final ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment(); DataSet<Tuple2<IntWritable, Text>> ds = HadoopTestData.getKVPairDataSet(env); DataSet<Tuple2<IntWritable, IntWritable>> commentCnts = ds. reduceGroup(new HadoopReduceFunction<IntWritable, Text, IntWritable, IntWritable>(new AllCommentCntReducer())); String resultPath = tempFolder.newFile().toURI().toString(); commentCnts.writeAsText(resultPath); env.execute(); String expected = "(42,15)\n"; compareResultsByLinesInMemory(expected, resultPath); }
@Test public void testUngroupedHadoopReducer() throws Exception { final ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment(); DataSet<Tuple2<IntWritable, IntWritable>> ds = HadoopTestData.getKVPairDataSet(env). map(new Mapper2()); DataSet<Tuple2<IntWritable, IntWritable>> sum = ds. reduceGroup(new HadoopReduceCombineFunction<IntWritable, IntWritable, IntWritable, IntWritable>( new SumReducer(), new SumReducer())); String resultPath = tempFolder.newFile().toURI().toString(); sum.writeAsText(resultPath); env.execute(); String expected = "(0,231)\n"; compareResultsByLinesInMemory(expected, resultPath); }
@SuppressWarnings("unchecked") @Override public void open(Configuration parameters) throws Exception { super.open(parameters); this.reducer.configure(jobConf); this.reporter = new HadoopDummyReporter(); this.reduceCollector = new HadoopOutputCollector<KEYOUT, VALUEOUT>(); Class<KEYIN> inKeyClass = (Class<KEYIN>) TypeExtractor.getParameterType(Reducer.class, reducer.getClass(), 0); TypeSerializer<KEYIN> keySerializer = TypeExtractor.getForClass(inKeyClass).createSerializer(getRuntimeContext().getExecutionConfig()); this.valueIterator = new HadoopTupleUnwrappingIterator<KEYIN, VALUEIN>(keySerializer); }
@SuppressWarnings("unchecked") @Override public void open(Configuration parameters) throws Exception { super.open(parameters); this.reducer.configure(jobConf); this.combiner.configure(jobConf); this.reporter = new HadoopDummyReporter(); Class<KEYIN> inKeyClass = (Class<KEYIN>) TypeExtractor.getParameterType(Reducer.class, reducer.getClass(), 0); TypeSerializer<KEYIN> keySerializer = TypeExtractor.getForClass(inKeyClass).createSerializer(getRuntimeContext().getExecutionConfig()); this.valueIterator = new HadoopTupleUnwrappingIterator<>(keySerializer); this.combineCollector = new HadoopOutputCollector<>(); this.reduceCollector = new HadoopOutputCollector<>(); }
@Test public void testConfigurationViaJobConf() throws Exception { final ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment(); JobConf conf = new JobConf(); conf.set("my.cntPrefix", "Hello"); DataSet<Tuple2<IntWritable, Text>> ds = HadoopTestData.getKVPairDataSet(env). map(new Mapper2()); DataSet<Tuple2<IntWritable, IntWritable>> helloCnts = ds. groupBy(0). reduceGroup(new HadoopReduceFunction<IntWritable, Text, IntWritable, IntWritable>( new ConfigurableCntReducer(), conf)); String resultPath = tempFolder.newFile().toURI().toString(); helloCnts.writeAsText(resultPath); env.execute(); String expected = "(0,0)\n" + "(1,0)\n" + "(2,1)\n" + "(3,1)\n" + "(4,1)\n"; compareResultsByLinesInMemory(expected, resultPath); }
@Test public void testNonPassingMapper() throws Exception{ final ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment(); DataSet<Tuple2<IntWritable, Text>> ds = HadoopTestData.getKVPairDataSet(env); DataSet<Tuple2<IntWritable, Text>> nonPassingFlatMapDs = ds. flatMap(new HadoopMapFunction<IntWritable, Text, IntWritable, Text>(new NonPassingMapper())); String resultPath = tempFolder.newFile().toURI().toString(); nonPassingFlatMapDs.writeAsText(resultPath, FileSystem.WriteMode.OVERWRITE); env.execute(); compareResultsByLinesInMemory("\n", resultPath); }
@Test public void testStandardCountingWithCombiner() throws Exception{ final ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment(); DataSet<Tuple2<IntWritable, IntWritable>> ds = HadoopTestData.getKVPairDataSet(env). map(new Mapper1()); DataSet<Tuple2<IntWritable, IntWritable>> counts = ds. groupBy(0). reduceGroup(new HadoopReduceCombineFunction<IntWritable, IntWritable, IntWritable, IntWritable>( new SumReducer(), new SumReducer())); String resultPath = tempFolder.newFile().toURI().toString(); counts.writeAsText(resultPath); env.execute(); String expected = "(0,5)\n" + "(1,6)\n" + "(2,6)\n" + "(3,4)\n"; compareResultsByLinesInMemory(expected, resultPath); }
@SuppressWarnings("unchecked") @Override public void open(Configuration parameters) throws Exception { super.open(parameters); this.reducer.configure(jobConf); this.reporter = new HadoopDummyReporter(); this.reduceCollector = new HadoopOutputCollector<KEYOUT, VALUEOUT>(); Class<KEYIN> inKeyClass = (Class<KEYIN>) TypeExtractor.getParameterType(Reducer.class, reducer.getClass(), 0); TypeSerializer<KEYIN> keySerializer = TypeExtractor.getForClass(inKeyClass).createSerializer(getRuntimeContext().getExecutionConfig()); this.valueIterator = new HadoopTupleUnwrappingIterator<KEYIN, VALUEIN>(keySerializer); }
@SuppressWarnings("unchecked") @Override public void open(Configuration parameters) throws Exception { super.open(parameters); this.reducer.configure(jobConf); this.combiner.configure(jobConf); this.reporter = new HadoopDummyReporter(); Class<KEYIN> inKeyClass = (Class<KEYIN>) TypeExtractor.getParameterType(Reducer.class, reducer.getClass(), 0); TypeSerializer<KEYIN> keySerializer = TypeExtractor.getForClass(inKeyClass).createSerializer(getRuntimeContext().getExecutionConfig()); this.valueIterator = new HadoopTupleUnwrappingIterator<>(keySerializer); this.combineCollector = new HadoopOutputCollector<>(); this.reduceCollector = new HadoopOutputCollector<>(); }
@Test public void testConfigurationViaJobConf() throws Exception { final ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment(); JobConf conf = new JobConf(); conf.set("my.cntPrefix", "Hello"); DataSet<Tuple2<IntWritable, Text>> ds = HadoopTestData.getKVPairDataSet(env). map(new Mapper4()); DataSet<Tuple2<IntWritable, IntWritable>> hellos = ds. groupBy(0). reduceGroup(new HadoopReduceFunction<IntWritable, Text, IntWritable, IntWritable>( new ConfigurableCntReducer(), conf)); String resultPath = tempFolder.newFile().toURI().toString(); hellos.writeAsText(resultPath); env.execute(); // return expected result String expected = "(0,0)\n" + "(1,0)\n" + "(2,1)\n" + "(3,1)\n" + "(4,1)\n"; compareResultsByLinesInMemory(expected, resultPath); }
flatMap(new HadoopMapFunction<IntWritable, Text, IntWritable, Text>(new DuplicatingMapper()));
@Test public void testCombiner() throws Exception { org.junit.Assume.assumeThat(mode, new IsEqual<TestExecutionMode>(TestExecutionMode.CLUSTER)); final ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment(); DataSet<Tuple2<IntWritable, IntWritable>> ds = HadoopTestData.getKVPairDataSet(env). map(new Mapper3()); DataSet<Tuple2<IntWritable, IntWritable>> counts = ds. groupBy(0). reduceGroup(new HadoopReduceCombineFunction<IntWritable, IntWritable, IntWritable, IntWritable>( new SumReducer(), new KeyChangingReducer())); String resultPath = tempFolder.newFile().toURI().toString(); counts.writeAsText(resultPath); env.execute(); String expected = "(0,5)\n" + "(1,6)\n" + "(2,5)\n" + "(3,5)\n"; compareResultsByLinesInMemory(expected, resultPath); }
@SuppressWarnings("unchecked") @Override public void open(Configuration parameters) throws Exception { super.open(parameters); this.reducer.configure(jobConf); this.reporter = new HadoopDummyReporter(); this.reduceCollector = new HadoopOutputCollector<KEYOUT, VALUEOUT>(); Class<KEYIN> inKeyClass = (Class<KEYIN>) TypeExtractor.getParameterType(Reducer.class, reducer.getClass(), 0); TypeSerializer<KEYIN> keySerializer = TypeExtractor.getForClass(inKeyClass).createSerializer(getRuntimeContext().getExecutionConfig()); this.valueIterator = new HadoopTupleUnwrappingIterator<KEYIN, VALUEIN>(keySerializer); }
@SuppressWarnings("unchecked") @Override public void open(Configuration parameters) throws Exception { super.open(parameters); this.reducer.configure(jobConf); this.combiner.configure(jobConf); this.reporter = new HadoopDummyReporter(); Class<KEYIN> inKeyClass = (Class<KEYIN>) TypeExtractor.getParameterType(Reducer.class, reducer.getClass(), 0); TypeSerializer<KEYIN> keySerializer = TypeExtractor.getForClass(inKeyClass).createSerializer(getRuntimeContext().getExecutionConfig()); this.valueIterator = new HadoopTupleUnwrappingIterator<>(keySerializer); this.combineCollector = new HadoopOutputCollector<>(); this.reduceCollector = new HadoopOutputCollector<>(); }
@Test public void testStandardGrouping() throws Exception{ final ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment(); DataSet<Tuple2<IntWritable, Text>> ds = HadoopTestData.getKVPairDataSet(env). map(new Mapper1()); DataSet<Tuple2<IntWritable, IntWritable>> commentCnts = ds. groupBy(0). reduceGroup(new HadoopReduceFunction<IntWritable, Text, IntWritable, IntWritable>(new CommentCntReducer())); String resultPath = tempFolder.newFile().toURI().toString(); commentCnts.writeAsText(resultPath); env.execute(); String expected = "(0,0)\n" + "(1,3)\n" + "(2,5)\n" + "(3,5)\n" + "(4,2)\n"; compareResultsByLinesInMemory(expected, resultPath); }