/** * Returns a new set containing the first n elements in this grouped and sorted {@link DataSet}. * @param n The desired number of elements for each group. * @return A GroupReduceOperator that represents the DataSet containing the elements. */ public GroupReduceOperator<T, T> first(int n) { if (n < 1) { throw new InvalidProgramException("Parameter n of first(n) must be at least 1."); } return reduceGroup(new FirstReducer<T>(n)); }
private <IN, OUT> DataSet<OUT> applyGroupReduceOperation(SortedGrouping<IN> op1, PythonOperationInfo info, TypeInformation<OUT> type) { return op1 .reduceGroup(new IdentityGroupReduce<IN>()).setCombinable(false).setParallelism(info.parallelism).name("PythonGroupReducePreStep") .mapPartition(new PythonMapPartition<IN, OUT>(operatorConfig, info.envID, info.setID, type)) .setParallelism(info.parallelism).name(info.name); }
@Override public DataSet<Tuple3<K, K, K>> run(Graph<K, VV, EV> input) throws Exception { DataSet<Edge<K, EV>> edges = input.getEdges(); // annotate edges with degrees DataSet<EdgeWithDegrees<K>> edgesWithDegrees = edges.flatMap(new EdgeDuplicator<>()) .groupBy(0).sortGroup(1, Order.ASCENDING).reduceGroup(new DegreeCounter<>()) .groupBy(EdgeWithDegrees.V1, EdgeWithDegrees.V2).reduce(new DegreeJoiner<>()); // project edges by degrees DataSet<Edge<K, NullValue>> edgesByDegree = edgesWithDegrees.map(new EdgeByDegreeProjector<>()); // project edges by vertex id DataSet<Edge<K, NullValue>> edgesById = edgesByDegree.map(new EdgeByIdProjector<>()); DataSet<Tuple3<K, K, K>> triangles = edgesByDegree // build triads .groupBy(EdgeWithDegrees.V1).sortGroup(EdgeWithDegrees.V2, Order.ASCENDING) .reduceGroup(new TriadBuilder<>()) // filter triads .join(edgesById, JoinHint.REPARTITION_HASH_SECOND).where(Triad.V2, Triad.V3).equalTo(0, 1).with(new TriadFilter<>()); return triangles; }
@Test public void testIntBasedDefinitionOnGroupSortForFullNestedTuple() throws Exception { /* * Test int-based definition on group sort, for (full) nested Tuple */ final ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment(); env.setParallelism(1); DataSet<Tuple2<Tuple2<Integer, Integer>, String>> ds = CollectionDataSets.getGroupSortedNestedTupleDataSet(env); DataSet<String> reduceDs = ds.groupBy("f1").sortGroup(0, Order.DESCENDING).reduceGroup(new NestedTupleReducer()); List<String> result = reduceDs.collect(); String expected = "a--(2,1)-(1,3)-(1,2)-\n" + "b--(2,2)-\n" + "c--(4,9)-(3,6)-(3,3)-\n"; compareResultAsText(result, expected); }
@Test public void testStringBasedDefinitionOnGroupSortForPartialNestedTuple() throws Exception { /* * Test string-based definition on group sort, for (partial) nested Tuple DESC */ final ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment(); env.setParallelism(1); DataSet<Tuple2<Tuple2<Integer, Integer>, String>> ds = CollectionDataSets.getGroupSortedNestedTupleDataSet(env); // f0.f0 is first integer DataSet<String> reduceDs = ds.groupBy("f1").sortGroup("f0.f0", Order.DESCENDING).reduceGroup(new NestedTupleReducer()); List<String> result = reduceDs.collect(); String expected = "a--(2,1)-(1,3)-(1,2)-\n" + "b--(2,2)-\n" + "c--(4,9)-(3,3)-(3,6)-\n"; compareResultAsText(result, expected); }
@Test public void testInputOfCombinerIsSortedForCombinableGroupReduceWithGroupSorting() throws Exception { /* * check that input of combiner is also sorted for combinable groupReduce with group sorting */ final ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment(); env.setParallelism(1); DataSet<Tuple3<Integer, Long, String>> ds = CollectionDataSets.get3TupleDataSet(env); DataSet<Tuple3<Integer, Long, String>> reduceDs = ds. groupBy(1).sortGroup(0, Order.ASCENDING).reduceGroup(new OrderCheckingCombinableReduce()); List<Tuple3<Integer, Long, String>> result = reduceDs.collect(); String expected = "1,1,Hi\n" + "2,2,Hello\n" + "4,3,Hello world, how are you?\n" + "7,4,Comment#1\n" + "11,5,Comment#5\n" + "16,6,Comment#10\n"; compareResultAsTuples(result, expected); }
@Test public void testCorrectnessOfGroupReduceOnTuplesWithKeyFieldSelectorAndGroupSorting() throws Exception { /* * check correctness of groupReduce on tuples with key field selector and group sorting */ final ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment(); env.setParallelism(1); DataSet<Tuple3<Integer, Long, String>> ds = CollectionDataSets.get3TupleDataSet(env); DataSet<Tuple3<Integer, Long, String>> reduceDs = ds. groupBy(1).sortGroup(2, Order.ASCENDING).reduceGroup(new Tuple3SortedGroupReduce()); List<Tuple3<Integer, Long, String>> result = reduceDs.collect(); String expected = "1,1,Hi\n" + "5,2,Hello-Hello world\n" + "15,3,Hello world, how are you?-I am fine.-Luke Skywalker\n" + "34,4,Comment#1-Comment#2-Comment#3-Comment#4\n" + "65,5,Comment#5-Comment#6-Comment#7-Comment#8-Comment#9\n" + "111,6,Comment#10-Comment#11-Comment#12-Comment#13-Comment#14-Comment#15\n"; compareResultAsTuples(result, expected); }
@Test public void testStringBasedDefinitionOnGroupSort() throws Exception { /* * Test string-based definition on group sort, based on test: * check correctness of groupReduce with descending group sort */ final ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment(); env.setParallelism(1); DataSet<Tuple3<Integer, Long, String>> ds = CollectionDataSets.get3TupleDataSet(env); DataSet<Tuple3<Integer, Long, String>> reduceDs = ds. groupBy(1).sortGroup("f2", Order.DESCENDING).reduceGroup(new Tuple3SortedGroupReduce()); List<Tuple3<Integer, Long, String>> result = reduceDs.collect(); String expected = "1,1,Hi\n" + "5,2,Hello world-Hello\n" + "15,3,Luke Skywalker-I am fine.-Hello world, how are you?\n" + "34,4,Comment#4-Comment#3-Comment#2-Comment#1\n" + "65,5,Comment#9-Comment#8-Comment#7-Comment#6-Comment#5\n" + "111,6,Comment#15-Comment#14-Comment#13-Comment#12-Comment#11-Comment#10\n"; compareResultAsTuples(result, expected); }
@Test public void testStringBasedDefinitionOnGroupSortForTwoGroupingKeysWithPojos() throws Exception { /* * Test string-based definition on group sort, for two grouping keys with Pojos */ final ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment(); env.setParallelism(1); DataSet<PojoContainingTupleAndWritable> ds = CollectionDataSets.getGroupSortedPojoContainingTupleAndWritable(env); // f0.f0 is first integer DataSet<String> reduceDs = ds.groupBy("hadoopFan").sortGroup("theTuple.f0", Order.DESCENDING).sortGroup("theTuple.f1", Order.DESCENDING) .reduceGroup(new GroupReducer5()); List<String> result = reduceDs.collect(); String expected = "1---(10,100)-\n" + "2---(30,600)-(30,400)-(30,200)-(20,201)-(20,200)-\n"; compareResultAsText(result, expected); }
@Test public void testCorrectnessOfGroupreduceWithDescendingGroupSort() throws Exception { /* * check correctness of groupReduce with descending group sort */ final ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment(); env.setParallelism(1); DataSet<Tuple3<Integer, Long, String>> ds = CollectionDataSets.get3TupleDataSet(env); DataSet<Tuple3<Integer, Long, String>> reduceDs = ds. groupBy(1).sortGroup(2, Order.DESCENDING).reduceGroup(new Tuple3SortedGroupReduce()); List<Tuple3<Integer, Long, String>> result = reduceDs.collect(); String expected = "1,1,Hi\n" + "5,2,Hello world-Hello\n" + "15,3,Luke Skywalker-I am fine.-Hello world, how are you?\n" + "34,4,Comment#4-Comment#3-Comment#2-Comment#1\n" + "65,5,Comment#9-Comment#8-Comment#7-Comment#6-Comment#5\n" + "111,6,Comment#15-Comment#14-Comment#13-Comment#12-Comment#11-Comment#10\n"; compareResultAsTuples(result, expected); }
@Test public void testStringBasedDefinitionOnGroupSortForTwoGroupingKeys() throws Exception { /* * Test string-based definition on group sort, for two grouping keys */ final ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment(); env.setParallelism(1); DataSet<Tuple2<Tuple2<Integer, Integer>, String>> ds = CollectionDataSets.getGroupSortedNestedTupleDataSet(env); // f0.f0 is first integer DataSet<String> reduceDs = ds.groupBy("f1").sortGroup("f0.f0", Order.DESCENDING).sortGroup("f0.f1", Order.DESCENDING).reduceGroup(new NestedTupleReducer()); List<String> result = reduceDs.collect(); String expected = "a--(2,1)-(1,3)-(1,2)-\n" + "b--(2,2)-\n" + "c--(4,9)-(3,6)-(3,3)-\n"; compareResultAsText(result, expected); }
@Test public void testIntBasedDefinitionOnGroupSortForPartialNestedTuple() throws Exception { /* * Test int-based definition on group sort, for (partial) nested Tuple ASC */ final ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment(); env.setParallelism(1); DataSet<Tuple2<Tuple2<Integer, Integer>, String>> ds = CollectionDataSets.getGroupSortedNestedTupleDataSet(env); // f0.f0 is first integer DataSet<String> reduceDs = ds.groupBy("f1") .sortGroup("f0.f0", Order.ASCENDING) .sortGroup("f0.f1", Order.ASCENDING) .reduceGroup(new NestedTupleReducer()); List<String> result = reduceDs.collect(); String expected = "a--(1,2)-(1,3)-(2,1)-\n" + "b--(2,2)-\n" + "c--(3,3)-(3,6)-(4,9)-\n"; compareResultAsText(result, expected); }
@Test public void testSortedGroupReduceWithTypeInformationTypeHint() throws Exception { final ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment(); env.getConfig().disableSysoutLogging(); DataSet<Tuple3<Integer, Long, String>> ds = CollectionDataSets.getSmall3TupleDataSet(env); DataSet<Integer> resultDs = ds .groupBy(0) .sortGroup(0, Order.ASCENDING) .reduceGroup(new GroupReducer<Tuple3<Integer, Long, String>, Integer>()) .returns(BasicTypeInfo.INT_TYPE_INFO); List<Integer> result = resultDs.collect(); String expectedResult = "2\n" + "3\n" + "1\n"; compareResultAsText(result, expectedResult); }
@Test public void testPojoKeySelectorGroupSort() throws Exception { /* * check correctness of sorted groupReduce on custom type with keyselector sorting */ final ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment(); DataSet<CustomType> ds = CollectionDataSets.getCustomTypeDataSet(env); DataSet<CustomType> reduceDs = ds .groupBy(new TwoTuplePojoExtractor()) .sortGroup(new StringPojoExtractor(), Order.DESCENDING) .reduceGroup(new CustomTypeSortedGroupReduce()); List<CustomType> result = reduceDs.collect(); String expected = "1,0,Hi\n" + "2,3,Hello world-Hello\n" + "3,12,Luke Skywalker-I am fine.-Hello world, how are you?\n" + "4,30,Comment#4-Comment#3-Comment#2-Comment#1\n" + "5,60,Comment#9-Comment#8-Comment#7-Comment#6-Comment#5\n" + "6,105,Comment#15-Comment#14-Comment#13-Comment#12-Comment#11-Comment#10\n"; compareResultAsText(result, expected); }
@Test public void testTupleKeySelectorSortCombineOnTuple() throws Exception { /* * check correctness of sorted groupReduceon with Tuple2 keyselector sorting */ final ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment(); env.setParallelism(1); DataSet<Tuple5<Integer, Long, Integer, String, Long>> ds = CollectionDataSets.get5TupleDataSet(env); DataSet<Tuple5<Integer, Long, Integer, String, Long>> reduceDs = ds .groupBy(new IntFieldExtractor<Tuple5<Integer, Long, Integer, String, Long>>(0)) .sortGroup(new FiveToTwoTupleExtractor(), Order.DESCENDING) .reduceGroup(new Tuple5SortedGroupReduce()); List<Tuple5<Integer, Long, Integer, String, Long>> result = reduceDs.collect(); String expected = "1,1,0,Hallo,1\n" + "2,5,0,Hallo Welt-Hallo Welt wie,1\n" + "3,15,0,BCD-ABC-Hallo Welt wie gehts?,2\n" + "4,34,0,FGH-CDE-EFG-DEF,1\n" + "5,65,0,IJK-HIJ-KLM-JKL-GHI,1\n"; compareResultAsTuples(result, expected); }
@Test public void testIdentityWithGroupByAndSort() throws Exception { final ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment(); DataSet<Tuple3<Integer, Long, String>> ds = CollectionDataSets.get3TupleDataSet(env); DataSet<Tuple3<Integer, Long, String>> reduceDs = ds .groupBy(1) .sortGroup(1, Order.DESCENDING) // reduce partially .combineGroup(new IdentityFunction()) .groupBy(1) .sortGroup(1, Order.DESCENDING) // fully reduce .reduceGroup(new IdentityFunction()); List<Tuple3<Integer, Long, String>> result = reduceDs.collect(); compareResultAsTuples(result, identityResult); }
@Test public void testCustomPartitioningTupleGroupReduceSorted() { try { ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment(); DataSet<Tuple3<Integer, Integer, Integer>> data = env.fromElements(new Tuple3<Integer, Integer, Integer>(0, 0, 0)) .rebalance().setParallelism(4); data.groupBy(0).withPartitioner(new TestPartitionerInt()) .sortGroup(1, Order.ASCENDING) .reduceGroup(new IdentityGroupReducerCombinable<Tuple3<Integer,Integer,Integer>>()) .output(new DiscardingOutputFormat<Tuple3<Integer, Integer, Integer>>()); Plan p = env.createProgramPlan(); OptimizedPlan op = compileNoStats(p); SinkPlanNode sink = op.getDataSinks().iterator().next(); SingleInputPlanNode reducer = (SingleInputPlanNode) sink.getInput().getSource(); SingleInputPlanNode combiner = (SingleInputPlanNode) reducer.getInput().getSource(); assertEquals(ShipStrategyType.FORWARD, sink.getInput().getShipStrategy()); assertEquals(ShipStrategyType.PARTITION_CUSTOM, reducer.getInput().getShipStrategy()); assertEquals(ShipStrategyType.FORWARD, combiner.getInput().getShipStrategy()); } catch (Exception e) { e.printStackTrace(); fail(e.getMessage()); } }
@Test public void testTupleKeySelectorGroupSort() throws Exception { /* * check correctness of sorted groupReduce on tuples with keyselector sorting */ final ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment(); env.setParallelism(1); DataSet<Tuple3<Integer, Long, String>> ds = CollectionDataSets.get3TupleDataSet(env); DataSet<Tuple3<Integer, Long, String>> reduceDs = ds .groupBy(new LongFieldExtractor<Tuple3<Integer, Long, String>>(1)) .sortGroup(new StringFieldExtractor<Tuple3<Integer, Long, String>>(2), Order.DESCENDING) .reduceGroup(new Tuple3SortedGroupReduce()); List<Tuple3<Integer, Long, String>> result = reduceDs.collect(); String expected = "1,1,Hi\n" + "5,2,Hello world-Hello\n" + "15,3,Luke Skywalker-I am fine.-Hello world, how are you?\n" + "34,4,Comment#4-Comment#3-Comment#2-Comment#1\n" + "65,5,Comment#9-Comment#8-Comment#7-Comment#6-Comment#5\n" + "111,6,Comment#15-Comment#14-Comment#13-Comment#12-Comment#11-Comment#10\n"; compareResultAsTuples(result, expected); }
@Test public void testCustomPartitioningTupleGroupReduceSorted() { try { ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment(); DataSet<Pojo3> data = env.fromElements(new Pojo3()) .rebalance().setParallelism(4); data.groupBy("a").withPartitioner(new TestPartitionerInt()) .sortGroup("b", Order.ASCENDING) .reduceGroup(new IdentityGroupReducerCombinable<Pojo3>()) .output(new DiscardingOutputFormat<Pojo3>()); Plan p = env.createProgramPlan(); OptimizedPlan op = compileNoStats(p); SinkPlanNode sink = op.getDataSinks().iterator().next(); SingleInputPlanNode reducer = (SingleInputPlanNode) sink.getInput().getSource(); SingleInputPlanNode combiner = (SingleInputPlanNode) reducer.getInput().getSource(); assertEquals(ShipStrategyType.FORWARD, sink.getInput().getShipStrategy()); assertEquals(ShipStrategyType.PARTITION_CUSTOM, reducer.getInput().getShipStrategy()); assertEquals(ShipStrategyType.FORWARD, combiner.getInput().getShipStrategy()); } catch (Exception e) { e.printStackTrace(); fail(e.getMessage()); } }
@Test public void testCustomPartitioningTupleGroupReduceSorted2() { try { ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment(); DataSet<Pojo4> data = env.fromElements(new Pojo4()) .rebalance().setParallelism(4); data.groupBy("a").withPartitioner(new TestPartitionerInt()) .sortGroup("b", Order.ASCENDING) .sortGroup("c", Order.DESCENDING) .reduceGroup(new IdentityGroupReducerCombinable<Pojo4>()) .output(new DiscardingOutputFormat<Pojo4>()); Plan p = env.createProgramPlan(); OptimizedPlan op = compileNoStats(p); SinkPlanNode sink = op.getDataSinks().iterator().next(); SingleInputPlanNode reducer = (SingleInputPlanNode) sink.getInput().getSource(); SingleInputPlanNode combiner = (SingleInputPlanNode) reducer.getInput().getSource(); assertEquals(ShipStrategyType.FORWARD, sink.getInput().getShipStrategy()); assertEquals(ShipStrategyType.PARTITION_CUSTOM, reducer.getInput().getShipStrategy()); assertEquals(ShipStrategyType.FORWARD, combiner.getInput().getShipStrategy()); } catch (Exception e) { e.printStackTrace(); fail(e.getMessage()); } }