@Test public void testRejectWhenSolutionSetKeysDontMatchCoGroup() { try { ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment(); @SuppressWarnings("unchecked") DataSet<Tuple3<Double, Long, String>> initialSolutionSet = env.fromElements(new Tuple3<Double, Long, String>(3.44, 5L, "abc")); @SuppressWarnings("unchecked") DataSet<Tuple2<Double, String>> initialWorkSet = env.fromElements(new Tuple2<Double, String>(1.23, "abc")); DeltaIteration<Tuple3<Double, Long, String>, Tuple2<Double, String>> iteration = initialSolutionSet.iterateDelta(initialWorkSet, 10, 1); try { iteration.getWorkset().coGroup(iteration.getSolutionSet()).where(1).equalTo(2).with(new SolutionWorksetCoGroup1()); fail("Accepted invalid program."); } catch (InvalidProgramException e) { // all good! } try { iteration.getSolutionSet().coGroup(iteration.getWorkset()).where(2).equalTo(1).with(new SolutionWorksetCoGroup2()); fail("Accepted invalid program."); } catch (InvalidProgramException e) { // all good! } } catch (Exception e) { System.err.println(e.getMessage()); e.printStackTrace(); fail(e.getMessage()); } }
@Test public void testCoGroupLambda() throws Exception { final ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment(); DataSet<Tuple2<Integer, String>> left = env.fromElements( new Tuple2<>(1, "hello"), new Tuple2<>(2, "what's"), new Tuple2<>(2, "up") ); DataSet<Tuple2<Integer, String>> right = env.fromElements( new Tuple2<>(1, "not"), new Tuple2<>(1, "much"), new Tuple2<>(2, "really") ); DataSet<Integer> joined = left.coGroup(right).where(0).equalTo(0) .with((Iterable<Tuple2<Integer, String>> values1, Iterable<Tuple2<Integer, String>> values2, Collector<Integer> out) -> { int sum = 0; for (Tuple2<Integer, String> next : values1) { sum += next.f0; } for (Tuple2<Integer, String> next : values2) { sum += next.f0; } out.collect(sum); }).returns(Integer.class); List<Integer> result = joined.collect(); String expected = "6\n3\n"; compareResultAsText(result, expected); }
@Test public void testCoGroupKeySelectors1() { final ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment(); DataSet<CustomType> ds1 = env.fromCollection(customTypeData); DataSet<CustomType> ds2 = env.fromCollection(customTypeData); // should work try { ds1.coGroup(ds2) .where( new KeySelector<CustomType, Long>() { @Override public Long getKey(CustomType value) { return value.myLong; } } ) .equalTo( new KeySelector<CustomType, Long>() { @Override public Long getKey(CustomType value) { return value.myLong; } } ); } catch (Exception e) { Assert.fail(); } }
@Test public void testCoGroupWithRangePartitioning() throws Exception { /* * Test coGroup on tuples with multiple key field positions and same customized distribution */ final ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment(); DataSet<Tuple5<Integer, Long, Integer, String, Long>> ds1 = CollectionDataSets.get5TupleDataSet(env); DataSet<Tuple3<Integer, Long, String>> ds2 = CollectionDataSets.get3TupleDataSet(env); env.setParallelism(4); TestDistribution testDis = new TestDistribution(); DataSet<Tuple3<Integer, Long, String>> coGrouped = DataSetUtils.partitionByRange(ds1, testDis, 0, 4) .coGroup(DataSetUtils.partitionByRange(ds2, testDis, 0, 1)) .where(0, 4) .equalTo(0, 1) .with(new Tuple5Tuple3CoGroup()); List<Tuple3<Integer, Long, String>> result = coGrouped.collect(); String expected = "1,1,Hallo\n" + "2,2,Hallo Welt\n" + "3,2,Hallo Welt wie gehts?\n" + "3,2,ABC\n" + "5,3,HIJ\n" + "5,3,IJK\n"; compareResultAsTuples(result, expected); }
cfg.setString(Optimizer.HINT_SHIP_STRATEGY_SECOND_INPUT, Optimizer.HINT_SHIP_STRATEGY_REPARTITION_RANGE); input.coGroup(input).where(0).equalTo(0) .with(new DummyCoGroupFunction<Tuple2<Long, Long>, Tuple2<Long, Long>>()) .withParameters(cfg)
.where(new Pojo2KeySelector()).equalTo(new Pojo3KeySelector()) .withPartitioner(partitioner) .with(new DummyCoGroupFunction<Pojo2, Pojo3>())
.where(1).equalTo(0) .withPartitioner(partitioner) .with(new DummyCoGroupFunction<Tuple2<Long, Long>, Tuple3<Long, Long, Long>>())
.where("b").equalTo("a") .withPartitioner(partitioner) .with(new DummyCoGroupFunction<Pojo2, Pojo3>())
@Override protected void testProgram() throws Exception { // set up execution environment ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment(); // read vertex and edge data DataSet<Long> vertices = env.fromElements(ConnectedComponentsData.getEnumeratingVertices(NUM_VERTICES).split("\n")) .map(new VertexParser()); DataSet<Tuple2<Long, Long>> edges = env.fromElements(ConnectedComponentsData.getRandomOddEvenEdges(NUM_EDGES, NUM_VERTICES, SEED).split("\n")) .flatMap(new EdgeParser()); // assign the initial components (equal to the vertex id) DataSet<Tuple2<Long, Long>> verticesWithInitialId = vertices.map(new DuplicateValue<Long>()); // open a delta iteration DeltaIteration<Tuple2<Long, Long>, Tuple2<Long, Long>> iteration = verticesWithInitialId.iterateDelta(verticesWithInitialId, 100, 0); // apply the step logic: join with the edges, select the minimum neighbor, update if the component of the candidate is smaller DataSet<Tuple2<Long, Long>> changes = iteration .getWorkset().join(edges).where(0).equalTo(0).with(new NeighborWithComponentIDJoin()) .coGroup(iteration.getSolutionSet()).where(0).equalTo(0) .with(new MinIdAndUpdate()); // close the delta iteration (delta and new workset are identical) DataSet<Tuple2<Long, Long>> result = iteration.closeWith(changes, changes); // emit result List<Tuple2<Long, Long>> resutTuples = new ArrayList<>(); result.output(new LocalCollectionOutputFormat<>(resutTuples)); env.execute(); }
@Test public void testCoGroupWithTuplesWrongType() { try { final Partitioner<Integer> partitioner = new TestPartitionerInt(); ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment(); DataSet<Tuple2<Long, Long>> input1 = env.fromElements(new Tuple2<Long, Long>(0L, 0L)); DataSet<Tuple3<Long, Long, Long>> input2 = env.fromElements(new Tuple3<Long, Long, Long>(0L, 0L, 0L)); try { input1 .coGroup(input2) .where(1).equalTo(0) .withPartitioner(partitioner); fail("should throw an exception"); } catch (InvalidProgramException e) { // expected } } catch (Exception e) { e.printStackTrace(); fail(e.getMessage()); } }
/** * Joins the vertex DataSet of this graph with an input Tuple2 DataSet and applies * a user-defined transformation on the values of the matched records. * The vertex ID and the first field of the Tuple2 DataSet are used as the join keys. * * @param inputDataSet the Tuple2 DataSet to join with. * The first field of the Tuple2 is used as the join key and the second field is passed * as a parameter to the transformation function. * @param vertexJoinFunction the transformation function to apply. * The first parameter is the current vertex value and the second parameter is the value * of the matched Tuple2 from the input DataSet. * @return a new Graph, where the vertex values have been updated according to the * result of the vertexJoinFunction. * * @param <T> the type of the second field of the input Tuple2 DataSet. */ public <T> Graph<K, VV, EV> joinWithVertices(DataSet<Tuple2<K, T>> inputDataSet, final VertexJoinFunction<VV, T> vertexJoinFunction) { DataSet<Vertex<K, VV>> resultedVertices = this.getVertices() .coGroup(inputDataSet).where(0).equalTo(0) .with(new ApplyCoGroupToVertexValues<>(vertexJoinFunction)) .name("Join with vertices"); return new Graph<>(resultedVertices, this.edges, this.context); }
@Test public void testCoGroupWithKeySelectorsWrongType() { try { final Partitioner<Long> partitioner = new TestPartitionerLong(); ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment(); DataSet<Pojo2> input1 = env.fromElements(new Pojo2()); DataSet<Pojo3> input2 = env.fromElements(new Pojo3()); try { input1 .coGroup(input2) .where(new Pojo2KeySelector()).equalTo(new Pojo3KeySelector()) .withPartitioner(partitioner); fail("should throw an exception"); } catch (InvalidProgramException e) { // expected } } catch (Exception e) { e.printStackTrace(); fail(e.getMessage()); } }
/** * Joins the edge DataSet with an input Tuple2 DataSet and applies a user-defined transformation * on the values of the matched records. * The target ID of the edges input and the first field of the input DataSet are used as join keys. * * @param inputDataSet the DataSet to join with. * The first field of the Tuple2 is used as the join key * and the second field is passed as a parameter to the transformation function. * @param edgeJoinFunction the transformation function to apply. * The first parameter is the current edge value and the second parameter is the value * of the matched Tuple2 from the input DataSet. * @param <T> the type of the second field of the input Tuple2 DataSet. * @return a new Graph, where the edge values have been updated according to the * result of the edgeJoinFunction. */ public <T> Graph<K, VV, EV> joinWithEdgesOnTarget(DataSet<Tuple2<K, T>> inputDataSet, final EdgeJoinFunction<EV, T> edgeJoinFunction) { DataSet<Edge<K, EV>> resultedEdges = this.getEdges() .coGroup(inputDataSet).where(1).equalTo(0) .with(new ApplyCoGroupToEdgeValuesOnEitherSourceOrTarget<>(edgeJoinFunction)) .name("Join with edges on target"); return new Graph<>(this.vertices, resultedEdges, this.context); }
/** * Joins the edge DataSet with an input Tuple2 DataSet and applies a user-defined transformation * on the values of the matched records. * The source ID of the edges input and the first field of the input DataSet are used as join keys. * * @param inputDataSet the DataSet to join with. * The first field of the Tuple2 is used as the join key * and the second field is passed as a parameter to the transformation function. * @param edgeJoinFunction the transformation function to apply. * The first parameter is the current edge value and the second parameter is the value * of the matched Tuple2 from the input DataSet. * @param <T> the type of the second field of the input Tuple2 DataSet. * @return a new Graph, where the edge values have been updated according to the * result of the edgeJoinFunction. */ public <T> Graph<K, VV, EV> joinWithEdgesOnSource(DataSet<Tuple2<K, T>> inputDataSet, final EdgeJoinFunction<EV, T> edgeJoinFunction) { DataSet<Edge<K, EV>> resultedEdges = this.getEdges() .coGroup(inputDataSet).where(0).equalTo(0) .with(new ApplyCoGroupToEdgeValuesOnEitherSourceOrTarget<>(edgeJoinFunction)) .name("Join with edges on source"); return new Graph<>(this.vertices, resultedEdges, this.context); }
/** * Joins the edge DataSet with an input DataSet on the composite key of both * source and target IDs and applies a user-defined transformation on the values * of the matched records. The first two fields of the input DataSet are used as join keys. * * @param inputDataSet the DataSet to join with. * The first two fields of the Tuple3 are used as the composite join key * and the third field is passed as a parameter to the transformation function. * @param edgeJoinFunction the transformation function to apply. * The first parameter is the current edge value and the second parameter is the value * of the matched Tuple3 from the input DataSet. * @param <T> the type of the third field of the input Tuple3 DataSet. * @return a new Graph, where the edge values have been updated according to the * result of the edgeJoinFunction. */ public <T> Graph<K, VV, EV> joinWithEdges(DataSet<Tuple3<K, K, T>> inputDataSet, final EdgeJoinFunction<EV, T> edgeJoinFunction) { DataSet<Edge<K, EV>> resultedEdges = this.getEdges() .coGroup(inputDataSet).where(0, 1).equalTo(0, 1) .with(new ApplyCoGroupToEdgeValues<>(edgeJoinFunction)) .name("Join with edges"); return new Graph<>(this.vertices, resultedEdges, this.context); }
@Test public void testCoGroupWithPojosWrongType() { try { final Partitioner<Long> partitioner = new TestPartitionerLong(); ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment(); DataSet<Pojo2> input1 = env.fromElements(new Pojo2()); DataSet<Pojo3> input2 = env.fromElements(new Pojo3()); try { input1 .coGroup(input2) .where("a").equalTo("b") .withPartitioner(partitioner); fail("should throw an exception"); } catch (InvalidProgramException e) { // expected } } catch (Exception e) { e.printStackTrace(); fail(e.getMessage()); } }
@Test public void CoGroupWithSameDistributionTest() throws Exception { ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment(); DataSet<Tuple3<Integer, Integer, Integer>> set1 = env.readCsvFile(IN_FILE).types(Integer.class, Integer.class, Integer.class); DataSet<Tuple3<Integer, Integer, Integer>> set2 = env.readCsvFile(IN_FILE).types(Integer.class, Integer.class, Integer.class); TestDistribution testDistribution1 = new TestDistribution(3); TestDistribution testDistribution2 = new TestDistribution(3); DataSet<Tuple3<Integer, Integer, Integer>> coGrouped = DataSetUtils.partitionByRange(set1, testDistribution1, 0) .coGroup(DataSetUtils.partitionByRange(set2, testDistribution2, 0)) .where(0).equalTo(0).with(new CoGroupFunc()); coGrouped.output(new DiscardingOutputFormat<Tuple3<Integer, Integer, Integer>>()); Plan plan = env.createProgramPlan(); OptimizedPlan oPlan = compileWithStats(plan); SinkPlanNode sink = oPlan.getDataSinks().iterator().next(); DualInputPlanNode coGroup= (DualInputPlanNode)sink.getInput().getSource(); Channel input1 = coGroup.getInput1(); Channel input2 = coGroup.getInput2(); assertEquals(ShipStrategyType.FORWARD, input1.getShipStrategy()); assertEquals(ShipStrategyType.FORWARD, input2.getShipStrategy()); }
@Test public void testCoGroupKeyMixing2() { final ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment(); DataSet<Tuple5<Integer, Long, String, Long, Integer>> ds1 = env.fromCollection(emptyTupleData, tupleTypeInfo); DataSet<CustomType> ds2 = env.fromCollection(customTypeData); // should work try { ds1.coGroup(ds2) .where(3) .equalTo( new KeySelector<CustomType, Long>() { @Override public Long getKey(CustomType value) { return value.myLong; } } ); } catch (Exception e) { Assert.fail(); } }
@Test public void testCoGroupKeyMixing1() { final ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment(); DataSet<CustomType> ds1 = env.fromCollection(customTypeData); DataSet<Tuple5<Integer, Long, String, Long, Integer>> ds2 = env.fromCollection(emptyTupleData, tupleTypeInfo); // should work try { ds1.coGroup(ds2) .where( new KeySelector<CustomType, Long>() { @Override public Long getKey(CustomType value) { return value.myLong; } } ) .equalTo(3); } catch (Exception e) { Assert.fail(); } }
@Test public void reuseBothPartitioningCoGroup4() { ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment(); DataSet<Tuple3<Integer, Integer, Integer>> set1 = env.readCsvFile(IN_FILE).types(Integer.class, Integer.class, Integer.class); DataSet<Tuple3<Integer, Integer, Integer>> set2 = env.readCsvFile(IN_FILE).types(Integer.class, Integer.class, Integer.class); DataSet<Tuple3<Integer, Integer, Integer>> coGrouped = set1 .partitionByHash(0,2) .map(new MockMapper()).withForwardedFields("0;2") .coGroup(set2.partitionByHash(1) .map(new MockMapper()) .withForwardedFields("1")) .where(0, 2).equalTo(2, 1).with(new MockCoGroup()); coGrouped.output(new DiscardingOutputFormat<Tuple3<Integer, Integer, Integer>>()); Plan plan = env.createProgramPlan(); OptimizedPlan oPlan = compileWithStats(plan); SinkPlanNode sink = oPlan.getDataSinks().iterator().next(); DualInputPlanNode coGroup= (DualInputPlanNode)sink.getInput().getSource(); checkValidCoGroupInputProperties(coGroup); }