/** * @return the vertex DataSet as Tuple2. */ public DataSet<Tuple2<K, VV>> getVerticesAsTuple2() { return vertices.map(new VertexToTuple2Map<>()); }
@Override protected void testProgram() throws Exception { final ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment(); DataSet<String> text = env.fromElements(WordCountData.TEXT); DataSet<Tuple2<String, Integer>> words = text.flatMap(new WordCount.Tokenizer()); DataSet<Tuple2<String, Integer>> result = words.groupBy(0).aggregate(Aggregations.SUM, 1); result.output(new LocalCollectionOutputFormat<Tuple2<String, Integer>>(resultsCollected)); env.execute("Word Count Collection"); } }
@Test public void testUnionWithEmptyDataSet() throws Exception { /* * Test on union with empty dataset */ final ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment(); // Don't know how to make an empty result in an other way than filtering it DataSet<Tuple3<Integer, Long, String>> empty = CollectionDataSets.get3TupleDataSet(env). filter(new RichFilter1()); DataSet<Tuple3<Integer, Long, String>> unionDs = CollectionDataSets.get3TupleDataSet(env) .union(empty); List<Tuple3<Integer, Long, String>> result = unionDs.collect(); String expected = FULL_TUPLE_3_STRING; compareResultAsTuples(result, expected); }
/** * Convenience method to get the count (number of elements) of a DataSet. * * @return A long integer that represents the number of elements in the data set. */ public long count() throws Exception { final String id = new AbstractID().toString(); output(new Utils.CountHelper<T>(id)).name("count()"); JobExecutionResult res = getExecutionEnvironment().execute(); return res.<Long> getAccumulatorResult(id); }
/** * Checks that the edge set input contains valid vertex Ids, i.e. that they * also exist in the vertex input set. * * @return a boolean stating whether a graph is valid * with respect to its vertex ids. */ @Override public boolean validate(Graph<K, VV, EV> graph) throws Exception { DataSet<Tuple1<K>> edgeIds = graph.getEdges() .flatMap(new MapEdgeIds<>()).distinct(); DataSet<K> invalidIds = graph.getVertices().coGroup(edgeIds).where(0) .equalTo(0).with(new GroupInvalidIds<>()).first(1); return invalidIds.map(new KToTupleMap<>()).count() == 0; }
@Override protected void testProgram() throws Exception { ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment(); DataSet<Long> data1 = env.generateSequence(1, 100); DataSet<Long> data2 = env.generateSequence(1, 100); IterativeDataSet<Long> firstIteration = data1.iterate(100); DataSet<Long> firstResult = firstIteration.closeWith(firstIteration.map(new IdMapper())); IterativeDataSet<Long> mainIteration = data2.map(new IdMapper()).iterate(100); DataSet<Long> joined = mainIteration.join(firstResult) .where(new IdKeyExtractor()).equalTo(new IdKeyExtractor()) .with(new Joiner()); DataSet<Long> mainResult = mainIteration.closeWith(joined); mainResult.output(new DiscardingOutputFormat<Long>()); env.execute(); }
@Test(expected = IndexOutOfBoundsException.class) public void testGroupByKeyFields4() { final ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment(); DataSet<Tuple5<Integer, Long, String, Long, Integer>> tupleDs = env.fromCollection(emptyTupleData, tupleTypeInfo); // should not work, key out of tuple bounds tupleDs.groupBy(5); }
@Test public void testUnion2IdenticalDataSets() throws Exception { /* * Union of 2 Same Data Sets */ final ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment(); DataSet<Tuple3<Integer, Long, String>> ds = CollectionDataSets.get3TupleDataSet(env); DataSet<Tuple3<Integer, Long, String>> unionDs = ds.union(CollectionDataSets.get3TupleDataSet(env)); List<Tuple3<Integer, Long, String>> result = unionDs.collect(); String expected = FULL_TUPLE_3_STRING + FULL_TUPLE_3_STRING; compareResultAsTuples(result, expected); }
@Override protected void testProgram() throws Exception { ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment(); DataSet<Long> inputStatic = env.generateSequence(1, 4); DataSet<Long> inputIteration = env.generateSequence(1, 4); IterativeDataSet<Long> iteration = inputIteration.iterate(3); DataSet<Long> result = iteration.closeWith(inputStatic.union(inputStatic).union(iteration.union(iteration))); result.output(new LocalCollectionOutputFormat<Long>(this.result)); env.execute(); }
@Override protected void testProgram() throws Exception { final ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment(); DataSet<Long> longs = env.generateSequence(0, 100000); DataSet<Tuple1<Long>> longT1 = longs.map(new TupleWrapper()); DataSet<Tuple1<Long>> longT2 = longT1.project(0); DataSet<Tuple1<Long>> longT3 = longs.map(new TupleWrapper()); longT2.join(longT3).where(0).equalTo(0).projectFirst(0) .join(longT1).where(0).equalTo(0).projectFirst(0) .writeAsText(resultPath); env.execute(); }
@Override protected void testProgram() throws Exception { ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment(); DataSet<Integer> data = env.fromElements(1, 2, 3, 4, 5, 6, 7, 8); IterativeDataSet<Integer> iteration = data.iterate(10); DataSet<Integer> result = data.reduceGroup(new PickOneAllReduce()).withBroadcastSet(iteration, "bc"); final List<Integer> resultList = new ArrayList<Integer>(); iteration.closeWith(result).output(new LocalCollectionOutputFormat<Integer>(resultList)); env.execute(); Assert.assertEquals(8, resultList.get(0).intValue()); }
@Test public void testSupportForDataAndEnumSerialization() throws Exception { /** * Test support for Date and enum serialization */ final ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment(); DataSet<PojoWithDateAndEnum> ds = env.generateSequence(0, 2).map(new Mapper1()); ds = ds.union(CollectionDataSets.getPojoWithDateAndEnum(env)); DataSet<String> res = ds.groupBy("group").reduceGroup(new GroupReducer1()); List<String> result = res.collect(); String expected = "ok\nok"; compareResultAsText(result, expected); }
private Plan getTestPlanLeftStatic(String strategy) { ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment(); env.setParallelism(DEFAULT_PARALLELISM); @SuppressWarnings("unchecked") DataSet<Tuple3<Long, Long, Long>> bigInput = env.fromElements(new Tuple3<Long, Long, Long>(1L, 2L, 3L), new Tuple3<Long, Long, Long>(1L, 2L, 3L),new Tuple3<Long, Long, Long>(1L, 2L, 3L)).name("Big"); @SuppressWarnings("unchecked") DataSet<Tuple3<Long, Long, Long>> smallInput = env.fromElements(new Tuple3<Long, Long, Long>(1L, 2L, 3L)).name("Small"); IterativeDataSet<Tuple3<Long, Long, Long>> iteration = bigInput.iterate(10); Configuration joinStrategy = new Configuration(); joinStrategy.setString(Optimizer.HINT_LOCAL_STRATEGY, strategy); DataSet<Tuple3<Long, Long, Long>> inner = smallInput.join(iteration).where(0).equalTo(0).with(new DummyJoiner()).name("DummyJoiner").withParameters(joinStrategy); DataSet<Tuple3<Long, Long, Long>> output = iteration.closeWith(inner); output.output(new DiscardingOutputFormat<Tuple3<Long,Long,Long>>()); return env.createProgramPlan(); }
@Test public void testAllRejectingFilter() throws Exception { /* * Test all-rejecting filter. */ final ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment(); DataSet<Tuple3<Integer, Long, String>> ds = CollectionDataSets.get3TupleDataSet(env); DataSet<Tuple3<Integer, Long, String>> filterDs = ds. filter(new Filter1()); List<Tuple3<Integer, Long, String>> result = filterDs.collect(); String expected = "\n"; compareResultAsTuples(result, expected); }
@Test(expected = InvalidProgramException.class) public void testJoinKeyInvalidAtomic6() { final ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment(); DataSet<Integer> ds1 = env.fromElements(0, 0, 0); DataSet<ArrayList<Integer>> ds2 = env.fromElements(new ArrayList<Integer>()); ds1.join(ds2).where("*").equalTo("*"); }
@Override protected void testProgram() throws Exception { ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment(); env.setParallelism(4); DataSet<String> initialInput = env.fromElements("1", "2", "3", "4", "5").name("input"); IterativeDataSet<String> iteration = initialInput.iterate(5).name("Loop"); DataSet<String> sumReduce = iteration.reduceGroup(new SumReducer()).name("Compute sum (GroupReduce"); DataSet<String> terminationFilter = sumReduce.filter(new TerminationFilter()).name("Compute termination criterion (Map)"); List<String> result = iteration.closeWith(sumReduce, terminationFilter).collect(); containsResultAsText(result, EXPECTED); }
@Test public void testNonPassingFlatMap() throws Exception { /* * Test non-passing flatmap */ final ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment(); DataSet<String> ds = CollectionDataSets.getStringDataSet(env); DataSet<String> nonPassingFlatMapDs = ds. flatMap(new FlatMapper1()); List<String> result = nonPassingFlatMapDs.collect(); String expected = "\n"; compareResultAsText(result, expected); }
@Override public void emitDataSet(DataSet<Row> dataSet) { dataSet .output(new Utils.CollectHelper<>(accumulatorName, serializer)) .name("SQL Client Batch Collect Sink"); }
private FlatMapOperator<Tuple3<Integer, Long, String>, String> getSourceDataSet(ExecutionEnvironment env) { return CollectionDataSets.get3TupleDataSet(env).flatMap( new FlatMapFunction<Tuple3<Integer, Long, String>, String>() { @Override public void flatMap(Tuple3<Integer, Long, String> value, Collector<String> out) throws Exception { out.collect(value.f2); } }); }
@Override protected void testProgram() throws Exception { ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment(); env.setParallelism(4); DataSet<Long> input = env.generateSequence(1, 10); DataSet<Long> bc1 = env.generateSequence(1, 5); DataSet<Long> bc2 = env.generateSequence(6, 10); List<Long> result = input .map(new Mapper()) .withBroadcastSet(bc1.union(bc2), BC_NAME) .reduce(new Reducer()) .collect(); Assert.assertEquals(Long.valueOf(3025), result.get(0)); }