/** * Creates a Graph from CSV input with edge values, but without vertex values. * @param vertexKey the type of the vertex IDs * @param edgeValue the type of the edge values * @return a Graph where the edges are read from an edges CSV file (with values). */ public <K, EV> Graph<K, NullValue, EV> edgeTypes(Class<K> vertexKey, Class<EV> edgeValue) { if (edgeReader == null) { throw new RuntimeException("The edge input file cannot be null!"); } DataSet<Tuple3<K, K, EV>> edges = edgeReader .types(vertexKey, vertexKey, edgeValue) .name(GraphCsvReader.class.getName()); return Graph.fromTupleDataSet(edges, executionContext); }
/** * Creates a Graph from CSV input without vertex values or edge values. * @param vertexKey the type of the vertex IDs * @return a Graph where the vertex IDs are read from the edges input file. */ public <K> Graph<K, NullValue, NullValue> keyType(Class<K> vertexKey) { if (edgeReader == null) { throw new RuntimeException("The edge input file cannot be null!"); } DataSet<Edge<K, NullValue>> edges = edgeReader .types(vertexKey, vertexKey) .name(GraphCsvReader.class.getName()) .map(new Tuple2ToEdgeMap<>()) .name("Type conversion"); return Graph.fromDataSet(edges, executionContext); }
@Override public DataSet<Row> getDataSet(ExecutionEnvironment execEnv) { return execEnv.createInput(new HBaseRowInputFormat(conf, tableName, hBaseSchema), getReturnType()).name(explainSource()); }
@Override public Graph<LongValue, NullValue, NullValue> generate() { Preconditions.checkState(vertexCount >= 0); // Vertices DataSet<Vertex<LongValue, NullValue>> vertices = GraphGeneratorUtils.vertexSequence(env, parallelism, vertexCount); // Edges DataSource<Edge<LongValue, NullValue>> edges = env .fromCollection(Collections.<Edge<LongValue, NullValue>>emptyList(), TypeInformation.of(new TypeHint<Edge<LongValue, NullValue>>(){})) .setParallelism(parallelism) .name("Empty edge set"); // Graph return Graph.fromDataSet(vertices, edges, env); } }
@SuppressWarnings("unchecked") private <T extends Tuple> void createCsvSource(ExecutionEnvironment env, PythonOperationInfo info) { if (!(info.types instanceof TupleTypeInfo)) { throw new RuntimeException("The output type of a csv source has to be a tuple. The derived type is " + info); } Path path = new Path(info.path); String lineD = info.lineDelimiter; String fieldD = info.fieldDelimiter; TupleTypeInfo<T> types = (TupleTypeInfo<T>) info.types; sets.add(info.setID, env.createInput(new TupleCsvInputFormat<>(path, lineD, fieldD, types), types).setParallelism(info.parallelism).name("CsvSource") .map(new SerializerMap<T>()).setParallelism(info.parallelism).name("CsvSourcePostStep")); }
@Override public DataSet<Row> getDataSet(ExecutionEnvironment execEnv) { OrcRowInputFormat orcIF = buildOrcInputFormat(); orcIF.setNestedFileEnumeration(recursiveEnumeration); if (selectedFields != null) { orcIF.selectFields(selectedFields); } if (predicates != null) { for (OrcRowInputFormat.Predicate pred : predicates) { orcIF.addPredicate(pred); } } return execEnv.createInput(orcIF).name(explainSource()); }
private void createValueSource(ExecutionEnvironment env, PythonOperationInfo info) { sets.add(info.setID, env.fromCollection(info.values).setParallelism(info.parallelism).name("ValueSource") .map(new SerializerMap<>()).setParallelism(info.parallelism).name("ValueSourcePostStep")); }
private void createTextSource(ExecutionEnvironment env, PythonOperationInfo info) { sets.add(info.setID, env.readTextFile(info.path).setParallelism(info.parallelism).name("TextSource") .map(new SerializerMap<String>()).setParallelism(info.parallelism).name("TextSourcePostStep")); }
private void createSequenceSource(ExecutionEnvironment env, PythonOperationInfo info) { sets.add(info.setID, env.generateSequence(info.frm, info.to).setParallelism(info.parallelism).name("SequenceSource") .map(new SerializerMap<Long>()).setParallelism(info.parallelism).name("SequenceSourcePostStep")); }
@Override public Graph<LongValue, NullValue, NullValue> generate() { Preconditions.checkState(!dimensions.isEmpty(), "No dimensions added to GridGraph"); // Vertices DataSet<Vertex<LongValue, NullValue>> vertices = GraphGeneratorUtils.vertexSequence(env, parallelism, vertexCount); // Edges LongValueSequenceIterator iterator = new LongValueSequenceIterator(0, this.vertexCount - 1); DataSet<Edge<LongValue, NullValue>> edges = env .fromParallelCollection(iterator, LongValue.class) .setParallelism(parallelism) .name("Edge iterators") .flatMap(new LinkVertexToNeighbors(vertexCount, dimensions)) .setParallelism(parallelism) .name("Grid graph edges"); // Graph return Graph.fromDataSet(vertices, edges, env); }
@Override public Graph<LongValue, NullValue, NullValue> generate() { Preconditions.checkState(vertexCount >= 2); // Vertices DataSet<Vertex<LongValue, NullValue>> vertices = GraphGeneratorUtils.vertexSequence(env, parallelism, vertexCount); // Edges LongValueSequenceIterator iterator = new LongValueSequenceIterator(1, this.vertexCount - 1); DataSet<Edge<LongValue, NullValue>> edges = env .fromParallelCollection(iterator, LongValue.class) .setParallelism(parallelism) .name("Edge iterators") .flatMap(new LinkVertexToCenter()) .setParallelism(parallelism) .name("Star graph edges"); // Graph return Graph.fromDataSet(vertices, edges, env); }
@Override public Graph<LongValue, NullValue, NullValue> generate() { Preconditions.checkState(vertexPairCount > 0); // Vertices long vertexCount = 2 * vertexPairCount; DataSet<Vertex<LongValue, NullValue>> vertices = GraphGeneratorUtils.vertexSequence(env, parallelism, vertexCount); // Edges LongValueSequenceIterator iterator = new LongValueSequenceIterator(0, vertexCount - 1); DataSet<Edge<LongValue, NullValue>> edges = env .fromParallelCollection(iterator, LongValue.class) .setParallelism(parallelism) .name("Edge iterators") .map(new LinkVertexToSingletonNeighbor()) .setParallelism(parallelism) .name("Complete graph edges"); // Graph return Graph.fromDataSet(vertices, edges, env); }
private Plan getTestPlanRightStatic(String strategy) { ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment(); env.setParallelism(DEFAULT_PARALLELISM); DataSet<Tuple3<Long, Long, Long>> bigInput = env.readCsvFile("file://bigFile").types(Long.class, Long.class, Long.class).name("bigFile"); DataSet<Tuple3<Long, Long, Long>> smallInput = env.readCsvFile("file://smallFile").types(Long.class, Long.class, Long.class).name("smallFile"); IterativeDataSet<Tuple3<Long, Long, Long>> iteration = bigInput.iterate(10); Configuration joinStrategy = new Configuration(); joinStrategy.setString(Optimizer.HINT_SHIP_STRATEGY, Optimizer.HINT_SHIP_STRATEGY_REPARTITION_HASH); if(!strategy.equals("")) { joinStrategy.setString(Optimizer.HINT_LOCAL_STRATEGY, strategy); } DataSet<Tuple3<Long, Long, Long>> inner = iteration.join(smallInput).where(0).equalTo(0).with(new DummyJoiner()).name("DummyJoiner").withParameters(joinStrategy); DataSet<Tuple3<Long, Long, Long>> output = iteration.closeWith(inner); output.output(new DiscardingOutputFormat<Tuple3<Long, Long, Long>>()); return env.createProgramPlan(); }
public static void connectedComponentsWithCoGroup(String[] args) throws Exception { ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment(); env.setParallelism(Integer.parseInt(args[0])); DataSet<Tuple1<Long>> initialVertices = env.readCsvFile(args[1]).types(Long.class).name(VERTEX_SOURCE); DataSet<Tuple2<Long, Long>> edges = env.readCsvFile(args[2]).types(Long.class, Long.class).name(EDGES_SOURCE); DataSet<Tuple2<Long, Long>> verticesWithId = initialVertices.flatMap(new DummyMapFunction()); DeltaIteration<Tuple2<Long, Long>, Tuple2<Long, Long>> iteration = verticesWithId.iterateDelta(verticesWithId, Integer.parseInt(args[4]), 0).name(ITERATION_NAME); DataSet<Tuple2<Long, Long>> joinWithNeighbors = iteration.getWorkset().join(edges) .where(0).equalTo(0) .with(new DummyJoinFunction()).name(JOIN_NEIGHBORS_MATCH); DataSet<Tuple2<Long, Long>> minAndUpdate = joinWithNeighbors.coGroup(iteration.getSolutionSet()) .where(0).equalTo(0) .with(new DummyCoGroupFunction()).name(MIN_ID_AND_UPDATE); iteration.closeWith(minAndUpdate, minAndUpdate).writeAsCsv(args[3]).name(SINK); env.execute(); }
@Test public void testBCVariableClosure() { ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment(); DataSet<String> input = env.readTextFile(IN_FILE).name("source1"); DataSet<String> reduced = input .map(new IdentityMapper<String>()) .reduceGroup(new Top1GroupReducer<String>()); DataSet<String> initialSolution = input.map(new IdentityMapper<String>()).withBroadcastSet(reduced, "bc"); IterativeDataSet<String> iteration = initialSolution.iterate(100); iteration.closeWith(iteration.map(new IdentityMapper<String>()).withBroadcastSet(reduced, "red")) .output(new DiscardingOutputFormat<String>()); Plan plan = env.createProgramPlan(); try{ compileNoStats(plan); }catch(Exception e){ e.printStackTrace(); Assert.fail(e.getMessage()); } }
private Plan getTestPlanLeftStatic(String strategy) { ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment(); env.setParallelism(DEFAULT_PARALLELISM); @SuppressWarnings("unchecked") DataSet<Tuple3<Long, Long, Long>> bigInput = env.fromElements(new Tuple3<Long, Long, Long>(1L, 2L, 3L), new Tuple3<Long, Long, Long>(1L, 2L, 3L),new Tuple3<Long, Long, Long>(1L, 2L, 3L)).name("Big"); @SuppressWarnings("unchecked") DataSet<Tuple3<Long, Long, Long>> smallInput = env.fromElements(new Tuple3<Long, Long, Long>(1L, 2L, 3L)).name("Small"); IterativeDataSet<Tuple3<Long, Long, Long>> iteration = bigInput.iterate(10); Configuration joinStrategy = new Configuration(); joinStrategy.setString(Optimizer.HINT_LOCAL_STRATEGY, strategy); DataSet<Tuple3<Long, Long, Long>> inner = smallInput.join(iteration).where(0).equalTo(0).with(new DummyJoiner()).name("DummyJoiner").withParameters(joinStrategy); DataSet<Tuple3<Long, Long, Long>> output = iteration.closeWith(inner); output.output(new DiscardingOutputFormat<Tuple3<Long,Long,Long>>()); return env.createProgramPlan(); }
@Override protected void testProgram() throws Exception { ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment(); env.setParallelism(4); DataSet<String> initialInput = env.fromElements("1", "2", "3", "4", "5").name("input"); IterativeDataSet<String> iteration = initialInput.iterate(5).name("Loop"); DataSet<String> sumReduce = iteration.reduceGroup(new SumReducer()).name("Compute sum (GroupReduce"); DataSet<String> terminationFilter = iteration.filter(new TerminationFilter()).name("Compute termination criterion (Map)"); List<String> result = iteration.closeWith(sumReduce, terminationFilter).collect(); containsResultAsText(result, EXPECTED); }
@Override protected void testProgram() throws Exception { ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment(); env.setParallelism(4); DataSet<String> initialInput = env.fromElements("1", "2", "3", "4", "5").name("input"); IterativeDataSet<String> iteration = initialInput.iterate(5).name("Loop"); DataSet<String> sumReduce = iteration.reduceGroup(new SumReducer()).name("Compute sum (GroupReduce"); DataSet<String> terminationFilter = sumReduce.filter(new TerminationFilter()).name("Compute termination criterion (Map)"); List<String> result = iteration.closeWith(sumReduce, terminationFilter).collect(); containsResultAsText(result, EXPECTED); }
public static void tcph3(String[] args) throws Exception { ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment(); env.setParallelism(Integer.parseInt(args[0])); //order id, order status, order data, order prio, ship prio DataSet<Tuple5<Long, String, String, String, Integer>> orders = env.readCsvFile(args[1]) .fieldDelimiter("|").lineDelimiter("\n") .includeFields("101011001").types(Long.class, String.class, String.class, String.class, Integer.class) .name(ORDERS); //order id, extended price DataSet<Tuple2<Long, Double>> lineItems = env.readCsvFile(args[2]) .fieldDelimiter("|").lineDelimiter("\n") .includeFields("100001").types(Long.class, Double.class) .name(LINEITEM); DataSet<Tuple2<Long, Integer>> filterO = orders.flatMap(new FilterO()).name(MAPPER_NAME); DataSet<Tuple3<Long, Integer, Double>> joinLiO = filterO.join(lineItems).where(0).equalTo(0).with(new JoinLiO()).name(JOIN_NAME); DataSet<Tuple3<Long, Integer, Double>> aggLiO = joinLiO.groupBy(0, 1).reduceGroup(new AggLiO()).name(REDUCE_NAME); aggLiO.writeAsCsv(args[3], "\n", "|").name(SINK); env.execute(); }
@Test public void testMultipleIterationsWithClosueBCVars() { ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment(); env.setParallelism(100); DataSet<String> input = env.readTextFile(IN_FILE).name("source1"); IterativeDataSet<String> iteration1 = input.iterate(100); IterativeDataSet<String> iteration2 = input.iterate(20); IterativeDataSet<String> iteration3 = input.iterate(17); iteration1.closeWith(iteration1.map(new IdentityMapper<String>())) .output(new DiscardingOutputFormat<String>()); iteration2.closeWith(iteration2.reduceGroup(new Top1GroupReducer<String>())) .output(new DiscardingOutputFormat<String>()); iteration3.closeWith(iteration3.reduceGroup(new IdentityGroupReducer<String>())) .output(new DiscardingOutputFormat<String>()); Plan plan = env.createProgramPlan(); try{ compileNoStats(plan); }catch(Exception e){ e.printStackTrace(); Assert.fail(e.getMessage()); } }