private void createTextSource(ExecutionEnvironment env, PythonOperationInfo info) { sets.add(info.setID, env.readTextFile(info.path).setParallelism(info.parallelism).name("TextSource") .map(new SerializerMap<String>()).setParallelism(info.parallelism).name("TextSourcePostStep")); }
/** * Generic method to create an input DataSet with in {@link InputFormat}. The {@link DataSet} will not be * immediately created - instead, this method returns a {@link DataSet} that will be lazily created from * the input format once the program is executed. * * <p>The {@link DataSet} is typed to the given TypeInformation. This method is intended for input formats that * where the return type cannot be determined by reflection analysis, and that do not implement the * {@link ResultTypeQueryable} interface. * * @param inputFormat The input format used to create the data set. * @return A {@link DataSet} that represents the data created by the input format. * * @see #createInput(InputFormat) */ public <X> DataSource<X> createInput(InputFormat<X, ?> inputFormat, TypeInformation<X> producedType) { if (inputFormat == null) { throw new IllegalArgumentException("InputFormat must not be null."); } if (producedType == null) { throw new IllegalArgumentException("Produced type information must not be null."); } return new DataSource<>(this, inputFormat, producedType, Utils.getCallLocationName()); }
@Override public Graph<LongValue, NullValue, NullValue> generate() { // Vertices DataSet<Vertex<LongValue, NullValue>> vertices = GraphGeneratorUtils.vertexSequence(env, parallelism, vertexCount); // Edges LongValueSequenceIterator iterator = new LongValueSequenceIterator(0, this.vertexCount - 1); // Validate ranges Collections.sort(offsetRanges); Iterator<OffsetRange> iter = offsetRanges.iterator(); OffsetRange lastRange = iter.next(); while (iter.hasNext()) { OffsetRange nextRange = iter.next(); if (lastRange.overlaps(nextRange)) { throw new IllegalArgumentException("Overlapping ranges " + lastRange + " and " + nextRange); } lastRange = nextRange; } DataSet<Edge<LongValue, NullValue>> edges = env .fromParallelCollection(iterator, LongValue.class) .setParallelism(parallelism) .name("Edge iterators") .flatMap(new LinkVertexToOffsets(vertexCount, offsetRanges)) .setParallelism(parallelism) .name("Circulant graph edges"); // Graph return Graph.fromDataSet(vertices, edges, env); }
/** * Creates a Graph from CSV input without vertex values or edge values. * @param vertexKey the type of the vertex IDs * @return a Graph where the vertex IDs are read from the edges input file. */ public <K> Graph<K, NullValue, NullValue> keyType(Class<K> vertexKey) { if (edgeReader == null) { throw new RuntimeException("The edge input file cannot be null!"); } DataSet<Edge<K, NullValue>> edges = edgeReader .types(vertexKey, vertexKey) .name(GraphCsvReader.class.getName()) .map(new Tuple2ToEdgeMap<>()) .name("Type conversion"); return Graph.fromDataSet(edges, executionContext); }
@Override public Graph<LongValue, NullValue, NullValue> generate() { Preconditions.checkState(vertexCount >= 0); // Vertices DataSet<Vertex<LongValue, NullValue>> vertices = GraphGeneratorUtils.vertexSequence(env, parallelism, vertexCount); // Edges DataSource<Edge<LongValue, NullValue>> edges = env .fromCollection(Collections.<Edge<LongValue, NullValue>>emptyList(), TypeInformation.of(new TypeHint<Edge<LongValue, NullValue>>(){})) .setParallelism(parallelism) .name("Empty edge set"); // Graph return Graph.fromDataSet(vertices, edges, env); } }
public static void main(String[] args) throws Exception { final long numSamples = args.length > 0 ? Long.parseLong(args[0]) : 1000000; final ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment(); // count how many of the samples would randomly fall into // the unit circle DataSet<Long> count = env.generateSequence(1, numSamples) .map(new Sampler()) .reduce(new SumReducer()); long theCount = count.collect().get(0); System.out.println("We estimate Pi to be: " + (theCount * 4.0 / numSamples)); }
/** * Creates a Graph from CSV input with edge values, but without vertex values. * @param vertexKey the type of the vertex IDs * @param edgeValue the type of the edge values * @return a Graph where the edges are read from an edges CSV file (with values). */ public <K, EV> Graph<K, NullValue, EV> edgeTypes(Class<K> vertexKey, Class<EV> edgeValue) { if (edgeReader == null) { throw new RuntimeException("The edge input file cannot be null!"); } DataSet<Tuple3<K, K, EV>> edges = edgeReader .types(vertexKey, vertexKey, edgeValue) .name(GraphCsvReader.class.getName()); return Graph.fromTupleDataSet(edges, executionContext); }
.generateSequence(0, 1).setParallelism(p * 2) .map(new IdentityMapper<Long>()) .withForwardedFields("*").setParallelism(p * 2).name("Map1") .groupBy("*").reduceGroup(new IdentityGroupReducer<Long>())
@Override protected void testProgram() throws Exception { // set up execution environment ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment(); // read vertex and edge data DataSet<Long> vertices = env.fromElements(ConnectedComponentsData.getEnumeratingVertices(NUM_VERTICES).split("\n")) .map(new VertexParser()); DataSet<Tuple2<Long, Long>> edges = env.fromElements(ConnectedComponentsData.getRandomOddEvenEdges(NUM_EDGES, NUM_VERTICES, SEED).split("\n")) .flatMap(new EdgeParser()); // assign the initial components (equal to the vertex id) DataSet<Tuple2<Long, Long>> verticesWithInitialId = vertices.map(new DuplicateValue<Long>()); // open a delta iteration DeltaIteration<Tuple2<Long, Long>, Tuple2<Long, Long>> iteration = verticesWithInitialId.iterateDelta(verticesWithInitialId, 100, 0); // apply the step logic: join with the edges, select the minimum neighbor, update if the component of the candidate is smaller DataSet<Tuple2<Long, Long>> changes = iteration .getWorkset().join(edges).where(0).equalTo(0).with(new NeighborWithComponentIDJoin()) .coGroup(iteration.getSolutionSet()).where(0).equalTo(0) .with(new MinIdAndUpdate()); // close the delta iteration (delta and new workset are identical) DataSet<Tuple2<Long, Long>> result = iteration.closeWith(changes, changes); // emit result List<Tuple2<Long, Long>> resutTuples = new ArrayList<>(); result.output(new LocalCollectionOutputFormat<>(resutTuples)); env.execute(); }
@SuppressWarnings("unchecked") private static DataSet<Tuple3<Double, StringValue, LongValue>> getSourceDataSet(ExecutionEnvironment env) { return env.fromElements(new Tuple3<Double, StringValue, LongValue>(3.141592, new StringValue("foobar"), new LongValue(77))) .setParallelism(1); }
private Plan getWordCountPlan(File inFile, File outFile, int parallelism) { ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment(); env.setParallelism(parallelism); env.readTextFile(inFile.getAbsolutePath()) .flatMap(new Tokenizer()) .groupBy(0) .sum(1) .writeAsCsv(outFile.getAbsolutePath()); return env.createProgramPlan(); } }
@Test public void testCustomPartitioningTupleInvalidType() { try { ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment(); DataSet<Pojo2> data = env.fromElements(new Pojo2()) .rebalance().setParallelism(4); try { data.groupBy("a").withPartitioner(new TestPartitionerLong()); fail("Should throw an exception"); } catch (InvalidProgramException e) {} } catch (Exception e) { e.printStackTrace(); fail(e.getMessage()); } }
@Override public Graph<LongValue, NullValue, NullValue> generate() { int scale = Long.SIZE - Long.numberOfLeadingZeros(vertexCount - 1); // Edges int cyclesPerEdge = noiseEnabled ? 5 * scale : scale; List<BlockInfo<T>> generatorBlocks = randomGenerableFactory .getRandomGenerables(edgeCount, cyclesPerEdge); DataSet<Edge<LongValue, NullValue>> edges = env .fromCollection(generatorBlocks) .name("Random generators") .rebalance() .setParallelism(parallelism) .name("Rebalance") .flatMap(new GenerateEdges<>(vertexCount, scale, a, b, c, noiseEnabled, noise)) .setParallelism(parallelism) .name("RMat graph edges"); // Vertices DataSet<Vertex<LongValue, NullValue>> vertices = GraphGeneratorUtils.vertexSet(edges, parallelism); // Graph return Graph.fromDataSet(vertices, edges, env); }
private static DataSet<Long> getVertexDataSet(ExecutionEnvironment env, ParameterTool params) { if (params.has("vertices")) { return env.readCsvFile(params.get("vertices")).types(Long.class).map( new MapFunction<Tuple1<Long>, Long>() { public Long map(Tuple1<Long> value) { return value.f0; } }); } else { System.out.println("Executing Connected Components example with default vertices data set."); System.out.println("Use --vertices to specify file input."); return ConnectedComponentsData.getDefaultVertexDataSet(env); } }
env.readFile(new PointInFormat(), dataPath).setParallelism(1).name("Input");
@Override public DataSet<Row> getDataSet(ExecutionEnvironment execEnv) { return execEnv.createInput(new HBaseRowInputFormat(conf, tableName, hBaseSchema), getReturnType()).name(explainSource()); }
@SuppressWarnings("unchecked") private static DataSet<Tuple3<Double, StringValue, LongValue>> getSourceDataSet(ExecutionEnvironment env, int parallelism) { return env .fromElements(new Tuple3<>(0.0, new StringValue(""), new LongValue(1L))) .setParallelism(parallelism); } }
@Test public void testBatchDistributedCache() throws Exception { String textPath = createTempFile("count.txt", DATA); ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment(); env.registerCachedFile(textPath, "cache_test"); env.readTextFile(textPath).flatMap(new WordChecker()).count(); }
@Test public void testCustomPartitioningTupleInvalidType() { try { ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment(); DataSet<Tuple2<Integer, Integer>> data = env.fromElements(new Tuple2<Integer, Integer>(0, 0)) .rebalance().setParallelism(4); try { data.groupBy(0).withPartitioner(new TestPartitionerLong()); fail("Should throw an exception"); } catch (InvalidProgramException e) {} } catch (Exception e) { e.printStackTrace(); fail(e.getMessage()); } }
private void createSequenceSource(ExecutionEnvironment env, PythonOperationInfo info) { sets.add(info.setID, env.generateSequence(info.frm, info.to).setParallelism(info.parallelism).name("SequenceSource") .map(new SerializerMap<Long>()).setParallelism(info.parallelism).name("SequenceSourcePostStep")); }