private static DataSet<Long> getVertexDataSet(ExecutionEnvironment env) { if(fileOutput) { return env.readCsvFile(verticesPath).types(Long.class) .map( new MapFunction<Tuple1<Long>, Long>() { public Long map(Tuple1<Long> value) { return value.f0; } }); } else { return ConnectedComponentsData.getDefaultVertexDataSet(env); } }
/** * Generic method to create an input DataSet with in {@link InputFormat}. The DataSet will not be * immediately created - instead, this method returns a DataSet that will be lazily created from * the input format once the program is executed. * <p> * The data set is typed to the given TypeInformation. This method is intended for input formats that * where the return type cannot be determined by reflection analysis, and that do not implement the * {@link ResultTypeQueryable} interface. * * @param inputFormat The input format used to create the data set. * @return A DataSet that represents the data created by the input format. * * @see #createInput(InputFormat) */ public <X> DataSource<X> createInput(InputFormat<X, ?> inputFormat, TypeInformation<X> producedType) { if (inputFormat == null) { throw new IllegalArgumentException("InputFormat must not be null."); } if (producedType == null) { throw new IllegalArgumentException("Produced type information must not be null."); } return new DataSource<X>(this, inputFormat, producedType); }
protected GenericDataSourceBase<OUT, ?> translateToDataFlow() { String name = this.name != null ? this.name : this.inputFormat.toString(); if (name.length() > 100) { name = name.substring(0, 100); } @SuppressWarnings({ "unchecked", "rawtypes" }) GenericDataSourceBase<OUT, ?> source = new GenericDataSourceBase(this.inputFormat, new OperatorInformation<OUT>(getType()), name); source.setDegreeOfParallelism(dop); return source; } }
.map(new MapFunction<Long, Tuple2<Long, Double>>() { public Tuple2<Long, Double> map(Long value) { return new Tuple2<Long, Double>(value, 1.0/numVertices); .flatMap(new FlatMapFunction<Long, Tuple3<Long, Long, Double>>() { public void flatMap(Long value, Collector<Tuple3<Long, Long, Double>> out) { int numOutEdges = (int) (Math.random() * (numVertices / 2));
dataFlowOp = ((DataSource<T>) dataSet).translateToDataFlow();
.flatMap(new FlatMapFunction<Long, Tuple3<Long, Long, Double>>() { public void flatMap(Long value, Collector<Tuple3<Long, Long, Double>> out) { int numOutEdges = (int) (Math.random() * (NUM_VERTICES / 2));
private static DataSet<Long> getVertexDataSet(ExecutionEnvironment env) { if(fileOutput) { return env.readCsvFile(verticesPath).types(Long.class) .map( new MapFunction<Tuple1<Long>, Long>() { public Long map(Tuple1<Long> value) { return value.f0; } }); } else { return ConnectedComponentsData.getDefaultVertexDataSet(env); } }
/** * Creates a new data set that contains elements in the iterator. The iterator is splittable, allowing the * framework to create a parallel data source that returns the elements in the iterator. * The iterator must be serializable (as defined in {@link java.io.Serializable}, because the * execution environment may ship the elements into the cluster. * <p> * Because the iterator will remain unmodified until the actual execution happens, the type of data * returned by the iterator must be given explicitly in the form of the type information. * This method is useful for cases where the type is generic. In that case, the type class * (as given in {@link #fromParallelCollection(SplittableIterator, Class)} does not supply all type information. * * @param iterator The iterator that produces the elements of the data set. * @param type The TypeInformation for the produced data set. * @return A DataSet representing the elements in the iterator. * * @see #fromParallelCollection(SplittableIterator, Class) */ public <X> DataSource<X> fromParallelCollection(SplittableIterator<X> iterator, TypeInformation<X> type) { return new DataSource<X>(this, new ParallelIteratorInputFormat<X>(iterator), type); }
private static DataSet<Edge> getEdgeDataSet(ExecutionEnvironment env) { if(fileOutput) { return env.readCsvFile(edgePath) .fieldDelimiter(' ') .includeFields(true, true) .types(Integer.class, Integer.class) .map(new TupleEdgeConverter()); } else { return EnumTrianglesData.getDefaultEdgeDataSet(env); } }
/** * Creates a DataSet from the given iterator. Because the iterator will remain unmodified until * the actual execution happens, the type of data returned by the iterator must be given * explicitly in the form of the type information. This method is useful for cases where the type * is generic. In that case, the type class (as given in {@link #fromCollection(Iterator, Class)} * does not supply all type information. * <p> * The iterator must be serializable (as defined in {@link java.io.Serializable}), because the * framework may move it to a remote environment, if needed. * <p> * Note that this operation will result in a non-parallel data source, i.e. a data source with * a degree of parallelism of one. * * @param data The collection of elements to create the data set from. * @param type The TypeInformation for the produced data set. * @return A DataSet representing the elements in the iterator. * * @see #fromCollection(Iterator, Class) */ public <X> DataSource<X> fromCollection(Iterator<X> data, TypeInformation<X> type) { if (!(data instanceof Serializable)) { throw new IllegalArgumentException("The iterator must be serializable."); } return new DataSource<X>(this, new IteratorInputFormat<X>(data), type); }
private static DataSet<Point> getPointDataSet(ExecutionEnvironment env) { if(fileOutput) { // read points from CSV file return env.readCsvFile(pointsPath) .fieldDelimiter(' ') .includeFields(true, true) .types(Double.class, Double.class) .map(new TuplePointConverter()); } else { return KMeansData.getDefaultPointDataSet(env); } }
/** * Creates a DataSet that represents the Strings produced by reading the given file line wise. * The {@link java.nio.charset.Charset} with the given name will be used to read the files. * * @param filePath The path of the file, as a URI (e.g., "file:///some/local/file" or "hdfs://host:port/file/path"). * @param charsetName The name of the character set used to read the file. * @return A DataSet that represents the data read from the given file as text lines. */ public DataSource<String> readTextFile(String filePath, String charsetName) { Validate.notNull(filePath, "The file path may not be null."); TextInputFormat format = new TextInputFormat(new Path(filePath)); format.setCharsetName(charsetName); return new DataSource<String>(this, format, BasicTypeInfo.STRING_TYPE_INFO ); }
private static DataSet<Edge> getEdgeDataSet(ExecutionEnvironment env) { if(fileOutput) { return env.readCsvFile(edgePath) .fieldDelimiter(' ') .includeFields(true, true) .types(Integer.class, Integer.class) .map(new TupleEdgeConverter()); } else { return EnumTrianglesData.getDefaultEdgeDataSet(env); } }
/** * Specifies the types for the CSV fields. This method parses the CSV data to a 1-tuple * which has fields of the specified types. * This method is overloaded for each possible length of the tuples to support type safe * creation of data sets through CSV parsing. * * @param type0 The type of CSV field 0 and the type of field 0 in the returned tuple type. * @return The {@link eu.stratosphere.api.java.DataSet} representing the parsed CSV data. */ public <T0> DataSource<Tuple1<T0>> types(Class<T0> type0) { TupleTypeInfo<Tuple1<T0>> types = TupleTypeInfo.getBasicTupleTypeInfo(type0); CsvInputFormat<Tuple1<T0>> inputFormat = new CsvInputFormat<Tuple1<T0>>(path); configureInputFormat(inputFormat, type0); return new DataSource<Tuple1<T0>>(executionContext, inputFormat, types); }
private static DataSet<Edge> getEdgeDataSet(ExecutionEnvironment env) { if(fileOutput) { return env.readCsvFile(edgePath) .fieldDelimiter(' ') .includeFields(true, true) .types(Integer.class, Integer.class) .map(new TupleEdgeConverter()); } else { return EnumTrianglesData.getDefaultEdgeDataSet(env); } }
/** * Specifies the types for the CSV fields. This method parses the CSV data to a 2-tuple * which has fields of the specified types. * This method is overloaded for each possible length of the tuples to support type safe * creation of data sets through CSV parsing. * * @param type0 The type of CSV field 0 and the type of field 0 in the returned tuple type. * @param type1 The type of CSV field 1 and the type of field 1 in the returned tuple type. * @return The {@link eu.stratosphere.api.java.DataSet} representing the parsed CSV data. */ public <T0, T1> DataSource<Tuple2<T0, T1>> types(Class<T0> type0, Class<T1> type1) { TupleTypeInfo<Tuple2<T0, T1>> types = TupleTypeInfo.getBasicTupleTypeInfo(type0, type1); CsvInputFormat<Tuple2<T0, T1>> inputFormat = new CsvInputFormat<Tuple2<T0, T1>>(path); configureInputFormat(inputFormat, type0, type1); return new DataSource<Tuple2<T0, T1>>(executionContext, inputFormat, types); }
private static DataSet<Edge> getEdgeDataSet(ExecutionEnvironment env) { if(fileOutput) { return env.readCsvFile(edgePath) .fieldDelimiter(' ') .includeFields(true, true) .types(Integer.class, Integer.class) .map(new TupleEdgeConverter()); } else { return EnumTrianglesData.getDefaultEdgeDataSet(env); } }
/** * Specifies the types for the CSV fields. This method parses the CSV data to a 5-tuple * which has fields of the specified types. * This method is overloaded for each possible length of the tuples to support type safe * creation of data sets through CSV parsing. * * @param type0 The type of CSV field 0 and the type of field 0 in the returned tuple type. * @param type1 The type of CSV field 1 and the type of field 1 in the returned tuple type. * @param type2 The type of CSV field 2 and the type of field 2 in the returned tuple type. * @param type3 The type of CSV field 3 and the type of field 3 in the returned tuple type. * @param type4 The type of CSV field 4 and the type of field 4 in the returned tuple type. * @return The {@link eu.stratosphere.api.java.DataSet} representing the parsed CSV data. */ public <T0, T1, T2, T3, T4> DataSource<Tuple5<T0, T1, T2, T3, T4>> types(Class<T0> type0, Class<T1> type1, Class<T2> type2, Class<T3> type3, Class<T4> type4) { TupleTypeInfo<Tuple5<T0, T1, T2, T3, T4>> types = TupleTypeInfo.getBasicTupleTypeInfo(type0, type1, type2, type3, type4); CsvInputFormat<Tuple5<T0, T1, T2, T3, T4>> inputFormat = new CsvInputFormat<Tuple5<T0, T1, T2, T3, T4>>(path); configureInputFormat(inputFormat, type0, type1, type2, type3, type4); return new DataSource<Tuple5<T0, T1, T2, T3, T4>>(executionContext, inputFormat, types); }
private static DataSet<Centroid> getCentroidDataSet(ExecutionEnvironment env) { if(fileOutput) { return env.readCsvFile(centersPath) .fieldDelimiter(' ') .includeFields(true, true, true) .types(Integer.class, Double.class, Double.class) .map(new TupleCentroidConverter()); } else { return KMeansData.getDefaultCentroidDataSet(env); } }
/** * Creates a DataSet that represents the Strings produced by reading the given file line wise. * The file will be read with the system's default character set. * * @param filePath The path of the file, as a URI (e.g., "file:///some/local/file" or "hdfs://host:port/file/path"). * @return A DataSet that represents the data read from the given file as text lines. */ public DataSource<String> readTextFile(String filePath) { Validate.notNull(filePath, "The file path may not be null."); return new DataSource<String>(this, new TextInputFormat(new Path(filePath)), BasicTypeInfo.STRING_TYPE_INFO ); }