/** * Generic method to create an input DataSet with in {@link InputFormat}. The DataSet will not be * immediately created - instead, this method returns a DataSet that will be lazily created from * the input format once the program is executed. * <p> * The data set is typed to the given TypeInformation. This method is intended for input formats that * where the return type cannot be determined by reflection analysis, and that do not implement the * {@link ResultTypeQueryable} interface. * * @param inputFormat The input format used to create the data set. * @return A DataSet that represents the data created by the input format. * * @see #createInput(InputFormat) */ public <X> DataSource<X> createInput(InputFormat<X, ?> inputFormat, TypeInformation<X> producedType) { if (inputFormat == null) { throw new IllegalArgumentException("InputFormat must not be null."); } if (producedType == null) { throw new IllegalArgumentException("Produced type information must not be null."); } return new DataSource<X>(this, inputFormat, producedType); }
/** * Creates a new data set that contains elements in the iterator. The iterator is splittable, allowing the * framework to create a parallel data source that returns the elements in the iterator. * The iterator must be serializable (as defined in {@link java.io.Serializable}, because the * execution environment may ship the elements into the cluster. * <p> * Because the iterator will remain unmodified until the actual execution happens, the type of data * returned by the iterator must be given explicitly in the form of the type information. * This method is useful for cases where the type is generic. In that case, the type class * (as given in {@link #fromParallelCollection(SplittableIterator, Class)} does not supply all type information. * * @param iterator The iterator that produces the elements of the data set. * @param type The TypeInformation for the produced data set. * @return A DataSet representing the elements in the iterator. * * @see #fromParallelCollection(SplittableIterator, Class) */ public <X> DataSource<X> fromParallelCollection(SplittableIterator<X> iterator, TypeInformation<X> type) { return new DataSource<X>(this, new ParallelIteratorInputFormat<X>(iterator), type); }
/** * Creates a DataSet that represents the Strings produced by reading the given file line wise. * The {@link java.nio.charset.Charset} with the given name will be used to read the files. * * @param filePath The path of the file, as a URI (e.g., "file:///some/local/file" or "hdfs://host:port/file/path"). * @param charsetName The name of the character set used to read the file. * @return A DataSet that represents the data read from the given file as text lines. */ public DataSource<String> readTextFile(String filePath, String charsetName) { Validate.notNull(filePath, "The file path may not be null."); TextInputFormat format = new TextInputFormat(new Path(filePath)); format.setCharsetName(charsetName); return new DataSource<String>(this, format, BasicTypeInfo.STRING_TYPE_INFO ); }
/** * Specifies the types for the CSV fields. This method parses the CSV data to a 1-tuple * which has fields of the specified types. * This method is overloaded for each possible length of the tuples to support type safe * creation of data sets through CSV parsing. * * @param type0 The type of CSV field 0 and the type of field 0 in the returned tuple type. * @return The {@link eu.stratosphere.api.java.DataSet} representing the parsed CSV data. */ public <T0> DataSource<Tuple1<T0>> types(Class<T0> type0) { TupleTypeInfo<Tuple1<T0>> types = TupleTypeInfo.getBasicTupleTypeInfo(type0); CsvInputFormat<Tuple1<T0>> inputFormat = new CsvInputFormat<Tuple1<T0>>(path); configureInputFormat(inputFormat, type0); return new DataSource<Tuple1<T0>>(executionContext, inputFormat, types); }
/** * Specifies the types for the CSV fields. This method parses the CSV data to a 2-tuple * which has fields of the specified types. * This method is overloaded for each possible length of the tuples to support type safe * creation of data sets through CSV parsing. * * @param type0 The type of CSV field 0 and the type of field 0 in the returned tuple type. * @param type1 The type of CSV field 1 and the type of field 1 in the returned tuple type. * @return The {@link eu.stratosphere.api.java.DataSet} representing the parsed CSV data. */ public <T0, T1> DataSource<Tuple2<T0, T1>> types(Class<T0> type0, Class<T1> type1) { TupleTypeInfo<Tuple2<T0, T1>> types = TupleTypeInfo.getBasicTupleTypeInfo(type0, type1); CsvInputFormat<Tuple2<T0, T1>> inputFormat = new CsvInputFormat<Tuple2<T0, T1>>(path); configureInputFormat(inputFormat, type0, type1); return new DataSource<Tuple2<T0, T1>>(executionContext, inputFormat, types); }
/** * Specifies the types for the CSV fields. This method parses the CSV data to a 5-tuple * which has fields of the specified types. * This method is overloaded for each possible length of the tuples to support type safe * creation of data sets through CSV parsing. * * @param type0 The type of CSV field 0 and the type of field 0 in the returned tuple type. * @param type1 The type of CSV field 1 and the type of field 1 in the returned tuple type. * @param type2 The type of CSV field 2 and the type of field 2 in the returned tuple type. * @param type3 The type of CSV field 3 and the type of field 3 in the returned tuple type. * @param type4 The type of CSV field 4 and the type of field 4 in the returned tuple type. * @return The {@link eu.stratosphere.api.java.DataSet} representing the parsed CSV data. */ public <T0, T1, T2, T3, T4> DataSource<Tuple5<T0, T1, T2, T3, T4>> types(Class<T0> type0, Class<T1> type1, Class<T2> type2, Class<T3> type3, Class<T4> type4) { TupleTypeInfo<Tuple5<T0, T1, T2, T3, T4>> types = TupleTypeInfo.getBasicTupleTypeInfo(type0, type1, type2, type3, type4); CsvInputFormat<Tuple5<T0, T1, T2, T3, T4>> inputFormat = new CsvInputFormat<Tuple5<T0, T1, T2, T3, T4>>(path); configureInputFormat(inputFormat, type0, type1, type2, type3, type4); return new DataSource<Tuple5<T0, T1, T2, T3, T4>>(executionContext, inputFormat, types); }
/** * Specifies the types for the CSV fields. This method parses the CSV data to a 6-tuple * which has fields of the specified types. * This method is overloaded for each possible length of the tuples to support type safe * creation of data sets through CSV parsing. * * @param type0 The type of CSV field 0 and the type of field 0 in the returned tuple type. * @param type1 The type of CSV field 1 and the type of field 1 in the returned tuple type. * @param type2 The type of CSV field 2 and the type of field 2 in the returned tuple type. * @param type3 The type of CSV field 3 and the type of field 3 in the returned tuple type. * @param type4 The type of CSV field 4 and the type of field 4 in the returned tuple type. * @param type5 The type of CSV field 5 and the type of field 5 in the returned tuple type. * @return The {@link eu.stratosphere.api.java.DataSet} representing the parsed CSV data. */ public <T0, T1, T2, T3, T4, T5> DataSource<Tuple6<T0, T1, T2, T3, T4, T5>> types(Class<T0> type0, Class<T1> type1, Class<T2> type2, Class<T3> type3, Class<T4> type4, Class<T5> type5) { TupleTypeInfo<Tuple6<T0, T1, T2, T3, T4, T5>> types = TupleTypeInfo.getBasicTupleTypeInfo(type0, type1, type2, type3, type4, type5); CsvInputFormat<Tuple6<T0, T1, T2, T3, T4, T5>> inputFormat = new CsvInputFormat<Tuple6<T0, T1, T2, T3, T4, T5>>(path); configureInputFormat(inputFormat, type0, type1, type2, type3, type4, type5); return new DataSource<Tuple6<T0, T1, T2, T3, T4, T5>>(executionContext, inputFormat, types); }
/** * Creates a DataSet that represents the Strings produced by reading the given file line wise. * The file will be read with the system's default character set. * * @param filePath The path of the file, as a URI (e.g., "file:///some/local/file" or "hdfs://host:port/file/path"). * @return A DataSet that represents the data read from the given file as text lines. */ public DataSource<String> readTextFile(String filePath) { Validate.notNull(filePath, "The file path may not be null."); return new DataSource<String>(this, new TextInputFormat(new Path(filePath)), BasicTypeInfo.STRING_TYPE_INFO ); }
/** * Specifies the types for the CSV fields. This method parses the CSV data to a 3-tuple * which has fields of the specified types. * This method is overloaded for each possible length of the tuples to support type safe * creation of data sets through CSV parsing. * * @param type0 The type of CSV field 0 and the type of field 0 in the returned tuple type. * @param type1 The type of CSV field 1 and the type of field 1 in the returned tuple type. * @param type2 The type of CSV field 2 and the type of field 2 in the returned tuple type. * @return The {@link eu.stratosphere.api.java.DataSet} representing the parsed CSV data. */ public <T0, T1, T2> DataSource<Tuple3<T0, T1, T2>> types(Class<T0> type0, Class<T1> type1, Class<T2> type2) { TupleTypeInfo<Tuple3<T0, T1, T2>> types = TupleTypeInfo.getBasicTupleTypeInfo(type0, type1, type2); CsvInputFormat<Tuple3<T0, T1, T2>> inputFormat = new CsvInputFormat<Tuple3<T0, T1, T2>>(path); configureInputFormat(inputFormat, type0, type1, type2); return new DataSource<Tuple3<T0, T1, T2>>(executionContext, inputFormat, types); }
/** * Specifies the types for the CSV fields. This method parses the CSV data to a 4-tuple * which has fields of the specified types. * This method is overloaded for each possible length of the tuples to support type safe * creation of data sets through CSV parsing. * * @param type0 The type of CSV field 0 and the type of field 0 in the returned tuple type. * @param type1 The type of CSV field 1 and the type of field 1 in the returned tuple type. * @param type2 The type of CSV field 2 and the type of field 2 in the returned tuple type. * @param type3 The type of CSV field 3 and the type of field 3 in the returned tuple type. * @return The {@link eu.stratosphere.api.java.DataSet} representing the parsed CSV data. */ public <T0, T1, T2, T3> DataSource<Tuple4<T0, T1, T2, T3>> types(Class<T0> type0, Class<T1> type1, Class<T2> type2, Class<T3> type3) { TupleTypeInfo<Tuple4<T0, T1, T2, T3>> types = TupleTypeInfo.getBasicTupleTypeInfo(type0, type1, type2, type3); CsvInputFormat<Tuple4<T0, T1, T2, T3>> inputFormat = new CsvInputFormat<Tuple4<T0, T1, T2, T3>>(path); configureInputFormat(inputFormat, type0, type1, type2, type3); return new DataSource<Tuple4<T0, T1, T2, T3>>(executionContext, inputFormat, types); }
/** * Specifies the types for the CSV fields. This method parses the CSV data to a 7-tuple * which has fields of the specified types. * This method is overloaded for each possible length of the tuples to support type safe * creation of data sets through CSV parsing. * * @param type0 The type of CSV field 0 and the type of field 0 in the returned tuple type. * @param type1 The type of CSV field 1 and the type of field 1 in the returned tuple type. * @param type2 The type of CSV field 2 and the type of field 2 in the returned tuple type. * @param type3 The type of CSV field 3 and the type of field 3 in the returned tuple type. * @param type4 The type of CSV field 4 and the type of field 4 in the returned tuple type. * @param type5 The type of CSV field 5 and the type of field 5 in the returned tuple type. * @param type6 The type of CSV field 6 and the type of field 6 in the returned tuple type. * @return The {@link eu.stratosphere.api.java.DataSet} representing the parsed CSV data. */ public <T0, T1, T2, T3, T4, T5, T6> DataSource<Tuple7<T0, T1, T2, T3, T4, T5, T6>> types(Class<T0> type0, Class<T1> type1, Class<T2> type2, Class<T3> type3, Class<T4> type4, Class<T5> type5, Class<T6> type6) { TupleTypeInfo<Tuple7<T0, T1, T2, T3, T4, T5, T6>> types = TupleTypeInfo.getBasicTupleTypeInfo(type0, type1, type2, type3, type4, type5, type6); CsvInputFormat<Tuple7<T0, T1, T2, T3, T4, T5, T6>> inputFormat = new CsvInputFormat<Tuple7<T0, T1, T2, T3, T4, T5, T6>>(path); configureInputFormat(inputFormat, type0, type1, type2, type3, type4, type5, type6); return new DataSource<Tuple7<T0, T1, T2, T3, T4, T5, T6>>(executionContext, inputFormat, types); }
/** * Specifies the types for the CSV fields. This method parses the CSV data to a 8-tuple * which has fields of the specified types. * This method is overloaded for each possible length of the tuples to support type safe * creation of data sets through CSV parsing. * * @param type0 The type of CSV field 0 and the type of field 0 in the returned tuple type. * @param type1 The type of CSV field 1 and the type of field 1 in the returned tuple type. * @param type2 The type of CSV field 2 and the type of field 2 in the returned tuple type. * @param type3 The type of CSV field 3 and the type of field 3 in the returned tuple type. * @param type4 The type of CSV field 4 and the type of field 4 in the returned tuple type. * @param type5 The type of CSV field 5 and the type of field 5 in the returned tuple type. * @param type6 The type of CSV field 6 and the type of field 6 in the returned tuple type. * @param type7 The type of CSV field 7 and the type of field 7 in the returned tuple type. * @return The {@link eu.stratosphere.api.java.DataSet} representing the parsed CSV data. */ public <T0, T1, T2, T3, T4, T5, T6, T7> DataSource<Tuple8<T0, T1, T2, T3, T4, T5, T6, T7>> types(Class<T0> type0, Class<T1> type1, Class<T2> type2, Class<T3> type3, Class<T4> type4, Class<T5> type5, Class<T6> type6, Class<T7> type7) { TupleTypeInfo<Tuple8<T0, T1, T2, T3, T4, T5, T6, T7>> types = TupleTypeInfo.getBasicTupleTypeInfo(type0, type1, type2, type3, type4, type5, type6, type7); CsvInputFormat<Tuple8<T0, T1, T2, T3, T4, T5, T6, T7>> inputFormat = new CsvInputFormat<Tuple8<T0, T1, T2, T3, T4, T5, T6, T7>>(path); configureInputFormat(inputFormat, type0, type1, type2, type3, type4, type5, type6, type7); return new DataSource<Tuple8<T0, T1, T2, T3, T4, T5, T6, T7>>(executionContext, inputFormat, types); }
/** * Specifies the types for the CSV fields. This method parses the CSV data to a 10-tuple * which has fields of the specified types. * This method is overloaded for each possible length of the tuples to support type safe * creation of data sets through CSV parsing. * * @param type0 The type of CSV field 0 and the type of field 0 in the returned tuple type. * @param type1 The type of CSV field 1 and the type of field 1 in the returned tuple type. * @param type2 The type of CSV field 2 and the type of field 2 in the returned tuple type. * @param type3 The type of CSV field 3 and the type of field 3 in the returned tuple type. * @param type4 The type of CSV field 4 and the type of field 4 in the returned tuple type. * @param type5 The type of CSV field 5 and the type of field 5 in the returned tuple type. * @param type6 The type of CSV field 6 and the type of field 6 in the returned tuple type. * @param type7 The type of CSV field 7 and the type of field 7 in the returned tuple type. * @param type8 The type of CSV field 8 and the type of field 8 in the returned tuple type. * @param type9 The type of CSV field 9 and the type of field 9 in the returned tuple type. * @return The {@link eu.stratosphere.api.java.DataSet} representing the parsed CSV data. */ public <T0, T1, T2, T3, T4, T5, T6, T7, T8, T9> DataSource<Tuple10<T0, T1, T2, T3, T4, T5, T6, T7, T8, T9>> types(Class<T0> type0, Class<T1> type1, Class<T2> type2, Class<T3> type3, Class<T4> type4, Class<T5> type5, Class<T6> type6, Class<T7> type7, Class<T8> type8, Class<T9> type9) { TupleTypeInfo<Tuple10<T0, T1, T2, T3, T4, T5, T6, T7, T8, T9>> types = TupleTypeInfo.getBasicTupleTypeInfo(type0, type1, type2, type3, type4, type5, type6, type7, type8, type9); CsvInputFormat<Tuple10<T0, T1, T2, T3, T4, T5, T6, T7, T8, T9>> inputFormat = new CsvInputFormat<Tuple10<T0, T1, T2, T3, T4, T5, T6, T7, T8, T9>>(path); configureInputFormat(inputFormat, type0, type1, type2, type3, type4, type5, type6, type7, type8, type9); return new DataSource<Tuple10<T0, T1, T2, T3, T4, T5, T6, T7, T8, T9>>(executionContext, inputFormat, types); }
/** * Specifies the types for the CSV fields. This method parses the CSV data to a 11-tuple * which has fields of the specified types. * This method is overloaded for each possible length of the tuples to support type safe * creation of data sets through CSV parsing. * * @param type0 The type of CSV field 0 and the type of field 0 in the returned tuple type. * @param type1 The type of CSV field 1 and the type of field 1 in the returned tuple type. * @param type2 The type of CSV field 2 and the type of field 2 in the returned tuple type. * @param type3 The type of CSV field 3 and the type of field 3 in the returned tuple type. * @param type4 The type of CSV field 4 and the type of field 4 in the returned tuple type. * @param type5 The type of CSV field 5 and the type of field 5 in the returned tuple type. * @param type6 The type of CSV field 6 and the type of field 6 in the returned tuple type. * @param type7 The type of CSV field 7 and the type of field 7 in the returned tuple type. * @param type8 The type of CSV field 8 and the type of field 8 in the returned tuple type. * @param type9 The type of CSV field 9 and the type of field 9 in the returned tuple type. * @param type10 The type of CSV field 10 and the type of field 10 in the returned tuple type. * @return The {@link eu.stratosphere.api.java.DataSet} representing the parsed CSV data. */ public <T0, T1, T2, T3, T4, T5, T6, T7, T8, T9, T10> DataSource<Tuple11<T0, T1, T2, T3, T4, T5, T6, T7, T8, T9, T10>> types(Class<T0> type0, Class<T1> type1, Class<T2> type2, Class<T3> type3, Class<T4> type4, Class<T5> type5, Class<T6> type6, Class<T7> type7, Class<T8> type8, Class<T9> type9, Class<T10> type10) { TupleTypeInfo<Tuple11<T0, T1, T2, T3, T4, T5, T6, T7, T8, T9, T10>> types = TupleTypeInfo.getBasicTupleTypeInfo(type0, type1, type2, type3, type4, type5, type6, type7, type8, type9, type10); CsvInputFormat<Tuple11<T0, T1, T2, T3, T4, T5, T6, T7, T8, T9, T10>> inputFormat = new CsvInputFormat<Tuple11<T0, T1, T2, T3, T4, T5, T6, T7, T8, T9, T10>>(path); configureInputFormat(inputFormat, type0, type1, type2, type3, type4, type5, type6, type7, type8, type9, type10); return new DataSource<Tuple11<T0, T1, T2, T3, T4, T5, T6, T7, T8, T9, T10>>(executionContext, inputFormat, types); }
/** * Specifies the types for the CSV fields. This method parses the CSV data to a 12-tuple * which has fields of the specified types. * This method is overloaded for each possible length of the tuples to support type safe * creation of data sets through CSV parsing. * * @param type0 The type of CSV field 0 and the type of field 0 in the returned tuple type. * @param type1 The type of CSV field 1 and the type of field 1 in the returned tuple type. * @param type2 The type of CSV field 2 and the type of field 2 in the returned tuple type. * @param type3 The type of CSV field 3 and the type of field 3 in the returned tuple type. * @param type4 The type of CSV field 4 and the type of field 4 in the returned tuple type. * @param type5 The type of CSV field 5 and the type of field 5 in the returned tuple type. * @param type6 The type of CSV field 6 and the type of field 6 in the returned tuple type. * @param type7 The type of CSV field 7 and the type of field 7 in the returned tuple type. * @param type8 The type of CSV field 8 and the type of field 8 in the returned tuple type. * @param type9 The type of CSV field 9 and the type of field 9 in the returned tuple type. * @param type10 The type of CSV field 10 and the type of field 10 in the returned tuple type. * @param type11 The type of CSV field 11 and the type of field 11 in the returned tuple type. * @return The {@link eu.stratosphere.api.java.DataSet} representing the parsed CSV data. */ public <T0, T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11> DataSource<Tuple12<T0, T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11>> types(Class<T0> type0, Class<T1> type1, Class<T2> type2, Class<T3> type3, Class<T4> type4, Class<T5> type5, Class<T6> type6, Class<T7> type7, Class<T8> type8, Class<T9> type9, Class<T10> type10, Class<T11> type11) { TupleTypeInfo<Tuple12<T0, T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11>> types = TupleTypeInfo.getBasicTupleTypeInfo(type0, type1, type2, type3, type4, type5, type6, type7, type8, type9, type10, type11); CsvInputFormat<Tuple12<T0, T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11>> inputFormat = new CsvInputFormat<Tuple12<T0, T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11>>(path); configureInputFormat(inputFormat, type0, type1, type2, type3, type4, type5, type6, type7, type8, type9, type10, type11); return new DataSource<Tuple12<T0, T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11>>(executionContext, inputFormat, types); }
/** * Specifies the types for the CSV fields. This method parses the CSV data to a 13-tuple * which has fields of the specified types. * This method is overloaded for each possible length of the tuples to support type safe * creation of data sets through CSV parsing. * * @param type0 The type of CSV field 0 and the type of field 0 in the returned tuple type. * @param type1 The type of CSV field 1 and the type of field 1 in the returned tuple type. * @param type2 The type of CSV field 2 and the type of field 2 in the returned tuple type. * @param type3 The type of CSV field 3 and the type of field 3 in the returned tuple type. * @param type4 The type of CSV field 4 and the type of field 4 in the returned tuple type. * @param type5 The type of CSV field 5 and the type of field 5 in the returned tuple type. * @param type6 The type of CSV field 6 and the type of field 6 in the returned tuple type. * @param type7 The type of CSV field 7 and the type of field 7 in the returned tuple type. * @param type8 The type of CSV field 8 and the type of field 8 in the returned tuple type. * @param type9 The type of CSV field 9 and the type of field 9 in the returned tuple type. * @param type10 The type of CSV field 10 and the type of field 10 in the returned tuple type. * @param type11 The type of CSV field 11 and the type of field 11 in the returned tuple type. * @param type12 The type of CSV field 12 and the type of field 12 in the returned tuple type. * @return The {@link eu.stratosphere.api.java.DataSet} representing the parsed CSV data. */ public <T0, T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12> DataSource<Tuple13<T0, T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12>> types(Class<T0> type0, Class<T1> type1, Class<T2> type2, Class<T3> type3, Class<T4> type4, Class<T5> type5, Class<T6> type6, Class<T7> type7, Class<T8> type8, Class<T9> type9, Class<T10> type10, Class<T11> type11, Class<T12> type12) { TupleTypeInfo<Tuple13<T0, T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12>> types = TupleTypeInfo.getBasicTupleTypeInfo(type0, type1, type2, type3, type4, type5, type6, type7, type8, type9, type10, type11, type12); CsvInputFormat<Tuple13<T0, T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12>> inputFormat = new CsvInputFormat<Tuple13<T0, T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12>>(path); configureInputFormat(inputFormat, type0, type1, type2, type3, type4, type5, type6, type7, type8, type9, type10, type11, type12); return new DataSource<Tuple13<T0, T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12>>(executionContext, inputFormat, types); }
/** * Specifies the types for the CSV fields. This method parses the CSV data to a 9-tuple * which has fields of the specified types. * This method is overloaded for each possible length of the tuples to support type safe * creation of data sets through CSV parsing. * * @param type0 The type of CSV field 0 and the type of field 0 in the returned tuple type. * @param type1 The type of CSV field 1 and the type of field 1 in the returned tuple type. * @param type2 The type of CSV field 2 and the type of field 2 in the returned tuple type. * @param type3 The type of CSV field 3 and the type of field 3 in the returned tuple type. * @param type4 The type of CSV field 4 and the type of field 4 in the returned tuple type. * @param type5 The type of CSV field 5 and the type of field 5 in the returned tuple type. * @param type6 The type of CSV field 6 and the type of field 6 in the returned tuple type. * @param type7 The type of CSV field 7 and the type of field 7 in the returned tuple type. * @param type8 The type of CSV field 8 and the type of field 8 in the returned tuple type. * @return The {@link eu.stratosphere.api.java.DataSet} representing the parsed CSV data. */ public <T0, T1, T2, T3, T4, T5, T6, T7, T8> DataSource<Tuple9<T0, T1, T2, T3, T4, T5, T6, T7, T8>> types(Class<T0> type0, Class<T1> type1, Class<T2> type2, Class<T3> type3, Class<T4> type4, Class<T5> type5, Class<T6> type6, Class<T7> type7, Class<T8> type8) { TupleTypeInfo<Tuple9<T0, T1, T2, T3, T4, T5, T6, T7, T8>> types = TupleTypeInfo.getBasicTupleTypeInfo(type0, type1, type2, type3, type4, type5, type6, type7, type8); CsvInputFormat<Tuple9<T0, T1, T2, T3, T4, T5, T6, T7, T8>> inputFormat = new CsvInputFormat<Tuple9<T0, T1, T2, T3, T4, T5, T6, T7, T8>>(path); configureInputFormat(inputFormat, type0, type1, type2, type3, type4, type5, type6, type7, type8); return new DataSource<Tuple9<T0, T1, T2, T3, T4, T5, T6, T7, T8>>(executionContext, inputFormat, types); }
/** * Creates a DataSet from the given non-empty collection. The type of the data set is that * of the elements in the collection. The elements need to be serializable (as defined by * {@link java.io.Serializable}), because the framework may move the elements into the cluster * if needed. * <p> * Note that this operation will result in a non-parallel data source, i.e. a data source with * a degree of parallelism of one. * <p> * The returned DataSet is typed to the given TypeInformation. * * @param data The collection of elements to create the data set from. * @param type The TypeInformation for the produced data set. * @return A DataSet representing the given collection. * * @see #fromCollection(Collection) */ public <X> DataSource<X> fromCollection(Collection<X> data, TypeInformation<X> type) { CollectionInputFormat.checkCollection(data, type.getTypeClass()); return new DataSource<X>(this, new CollectionInputFormat<X>(data), type); }
/** * Creates a DataSet that represents the Strings produced by reading the given file line wise. * This method is similar to {@link #readTextFile(String)}, but it produces a DataSet with mutable * {@link StringValue} objects, rather than Java Strings. StringValues can be used to tune implementations * to be less object and garbage collection heavy. * <p> * The file will be read with the system's default character set. * * @param filePath The path of the file, as a URI (e.g., "file:///some/local/file" or "hdfs://host:port/file/path"). * @return A DataSet that represents the data read from the given file as text lines. */ public DataSource<StringValue> readTextFileWithValue(String filePath) { Validate.notNull(filePath, "The file path may not be null."); return new DataSource<StringValue>(this, new TextValueInputFormat(new Path(filePath)), new ValueTypeInfo<StringValue>(StringValue.class) ); }
/** * Creates a DataSet that represents the Strings produced by reading the given file line wise. * This method is similar to {@link #readTextFile(String, String)}, but it produces a DataSet with mutable * {@link StringValue} objects, rather than Java Strings. StringValues can be used to tune implementations * to be less object and garbage collection heavy. * <p> * The {@link java.nio.charset.Charset} with the given name will be used to read the files. * * @param filePath The path of the file, as a URI (e.g., "file:///some/local/file" or "hdfs://host:port/file/path"). * @param charsetName The name of the character set used to read the file. * @param skipInvalidLines A flag to indicate whether to skip lines that cannot be read with the given character set. * * @return A DataSet that represents the data read from the given file as text lines. */ public DataSource<StringValue> readTextFileWithValue(String filePath, String charsetName, boolean skipInvalidLines) { Validate.notNull(filePath, "The file path may not be null."); TextValueInputFormat format = new TextValueInputFormat(new Path(filePath)); format.setCharsetName(charsetName); format.setSkipInvalidLines(skipInvalidLines); return new DataSource<StringValue>(this, format, new ValueTypeInfo<StringValue>(StringValue.class) ); }