private <X> DataSource<X> fromParallelCollection(SplittableIterator<X> iterator, TypeInformation<X> type, String callLocationName) { return new DataSource<>(this, new ParallelIteratorInputFormat<>(iterator), type, callLocationName); }
/** * Creates a DataSet from the given iterator. Because the iterator will remain unmodified until * the actual execution happens, the type of data returned by the iterator must be given * explicitly in the form of the type information. This method is useful for cases where the type * is generic. In that case, the type class (as given in {@link #fromCollection(Iterator, Class)} * does not supply all type information. * * <p>Note that this operation will result in a non-parallel data source, i.e. a data source with * a parallelism of one. * * @param data The collection of elements to create the data set from. * @param type The TypeInformation for the produced data set. * @return A DataSet representing the elements in the iterator. * * @see #fromCollection(Iterator, Class) */ public <X> DataSource<X> fromCollection(Iterator<X> data, TypeInformation<X> type) { return new DataSource<>(this, new IteratorInputFormat<>(data), type, Utils.getCallLocationName()); }
/** * Specifies the types for the CSV fields. This method parses the CSV data to a 1-tuple * which has fields of the specified types. * This method is overloaded for each possible length of the tuples to support type safe * creation of data sets through CSV parsing. * * @param type0 The type of CSV field 0 and the type of field 0 in the returned tuple type. * @return The {@link org.apache.flink.api.java.DataSet} representing the parsed CSV data. */ public <T0> DataSource<Tuple1<T0>> types(Class<T0> type0) { TupleTypeInfo<Tuple1<T0>> types = TupleTypeInfo.getBasicAndBasicValueTupleTypeInfo(type0); CsvInputFormat<Tuple1<T0>> inputFormat = new TupleCsvInputFormat<Tuple1<T0>>(path, types, this.includedMask); configureInputFormat(inputFormat); return new DataSource<Tuple1<T0>>(executionContext, inputFormat, types, Utils.getCallLocationName()); }
/** * Specifies the types for the CSV fields. This method parses the CSV data to a 3-tuple * which has fields of the specified types. * This method is overloaded for each possible length of the tuples to support type safe * creation of data sets through CSV parsing. * * @param type0 The type of CSV field 0 and the type of field 0 in the returned tuple type. * @param type1 The type of CSV field 1 and the type of field 1 in the returned tuple type. * @param type2 The type of CSV field 2 and the type of field 2 in the returned tuple type. * @return The {@link org.apache.flink.api.java.DataSet} representing the parsed CSV data. */ public <T0, T1, T2> DataSource<Tuple3<T0, T1, T2>> types(Class<T0> type0, Class<T1> type1, Class<T2> type2) { TupleTypeInfo<Tuple3<T0, T1, T2>> types = TupleTypeInfo.getBasicAndBasicValueTupleTypeInfo(type0, type1, type2); CsvInputFormat<Tuple3<T0, T1, T2>> inputFormat = new TupleCsvInputFormat<Tuple3<T0, T1, T2>>(path, types, this.includedMask); configureInputFormat(inputFormat); return new DataSource<Tuple3<T0, T1, T2>>(executionContext, inputFormat, types, Utils.getCallLocationName()); }
/** * Specifies the types for the CSV fields. This method parses the CSV data to a 2-tuple * which has fields of the specified types. * This method is overloaded for each possible length of the tuples to support type safe * creation of data sets through CSV parsing. * * @param type0 The type of CSV field 0 and the type of field 0 in the returned tuple type. * @param type1 The type of CSV field 1 and the type of field 1 in the returned tuple type. * @return The {@link org.apache.flink.api.java.DataSet} representing the parsed CSV data. */ public <T0, T1> DataSource<Tuple2<T0, T1>> types(Class<T0> type0, Class<T1> type1) { TupleTypeInfo<Tuple2<T0, T1>> types = TupleTypeInfo.getBasicAndBasicValueTupleTypeInfo(type0, type1); CsvInputFormat<Tuple2<T0, T1>> inputFormat = new TupleCsvInputFormat<Tuple2<T0, T1>>(path, types, this.includedMask); configureInputFormat(inputFormat); return new DataSource<Tuple2<T0, T1>>(executionContext, inputFormat, types, Utils.getCallLocationName()); }
/** * Specifies the types for the CSV fields. This method parses the CSV data to a 4-tuple * which has fields of the specified types. * This method is overloaded for each possible length of the tuples to support type safe * creation of data sets through CSV parsing. * * @param type0 The type of CSV field 0 and the type of field 0 in the returned tuple type. * @param type1 The type of CSV field 1 and the type of field 1 in the returned tuple type. * @param type2 The type of CSV field 2 and the type of field 2 in the returned tuple type. * @param type3 The type of CSV field 3 and the type of field 3 in the returned tuple type. * @return The {@link org.apache.flink.api.java.DataSet} representing the parsed CSV data. */ public <T0, T1, T2, T3> DataSource<Tuple4<T0, T1, T2, T3>> types(Class<T0> type0, Class<T1> type1, Class<T2> type2, Class<T3> type3) { TupleTypeInfo<Tuple4<T0, T1, T2, T3>> types = TupleTypeInfo.getBasicAndBasicValueTupleTypeInfo(type0, type1, type2, type3); CsvInputFormat<Tuple4<T0, T1, T2, T3>> inputFormat = new TupleCsvInputFormat<Tuple4<T0, T1, T2, T3>>(path, types, this.includedMask); configureInputFormat(inputFormat); return new DataSource<Tuple4<T0, T1, T2, T3>>(executionContext, inputFormat, types, Utils.getCallLocationName()); }
/** * Specifies the types for the CSV fields. This method parses the CSV data to a 5-tuple * which has fields of the specified types. * This method is overloaded for each possible length of the tuples to support type safe * creation of data sets through CSV parsing. * * @param type0 The type of CSV field 0 and the type of field 0 in the returned tuple type. * @param type1 The type of CSV field 1 and the type of field 1 in the returned tuple type. * @param type2 The type of CSV field 2 and the type of field 2 in the returned tuple type. * @param type3 The type of CSV field 3 and the type of field 3 in the returned tuple type. * @param type4 The type of CSV field 4 and the type of field 4 in the returned tuple type. * @return The {@link org.apache.flink.api.java.DataSet} representing the parsed CSV data. */ public <T0, T1, T2, T3, T4> DataSource<Tuple5<T0, T1, T2, T3, T4>> types(Class<T0> type0, Class<T1> type1, Class<T2> type2, Class<T3> type3, Class<T4> type4) { TupleTypeInfo<Tuple5<T0, T1, T2, T3, T4>> types = TupleTypeInfo.getBasicAndBasicValueTupleTypeInfo(type0, type1, type2, type3, type4); CsvInputFormat<Tuple5<T0, T1, T2, T3, T4>> inputFormat = new TupleCsvInputFormat<Tuple5<T0, T1, T2, T3, T4>>(path, types, this.includedMask); configureInputFormat(inputFormat); return new DataSource<Tuple5<T0, T1, T2, T3, T4>>(executionContext, inputFormat, types, Utils.getCallLocationName()); }
/** * Specifies the types for the CSV fields. This method parses the CSV data to a 6-tuple * which has fields of the specified types. * This method is overloaded for each possible length of the tuples to support type safe * creation of data sets through CSV parsing. * * @param type0 The type of CSV field 0 and the type of field 0 in the returned tuple type. * @param type1 The type of CSV field 1 and the type of field 1 in the returned tuple type. * @param type2 The type of CSV field 2 and the type of field 2 in the returned tuple type. * @param type3 The type of CSV field 3 and the type of field 3 in the returned tuple type. * @param type4 The type of CSV field 4 and the type of field 4 in the returned tuple type. * @param type5 The type of CSV field 5 and the type of field 5 in the returned tuple type. * @return The {@link org.apache.flink.api.java.DataSet} representing the parsed CSV data. */ public <T0, T1, T2, T3, T4, T5> DataSource<Tuple6<T0, T1, T2, T3, T4, T5>> types(Class<T0> type0, Class<T1> type1, Class<T2> type2, Class<T3> type3, Class<T4> type4, Class<T5> type5) { TupleTypeInfo<Tuple6<T0, T1, T2, T3, T4, T5>> types = TupleTypeInfo.getBasicAndBasicValueTupleTypeInfo(type0, type1, type2, type3, type4, type5); CsvInputFormat<Tuple6<T0, T1, T2, T3, T4, T5>> inputFormat = new TupleCsvInputFormat<Tuple6<T0, T1, T2, T3, T4, T5>>(path, types, this.includedMask); configureInputFormat(inputFormat); return new DataSource<Tuple6<T0, T1, T2, T3, T4, T5>>(executionContext, inputFormat, types, Utils.getCallLocationName()); }
private <X> DataSource<X> fromCollection(Collection<X> data, TypeInformation<X> type, String callLocationName) { CollectionInputFormat.checkCollection(data, type.getTypeClass()); return new DataSource<>(this, new CollectionInputFormat<>(data, type.createSerializer(config)), type, callLocationName); }
/** * Specifies the types for the CSV fields. This method parses the CSV data to a 7-tuple * which has fields of the specified types. * This method is overloaded for each possible length of the tuples to support type safe * creation of data sets through CSV parsing. * * @param type0 The type of CSV field 0 and the type of field 0 in the returned tuple type. * @param type1 The type of CSV field 1 and the type of field 1 in the returned tuple type. * @param type2 The type of CSV field 2 and the type of field 2 in the returned tuple type. * @param type3 The type of CSV field 3 and the type of field 3 in the returned tuple type. * @param type4 The type of CSV field 4 and the type of field 4 in the returned tuple type. * @param type5 The type of CSV field 5 and the type of field 5 in the returned tuple type. * @param type6 The type of CSV field 6 and the type of field 6 in the returned tuple type. * @return The {@link org.apache.flink.api.java.DataSet} representing the parsed CSV data. */ public <T0, T1, T2, T3, T4, T5, T6> DataSource<Tuple7<T0, T1, T2, T3, T4, T5, T6>> types(Class<T0> type0, Class<T1> type1, Class<T2> type2, Class<T3> type3, Class<T4> type4, Class<T5> type5, Class<T6> type6) { TupleTypeInfo<Tuple7<T0, T1, T2, T3, T4, T5, T6>> types = TupleTypeInfo.getBasicAndBasicValueTupleTypeInfo(type0, type1, type2, type3, type4, type5, type6); CsvInputFormat<Tuple7<T0, T1, T2, T3, T4, T5, T6>> inputFormat = new TupleCsvInputFormat<Tuple7<T0, T1, T2, T3, T4, T5, T6>>(path, types, this.includedMask); configureInputFormat(inputFormat); return new DataSource<Tuple7<T0, T1, T2, T3, T4, T5, T6>>(executionContext, inputFormat, types, Utils.getCallLocationName()); }
/** * Specifies the types for the CSV fields. This method parses the CSV data to a 8-tuple * which has fields of the specified types. * This method is overloaded for each possible length of the tuples to support type safe * creation of data sets through CSV parsing. * * @param type0 The type of CSV field 0 and the type of field 0 in the returned tuple type. * @param type1 The type of CSV field 1 and the type of field 1 in the returned tuple type. * @param type2 The type of CSV field 2 and the type of field 2 in the returned tuple type. * @param type3 The type of CSV field 3 and the type of field 3 in the returned tuple type. * @param type4 The type of CSV field 4 and the type of field 4 in the returned tuple type. * @param type5 The type of CSV field 5 and the type of field 5 in the returned tuple type. * @param type6 The type of CSV field 6 and the type of field 6 in the returned tuple type. * @param type7 The type of CSV field 7 and the type of field 7 in the returned tuple type. * @return The {@link org.apache.flink.api.java.DataSet} representing the parsed CSV data. */ public <T0, T1, T2, T3, T4, T5, T6, T7> DataSource<Tuple8<T0, T1, T2, T3, T4, T5, T6, T7>> types(Class<T0> type0, Class<T1> type1, Class<T2> type2, Class<T3> type3, Class<T4> type4, Class<T5> type5, Class<T6> type6, Class<T7> type7) { TupleTypeInfo<Tuple8<T0, T1, T2, T3, T4, T5, T6, T7>> types = TupleTypeInfo.getBasicAndBasicValueTupleTypeInfo(type0, type1, type2, type3, type4, type5, type6, type7); CsvInputFormat<Tuple8<T0, T1, T2, T3, T4, T5, T6, T7>> inputFormat = new TupleCsvInputFormat<Tuple8<T0, T1, T2, T3, T4, T5, T6, T7>>(path, types, this.includedMask); configureInputFormat(inputFormat); return new DataSource<Tuple8<T0, T1, T2, T3, T4, T5, T6, T7>>(executionContext, inputFormat, types, Utils.getCallLocationName()); }
/** * Specifies the types for the CSV fields. This method parses the CSV data to a 9-tuple * which has fields of the specified types. * This method is overloaded for each possible length of the tuples to support type safe * creation of data sets through CSV parsing. * * @param type0 The type of CSV field 0 and the type of field 0 in the returned tuple type. * @param type1 The type of CSV field 1 and the type of field 1 in the returned tuple type. * @param type2 The type of CSV field 2 and the type of field 2 in the returned tuple type. * @param type3 The type of CSV field 3 and the type of field 3 in the returned tuple type. * @param type4 The type of CSV field 4 and the type of field 4 in the returned tuple type. * @param type5 The type of CSV field 5 and the type of field 5 in the returned tuple type. * @param type6 The type of CSV field 6 and the type of field 6 in the returned tuple type. * @param type7 The type of CSV field 7 and the type of field 7 in the returned tuple type. * @param type8 The type of CSV field 8 and the type of field 8 in the returned tuple type. * @return The {@link org.apache.flink.api.java.DataSet} representing the parsed CSV data. */ public <T0, T1, T2, T3, T4, T5, T6, T7, T8> DataSource<Tuple9<T0, T1, T2, T3, T4, T5, T6, T7, T8>> types(Class<T0> type0, Class<T1> type1, Class<T2> type2, Class<T3> type3, Class<T4> type4, Class<T5> type5, Class<T6> type6, Class<T7> type7, Class<T8> type8) { TupleTypeInfo<Tuple9<T0, T1, T2, T3, T4, T5, T6, T7, T8>> types = TupleTypeInfo.getBasicAndBasicValueTupleTypeInfo(type0, type1, type2, type3, type4, type5, type6, type7, type8); CsvInputFormat<Tuple9<T0, T1, T2, T3, T4, T5, T6, T7, T8>> inputFormat = new TupleCsvInputFormat<Tuple9<T0, T1, T2, T3, T4, T5, T6, T7, T8>>(path, types, this.includedMask); configureInputFormat(inputFormat); return new DataSource<Tuple9<T0, T1, T2, T3, T4, T5, T6, T7, T8>>(executionContext, inputFormat, types, Utils.getCallLocationName()); }
/** * Configures the reader to read the CSV data and parse it to the given type. The type must be a subclass of * {@link Tuple}. The type information for the fields is obtained from the type class. The type * consequently needs to specify all generic field types of the tuple. * * @param targetType The class of the target type, needs to be a subclass of Tuple. * @return The DataSet representing the parsed CSV data. */ public <T extends Tuple> DataSource<T> tupleType(Class<T> targetType) { Preconditions.checkNotNull(targetType, "The target type class must not be null."); if (!Tuple.class.isAssignableFrom(targetType)) { throw new IllegalArgumentException("The target type must be a subclass of " + Tuple.class.getName()); } @SuppressWarnings("unchecked") TupleTypeInfo<T> typeInfo = (TupleTypeInfo<T>) TypeExtractor.createTypeInfo(targetType); CsvInputFormat<T> inputFormat = new TupleCsvInputFormat<T>(path, this.lineDelimiter, this.fieldDelimiter, typeInfo, this.includedMask); Class<?>[] classes = new Class<?>[typeInfo.getArity()]; for (int i = 0; i < typeInfo.getArity(); i++) { classes[i] = typeInfo.getTypeAt(i).getTypeClass(); } configureInputFormat(inputFormat); return new DataSource<T>(executionContext, inputFormat, typeInfo, Utils.getCallLocationName()); }
/** * Configures the reader to read the CSV data and parse it to the given type. The all fields of the type * must be public or able to set value. The type information for the fields is obtained from the type class. * * @param pojoType The class of the target POJO. * @param pojoFields The fields of the POJO which are mapped to CSV fields. * @return The DataSet representing the parsed CSV data. */ public <T> DataSource<T> pojoType(Class<T> pojoType, String... pojoFields) { Preconditions.checkNotNull(pojoType, "The POJO type class must not be null."); Preconditions.checkNotNull(pojoFields, "POJO fields must be specified (not null) if output type is a POJO."); final TypeInformation<T> ti = TypeExtractor.createTypeInfo(pojoType); if (!(ti instanceof PojoTypeInfo)) { throw new IllegalArgumentException( "The specified class is not a POJO. The type class must meet the POJO requirements. Found: " + ti); } final PojoTypeInfo<T> pti = (PojoTypeInfo<T>) ti; CsvInputFormat<T> inputFormat = new PojoCsvInputFormat<T>(path, this.lineDelimiter, this.fieldDelimiter, pti, pojoFields, this.includedMask); configureInputFormat(inputFormat); return new DataSource<T>(executionContext, inputFormat, pti, Utils.getCallLocationName()); }
/** * Creates a {@link DataSet} that represents the Strings produced by reading the given file line wise. * The {@link java.nio.charset.Charset} with the given name will be used to read the files. * * @param filePath The path of the file, as a URI (e.g., "file:///some/local/file" or "hdfs://host:port/file/path"). * @param charsetName The name of the character set used to read the file. * @return A {@link DataSet} that represents the data read from the given file as text lines. */ public DataSource<String> readTextFile(String filePath, String charsetName) { Preconditions.checkNotNull(filePath, "The file path may not be null."); TextInputFormat format = new TextInputFormat(new Path(filePath)); format.setCharsetName(charsetName); return new DataSource<>(this, format, BasicTypeInfo.STRING_TYPE_INFO, Utils.getCallLocationName()); }
/** * Creates a {@link DataSet} that represents the Strings produced by reading the given file line wise. * The file will be read with the UTF-8 character set. * * @param filePath The path of the file, as a URI (e.g., "file:///some/local/file" or "hdfs://host:port/file/path"). * @return A {@link DataSet} that represents the data read from the given file as text lines. */ public DataSource<String> readTextFile(String filePath) { Preconditions.checkNotNull(filePath, "The file path may not be null."); return new DataSource<>(this, new TextInputFormat(new Path(filePath)), BasicTypeInfo.STRING_TYPE_INFO, Utils.getCallLocationName()); }
/** * Creates a {@link DataSet} that represents the primitive type produced by reading the given file line wise. * This method is similar to {@link #readCsvFile(String)} with single field, but it produces a DataSet not through * {@link org.apache.flink.api.java.tuple.Tuple1}. * * @param filePath The path of the file, as a URI (e.g., "file:///some/local/file" or "hdfs://host:port/file/path"). * @param typeClass The primitive type class to be read. * @return A {@link DataSet} that represents the data read from the given file as primitive type. */ public <X> DataSource<X> readFileOfPrimitives(String filePath, Class<X> typeClass) { Preconditions.checkNotNull(filePath, "The file path may not be null."); return new DataSource<>(this, new PrimitiveInputFormat<>(new Path(filePath), typeClass), TypeExtractor.getForClass(typeClass), Utils.getCallLocationName()); }
/** * Creates a {@link DataSet} that represents the primitive type produced by reading the given file in delimited way. * This method is similar to {@link #readCsvFile(String)} with single field, but it produces a DataSet not through * {@link org.apache.flink.api.java.tuple.Tuple1}. * * @param filePath The path of the file, as a URI (e.g., "file:///some/local/file" or "hdfs://host:port/file/path"). * @param delimiter The delimiter of the given file. * @param typeClass The primitive type class to be read. * @return A {@link DataSet} that represents the data read from the given file as primitive type. */ public <X> DataSource<X> readFileOfPrimitives(String filePath, String delimiter, Class<X> typeClass) { Preconditions.checkNotNull(filePath, "The file path may not be null."); return new DataSource<>(this, new PrimitiveInputFormat<>(new Path(filePath), delimiter, typeClass), TypeExtractor.getForClass(typeClass), Utils.getCallLocationName()); }
/** * Creates a {@link DataSet} that represents the Strings produced by reading the given file line wise. * This method is similar to {@link #readTextFile(String)}, but it produces a DataSet with mutable * {@link StringValue} objects, rather than Java Strings. StringValues can be used to tune implementations * to be less object and garbage collection heavy. * * <p>The file will be read with the UTF-8 character set. * * @param filePath The path of the file, as a URI (e.g., "file:///some/local/file" or "hdfs://host:port/file/path"). * @return A {@link DataSet} that represents the data read from the given file as text lines. */ public DataSource<StringValue> readTextFileWithValue(String filePath) { Preconditions.checkNotNull(filePath, "The file path may not be null."); return new DataSource<>(this, new TextValueInputFormat(new Path(filePath)), new ValueTypeInfo<>(StringValue.class), Utils.getCallLocationName()); }
/** * Creates a {@link DataSet} that represents the Strings produced by reading the given file line wise. * This method is similar to {@link #readTextFile(String, String)}, but it produces a DataSet with mutable * {@link StringValue} objects, rather than Java Strings. StringValues can be used to tune implementations * to be less object and garbage collection heavy. * * <p>The {@link java.nio.charset.Charset} with the given name will be used to read the files. * * @param filePath The path of the file, as a URI (e.g., "file:///some/local/file" or "hdfs://host:port/file/path"). * @param charsetName The name of the character set used to read the file. * @param skipInvalidLines A flag to indicate whether to skip lines that cannot be read with the given character set. * * @return A DataSet that represents the data read from the given file as text lines. */ public DataSource<StringValue> readTextFileWithValue(String filePath, String charsetName, boolean skipInvalidLines) { Preconditions.checkNotNull(filePath, "The file path may not be null."); TextValueInputFormat format = new TextValueInputFormat(new Path(filePath)); format.setCharsetName(charsetName); format.setSkipInvalidLines(skipInvalidLines); return new DataSource<>(this, format, new ValueTypeInfo<>(StringValue.class), Utils.getCallLocationName()); }