/** * Syntactic sugar for aggregate (MAX, field). * @param field The index of the Tuple field on which the aggregation function is applied. * @return An AggregateOperator that represents the max'ed DataSet. * * @see org.apache.flink.api.java.operators.AggregateOperator */ public AggregateOperator<T> max (int field) { return this.aggregate (Aggregations.MAX, field, Utils.getCallLocationName()); }
/** * Locally sorts the partitions of the DataSet on the specified field in the specified order. * DataSet can be sorted on multiple fields by chaining sortPartition() calls. * * @param field The field index on which the DataSet is sorted. * @param order The order in which the DataSet is sorted. * @return The DataSet with sorted local partitions. */ public SortPartitionOperator<T> sortPartition(int field, Order order) { return new SortPartitionOperator<>(this, field, order, Utils.getCallLocationName()); }
/** * Syntactic sugar for aggregate (MIN, field). * @param field The index of the Tuple field on which the aggregation function is applied. * @return An AggregateOperator that represents the min'ed DataSet. * * @see org.apache.flink.api.java.operators.AggregateOperator */ public AggregateOperator<T> min (int field) { return this.aggregate (Aggregations.MIN, field, Utils.getCallLocationName()); }
/** * Enforces a re-balancing of the DataSet, i.e., the DataSet is evenly distributed over all parallel instances of the * following task. This can help to improve performance in case of heavy data skew and compute intensive operations. * * <p><b>Important:</b>This operation shuffles the whole DataSet over the network and can take significant amount of time. * * @return The re-balanced DataSet. */ public PartitionOperator<T> rebalance() { return new PartitionOperator<>(this, PartitionMethod.REBALANCE, Utils.getCallLocationName()); }
/** * Locally sorts the partitions of the DataSet on the specified field in the specified order. * DataSet can be sorted on multiple fields by chaining sortPartition() calls. * * @param field The field expression referring to the field on which the DataSet is sorted. * @param order The order in which the DataSet is sorted. * @return The DataSet with sorted local partitions. */ public SortPartitionOperator<T> sortPartition(String field, Order order) { return new SortPartitionOperator<>(this, field, order, Utils.getCallLocationName()); }
/** * Creates a union of this DataSet with an other DataSet. The other DataSet must be of the same data type. * * @param other The other DataSet which is unioned with the current DataSet. * @return The resulting DataSet. */ public UnionOperator<T> union(DataSet<T> other){ return new UnionOperator<>(this, other, Utils.getCallLocationName()); }
/** * Returns a distinct set of a {@link DataSet}. * * <p>If the input is a {@link org.apache.flink.api.common.typeutils.CompositeType} (Tuple or Pojo type), * distinct is performed on all fields and each field must be a key type * * @return A DistinctOperator that represents the distinct DataSet. */ public DistinctOperator<T> distinct() { return new DistinctOperator<>(this, null, Utils.getCallLocationName()); }
/** * Syntactic sugar for aggregate (SUM, field). * @param field The index of the Tuple field on which the aggregation function is applied. * @return An AggregateOperator that represents the summed DataSet. * * @see org.apache.flink.api.java.operators.AggregateOperator */ public AggregateOperator<T> sum (int field) { return this.aggregate (Aggregations.SUM, field, Utils.getCallLocationName()); }
/** * Creates a new data set that contains a sequence of numbers. The data set will be created in parallel, * so there is no guarantee about the order of the elements. * * @param from The number to start at (inclusive). * @param to The number to stop at (inclusive). * @return A DataSet, containing all number in the {@code [from, to]} interval. */ public DataSource<Long> generateSequence(long from, long to) { return fromParallelCollection(new NumberSequenceIterator(from, to), BasicTypeInfo.LONG_TYPE_INFO, Utils.getCallLocationName()); }
/** * Hash-partitions a DataSet on the specified key fields. * * <p><b>Important:</b>This operation shuffles the whole DataSet over the network and can take significant amount of time. * * @param fields The field indexes on which the DataSet is hash-partitioned. * @return The partitioned DataSet. */ public PartitionOperator<T> partitionByHash(int... fields) { return new PartitionOperator<>(this, PartitionMethod.HASH, new Keys.ExpressionKeys<>(fields, getType()), Utils.getCallLocationName()); }
/** * Range-partitions a DataSet on the specified key fields. * * <p><b>Important:</b>This operation requires an extra pass over the DataSet to compute the range boundaries and * shuffles the whole DataSet over the network. This can take significant amount of time. * * @param fields The field expressions on which the DataSet is range-partitioned. * @return The partitioned DataSet. */ public PartitionOperator<T> partitionByRange(String... fields) { return new PartitionOperator<>(this, PartitionMethod.RANGE, new Keys.ExpressionKeys<>(fields, getType()), Utils.getCallLocationName()); }
/** * Hash-partitions a DataSet on the specified key fields. * * <p><b>Important:</b>This operation shuffles the whole DataSet over the network and can take significant amount of time. * * @param fields The field expressions on which the DataSet is hash-partitioned. * @return The partitioned DataSet. */ public PartitionOperator<T> partitionByHash(String... fields) { return new PartitionOperator<>(this, PartitionMethod.HASH, new Keys.ExpressionKeys<>(fields, getType()), Utils.getCallLocationName()); }
/** * Range-partitions a DataSet on the specified tuple field positions. */ public static <T> PartitionOperator<T> partitionByRange(DataSet<T> input, DataDistribution distribution, int... fields) { return new PartitionOperator<>(input, PartitionOperatorBase.PartitionMethod.RANGE, new Keys.ExpressionKeys<>(fields, input.getType(), false), distribution, Utils.getCallLocationName()); }
/** * Range-partitions a DataSet on the specified fields. */ public static <T> PartitionOperator<T> partitionByRange(DataSet<T> input, DataDistribution distribution, String... fields) { return new PartitionOperator<>(input, PartitionOperatorBase.PartitionMethod.RANGE, new Keys.ExpressionKeys<>(fields, input.getType()), distribution, Utils.getCallLocationName()); }
/** * Range-partitions a DataSet on the specified key fields. * * <p><b>Important:</b>This operation requires an extra pass over the DataSet to compute the range boundaries and * shuffles the whole DataSet over the network. This can take significant amount of time. * * @param fields The field indexes on which the DataSet is range-partitioned. * @return The partitioned DataSet. */ public PartitionOperator<T> partitionByRange(int... fields) { return new PartitionOperator<>(this, PartitionMethod.RANGE, new Keys.ExpressionKeys<>(fields, getType()), Utils.getCallLocationName()); }
protected ProjectJoin(DataSet<I1> input1, DataSet<I2> input2, Keys<I1> keys1, Keys<I2> keys2, JoinHint hint, int[] fields, boolean[] isFromFirst, TupleTypeInfo<OUT> returnType) { super(input1, input2, keys1, keys2, new ProjectFlatJoinFunction<I1, I2, OUT>(fields, isFromFirst, returnType.createSerializer(input1.getExecutionEnvironment().getConfig()).createInstance()), returnType, hint, Utils.getCallLocationName(4)); // We need to use the 4th element in the stack because the call comes through .types(). joinProj = null; }
protected ProjectJoin(DataSet<I1> input1, DataSet<I2> input2, Keys<I1> keys1, Keys<I2> keys2, JoinHint hint, int[] fields, boolean[] isFromFirst, TupleTypeInfo<OUT> returnType, JoinProjection<I1, I2> joinProj) { super(input1, input2, keys1, keys2, new ProjectFlatJoinFunction<I1, I2, OUT>(fields, isFromFirst, returnType.createSerializer(input1.getExecutionEnvironment().getConfig()).createInstance()), returnType, hint, Utils.getCallLocationName(4)); this.joinProj = joinProj; }
/** * Partitions a POJO DataSet on the specified key fields using a custom partitioner. * This method takes the key expression to partition on, and a partitioner that accepts the key type. * * <p>Note: This method works only on single field keys. * * @param partitioner The partitioner to assign partitions to keys. * @param field The field index on which the DataSet is to partitioned. * @return The partitioned DataSet. */ public <K> PartitionOperator<T> partitionCustom(Partitioner<K> partitioner, String field) { return new PartitionOperator<>(this, new Keys.ExpressionKeys<>(new String[] {field}, getType()), clean(partitioner), Utils.getCallLocationName()); }
/** * Creates a {@link DataSet} that represents the Strings produced by reading the given file line wise. * The file will be read with the UTF-8 character set. * * @param filePath The path of the file, as a URI (e.g., "file:///some/local/file" or "hdfs://host:port/file/path"). * @return A {@link DataSet} that represents the data read from the given file as text lines. */ public DataSource<String> readTextFile(String filePath) { Preconditions.checkNotNull(filePath, "The file path may not be null."); return new DataSource<>(this, new TextInputFormat(new Path(filePath)), BasicTypeInfo.STRING_TYPE_INFO, Utils.getCallLocationName()); }
/** * Range-partitions a DataSet using the specified key selector function. */ public static <T, K extends Comparable<K>> PartitionOperator<T> partitionByRange(DataSet<T> input, DataDistribution distribution, KeySelector<T, K> keyExtractor) { final TypeInformation<K> keyType = TypeExtractor.getKeySelectorTypes(keyExtractor, input.getType()); return new PartitionOperator<>(input, PartitionOperatorBase.PartitionMethod.RANGE, new Keys.SelectorFunctionKeys<>(input.clean(keyExtractor), input.getType(), keyType), distribution, Utils.getCallLocationName()); }