/** * Returns a {@link PTransform} that counts the number of elements in its input {@link * PCollection}. * * <p>Note: if the input collection uses a windowing strategy other than {@link GlobalWindows}, * use {@code Combine.globally(Count.<T>combineFn()).withoutDefaults()} instead. */ public static <T> PTransform<PCollection<T>, PCollection<Long>> globally() { return Combine.globally(new CountFn<T>()); }
/** * Returns a {@code PTransform} that takes an input {@code PCollection<KV<K, Long>>} and returns a * {@code PCollection<KV<K, Long>>} that contains an output element mapping each distinct key in * the input {@code PCollection} to the maximum of the values associated with that key in the * input {@code PCollection}. * * <p>See {@link Combine.PerKey} for how this affects timestamps and windowing. */ public static <K> Combine.PerKey<K, Long, Long> longsPerKey() { return Combine.perKey(new MaxLongFn()); }
@Override public PCollection<KV<Row, OutputT>> expand(PCollection<InputT> input) { return input.apply(byFields).apply(Combine.groupedValues(combineFn)); } }
/** * Returns a {@link PerKey Combine.PerKey} {@code PTransform} that first groups its input {@code * PCollection} of {@code KV}s by keys and windows, then invokes the given function on each of the * values lists to produce a combined value, and then returns a {@code PCollection} of {@code KV}s * mapping each distinct key to its combined value for each window. * * <p>Each output element is in the window by which its corresponding input was grouped, and has * the timestamp of the end of that window. The output {@code PCollection} has the same {@link * org.apache.beam.sdk.transforms.windowing.WindowFn} as the input. * * <p>See {@link PerKey Combine.PerKey} for more information. */ public static <K, InputT, OutputT> PerKey<K, InputT, OutputT> perKey( GlobalCombineFn<? super InputT, ?, OutputT> fn) { return perKey(fn, displayDataForFn(fn)); }
/** * Returns a {@link Globally Combine.Globally} {@code PTransform} that uses the given {@code * GloballyCombineFn} to combine all the elements in each window of the input {@code PCollection} * into a single value in the output {@code PCollection}. The types of the input elements and the * output elements can differ. * * <p>If the input {@code PCollection} is windowed into {@link GlobalWindows}, a default value in * the {@link GlobalWindow} will be output if the input {@code PCollection} is empty. To use this * with inputs with other windowing, either {@link Globally#withoutDefaults} or {@link * Globally#asSingletonView} must be called. * * <p>See {@link Globally Combine.Globally} for more information. */ public static <InputT, OutputT> Globally<InputT, OutputT> globally( GlobalCombineFn<? super InputT, ?, OutputT> fn) { return globally(fn, displayDataForFn(fn)); }
/** * Returns a {@link GroupedValues Combine.GroupedValues} {@code PTransform} that takes a {@code * PCollection} of {@code KV}s where a key maps to an {@code Iterable} of values, e.g., the result * of a {@code GroupByKey}, then uses the given {@code CombineFn} to combine all the values * associated with a key, ignoring the key. The types of the input and output values can differ. * * <p>Each output element has the same timestamp and is in the same window as its corresponding * input element, and the output {@code PCollection} has the same {@link * org.apache.beam.sdk.transforms.windowing.WindowFn} associated with it as the input. * * <p>See {@link GroupedValues Combine.GroupedValues} for more information. * * <p>Note that {@link #perKey(CombineFnBase.GlobalCombineFn)} is typically more convenient to use * than {@link GroupByKey} followed by {@code groupedValues(...)}. */ public static <K, InputT, OutputT> GroupedValues<K, InputT, OutputT> groupedValues( GlobalCombineFn<? super InputT, ?, OutputT> fn) { return groupedValues(fn, displayDataForFn(fn)); }
@Test public void testCombineGetName() { assertEquals("Combine.globally(SumInts)", Combine.globally(new SumInts()).getName()); assertEquals( "Combine.GloballyAsSingletonView", Combine.globally(new SumInts()).asSingletonView().getName()); assertEquals("Combine.perKey(Test)", Combine.perKey(new TestCombineFn()).getName()); assertEquals( "Combine.perKeyWithFanout(Test)", Combine.perKey(new TestCombineFn()).withHotKeyFanout(10).getName()); }
protected void runTestSimpleCombine( List<KV<String, Integer>> table, int globalSum, List<KV<String, String>> perKeyCombines) { PCollection<KV<String, Integer>> input = createInput(pipeline, table); PCollection<Integer> sum = input.apply(Values.create()).apply(Combine.globally(new SumInts())); PCollection<KV<String, String>> sumPerKey = input.apply(Combine.perKey(new TestCombineFn())); PAssert.that(sum).containsInAnyOrder(globalSum); PAssert.that(sumPerKey).containsInAnyOrder(perKeyCombines); pipeline.run(); }
/** * Returns a {@link Globally Combine.Globally} {@code PTransform} that uses the given {@code * SerializableFunction} to combine all the elements in each window of the input {@code * PCollection} into a single value in the output {@code PCollection}. The types of the input * elements and the output elements must be the same. * * <p>If the input {@code PCollection} is windowed into {@link GlobalWindows}, a default value in * the {@link GlobalWindow} will be output if the input {@code PCollection} is empty. To use this * with inputs with other windowing, either {@link Globally#withoutDefaults} or {@link * Globally#asSingletonView} must be called. * * <p>See {@link Globally Combine.Globally} for more information. */ public static <V> Globally<V, V> globally(SerializableFunction<Iterable<V>, V> combiner) { return globally(IterableCombineFn.of(combiner), displayDataForFn(combiner)); }
/** * Returns a {@link PerKey Combine.PerKey} {@code PTransform} that first groups its input {@code * PCollection} of {@code KV}s by keys and windows, then invokes the given function on each of the * values lists to produce a combined value, and then returns a {@code PCollection} of {@code KV}s * mapping each distinct key to its combined value for each window. * * <p>Each output element is in the window by which its corresponding input was grouped, and has * the timestamp of the end of that window. The output {@code PCollection} has the same {@link * org.apache.beam.sdk.transforms.windowing.WindowFn} as the input. * * <p>See {@link PerKey Combine.PerKey} for more information. */ public static <K, V> PerKey<K, V, V> perKey(SerializableFunction<Iterable<V>, V> fn) { return perKey(IterableCombineFn.of(fn), displayDataForFn(fn)); }
/** * Returns a {@link GroupedValues Combine.GroupedValues} {@code PTransform} that takes a {@code * PCollection} of {@code KV}s where a key maps to an {@code Iterable} of values, e.g., the result * of a {@code GroupByKey}, then uses the given {@code SerializableFunction} to combine all the * values associated with a key, ignoring the key. The type of the input and output values must be * the same. * * <p>Each output element has the same timestamp and is in the same window as its corresponding * input element, and the output {@code PCollection} has the same {@link * org.apache.beam.sdk.transforms.windowing.WindowFn} associated with it as the input. * * <p>See {@link GroupedValues Combine.GroupedValues} for more information. * * <p>Note that {@link #perKey(SerializableFunction)} is typically more convenient to use than * {@link GroupByKey} followed by {@code groupedValues(...)}. */ public static <K, V> GroupedValues<K, V, V> groupedValues( SerializableFunction<Iterable<V>, V> fn) { return groupedValues(IterableCombineFn.of(fn), displayDataForFn(fn)); }
@Override public PCollection<OutputT> expand(PCollection<InputT> input) { return input.apply(Combine.globally(combineFn)); } }
protected void runTestAccumulatingCombine( List<KV<String, Integer>> table, Double globalMean, List<KV<String, Double>> perKeyMeans) { PCollection<KV<String, Integer>> input = createInput(pipeline, table); PCollection<Double> mean = input.apply(Values.create()).apply(Combine.globally(new MeanInts())); PCollection<KV<String, Double>> meanPerKey = input.apply(Combine.perKey(new MeanInts())); PAssert.that(mean).containsInAnyOrder(globalMean); PAssert.that(meanPerKey).containsInAnyOrder(perKeyMeans); pipeline.run(); }
/** * Returns a {@code PTransform} that takes an input {@code PCollection<KV<K, Integer>>} and * returns a {@code PCollection<KV<K, Integer>>} that contains an output element mapping each * distinct key in the input {@code PCollection} to the sum of the values associated with that key * in the input {@code PCollection}. */ public static <K> Combine.PerKey<K, Integer, Integer> integersPerKey() { return Combine.perKey(Sum.ofIntegers()); }
@Override public PCollection<KV<Row, Row>> expand(PCollection<InputT> input) { SchemaAggregateFn.Inner<InputT> fn = schemaAggregateFn.withSchema(input.getSchema(), input.getToRowFunction()); return input.apply(byFields).apply(Combine.groupedValues(fn)); } }
/** * Returns a {@code PTransform} that takes an input {@code PCollection<T>} and returns a {@code * PCollection<T>} whose contents is the maximum according to the natural ordering of {@code T} of * the input {@code PCollection}'s elements, or {@code null} if there are no elements. */ public static <T extends Comparable<? super T>> Combine.Globally<T, T> globally() { return Combine.globally(Max.<T>naturalOrder()); }
@SuppressWarnings("unchecked") protected void runTestBasicCombine( List<KV<String, Integer>> table, Set<Integer> globalUnique, List<KV<String, Set<Integer>>> perKeyUnique) { PCollection<KV<String, Integer>> input = createInput(pipeline, table); PCollection<Set<Integer>> unique = input.apply(Values.create()).apply(Combine.globally(new UniqueInts())); PCollection<KV<String, Set<Integer>>> uniquePerKey = input.apply(Combine.perKey(new UniqueInts())); PAssert.that(unique).containsInAnyOrder(globalUnique); PAssert.that(uniquePerKey).containsInAnyOrder(perKeyUnique); pipeline.run(); }
/** * Returns a {@code PTransform} that takes an input {@code PCollection<KV<K, Long>>} and returns a * {@code PCollection<KV<K, Long>>} that contains an output element mapping each distinct key in * the input {@code PCollection} to the minimum of the values associated with that key in the * input {@code PCollection}. * * <p>See {@link Combine.PerKey} for how this affects timestamps and windowing. */ public static <K> Combine.PerKey<K, Long, Long> longsPerKey() { return Combine.perKey(new MinLongFn()); }
/** Creates a simple pipeline with a {@link Combine.GroupedValues}. */ private static TestPipeline createCombineGroupedValuesPipeline() { TestPipeline pipeline = TestPipeline.create().enableAbandonedNodeEnforcement(false); PCollection<KV<String, Integer>> input = pipeline .apply(Create.of(KV.of("key", 1))) .setCoder(KvCoder.of(StringUtf8Coder.of(), VarIntCoder.of())); input.apply(GroupByKey.create()).apply(Combine.groupedValues(new SumCombineFn())); return pipeline; }
/** * Returns a {@code PTransform} that takes an input {@code PCollection<T>} and returns a {@code * PCollection<T>} whose contents is the maximum of the input {@code PCollection}'s elements, or * {@code null} if there are no elements. */ public static <T, ComparatorT extends Comparator<? super T> & Serializable> Combine.Globally<T, T> globally(ComparatorT comparator) { return Combine.globally(Max.of(comparator)); }