@Override protected Dataset<Triple<TimeInterval, Integer, String>> getOutput(Dataset<Pair<String, Integer>> input) { input = AssignEventTime.of(input).using(e -> e.getSecond()).output(); Dataset<Pair<Integer, String>> reduced = ReduceStateByKey.of(input) .keyBy(e -> e.getFirst().charAt(0) - '0') .valueBy(e -> e.getFirst().substring(2)) .stateFactory((StateFactory<String, String, AccState<String>>) AccState::new) .mergeStatesBy(AccState::combine) .windowBy(TimeSliding.of(Duration.ofMillis(10), Duration.ofMillis(5))) .output(); return FlatMap.of(reduced) .using((UnaryFunctor<Pair<Integer, String>, Triple<TimeInterval, Integer, String>>) (elem, context) -> context.collect(Triple.of((TimeInterval) context.getWindow(), elem.getFirst(), elem.getSecond()))) .output(); }
@Override protected Dataset<Triple<TimeInterval, Integer, String>> getOutput(Dataset<Pair<String, Integer>> input) { input = AssignEventTime.of(input).using(e -> e.getSecond()).output(); Dataset<Pair<Integer, String>> reduced = ReduceStateByKey.of(input) .keyBy(e -> e.getFirst().charAt(0) - '0') .valueBy(e -> e.getFirst().substring(2)) .stateFactory((StateFactory<String, String, AccState<String>>) AccState::new) .mergeStatesBy(AccState::combine) .windowBy(TimeSliding.of(Duration.ofMillis(10), Duration.ofMillis(5))) .output(); return FlatMap.of(reduced) .using((UnaryFunctor<Pair<Integer, String>, Triple<TimeInterval, Integer, String>>) (elem, context) -> context.collect(Triple.of((TimeInterval) context.getWindow(), elem.getFirst(), elem.getSecond()))) .output(); }
@Override protected Dataset<Pair<Integer, Long>> getOutput(Dataset<Pair<String, Integer>> input) { return ReduceStateByKey.of(input) .keyBy(e -> e.getFirst().charAt(0) - '0') .valueBy(Pair::getFirst) .stateFactory((StateFactory<String, Long, CountState<String>>) CountState::new) .mergeStatesBy(CountState::combine) // FIXME .timedBy(Pair::getSecond) and make the assertion in the validation phase stronger .windowBy(Count.of(3)) .output(); }
.mergeStatesBy(AccState::combine) .windowBy(Time.of(Duration.ofMillis(5))) .output();
@Test public void testReduceByKeyWithSortStateAndCustomWindowing() throws InterruptedException, ExecutionException { Dataset<Integer> ints = flow.createInput( ListDataSource.unbounded( reversed(sequenceInts(0, 100)), reversed(sequenceInts(100, 1100)))); SizedCountWindowing<Integer> windowing = new SizedCountWindowing<>(i -> (i % 10) + 1); // the key for sort will be the last digit Dataset<Pair<Integer, Integer>> output = ReduceStateByKey.of(ints) .keyBy(i -> i % 10) .valueBy(e -> e) .stateFactory(SortState::new) .mergeStatesBy(SortState::combine) .windowBy(windowing) .output(); // collector of outputs ListDataSink<Triple<SizedCountWindow, Integer, Integer>> sink = ListDataSink.get(); FlatMap.of(output) .using((UnaryFunctor<Pair<Integer, Integer>, Triple<SizedCountWindow, Integer, Integer>>) (elem, context) -> context.collect(Triple.of((SizedCountWindow) context.getWindow(), elem.getFirst(), elem.getSecond()))) .output() .persist(sink); executor.submit(flow).get(); List<Triple<SizedCountWindow, Integer, Integer>> outputs = sink.getOutputs(); assertEquals(4 * 550, outputs.size()); checkKeyAlignedSortedList(outputs); }
@Override protected Dataset<Triple<Instant, Type, Long>> getOutput(Dataset<Triple<Instant, Type, String>> input) { // distinct implemented using raw ReduceStateByKey input = AssignEventTime.of(input).using(t -> t.getFirst().toEpochMilli()).output(); Dataset<Pair<ComparablePair<Type, String>, Object>> pairs = ReduceStateByKey.of(input) .keyBy(t -> new ComparablePair<>(t.getSecond(), t.getThird())) .valueBy(t -> null) .stateFactory(DistinctState::new) .mergeStatesBy((t, os) -> {}) .windowBy(Time.of(Duration.ofHours(1))) .output(); Dataset<ComparablePair<Type, String>> distinct = MapElements.of(pairs) .using(Pair::getFirst) .output(); Dataset<Pair<Type, Long>> reduced = ReduceByKey.of(distinct) .keyBy(ComparablePair::getFirst) .valueBy(p -> 1L) .combineBy(Sums.ofLongs()) .windowBy(Time.of(Duration.ofHours(1))) .output(); // extract window timestamp return FlatMap.of(reduced) .using((Pair<Type, Long> p, Collector<Triple<Instant, Type, Long>> ctx) -> { long windowEnd = ((TimeInterval) ctx.getWindow()).getEndMillis(); ctx.collect(Triple.of(Instant.ofEpochMilli(windowEnd), p.getFirst(), p.getSecond())); }) .output(); }
@Override protected Dataset<Triple<Instant, Type, Long>> getOutput(Dataset<Triple<Instant, Type, String>> input) { // distinct implemented using raw ReduceStateByKey input = AssignEventTime.of(input).using(t -> t.getFirst().toEpochMilli()).output(); Dataset<Pair<ComparablePair<Type, String>, Object>> pairs = ReduceStateByKey.of(input) .keyBy(t -> new ComparablePair<>(t.getSecond(), t.getThird())) .valueBy(t -> null) .stateFactory(DistinctState::new) .mergeStatesBy((t, os) -> {}) .windowBy(Time.of(Duration.ofHours(1))) .output(); Dataset<ComparablePair<Type, String>> distinct = MapElements.of(pairs) .using(Pair::getFirst) .output(); Dataset<Pair<Type, Long>> reduced = ReduceByKey.of(distinct) .keyBy(ComparablePair::getFirst) .valueBy(p -> 1L) .combineBy(Sums.ofLongs()) .windowBy(Time.of(Duration.ofHours(1))) .output(); // extract window timestamp return FlatMap.of(reduced) .using((Pair<Type, Long> p, Collector<Triple<Instant, Type, Long>> ctx) -> { long windowEnd = ((TimeInterval) ctx.getWindow()).getEndMillis(); ctx.collect(Triple.of(Instant.ofEpochMilli(windowEnd), p.getFirst(), p.getSecond())); }) .output(); }
@Test public void testBuild() { Flow flow = Flow.create("TEST"); Dataset<String> dataset = Util.createMockDataset(flow, 2); Time<String> windowing = Time.of(Duration.ofHours(1)); Dataset<Pair<String, Long>> reduced = ReduceStateByKey.named("ReduceStateByKey1") .of(dataset) .keyBy(s -> s) .valueBy(s -> 1L) .stateFactory(WordCountState::new) .mergeStatesBy(WordCountState::combine) .windowBy(windowing) .output(); assertEquals(flow, reduced.getFlow()); assertEquals(1, flow.size()); ReduceStateByKey reduce = (ReduceStateByKey) flow.operators().iterator().next(); assertEquals(flow, reduce.getFlow()); assertEquals("ReduceStateByKey1", reduce.getName()); assertNotNull(reduce.getKeyExtractor()); assertNotNull(reduce.getValueExtractor()); assertNotNull(reduce.getStateMerger()); assertNotNull(reduce.getStateFactory()); assertEquals(reduced, reduce.output()); assertSame(windowing, reduce.getWindowing()); }
@Override protected Dataset<Triple<TimeInterval, Integer, String>> getOutput(Dataset<Pair<String, Integer>> input) { input = AssignEventTime.of(input).using(e -> e.getSecond()).output(); Dataset<Pair<Integer, String>> reduced = ReduceStateByKey.of(input) .keyBy(e -> e.getFirst().charAt(0) - '0') .valueBy(Pair::getFirst) .stateFactory((StateFactory<String, String, AccState<String>>) AccState::new) .mergeStatesBy(AccState::combine) .windowBy(Session.of(Duration.ofMillis(5))) .output(); return FlatMap.of(reduced) .using((UnaryFunctor<Pair<Integer, String>, Triple<TimeInterval, Integer, String>>) (elem, context) -> context.collect(Triple.of((TimeInterval) context.getWindow(), elem.getFirst(), elem.getSecond()))) .output(); }
@Override protected Dataset<Triple<TimeInterval, Integer, String>> getOutput(Dataset<Pair<String, Integer>> input) { input = AssignEventTime.of(input).using(e -> e.getSecond()).output(); Dataset<Pair<Integer, String>> reduced = ReduceStateByKey.of(input) .keyBy(e -> e.getFirst().charAt(0) - '0') .valueBy(Pair::getFirst) .stateFactory((StateFactory<String, String, AccState<String>>) AccState::new) .mergeStatesBy(AccState::combine) .windowBy(Session.of(Duration.ofMillis(5))) .output(); return FlatMap.of(reduced) .using((UnaryFunctor<Pair<Integer, String>, Triple<TimeInterval, Integer, String>>) (elem, context) -> context.collect(Triple.of((TimeInterval) context.getWindow(), elem.getFirst(), elem.getSecond()))) .output(); }
@Override protected Dataset<Triple<TimeInterval, Integer, String>> getOutput(Dataset<Pair<String, Integer>> input) { input = AssignEventTime.of(input).using(e -> e.getSecond()).output(); Dataset<Pair<Integer, String>> reduced = ReduceStateByKey.of(input) .keyBy(e -> e.getFirst().charAt(0) - '0') .valueBy(Pair::getFirst) .stateFactory(AccState<String>::new) .mergeStatesBy(AccState::combine) .windowBy(Time.of(Duration.ofMillis(5))) .output(); return FlatMap.of(reduced) .using((UnaryFunctor<Pair<Integer, String>, Triple<TimeInterval, Integer, String>>) (elem, context) -> context.collect(Triple.of((TimeInterval) context.getWindow(), elem.getFirst(), elem.getSecond()))) .output(); }
@Override protected Dataset<Triple<TimeInterval, Integer, String>> getOutput(Dataset<Pair<String, Integer>> input) { input = AssignEventTime.of(input).using(e -> e.getSecond()).output(); Dataset<Pair<Integer, String>> reduced = ReduceStateByKey.of(input) .keyBy(e -> e.getFirst().charAt(0) - '0') .valueBy(Pair::getFirst) .stateFactory(AccState<String>::new) .mergeStatesBy(AccState::combine) .windowBy(Time.of(Duration.ofMillis(5))) .output(); return FlatMap.of(reduced) .using((UnaryFunctor<Pair<Integer, String>, Triple<TimeInterval, Integer, String>>) (elem, context) -> context.collect(Triple.of((TimeInterval) context.getWindow(), elem.getFirst(), elem.getSecond()))) .output(); }
@Override protected Dataset<Triple<Integer, Integer, Integer>> getOutput(Dataset<Integer> input) { Dataset<Pair<Integer, Integer>> output = ReduceStateByKey.of(input) .keyBy(e -> e % 3) .valueBy(e -> e) .stateFactory(SortState::new) .mergeStatesBy(SortState::combine) .windowBy(new ReduceByKeyTest.TestWindowing()) .output(); return FlatMap.of(output) .using((UnaryFunctor<Pair<Integer, Integer>, Triple<Integer, Integer, Integer>>) (elem, c) -> c.collect(Triple.of(((IntWindow) c.getWindow()).getValue(), elem.getFirst(), elem.getSecond()))) .output(); }
@Override protected Dataset<Triple<Integer, Integer, Integer>> getOutput(Dataset<Integer> input) { Dataset<Pair<Integer, Integer>> output = ReduceStateByKey.of(input) .keyBy(e -> e % 3) .valueBy(e -> e) .stateFactory(SortState::new) .mergeStatesBy(SortState::combine) .windowBy(new ReduceByKeyTest.TestWindowing()) .output(); return FlatMap.of(output) .using((UnaryFunctor<Pair<Integer, Integer>, Triple<Integer, Integer, Integer>>) (elem, c) -> c.collect(Triple.of(((IntWindow) c.getWindow()).getValue(), elem.getFirst(), elem.getSecond()))) .output(); }
@Test public void testBuild_Windowing() { Flow flow = Flow.create("TEST"); Dataset<String> dataset = Util.createMockDataset(flow, 2); Dataset<Pair<String, Long>> reduced = ReduceStateByKey.of(dataset) .keyBy(s -> s) .valueBy(s -> 1L) .stateFactory(WordCountState::new) .mergeStatesBy(WordCountState::combine) .windowBy(Time.of(Duration.ofHours(1))) .output(); ReduceStateByKey reduce = (ReduceStateByKey) flow.operators().iterator().next(); assertTrue(reduce.getWindowing() instanceof Time); }
@Override protected Dataset<Pair<Word, Long>> getOutput(Dataset<Pair<Word, Long>> input) { input = AssignEventTime.of(input).using(Pair::getSecond).output(); return ReduceStateByKey.of(input) .keyBy(Pair::getFirst) .valueBy(Pair::getFirst) .stateFactory((StateFactory<Word, Long, CountState<Word>>) CountState::new) .mergeStatesBy(CountState::combine) .windowBy(Time.of(Duration.ofSeconds(1))) .output(); }
@Override public Dataset<Pair<KEY, OUT>> output(OutputHint... outputHints) { return new DatasetBuilder6<>(name, input, keyExtractor, valueExtractor, stateFactory, stateMerger, null) .output(outputHints); } }
@Override protected Dataset<Pair<Integer, Long>> getOutput(Dataset<Pair<String, Integer>> input) { return ReduceStateByKey.of(input) .keyBy(e -> e.getFirst().charAt(0) - '0') .valueBy(Pair::getFirst) .stateFactory((StateFactory<String, Long, CountState<String>>) CountState::new) .mergeStatesBy(CountState::combine) // FIXME .timedBy(Pair::getSecond) and make the assertion in the validation phase stronger .windowBy(Count.of(3)) .output(); }
@Override public Dataset<Pair<KEY, OUT>> output(OutputHint... outputHints) { return new DatasetBuilder6<>(name, input, keyExtractor, valueExtractor, stateFactory, stateMerger, null) .output(outputHints); } }
@Override protected Dataset<Pair<Word, Long>> getOutput(Dataset<Pair<Word, Long>> input) { input = AssignEventTime.of(input).using(Pair::getSecond).output(); return ReduceStateByKey.of(input) .keyBy(Pair::getFirst) .valueBy(Pair::getFirst) .stateFactory((StateFactory<Word, Long, CountState<Word>>) CountState::new) .mergeStatesBy(CountState::combine) .windowBy(Time.of(Duration.ofSeconds(1))) .output(); }