public <S> Dataset<S> flatMap(UnaryFunctor<T, S> f) { return new Dataset<>(FlatMap.of(this.wrap).using(requireNonNull(f)).output()); }
@Override protected Dataset<Integer> getOutput(Dataset<Integer> input) { return FlatMap.of(input) .using((Integer e, Collector<Integer> c) -> { for (int i = 1; i <= e; i++) { c.collect(i); } }) .output(); }
@Override protected Dataset<Integer> getOutput(Dataset<Integer> input) { return FlatMap.of(input) .using((Integer e, Collector<Integer> c) -> { for (int i = 1; i <= e; i++) { c.collect(i); } }) .output(); }
private void run() { Flow flow = Flow.create(); Dataset<Pair<ImmutableBytesWritable, Result>> ds = flow.createInput( Utils.getHBaseSource(input, conf.get())); FlatMap.of(ds) .using((Pair<ImmutableBytesWritable, Result> p, Collector<byte[]> c) -> { writeCellsAsBytes(p.getSecond(), c); }) .output() .persist(Utils.getSink(output, conf.get())); LOG.info("Starting flow reading from {} and persisting to {}", input, output); executor.submit(flow).join(); }
static <W, T> Dataset<Pair<W, T>> extractWindowsToPair(Dataset<T> input, Class<W> expectedWindowType) { return FlatMap.of(input) .using((UnaryFunctor<T, Pair<W, T>>) (elem, context) -> { Object actualWindow = context.getWindow(); if (actualWindow != null && !expectedWindowType.isAssignableFrom(actualWindow.getClass())) { throw new IllegalStateException( "Encountered window of type " + actualWindow.getClass() + " but expected only " + expectedWindowType); } @SuppressWarnings("unchecked") Pair<W, T> out = Pair.of((W) actualWindow, elem); context.collect(out); }) .output(); }
@Override protected Dataset<Integer> getOutput(Dataset<Integer> input) { return FlatMap.of(input).using( (UnaryFunctor<Integer, Integer>) (elem, collector) -> { collector.getCounter("input").increment(); collector.getCounter("sum").increment(elem); collector.collect(elem * elem); }) .output(); }
@Override protected Dataset<Integer> getOutput(Dataset<Integer> input) { return FlatMap.of(input).using( (UnaryFunctor<Integer, Integer>) (elem, collector) -> { collector.getCounter("input").increment(); collector.getCounter("sum").increment(elem); collector.collect(elem * elem); }) .output(); }
@Override protected Dataset<Triple<TimeInterval, Integer, Set<String>>> getOutput (Dataset<Pair<String, Integer>> input) { input = AssignEventTime.of(input).using(Pair::getSecond).output(); Dataset<Pair<Integer, Set<String>>> reduced = ReduceByKey.of(input) .keyBy(e -> e.getFirst().charAt(0) - '0') .valueBy(Pair::getFirst) .reduceBy(s -> s.collect(Collectors.toSet())) .windowBy(Session.of(Duration.ofMillis(5))) .output(); return FlatMap.of(reduced) .using((UnaryFunctor<Pair<Integer, Set<String>>, Triple<TimeInterval, Integer, Set<String>>>) (elem, context) -> context.collect(Triple.of((TimeInterval) context.getWindow(), elem.getFirst(), elem.getSecond()))) .output(); }
static <W, F, S> Dataset<Triple<W, F, S>> extractWindows(Dataset<Pair<F, S>> input, Class<W> expectedWindowType) { return FlatMap.of(input) .using((UnaryFunctor<Pair<F, S>, Triple<W, F, S>>) (elem, context) -> { Object actualWindow = context.getWindow(); if (actualWindow != null && !expectedWindowType.isAssignableFrom(actualWindow.getClass())) { throw new IllegalStateException( "Encountered window of type " + actualWindow.getClass() + " but expected only " + expectedWindowType); } @SuppressWarnings("unchecked") Triple<W, F, S> out = Triple.of((W) actualWindow, elem.getFirst(), elem.getSecond()); context.collect(out); }) .output(); }
@Override protected Dataset<Triple<TimeInterval, Integer, Set<String>>> getOutput (Dataset<Pair<String, Integer>> input) { input = AssignEventTime.of(input).using(Pair::getSecond).output(); Dataset<Pair<Integer, Set<String>>> reduced = ReduceByKey.of(input) .keyBy(e -> e.getFirst().charAt(0) - '0') .valueBy(Pair::getFirst) .reduceBy(s -> s.collect(Collectors.toSet())) .windowBy(Session.of(Duration.ofMillis(5))) .output(); return FlatMap.of(reduced) .using((UnaryFunctor<Pair<Integer, Set<String>>, Triple<TimeInterval, Integer, Set<String>>>) (elem, context) -> context.collect(Triple.of((TimeInterval) context.getWindow(), elem.getFirst(), elem.getSecond()))) .output(); }
@Override protected Dataset<Triple<TimeInterval, Integer, String>> getOutput(Dataset<Pair<String, Integer>> input) { input = AssignEventTime.of(input).using(e -> e.getSecond()).output(); Dataset<Pair<Integer, String>> reduced = ReduceStateByKey.of(input) .keyBy(e -> e.getFirst().charAt(0) - '0') .valueBy(e -> e.getFirst().substring(2)) .stateFactory((StateFactory<String, String, AccState<String>>) AccState::new) .mergeStatesBy(AccState::combine) .windowBy(TimeSliding.of(Duration.ofMillis(10), Duration.ofMillis(5))) .output(); return FlatMap.of(reduced) .using((UnaryFunctor<Pair<Integer, String>, Triple<TimeInterval, Integer, String>>) (elem, context) -> context.collect(Triple.of((TimeInterval) context.getWindow(), elem.getFirst(), elem.getSecond()))) .output(); }
@Override protected Dataset<Triple<TimeInterval, Integer, String>> getOutput(Dataset<Pair<String, Integer>> input) { input = AssignEventTime.of(input).using(e -> e.getSecond()).output(); Dataset<Pair<Integer, String>> reduced = ReduceStateByKey.of(input) .keyBy(e -> e.getFirst().charAt(0) - '0') .valueBy(e -> e.getFirst().substring(2)) .stateFactory((StateFactory<String, String, AccState<String>>) AccState::new) .mergeStatesBy(AccState::combine) .windowBy(TimeSliding.of(Duration.ofMillis(10), Duration.ofMillis(5))) .output(); return FlatMap.of(reduced) .using((UnaryFunctor<Pair<Integer, String>, Triple<TimeInterval, Integer, String>>) (elem, context) -> context.collect(Triple.of((TimeInterval) context.getWindow(), elem.getFirst(), elem.getSecond()))) .output(); }
@Test public void testBuild_ImplicitName() { Flow flow = Flow.create("TEST"); Dataset<String> dataset = Util.createMockDataset(flow, 1); Dataset<String> mapped = FlatMap.of(dataset) .using((String s, Collector<String> c) -> c.collect(s)) .output(); FlatMap map = (FlatMap) flow.operators().iterator().next(); assertEquals("FlatMap", map.getName()); }
@Override protected Dataset<Triple<TimeInterval, Integer, String>> getOutput(Dataset<Pair<String, Integer>> input) { input = AssignEventTime.of(input).using(e -> e.getSecond()).output(); Dataset<Pair<Integer, String>> reduced = ReduceStateByKey.of(input) .keyBy(e -> e.getFirst().charAt(0) - '0') .valueBy(Pair::getFirst) .stateFactory(AccState<String>::new) .mergeStatesBy(AccState::combine) .windowBy(Time.of(Duration.ofMillis(5))) .output(); return FlatMap.of(reduced) .using((UnaryFunctor<Pair<Integer, String>, Triple<TimeInterval, Integer, String>>) (elem, context) -> context.collect(Triple.of((TimeInterval) context.getWindow(), elem.getFirst(), elem.getSecond()))) .output(); }
@Override protected Dataset<Triple<TimeInterval, Integer, String>> getOutput(Dataset<Pair<String, Integer>> input) { input = AssignEventTime.of(input).using(e -> e.getSecond()).output(); Dataset<Pair<Integer, String>> reduced = ReduceStateByKey.of(input) .keyBy(e -> e.getFirst().charAt(0) - '0') .valueBy(Pair::getFirst) .stateFactory((StateFactory<String, String, AccState<String>>) AccState::new) .mergeStatesBy(AccState::combine) .windowBy(Session.of(Duration.ofMillis(5))) .output(); return FlatMap.of(reduced) .using((UnaryFunctor<Pair<Integer, String>, Triple<TimeInterval, Integer, String>>) (elem, context) -> context.collect(Triple.of((TimeInterval) context.getWindow(), elem.getFirst(), elem.getSecond()))) .output(); }
@Override protected Dataset<Triple<TimeInterval, Integer, String>> getOutput(Dataset<Pair<String, Integer>> input) { input = AssignEventTime.of(input).using(e -> e.getSecond()).output(); Dataset<Pair<Integer, String>> reduced = ReduceStateByKey.of(input) .keyBy(e -> e.getFirst().charAt(0) - '0') .valueBy(Pair::getFirst) .stateFactory((StateFactory<String, String, AccState<String>>) AccState::new) .mergeStatesBy(AccState::combine) .windowBy(Session.of(Duration.ofMillis(5))) .output(); return FlatMap.of(reduced) .using((UnaryFunctor<Pair<Integer, String>, Triple<TimeInterval, Integer, String>>) (elem, context) -> context.collect(Triple.of((TimeInterval) context.getWindow(), elem.getFirst(), elem.getSecond()))) .output(); }
@Override protected Dataset<Triple<TimeInterval, Integer, String>> getOutput(Dataset<Pair<String, Integer>> input) { input = AssignEventTime.of(input).using(e -> e.getSecond()).output(); Dataset<Pair<Integer, String>> reduced = ReduceStateByKey.of(input) .keyBy(e -> e.getFirst().charAt(0) - '0') .valueBy(Pair::getFirst) .stateFactory(AccState<String>::new) .mergeStatesBy(AccState::combine) .windowBy(Time.of(Duration.ofMillis(5))) .output(); return FlatMap.of(reduced) .using((UnaryFunctor<Pair<Integer, String>, Triple<TimeInterval, Integer, String>>) (elem, context) -> context.collect(Triple.of((TimeInterval) context.getWindow(), elem.getFirst(), elem.getSecond()))) .output(); }
@Override protected Dataset<Triple<Integer, Integer, Integer>> getOutput(Dataset<Integer> input) { Dataset<Pair<Integer, Integer>> output = ReduceStateByKey.of(input) .keyBy(e -> e % 3) .valueBy(e -> e) .stateFactory(SortState::new) .mergeStatesBy(SortState::combine) .windowBy(new ReduceByKeyTest.TestWindowing()) .output(); return FlatMap.of(output) .using((UnaryFunctor<Pair<Integer, Integer>, Triple<Integer, Integer, Integer>>) (elem, c) -> c.collect(Triple.of(((IntWindow) c.getWindow()).getValue(), elem.getFirst(), elem.getSecond()))) .output(); }
@Override protected Dataset<Triple<Integer, Integer, Integer>> getOutput(Dataset<Integer> input) { Dataset<Pair<Integer, Integer>> output = ReduceStateByKey.of(input) .keyBy(e -> e % 3) .valueBy(e -> e) .stateFactory(SortState::new) .mergeStatesBy(SortState::combine) .windowBy(new ReduceByKeyTest.TestWindowing()) .output(); return FlatMap.of(output) .using((UnaryFunctor<Pair<Integer, Integer>, Triple<Integer, Integer, Integer>>) (elem, c) -> c.collect(Triple.of(((IntWindow) c.getWindow()).getValue(), elem.getFirst(), elem.getSecond()))) .output(); }
@Test public void testDistinctOnBatchWithoutWindowingLabels() throws Exception { Flow flow = Flow.create("Test"); Dataset<String> lines = flow.createInput(ListDataSource.bounded( asList("one two three four", "one two three", "one two", "one"))); // expand it to words Dataset<String> words = FlatMap.of(lines) .using(toWords(w -> w)) .output(); Dataset<String> output = Distinct.of(words).output(); ListDataSink<String> out = ListDataSink.get(); output.persist(out); executor.submit(flow).get(); DatasetAssert.unorderedEquals( out.getOutputs(), "four", "one", "three", "two"); }