/** * Creates a data stream from the given iterator. * * <p>Because the iterator will remain unmodified until the actual execution happens, * the type of data returned by the iterator must be given explicitly in the form of the type * class (this is due to the fact that the Java compiler erases the generic type information). * * <p>Note that this operation will result in a non-parallel data stream source, i.e., * a data stream source with a parallelism of one. * * @param data * The iterator of elements to create the data stream from * @param type * The class of the data produced by the iterator. Must not be a generic class. * @param <OUT> * The type of the returned data stream * @return The data stream representing the elements in the iterator * @see #fromCollection(java.util.Iterator, org.apache.flink.api.common.typeinfo.TypeInformation) */ public <OUT> DataStreamSource<OUT> fromCollection(Iterator<OUT> data, Class<OUT> type) { return fromCollection(data, TypeExtractor.getForClass(type)); }
/** * A thin wrapper layer over {@link StreamExecutionEnvironment#fromCollection(java.util.Collection)} * * <p>The input {@code Collection} is of type {@code Object}, because it is a collection * of Python elements. * There type is determined in runtime, by the Jython framework.</p> * * @param collection The collection of python elements to create the data stream from. * @return The data stream representing the given collection */ public PythonDataStream from_collection(Collection<Object> collection) { return new PythonDataStream<>(env.fromCollection(collection).map(new AdapterMap<>())); }
public static DataStream<Tuple2<String, Integer>> getSource(StreamExecutionEnvironment env, long rate) { return env.fromCollection(new ThrottledIterator<>(new GradeSource(), rate), TypeInformation.of(new TypeHint<Tuple2<String, Integer>>(){})); } }
public static DataStream<Tuple2<String, Integer>> getSource(StreamExecutionEnvironment env, long rate) { return env.fromCollection(new ThrottledIterator<>(new SalarySource(), rate), TypeInformation.of(new TypeHint<Tuple2<String, Integer>>(){})); } }
public static DataStream<Tuple3<Integer, Long, String>> getSmall3TupleDataSet(StreamExecutionEnvironment env) { List<Tuple3<Integer, Long, String>> data = new ArrayList<>(); data.add(new Tuple3<>(1, 1L, "Hi")); data.add(new Tuple3<>(2, 2L, "Hello")); data.add(new Tuple3<>(3, 2L, "Hello world")); Collections.shuffle(data); return env.fromCollection(data); }
public static void main(String[] args) throws Exception { // set up execution environment StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment(); StreamTableEnvironment tEnv = TableEnvironment.getTableEnvironment(env); DataStream<Order> orderA = env.fromCollection(Arrays.asList( new Order(1L, "beer", 3), new Order(1L, "diaper", 4), new Order(3L, "rubber", 2))); DataStream<Order> orderB = env.fromCollection(Arrays.asList( new Order(2L, "pen", 3), new Order(2L, "rubber", 3), new Order(4L, "beer", 1))); // convert DataStream to Table Table tableA = tEnv.fromDataStream(orderA, "user, product, amount"); // register DataStream as Table tEnv.registerDataStream("OrderB", orderB, "user, product, amount"); // union the two tables Table result = tEnv.sqlQuery("SELECT * FROM " + tableA + " WHERE amount > 2 UNION ALL " + "SELECT * FROM OrderB WHERE amount < 2"); tEnv.toAppendStream(result, Order.class).print(); env.execute(); }
public static DataStream<Tuple5<Integer, Long, Integer, String, Long>> get5TupleDataStream(StreamExecutionEnvironment env) { List<Tuple5<Integer, Long, Integer, String, Long>> data = new ArrayList<>(); data.add(new Tuple5<>(1, 1L, 0, "Hallo", 1L)); data.add(new Tuple5<>(2, 2L, 1, "Hallo Welt", 2L)); data.add(new Tuple5<>(2, 3L, 2, "Hallo Welt wie", 1L)); data.add(new Tuple5<>(3, 4L, 3, "Hallo Welt wie gehts?", 2L)); data.add(new Tuple5<>(3, 5L, 4, "ABC", 2L)); data.add(new Tuple5<>(3, 6L, 5, "BCD", 3L)); data.add(new Tuple5<>(4, 7L, 6, "CDE", 2L)); data.add(new Tuple5<>(4, 8L, 7, "DEF", 1L)); data.add(new Tuple5<>(4, 9L, 8, "EFG", 1L)); data.add(new Tuple5<>(4, 10L, 9, "FGH", 2L)); data.add(new Tuple5<>(5, 11L, 10, "GHI", 1L)); data.add(new Tuple5<>(5, 12L, 11, "HIJ", 3L)); data.add(new Tuple5<>(5, 13L, 12, "IJK", 3L)); data.add(new Tuple5<>(5, 15L, 14, "KLM", 2L)); data.add(new Tuple5<>(5, 14L, 13, "JKL", 2L)); return env.fromCollection(data); } }
@Test(expected = CompositeType.InvalidFieldReferenceException.class) public void testFailOnNestedPojoFieldAccessor() throws Exception { StreamExecutionEnvironment see = StreamExecutionEnvironment.getExecutionEnvironment(); DataStream<Data> dataStream = see.fromCollection(elements); dataStream.keyBy("aaa", "stats.count").sum("stats.nonExistingField"); }
/** * Test ProcessFunction side outputs with wrong {@code OutputTag}. */ @Test public void testProcessFunctionSideOutputWithWrongTag() throws Exception { final OutputTag<String> sideOutputTag1 = new OutputTag<String>("side"){}; final OutputTag<String> sideOutputTag2 = new OutputTag<String>("other-side"){}; TestListResultSink<String> sideOutputResultSink = new TestListResultSink<>(); StreamExecutionEnvironment see = StreamExecutionEnvironment.getExecutionEnvironment(); see.setParallelism(3); DataStream<Integer> dataStream = see.fromCollection(elements); dataStream .process(new ProcessFunction<Integer, Integer>() { private static final long serialVersionUID = 1L; @Override public void processElement( Integer value, Context ctx, Collector<Integer> out) throws Exception { out.collect(value); ctx.output(sideOutputTag2, "sideout-" + String.valueOf(value)); } }).getSideOutput(sideOutputTag1).addSink(sideOutputResultSink); see.execute(); assertEquals(Arrays.asList(), sideOutputResultSink.getSortedResult()); }
/** * Test ProcessFunction side output. */ @Test public void testProcessFunctionSideOutput() throws Exception { final OutputTag<String> sideOutputTag = new OutputTag<String>("side"){}; TestListResultSink<String> sideOutputResultSink = new TestListResultSink<>(); TestListResultSink<Integer> resultSink = new TestListResultSink<>(); StreamExecutionEnvironment see = StreamExecutionEnvironment.getExecutionEnvironment(); see.setParallelism(3); DataStream<Integer> dataStream = see.fromCollection(elements); SingleOutputStreamOperator<Integer> passThroughtStream = dataStream .process(new ProcessFunction<Integer, Integer>() { private static final long serialVersionUID = 1L; @Override public void processElement( Integer value, Context ctx, Collector<Integer> out) throws Exception { out.collect(value); ctx.output(sideOutputTag, "sideout-" + String.valueOf(value)); } }); passThroughtStream.getSideOutput(sideOutputTag).addSink(sideOutputResultSink); passThroughtStream.addSink(resultSink); see.execute(); assertEquals(Arrays.asList("sideout-1", "sideout-2", "sideout-3", "sideout-4", "sideout-5"), sideOutputResultSink.getSortedResult()); assertEquals(Arrays.asList(1, 2, 3, 4, 5), resultSink.getSortedResult()); }
@Test public void testStreaming() throws Exception { StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment(); env.setParallelism(1); DataStream<Integer> input = env.fromCollection(inputData); input .flatMap(new NotifyingMapper()) .writeUsingOutputFormat(new DummyOutputFormat()).disableChaining(); JobGraph jobGraph = env.getStreamGraph().getJobGraph(); submitJobAndVerifyResults(jobGraph); }
@Test public void testSideOutputWithMultipleConsumers() throws Exception { final OutputTag<String> sideOutputTag = new OutputTag<String>("side"){}; TestListResultSink<String> sideOutputResultSink1 = new TestListResultSink<>(); TestListResultSink<String> sideOutputResultSink2 = new TestListResultSink<>(); TestListResultSink<Integer> resultSink = new TestListResultSink<>(); StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment(); env.setParallelism(3); DataStream<Integer> dataStream = env.fromCollection(elements); SingleOutputStreamOperator<Integer> passThroughtStream = dataStream .process(new ProcessFunction<Integer, Integer>() { private static final long serialVersionUID = 1L; @Override public void processElement( Integer value, Context ctx, Collector<Integer> out) throws Exception { out.collect(value); ctx.output(sideOutputTag, "sideout-" + String.valueOf(value)); } }); passThroughtStream.getSideOutput(sideOutputTag).addSink(sideOutputResultSink1); passThroughtStream.getSideOutput(sideOutputTag).addSink(sideOutputResultSink2); passThroughtStream.addSink(resultSink); env.execute(); assertEquals(Arrays.asList("sideout-1", "sideout-2", "sideout-3", "sideout-4", "sideout-5"), sideOutputResultSink1.getSortedResult()); assertEquals(Arrays.asList("sideout-1", "sideout-2", "sideout-3", "sideout-4", "sideout-5"), sideOutputResultSink2.getSortedResult()); assertEquals(Arrays.asList(1, 2, 3, 4, 5), resultSink.getSortedResult()); }
@SuppressWarnings("rawtypes") @Test public void testSimpleIteration() throws Exception { int numRetries = 5; int timeoutScale = 1; for (int numRetry = 0; numRetry < numRetries; numRetry++) { try { StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment(); iterated = new boolean[parallelism]; DataStream<Boolean> source = env.fromCollection(Collections.nCopies(parallelism * 2, false)) .map(noOpBoolMap).name("ParallelizeMap"); IterativeStream<Boolean> iteration = source.iterate(3000 * timeoutScale); DataStream<Boolean> increment = iteration.flatMap(new IterationHead()).map(noOpBoolMap); iteration.map(noOpBoolMap).addSink(new ReceiveCheckNoOpSink()); iteration.closeWith(increment).addSink(new ReceiveCheckNoOpSink()); env.execute(); for (boolean iter : iterated) { assertTrue(iter); } break; // success } catch (Throwable t) { LOG.info("Run " + (numRetry + 1) + "/" + numRetries + " failed", t); if (numRetry >= numRetries - 1) { throw t; } else { timeoutScale *= 2; } } } }
@Test public void testDifferentSideOutputTypes() throws Exception { final OutputTag<String> sideOutputTag1 = new OutputTag<String>("string"){}; final OutputTag<Integer> sideOutputTag2 = new OutputTag<Integer>("int"){}; TestListResultSink<String> sideOutputResultSink1 = new TestListResultSink<>(); TestListResultSink<Integer> sideOutputResultSink2 = new TestListResultSink<>(); TestListResultSink<Integer> resultSink = new TestListResultSink<>(); StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment(); env.getConfig().enableObjectReuse(); env.setParallelism(3); DataStream<Integer> dataStream = env.fromCollection(elements); SingleOutputStreamOperator<Integer> passThroughtStream = dataStream .process(new ProcessFunction<Integer, Integer>() { private static final long serialVersionUID = 1L; @Override public void processElement( Integer value, Context ctx, Collector<Integer> out) throws Exception { out.collect(value); ctx.output(sideOutputTag1, "sideout-" + String.valueOf(value)); ctx.output(sideOutputTag2, 13); } }); passThroughtStream.getSideOutput(sideOutputTag1).addSink(sideOutputResultSink1); passThroughtStream.getSideOutput(sideOutputTag2).addSink(sideOutputResultSink2); passThroughtStream.addSink(resultSink); env.execute(); assertEquals(Arrays.asList("sideout-1", "sideout-2", "sideout-3", "sideout-4", "sideout-5"), sideOutputResultSink1.getSortedResult()); assertEquals(Arrays.asList(13, 13, 13, 13, 13), sideOutputResultSink2.getSortedResult()); assertEquals(Arrays.asList(1, 2, 3, 4, 5), resultSink.getSortedResult()); }
@Test public void testSources() { StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment(); SourceFunction<Integer> srcFun = new SourceFunction<Integer>() { private static final long serialVersionUID = 1L; @Override public void run(SourceContext<Integer> ctx) throws Exception { } @Override public void cancel() { } }; DataStreamSource<Integer> src1 = env.addSource(srcFun); src1.addSink(new DiscardingSink<Integer>()); assertEquals(srcFun, getFunctionFromDataSource(src1)); List<Long> list = Arrays.asList(0L, 1L, 2L); DataStreamSource<Long> src2 = env.generateSequence(0, 2); assertTrue(getFunctionFromDataSource(src2) instanceof StatefulSequenceSource); DataStreamSource<Long> src3 = env.fromElements(0L, 1L, 2L); assertTrue(getFunctionFromDataSource(src3) instanceof FromElementsFunction); DataStreamSource<Long> src4 = env.fromCollection(list); assertTrue(getFunctionFromDataSource(src4) instanceof FromElementsFunction); }
@Test public void testNestedPojoFieldAccessor() throws Exception { StreamExecutionEnvironment see = StreamExecutionEnvironment.getExecutionEnvironment(); see.getConfig().disableObjectReuse(); see.setParallelism(4); DataStream<Data> dataStream = see.fromCollection(elements); DataStream<Data> summedStream = dataStream .keyBy("aaa") .sum("stats.count") .keyBy("aaa") .flatMap(new FlatMapFunction<Data, Data>() { Data[] first = new Data[3]; @Override public void flatMap(Data value, Collector<Data> out) throws Exception { if (first[value.aaa] == null) { first[value.aaa] = value; if (value.stats.count != 123) { throw new RuntimeException("Expected stats.count to be 123"); } } else { if (value.stats.count != 2 * 123) { throw new RuntimeException("Expected stats.count to be 2 * 123"); } } } }); summedStream.print(); see.execute(); }
see.setStreamTimeCharacteristic(TimeCharacteristic.EventTime); DataStream<Integer> dataStream = see.fromCollection(elements);
@Test public void testSideOutputNameClash() throws Exception { final OutputTag<String> sideOutputTag1 = new OutputTag<String>("side"){}; final OutputTag<Integer> sideOutputTag2 = new OutputTag<Integer>("side"){}; TestListResultSink<String> sideOutputResultSink1 = new TestListResultSink<>(); TestListResultSink<Integer> sideOutputResultSink2 = new TestListResultSink<>(); StreamExecutionEnvironment see = StreamExecutionEnvironment.getExecutionEnvironment(); see.setParallelism(3); DataStream<Integer> dataStream = see.fromCollection(elements); SingleOutputStreamOperator<Integer> passThroughtStream = dataStream .process(new ProcessFunction<Integer, Integer>() { private static final long serialVersionUID = 1L; @Override public void processElement( Integer value, Context ctx, Collector<Integer> out) throws Exception { out.collect(value); ctx.output(sideOutputTag1, "sideout-" + String.valueOf(value)); ctx.output(sideOutputTag2, 13); } }); passThroughtStream.getSideOutput(sideOutputTag1).addSink(sideOutputResultSink1); expectedException.expect(UnsupportedOperationException.class); passThroughtStream.getSideOutput(sideOutputTag2).addSink(sideOutputResultSink2); }
@Test public void testProcessdWindowFunctionSideOutput() throws Exception { TestListResultSink<Integer> resultSink = new TestListResultSink<>(); TestListResultSink<String> sideOutputResultSink = new TestListResultSink<>(); StreamExecutionEnvironment see = StreamExecutionEnvironment.getExecutionEnvironment(); see.setParallelism(3); see.setStreamTimeCharacteristic(TimeCharacteristic.EventTime); DataStream<Integer> dataStream = see.fromCollection(elements); OutputTag<String> sideOutputTag = new OutputTag<String>("side"){}; SingleOutputStreamOperator<Integer> windowOperator = dataStream .assignTimestampsAndWatermarks(new TestWatermarkAssigner()) .keyBy(new TestKeySelector()) .timeWindow(Time.milliseconds(1), Time.milliseconds(1)) .process(new ProcessWindowFunction<Integer, Integer, Integer, TimeWindow>() { private static final long serialVersionUID = 1L; @Override public void process(Integer integer, Context context, Iterable<Integer> elements, Collector<Integer> out) throws Exception { out.collect(integer); context.output(sideOutputTag, "sideout-" + String.valueOf(integer)); } }); windowOperator.getSideOutput(sideOutputTag).addSink(sideOutputResultSink); windowOperator.addSink(resultSink); see.execute(); assertEquals(Arrays.asList("sideout-1", "sideout-2", "sideout-5"), sideOutputResultSink.getSortedResult()); assertEquals(Arrays.asList(1, 2, 5), resultSink.getSortedResult()); }
@Test @SuppressWarnings("unchecked") public void testFromCollectionParallelism() { try { TypeInformation<Integer> typeInfo = BasicTypeInfo.INT_TYPE_INFO; StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment(); DataStreamSource<Integer> dataStream1 = env.fromCollection(new DummySplittableIterator<Integer>(), typeInfo); try { dataStream1.setParallelism(4); fail("should throw an exception"); } catch (IllegalArgumentException e) { // expected } dataStream1.addSink(new DiscardingSink<Integer>()); DataStreamSource<Integer> dataStream2 = env.fromParallelCollection(new DummySplittableIterator<Integer>(), typeInfo).setParallelism(4); dataStream2.addSink(new DiscardingSink<Integer>()); env.getExecutionPlan(); assertEquals("Parallelism of collection source must be 1.", 1, env.getStreamGraph().getStreamNode(dataStream1.getId()).getParallelism()); assertEquals("Parallelism of parallel collection source must be 4.", 4, env.getStreamGraph().getStreamNode(dataStream2.getId()).getParallelism()); } catch (Exception e) { e.printStackTrace(); fail(e.getMessage()); } }