public <X> DataSource<X> readFile(FileInputFormat<X> inputFormat, String filePath) { if (inputFormat == null) { throw new IllegalArgumentException("InputFormat must not be null."); } if (filePath == null) { throw new IllegalArgumentException("The file path must not be null."); } inputFormat.setFilePath(new Path(filePath)); try { return createInput(inputFormat, TypeExtractor.getInputFormatTypes(inputFormat)); } catch (Exception e) { throw new InvalidProgramException("The type returned by the input format could not be automatically determined. " + "Please specify the TypeInformation of the produced type explicitly by using the " + "'createInput(InputFormat, TypeInformation)' method instead."); } }
@Override public DataSet<Row> getDataSet(ExecutionEnvironment execEnv) { return execEnv.createInput(new HBaseRowInputFormat(conf, tableName, hBaseSchema), getReturnType()).name(explainSource()); }
@SuppressWarnings("unchecked") private <T extends Tuple> void createCsvSource(ExecutionEnvironment env, PythonOperationInfo info) { if (!(info.types instanceof TupleTypeInfo)) { throw new RuntimeException("The output type of a csv source has to be a tuple. The derived type is " + info); } Path path = new Path(info.path); String lineD = info.lineDelimiter; String fieldD = info.fieldDelimiter; TupleTypeInfo<T> types = (TupleTypeInfo<T>) info.types; sets.add(info.setID, env.createInput(new TupleCsvInputFormat<>(path, lineD, fieldD, types), types).setParallelism(info.parallelism).name("CsvSource") .map(new SerializerMap<T>()).setParallelism(info.parallelism).name("CsvSourcePostStep")); }
@Override public DataSet<Row> getDataSet(ExecutionEnvironment execEnv) { OrcRowInputFormat orcIF = buildOrcInputFormat(); orcIF.setNestedFileEnumeration(recursiveEnumeration); if (selectedFields != null) { orcIF.selectFields(selectedFields); } if (predicates != null) { for (OrcRowInputFormat.Predicate pred : predicates) { orcIF.addPredicate(pred); } } return execEnv.createInput(orcIF).name(explainSource()); }
@Override protected void testProgram() throws Exception { final ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment(); TestNonRichOutputFormat output = new TestNonRichOutputFormat(); env.createInput(new TestNonRichInputFormat()).output(output); try { env.execute(); } catch (Exception e){ // we didn't break anything by making everything rich. e.printStackTrace(); fail(e.getMessage()); } } }
@Override protected void testProgram() throws Exception { // test verifying the number of records read and written vs the accumulator counts readCalls = new ConcurrentLinkedQueue<Integer>(); writeCalls = new ConcurrentLinkedQueue<Integer>(); final ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment(); env.createInput(new TestInputFormat(new Path(inputPath))).output(new TestOutputFormat()); JobExecutionResult result = env.execute(); Object a = result.getAllAccumulatorResults().get("DATA_SOURCE_ACCUMULATOR"); Object b = result.getAllAccumulatorResults().get("DATA_SINK_ACCUMULATOR"); long recordsRead = (Long) a; long recordsWritten = (Long) b; assertEquals(recordsRead, readCalls.size()); assertEquals(recordsWritten, writeCalls.size()); }
private void executeTask(JoinFunction<Tuple2<Integer, Integer>, Tuple2<Integer, Integer>, Tuple2<Integer, Integer>> joiner, boolean slow, int parallelism) throws Exception { ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment(); DataSet<Tuple2<Integer, Integer>> input1 = env.createInput(new InfiniteIntegerTupleInputFormat(slow)); DataSet<Tuple2<Integer, Integer>> input2 = env.createInput(new InfiniteIntegerTupleInputFormat(slow)); input1.join(input2, JoinOperatorBase.JoinHint.REPARTITION_SORT_MERGE) .where(0) .equalTo(0) .with(joiner) .output(new DiscardingOutputFormat<Tuple2<Integer, Integer>>()); env.setParallelism(parallelism); runAndCancelJob(env.createProgramPlan(), 5 * 1000, 10 * 1000); }
private void executeTaskWithGenerator( JoinFunction<Tuple2<Integer, Integer>, Tuple2<Integer, Integer>, Tuple2<Integer, Integer>> joiner, int keys, int vals, int msecsTillCanceling, int maxTimeTillCanceled) throws Exception { ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment(); DataSet<Tuple2<Integer, Integer>> input1 = env.createInput(new UniformIntTupleGeneratorInputFormat(keys, vals)); DataSet<Tuple2<Integer, Integer>> input2 = env.createInput(new UniformIntTupleGeneratorInputFormat(keys, vals)); input1.join(input2, JoinOperatorBase.JoinHint.REPARTITION_SORT_MERGE) .where(0) .equalTo(0) .with(joiner) .output(new DiscardingOutputFormat<Tuple2<Integer, Integer>>()); env.setParallelism(PARALLELISM); runAndCancelJob(env.createProgramPlan(), msecsTillCanceling, maxTimeTillCanceled); }
@Test public void testCancelSortMatchWhileDoingHeavySorting() throws Exception { ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment(); HeavyCompareGeneratorInputFormat input = new HeavyCompareGeneratorInputFormat(100); DataSet<Tuple2<HeavyCompare, Integer>> input1 = env.createInput(input); DataSet<Tuple2<HeavyCompare, Integer>> input2 = env.createInput(input); input1.join(input2, JoinOperatorBase.JoinHint.REPARTITION_SORT_MERGE) .where(0) .equalTo(0) .with(new JoinFunction<Tuple2<HeavyCompare, Integer>, Tuple2<HeavyCompare, Integer>, Tuple2<HeavyCompare, Integer>>() { @Override public Tuple2<HeavyCompare, Integer> join( Tuple2<HeavyCompare, Integer> first, Tuple2<HeavyCompare, Integer> second) throws Exception { throw new Exception("Job should be canceled in sort-merge phase, never run here ..."); } }) .output(new DiscardingOutputFormat<Tuple2<HeavyCompare, Integer>>()); runAndCancelJob(env.createProgramPlan(), 30 * 1000, 60 * 1000); }
@Test public void testWithKryoGenericSer() throws Exception { final ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment(); env.getConfig().enableForceKryo(); Path in = new Path(inFile.getAbsoluteFile().toURI()); AvroInputFormat<User> users = new AvroInputFormat<>(in, User.class); DataSet<User> usersDS = env.createInput(users); DataSet<Tuple2<String, Integer>> res = usersDS .groupBy((KeySelector<User, String>) value -> String.valueOf(value.getName())) .reduceGroup((GroupReduceFunction<User, Tuple2<String, Integer>>) (values, out) -> { for (User u : values) { out.collect(new Tuple2<>(u.getName().toString(), 1)); } }) .returns(Types.TUPLE(Types.STRING, Types.INT)); res.writeAsText(resultPath); env.execute("Avro Key selection"); expected = "(Charlie,1)\n(Alyssa,1)\n"; }
@Test public void testWithAvroGenericSer() throws Exception { final ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment(); env.getConfig().enableForceAvro(); Path in = new Path(inFile.getAbsoluteFile().toURI()); AvroInputFormat<User> users = new AvroInputFormat<>(in, User.class); DataSet<User> usersDS = env.createInput(users); DataSet<Tuple2<String, Integer>> res = usersDS .groupBy((KeySelector<User, String>) value -> String.valueOf(value.getName())) .reduceGroup((GroupReduceFunction<User, Tuple2<String, Integer>>) (values, out) -> { for (User u : values) { out.collect(new Tuple2<>(u.getName().toString(), 1)); } }) .returns(Types.TUPLE(Types.STRING, Types.INT)); res.writeAsText(resultPath); env.execute("Avro Key selection"); expected = "(Charlie,1)\n(Alyssa,1)\n"; }
public void executeTask(MapFunction<Integer, Integer> mapper) throws Exception { ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment(); env .createInput(new InfiniteIntegerInputFormat(false)) .map(mapper) .output(new DiscardingOutputFormat<Integer>()); env.setParallelism(PARALLELISM); runAndCancelJob(env.createProgramPlan(), 5 * 1000, 10 * 1000); }
@Test public void testKeySelection() throws Exception { final ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment(); env.getConfig().enableObjectReuse(); Path in = new Path(inFile.getAbsoluteFile().toURI()); AvroInputFormat<User> users = new AvroInputFormat<>(in, User.class); DataSet<User> usersDS = env.createInput(users); DataSet<Tuple2<String, Integer>> res = usersDS .groupBy("name") .reduceGroup((GroupReduceFunction<User, Tuple2<String, Integer>>) (values, out) -> { for (User u : values) { out.collect(new Tuple2<>(u.getName().toString(), 1)); } }) .returns(Types.TUPLE(Types.STRING, Types.INT)); res.writeAsText(resultPath); env.execute("Avro Key selection"); expected = "(Alyssa,1)\n(Charlie,1)\n"; }
@Test public void testProgramWithAutoParallelism() throws Exception { ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment(); env.setParallelism(ExecutionConfig.PARALLELISM_AUTO_MAX); env.getConfig().disableSysoutLogging(); DataSet<Integer> result = env .createInput(new ParallelismDependentInputFormat()) .rebalance() .mapPartition(new ParallelismDependentMapPartition()); List<Integer> resultCollection = new ArrayList<>(); result.output(new LocalCollectionOutputFormat<>(resultCollection)); try { env.execute(); assertEquals(PARALLELISM, resultCollection.size()); } catch (Exception ex) { assertTrue( ExceptionUtils.findThrowableWithMessage(ex, ExecutionGraphBuilder.PARALLELISM_AUTO_MAX_ERROR_MESSAGE).isPresent()); } }
@Test public void testTypeExtraction() { try { InputFormat<MyAvroType, ?> format = new AvroInputFormat<MyAvroType>(new Path("file:///ignore/this/file"), MyAvroType.class); TypeInformation<?> typeInfoDirect = TypeExtractor.getInputFormatTypes(format); ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment(); DataSet<MyAvroType> input = env.createInput(format); TypeInformation<?> typeInfoDataSet = input.getType(); Assert.assertTrue(typeInfoDirect instanceof PojoTypeInfo); Assert.assertTrue(typeInfoDataSet instanceof PojoTypeInfo); Assert.assertEquals(MyAvroType.class, typeInfoDirect.getTypeClass()); Assert.assertEquals(MyAvroType.class, typeInfoDataSet.getTypeClass()); } catch (Exception e) { e.printStackTrace(); Assert.fail(e.getMessage()); } }
@Override protected void testProgram() throws Exception { ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment(); DataSet<Tuple3<Integer, Integer, String>> ds = env.createInput(new LargeJoinDataGeneratorInputFormat(1000000)); ds.join(ds).where(0).equalTo(1).with(new Joiner()).writeAsText(resultPath); env.execute("Local Selfjoin Test Job"); }
@Override protected void testProgram() throws Exception { /* * Test passing a configuration object to an input format */ final ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment(); Configuration ifConf = new Configuration(); ifConf.setString("prepend", "test"); DataSet<String> ds = env.createInput(new TestInputFormat(new Path(inputPath))).withParameters(ifConf); List<String> result = ds.collect(); String expectedResult = "ab\n" + "cd\n" + "ef\n"; compareResultAsText(result, expectedResult); }
@Test public void testReplicatedSourceToJoin() throws Exception { /* * Test replicated source going into join */ final ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment(); DataSet<Tuple1<Long>> source1 = env.createInput(new ReplicatingInputFormat<Long, GenericInputSplit> (new ParallelIteratorInputFormat<Long>(new NumberSequenceIterator(0L, 1000L))), BasicTypeInfo.LONG_TYPE_INFO) .map(new ToTuple()); DataSet<Tuple1<Long>> source2 = env.generateSequence(0L, 1000L).map(new ToTuple()); DataSet<Tuple> pairs = source1.join(source2).where(0).equalTo(0) .projectFirst(0) .sum(0); List<Tuple> result = pairs.collect(); String expectedResult = "(500500)"; compareResultAsText(result, expectedResult); }
/** * Tests compiler fail for join program with replicated data source and changing parallelism. */ @Test(expected = CompilerException.class) public void checkJoinWithReplicatedSourceInputChangingparallelism() { ExecutionEnvironment env = ExecutionEnvironment.createLocalEnvironment(); env.setParallelism(DEFAULT_PARALLELISM); TupleTypeInfo<Tuple1<String>> typeInfo = TupleTypeInfo.getBasicTupleTypeInfo(String.class); ReplicatingInputFormat<Tuple1<String>, FileInputSplit> rif = new ReplicatingInputFormat<Tuple1<String>, FileInputSplit>(new TupleCsvInputFormat<Tuple1<String>>(new Path("/some/path"), typeInfo)); DataSet<Tuple1<String>> source1 = env.createInput(rif, new TupleTypeInfo<Tuple1<String>>(BasicTypeInfo.STRING_TYPE_INFO)); DataSet<Tuple1<String>> source2 = env.readCsvFile("/some/otherpath").types(String.class); DataSink<Tuple2<Tuple1<String>, Tuple1<String>>> out = source1 .join(source2).where("*").equalTo("*").setParallelism(DEFAULT_PARALLELISM+2) .writeAsText("/some/newpath"); Plan plan = env.createProgramPlan(); // submit the plan to the compiler OptimizedPlan oPlan = compileNoStats(plan); }
/** * Tests compiler fail for join program with replicated data source behind map and changing parallelism. */ @Test(expected = CompilerException.class) public void checkJoinWithReplicatedSourceInputBehindMapChangingparallelism() { ExecutionEnvironment env = ExecutionEnvironment.createLocalEnvironment(); env.setParallelism(DEFAULT_PARALLELISM); TupleTypeInfo<Tuple1<String>> typeInfo = TupleTypeInfo.getBasicTupleTypeInfo(String.class); ReplicatingInputFormat<Tuple1<String>, FileInputSplit> rif = new ReplicatingInputFormat<Tuple1<String>, FileInputSplit>(new TupleCsvInputFormat<Tuple1<String>>(new Path("/some/path"), typeInfo)); DataSet<Tuple1<String>> source1 = env.createInput(rif, new TupleTypeInfo<Tuple1<String>>(BasicTypeInfo.STRING_TYPE_INFO)); DataSet<Tuple1<String>> source2 = env.readCsvFile("/some/otherpath").types(String.class); DataSink<Tuple2<Tuple1<String>, Tuple1<String>>> out = source1 .map(new IdMap()).setParallelism(DEFAULT_PARALLELISM+1) .join(source2).where("*").equalTo("*") .writeAsText("/some/newpath"); Plan plan = env.createProgramPlan(); // submit the plan to the compiler OptimizedPlan oPlan = compileNoStats(plan); }