org.apache.flink.api.java.ExecutionEnvironment.createInput java code examples

public <X> DataSource<X> readFile(FileInputFormat<X> inputFormat, String filePath) {
  if (inputFormat == null) {
    throw new IllegalArgumentException("InputFormat must not be null.");
  }
  if (filePath == null) {
    throw new IllegalArgumentException("The file path must not be null.");
  }
  inputFormat.setFilePath(new Path(filePath));
  try {
    return createInput(inputFormat, TypeExtractor.getInputFormatTypes(inputFormat));
  }
  catch (Exception e) {
    throw new InvalidProgramException("The type returned by the input format could not be automatically determined. " +
        "Please specify the TypeInformation of the produced type explicitly by using the " +
        "'createInput(InputFormat, TypeInformation)' method instead.");
  }
}

@Override
public DataSet<Row> getDataSet(ExecutionEnvironment execEnv) {
  return execEnv.createInput(new HBaseRowInputFormat(conf, tableName, hBaseSchema), getReturnType()).name(explainSource());
}

@SuppressWarnings("unchecked")
private <T extends Tuple> void createCsvSource(ExecutionEnvironment env, PythonOperationInfo info) {
  if (!(info.types instanceof TupleTypeInfo)) {
    throw new RuntimeException("The output type of a csv source has to be a tuple. The derived type is " + info);
  }
  Path path = new Path(info.path);
  String lineD = info.lineDelimiter;
  String fieldD = info.fieldDelimiter;
  TupleTypeInfo<T> types = (TupleTypeInfo<T>) info.types;
  sets.add(info.setID, env.createInput(new TupleCsvInputFormat<>(path, lineD, fieldD, types), types).setParallelism(info.parallelism).name("CsvSource")
    .map(new SerializerMap<T>()).setParallelism(info.parallelism).name("CsvSourcePostStep"));
}

@Override
public DataSet<Row> getDataSet(ExecutionEnvironment execEnv) {
  OrcRowInputFormat orcIF = buildOrcInputFormat();
  orcIF.setNestedFileEnumeration(recursiveEnumeration);
  if (selectedFields != null) {
    orcIF.selectFields(selectedFields);
  }
  if (predicates != null) {
    for (OrcRowInputFormat.Predicate pred : predicates) {
      orcIF.addPredicate(pred);
    }
  }
  return execEnv.createInput(orcIF).name(explainSource());
}

  @Override
  protected void testProgram() throws Exception {

    final ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment();
    TestNonRichOutputFormat output = new TestNonRichOutputFormat();
    env.createInput(new TestNonRichInputFormat()).output(output);
    try {
      env.execute();
    } catch (Exception e){
      // we didn't break anything by making everything rich.
      e.printStackTrace();
      fail(e.getMessage());
    }
  }
}

@Override
protected void testProgram() throws Exception {
  // test verifying the number of records read and written vs the accumulator counts
  readCalls = new ConcurrentLinkedQueue<Integer>();
  writeCalls = new ConcurrentLinkedQueue<Integer>();
  final ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment();
  env.createInput(new TestInputFormat(new Path(inputPath))).output(new TestOutputFormat());
  JobExecutionResult result = env.execute();
  Object a = result.getAllAccumulatorResults().get("DATA_SOURCE_ACCUMULATOR");
  Object b = result.getAllAccumulatorResults().get("DATA_SINK_ACCUMULATOR");
  long recordsRead = (Long) a;
  long recordsWritten = (Long) b;
  assertEquals(recordsRead, readCalls.size());
  assertEquals(recordsWritten, writeCalls.size());
}

private void executeTask(JoinFunction<Tuple2<Integer, Integer>, Tuple2<Integer, Integer>, Tuple2<Integer, Integer>> joiner, boolean slow, int parallelism) throws Exception {
  ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment();
  DataSet<Tuple2<Integer, Integer>> input1 = env.createInput(new InfiniteIntegerTupleInputFormat(slow));
  DataSet<Tuple2<Integer, Integer>> input2 = env.createInput(new InfiniteIntegerTupleInputFormat(slow));
  input1.join(input2, JoinOperatorBase.JoinHint.REPARTITION_SORT_MERGE)
      .where(0)
      .equalTo(0)
      .with(joiner)
      .output(new DiscardingOutputFormat<Tuple2<Integer, Integer>>());
  env.setParallelism(parallelism);
  runAndCancelJob(env.createProgramPlan(), 5 * 1000, 10 * 1000);
}

private void executeTaskWithGenerator(
    JoinFunction<Tuple2<Integer, Integer>, Tuple2<Integer, Integer>, Tuple2<Integer, Integer>> joiner,
    int keys, int vals, int msecsTillCanceling, int maxTimeTillCanceled) throws Exception {
  ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment();
  DataSet<Tuple2<Integer, Integer>> input1 = env.createInput(new UniformIntTupleGeneratorInputFormat(keys, vals));
  DataSet<Tuple2<Integer, Integer>> input2 = env.createInput(new UniformIntTupleGeneratorInputFormat(keys, vals));
  input1.join(input2, JoinOperatorBase.JoinHint.REPARTITION_SORT_MERGE)
      .where(0)
      .equalTo(0)
      .with(joiner)
      .output(new DiscardingOutputFormat<Tuple2<Integer, Integer>>());
  env.setParallelism(PARALLELISM);
  runAndCancelJob(env.createProgramPlan(), msecsTillCanceling, maxTimeTillCanceled);
}

@Test
public void testCancelSortMatchWhileDoingHeavySorting() throws Exception {
  ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment();
  HeavyCompareGeneratorInputFormat input = new HeavyCompareGeneratorInputFormat(100);
  DataSet<Tuple2<HeavyCompare, Integer>> input1 = env.createInput(input);
  DataSet<Tuple2<HeavyCompare, Integer>> input2 = env.createInput(input);
  input1.join(input2, JoinOperatorBase.JoinHint.REPARTITION_SORT_MERGE)
      .where(0)
      .equalTo(0)
      .with(new JoinFunction<Tuple2<HeavyCompare, Integer>, Tuple2<HeavyCompare, Integer>, Tuple2<HeavyCompare, Integer>>() {
        @Override
        public Tuple2<HeavyCompare, Integer> join(
          Tuple2<HeavyCompare, Integer> first,
          Tuple2<HeavyCompare, Integer> second) throws Exception {
          throw new Exception("Job should be canceled in sort-merge phase, never run here ...");
        }
      })
      .output(new DiscardingOutputFormat<Tuple2<HeavyCompare, Integer>>());
  runAndCancelJob(env.createProgramPlan(), 30 * 1000, 60 * 1000);
}

@Test
public void testWithKryoGenericSer() throws Exception {
  final ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment();
  env.getConfig().enableForceKryo();
  Path in = new Path(inFile.getAbsoluteFile().toURI());
  AvroInputFormat<User> users = new AvroInputFormat<>(in, User.class);
  DataSet<User> usersDS = env.createInput(users);
  DataSet<Tuple2<String, Integer>> res = usersDS
    .groupBy((KeySelector<User, String>) value -> String.valueOf(value.getName()))
    .reduceGroup((GroupReduceFunction<User, Tuple2<String, Integer>>) (values, out) -> {
      for (User u : values) {
        out.collect(new Tuple2<>(u.getName().toString(), 1));
      }
    })
    .returns(Types.TUPLE(Types.STRING, Types.INT));
  res.writeAsText(resultPath);
  env.execute("Avro Key selection");
  expected = "(Charlie,1)\n(Alyssa,1)\n";
}

@Test
public void testWithAvroGenericSer() throws Exception {
  final ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment();
  env.getConfig().enableForceAvro();
  Path in = new Path(inFile.getAbsoluteFile().toURI());
  AvroInputFormat<User> users = new AvroInputFormat<>(in, User.class);
  DataSet<User> usersDS = env.createInput(users);
  DataSet<Tuple2<String, Integer>> res = usersDS
    .groupBy((KeySelector<User, String>) value -> String.valueOf(value.getName()))
    .reduceGroup((GroupReduceFunction<User, Tuple2<String, Integer>>) (values, out) -> {
      for (User u : values) {
        out.collect(new Tuple2<>(u.getName().toString(), 1));
      }
    })
    .returns(Types.TUPLE(Types.STRING, Types.INT));
  res.writeAsText(resultPath);
  env.execute("Avro Key selection");
  expected = "(Charlie,1)\n(Alyssa,1)\n";
}

public void executeTask(MapFunction<Integer, Integer> mapper) throws Exception {
  ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment();
  env
      .createInput(new InfiniteIntegerInputFormat(false))
      .map(mapper)
      .output(new DiscardingOutputFormat<Integer>());
  env.setParallelism(PARALLELISM);
  runAndCancelJob(env.createProgramPlan(), 5 * 1000, 10 * 1000);
}

@Test
public void testKeySelection() throws Exception {
  final ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment();
  env.getConfig().enableObjectReuse();
  Path in = new Path(inFile.getAbsoluteFile().toURI());
  AvroInputFormat<User> users = new AvroInputFormat<>(in, User.class);
  DataSet<User> usersDS = env.createInput(users);
  DataSet<Tuple2<String, Integer>> res = usersDS
    .groupBy("name")
    .reduceGroup((GroupReduceFunction<User, Tuple2<String, Integer>>) (values, out) -> {
      for (User u : values) {
        out.collect(new Tuple2<>(u.getName().toString(), 1));
      }
    })
    .returns(Types.TUPLE(Types.STRING, Types.INT));
  res.writeAsText(resultPath);
  env.execute("Avro Key selection");
  expected = "(Alyssa,1)\n(Charlie,1)\n";
}

@Test
public void testProgramWithAutoParallelism() throws Exception {
  ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment();
  env.setParallelism(ExecutionConfig.PARALLELISM_AUTO_MAX);
  env.getConfig().disableSysoutLogging();
  DataSet<Integer> result = env
      .createInput(new ParallelismDependentInputFormat())
      .rebalance()
      .mapPartition(new ParallelismDependentMapPartition());
  List<Integer> resultCollection = new ArrayList<>();
  result.output(new LocalCollectionOutputFormat<>(resultCollection));
  try {
    env.execute();
    assertEquals(PARALLELISM, resultCollection.size());
  }
  catch (Exception ex) {
    assertTrue(
      ExceptionUtils.findThrowableWithMessage(ex, ExecutionGraphBuilder.PARALLELISM_AUTO_MAX_ERROR_MESSAGE).isPresent());
  }
}

@Test
public void testTypeExtraction() {
  try {
    InputFormat<MyAvroType, ?> format = new AvroInputFormat<MyAvroType>(new Path("file:///ignore/this/file"), MyAvroType.class);
    TypeInformation<?> typeInfoDirect = TypeExtractor.getInputFormatTypes(format);
    ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment();
    DataSet<MyAvroType> input = env.createInput(format);
    TypeInformation<?> typeInfoDataSet = input.getType();
    Assert.assertTrue(typeInfoDirect instanceof PojoTypeInfo);
    Assert.assertTrue(typeInfoDataSet instanceof PojoTypeInfo);
    Assert.assertEquals(MyAvroType.class, typeInfoDirect.getTypeClass());
    Assert.assertEquals(MyAvroType.class, typeInfoDataSet.getTypeClass());
  } catch (Exception e) {
    e.printStackTrace();
    Assert.fail(e.getMessage());
  }
}

@Override
protected void testProgram() throws Exception {
  ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment();
  DataSet<Tuple3<Integer, Integer, String>> ds = env.createInput(new LargeJoinDataGeneratorInputFormat(1000000));
  ds.join(ds).where(0).equalTo(1).with(new Joiner()).writeAsText(resultPath);
  env.execute("Local Selfjoin Test Job");
}

@Override
protected void testProgram() throws Exception {
  /*
   * Test passing a configuration object to an input format
   */
  final ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment();
  Configuration ifConf = new Configuration();
  ifConf.setString("prepend", "test");
  DataSet<String> ds = env.createInput(new TestInputFormat(new Path(inputPath))).withParameters(ifConf);
  List<String> result = ds.collect();
  String expectedResult = "ab\n"
      + "cd\n"
      + "ef\n";
  compareResultAsText(result, expectedResult);
}

@Test
public void testReplicatedSourceToJoin() throws Exception {
  /*
   * Test replicated source going into join
   */
  final ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment();
  DataSet<Tuple1<Long>> source1 = env.createInput(new ReplicatingInputFormat<Long, GenericInputSplit>
      (new ParallelIteratorInputFormat<Long>(new NumberSequenceIterator(0L, 1000L))), BasicTypeInfo.LONG_TYPE_INFO)
      .map(new ToTuple());
  DataSet<Tuple1<Long>> source2 = env.generateSequence(0L, 1000L).map(new ToTuple());
  DataSet<Tuple> pairs = source1.join(source2).where(0).equalTo(0)
      .projectFirst(0)
      .sum(0);
  List<Tuple> result = pairs.collect();
  String expectedResult = "(500500)";
  compareResultAsText(result, expectedResult);
}

/**
 * Tests compiler fail for join program with replicated data source and changing parallelism.
 */
@Test(expected = CompilerException.class)
public void checkJoinWithReplicatedSourceInputChangingparallelism() {
  ExecutionEnvironment env = ExecutionEnvironment.createLocalEnvironment();
  env.setParallelism(DEFAULT_PARALLELISM);
  TupleTypeInfo<Tuple1<String>> typeInfo = TupleTypeInfo.getBasicTupleTypeInfo(String.class);
  ReplicatingInputFormat<Tuple1<String>, FileInputSplit> rif =
      new ReplicatingInputFormat<Tuple1<String>, FileInputSplit>(new TupleCsvInputFormat<Tuple1<String>>(new Path("/some/path"), typeInfo));
  DataSet<Tuple1<String>> source1 = env.createInput(rif, new TupleTypeInfo<Tuple1<String>>(BasicTypeInfo.STRING_TYPE_INFO));
  DataSet<Tuple1<String>> source2 = env.readCsvFile("/some/otherpath").types(String.class);
  DataSink<Tuple2<Tuple1<String>, Tuple1<String>>> out = source1
      .join(source2).where("*").equalTo("*").setParallelism(DEFAULT_PARALLELISM+2)
      .writeAsText("/some/newpath");
  Plan plan = env.createProgramPlan();
  // submit the plan to the compiler
  OptimizedPlan oPlan = compileNoStats(plan);
}

/**
 * Tests compiler fail for join program with replicated data source behind map and changing parallelism.
 */
@Test(expected = CompilerException.class)
public void checkJoinWithReplicatedSourceInputBehindMapChangingparallelism() {
  ExecutionEnvironment env = ExecutionEnvironment.createLocalEnvironment();
  env.setParallelism(DEFAULT_PARALLELISM);
  TupleTypeInfo<Tuple1<String>> typeInfo = TupleTypeInfo.getBasicTupleTypeInfo(String.class);
  ReplicatingInputFormat<Tuple1<String>, FileInputSplit> rif =
      new ReplicatingInputFormat<Tuple1<String>, FileInputSplit>(new TupleCsvInputFormat<Tuple1<String>>(new Path("/some/path"), typeInfo));
  DataSet<Tuple1<String>> source1 = env.createInput(rif, new TupleTypeInfo<Tuple1<String>>(BasicTypeInfo.STRING_TYPE_INFO));
  DataSet<Tuple1<String>> source2 = env.readCsvFile("/some/otherpath").types(String.class);
  DataSink<Tuple2<Tuple1<String>, Tuple1<String>>> out = source1
      .map(new IdMap()).setParallelism(DEFAULT_PARALLELISM+1)
      .join(source2).where("*").equalTo("*")
      .writeAsText("/some/newpath");
  Plan plan = env.createProgramPlan();
  // submit the plan to the compiler
  OptimizedPlan oPlan = compileNoStats(plan);
}

Javadoc

Generic method to create an input DataSet with in InputFormat. The DataSet will not be immediately created - instead, this method returns a DataSet that will be lazily created from the input format once the program is executed.

Since all data sets need specific information about their types, this method needs to determine the type of the data produced by the input format. It will attempt to determine the data type by reflection, unless the input format implements the ResultTypeQueryable interface. In the latter case, this method will invoke the ResultTypeQueryable#getProducedType()method to determine data type produced by the input format.

Popular methods of ExecutionEnvironment

getExecutionEnvironment
Creates an execution environment that represents the context in which the program is currently execu
execute
Triggers the program execution. The environment will execute all parts of the program that have resu
getConfig
Gets the config object that defines execution parameters.
fromCollection
Creates a DataSet from the given iterator. Because the iterator will remain unmodified until the act
fromElements
Creates a new data set that contains the given elements. The elements must all be of the same type,
setParallelism
Sets the parallelism for operations executed through this environment. Setting a parallelism of x he
getParallelism
Gets the parallelism with which operation are executed by default. Operations can individually overr
createLocalEnvironment
Creates a LocalEnvironment which is used for executing Flink jobs.
readTextFile
Creates a DataSet that represents the Strings produced by reading the given file line wise. The java
getLastJobExecutionResult
Returns the org.apache.flink.api.common.JobExecutionResult of the last executed job.
readCsvFile
Creates a CSV reader to read a comma separated value (CSV) file. The reader has options to define pa
generateSequence
Creates a new data set that contains a sequence of numbers. The data set will be created in parallel

Popular in Java

Running tasks concurrently on multiple threads
onRequestPermissionsResult (Fragment)
findViewById (Activity)
getResourceAsStream (ClassLoader)
BigInteger (java.math)
An immutable arbitrary-precision signed integer.FAST CRYPTOGRAPHY This implementation is efficient f
Date (java.util)
A specific moment in time, with millisecond precision. Values typically come from System#currentTime
HttpServlet (javax.servlet.http)
Provides an abstract class to be subclassed to create an HTTP servlet suitable for a Web site. A sub
SAXParseException (org.xml.sax)
Encapsulate an XML parse error or warning.> This module, both source code and documentation, is in t
Color (java.awt)
The Color class is used to encapsulate colors in the default sRGB color space or colors in arbitrary
Reflections (org.reflections)
Reflections one-stop-shop objectReflections scans your classpath, indexes the metadata, allows you t
CodeWhisperer alternatives

How to use createInputmethodin org.apache.flink.api.java.ExecutionEnvironment

Best Java code snippets using org.apache.flink.api.java.ExecutionEnvironment.createInput (Showing top 20 results out of 315)

How to use
createInput
method
in
org.apache.flink.api.java.ExecutionEnvironment