org.apache.flink.api.java.DataSet java code examples

/**
 * @return the vertex DataSet as Tuple2.
 */
public DataSet<Tuple2<K, VV>> getVerticesAsTuple2() {
  return vertices.map(new VertexToTuple2Map<>());
}

  @Override
  protected void testProgram() throws Exception {
    final ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment();

    DataSet<String> text = env.fromElements(WordCountData.TEXT);
    DataSet<Tuple2<String, Integer>> words = text.flatMap(new WordCount.Tokenizer());
    DataSet<Tuple2<String, Integer>> result = words.groupBy(0).aggregate(Aggregations.SUM, 1);

    result.output(new LocalCollectionOutputFormat<Tuple2<String, Integer>>(resultsCollected));
    env.execute("Word Count Collection");
  }
}

@Test
public void testUnionWithEmptyDataSet() throws Exception {
  /*
   * Test on union with empty dataset
   */
  final ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment();
  // Don't know how to make an empty result in an other way than filtering it
  DataSet<Tuple3<Integer, Long, String>> empty = CollectionDataSets.get3TupleDataSet(env).
      filter(new RichFilter1());
  DataSet<Tuple3<Integer, Long, String>> unionDs = CollectionDataSets.get3TupleDataSet(env)
      .union(empty);
  List<Tuple3<Integer, Long, String>> result = unionDs.collect();
  String expected = FULL_TUPLE_3_STRING;
  compareResultAsTuples(result, expected);
}

/**
 * Convenience method to get the count (number of elements) of a DataSet.
 *
 * @return A long integer that represents the number of elements in the data set.
 */
public long count() throws Exception {
  final String id = new AbstractID().toString();
  output(new Utils.CountHelper<T>(id)).name("count()");
  JobExecutionResult res = getExecutionEnvironment().execute();
  return res.<Long> getAccumulatorResult(id);
}

/**
 * Checks that the edge set input contains valid vertex Ids, i.e. that they
 * also exist in the vertex input set.
 *
 * @return a boolean stating whether a graph is valid
 *         with respect to its vertex ids.
 */
@Override
public boolean validate(Graph<K, VV, EV> graph) throws Exception {
  DataSet<Tuple1<K>> edgeIds = graph.getEdges()
      .flatMap(new MapEdgeIds<>()).distinct();
  DataSet<K> invalidIds = graph.getVertices().coGroup(edgeIds).where(0)
      .equalTo(0).with(new GroupInvalidIds<>()).first(1);
  return invalidIds.map(new KToTupleMap<>()).count() == 0;
}

@Override
protected void testProgram() throws Exception {
  ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment();
  DataSet<Long> data1 = env.generateSequence(1, 100);
  DataSet<Long> data2 = env.generateSequence(1, 100);
  IterativeDataSet<Long> firstIteration = data1.iterate(100);
  DataSet<Long> firstResult = firstIteration.closeWith(firstIteration.map(new IdMapper()));
  IterativeDataSet<Long> mainIteration = data2.map(new IdMapper()).iterate(100);
  DataSet<Long> joined = mainIteration.join(firstResult)
      .where(new IdKeyExtractor()).equalTo(new IdKeyExtractor())
      .with(new Joiner());
  DataSet<Long> mainResult = mainIteration.closeWith(joined);
  mainResult.output(new DiscardingOutputFormat<Long>());
  env.execute();
}

@Test(expected = IndexOutOfBoundsException.class)
public void testGroupByKeyFields4() {
  final ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment();
  DataSet<Tuple5<Integer, Long, String, Long, Integer>> tupleDs = env.fromCollection(emptyTupleData, tupleTypeInfo);
  // should not work, key out of tuple bounds
  tupleDs.groupBy(5);
}

@Test
public void testUnion2IdenticalDataSets() throws Exception {
  /*
   * Union of 2 Same Data Sets
   */
  final ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment();
  DataSet<Tuple3<Integer, Long, String>> ds = CollectionDataSets.get3TupleDataSet(env);
  DataSet<Tuple3<Integer, Long, String>> unionDs = ds.union(CollectionDataSets.get3TupleDataSet(env));
  List<Tuple3<Integer, Long, String>> result = unionDs.collect();
  String expected = FULL_TUPLE_3_STRING + FULL_TUPLE_3_STRING;
  compareResultAsTuples(result, expected);
}

@Override
protected void testProgram() throws Exception {
  ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment();
  DataSet<Long> inputStatic = env.generateSequence(1, 4);
  DataSet<Long> inputIteration = env.generateSequence(1, 4);
  IterativeDataSet<Long> iteration = inputIteration.iterate(3);
  DataSet<Long> result = iteration.closeWith(inputStatic.union(inputStatic).union(iteration.union(iteration)));
  result.output(new LocalCollectionOutputFormat<Long>(this.result));
  env.execute();
}

@Override
protected void testProgram() throws Exception {
  final ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment();
  DataSet<Long> longs = env.generateSequence(0, 100000);
  DataSet<Tuple1<Long>> longT1 = longs.map(new TupleWrapper());
  DataSet<Tuple1<Long>> longT2 = longT1.project(0);
  DataSet<Tuple1<Long>> longT3 = longs.map(new TupleWrapper());
  longT2.join(longT3).where(0).equalTo(0).projectFirst(0)
      .join(longT1).where(0).equalTo(0).projectFirst(0)
      .writeAsText(resultPath);
  env.execute();
}

@Override
protected void testProgram() throws Exception {
  ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment();
  DataSet<Integer> data = env.fromElements(1, 2, 3, 4, 5, 6, 7, 8);
  IterativeDataSet<Integer> iteration = data.iterate(10);
  DataSet<Integer> result = data.reduceGroup(new PickOneAllReduce()).withBroadcastSet(iteration, "bc");
  final List<Integer> resultList = new ArrayList<Integer>();
  iteration.closeWith(result).output(new LocalCollectionOutputFormat<Integer>(resultList));
  env.execute();
  Assert.assertEquals(8, resultList.get(0).intValue());
}

@Test
public void testSupportForDataAndEnumSerialization() throws Exception {
  /**
   * Test support for Date and enum serialization
   */
  final ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment();
  DataSet<PojoWithDateAndEnum> ds = env.generateSequence(0, 2).map(new Mapper1());
  ds = ds.union(CollectionDataSets.getPojoWithDateAndEnum(env));
  DataSet<String> res = ds.groupBy("group").reduceGroup(new GroupReducer1());
  List<String> result = res.collect();
  String expected = "ok\nok";
  compareResultAsText(result, expected);
}

private Plan getTestPlanLeftStatic(String strategy) {
  
  ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment();
  env.setParallelism(DEFAULT_PARALLELISM);
  
  @SuppressWarnings("unchecked")
  DataSet<Tuple3<Long, Long, Long>> bigInput = env.fromElements(new Tuple3<Long, Long, Long>(1L, 2L, 3L),
      new Tuple3<Long, Long, Long>(1L, 2L, 3L),new Tuple3<Long, Long, Long>(1L, 2L, 3L)).name("Big");
  
  @SuppressWarnings("unchecked")
  DataSet<Tuple3<Long, Long, Long>> smallInput = env.fromElements(new Tuple3<Long, Long, Long>(1L, 2L, 3L)).name("Small");
  
  IterativeDataSet<Tuple3<Long, Long, Long>> iteration = bigInput.iterate(10);
  
  Configuration joinStrategy = new Configuration();
  joinStrategy.setString(Optimizer.HINT_LOCAL_STRATEGY, strategy);
  
  DataSet<Tuple3<Long, Long, Long>> inner = smallInput.join(iteration).where(0).equalTo(0).with(new DummyJoiner()).name("DummyJoiner").withParameters(joinStrategy);
  DataSet<Tuple3<Long, Long, Long>> output = iteration.closeWith(inner);
  
  output.output(new DiscardingOutputFormat<Tuple3<Long,Long,Long>>());
  
  return env.createProgramPlan();
  
}

@Test
public void testAllRejectingFilter() throws Exception {
  /*
   * Test all-rejecting filter.
   */
  final ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment();
  DataSet<Tuple3<Integer, Long, String>> ds = CollectionDataSets.get3TupleDataSet(env);
  DataSet<Tuple3<Integer, Long, String>> filterDs = ds.
      filter(new Filter1());
  List<Tuple3<Integer, Long, String>> result = filterDs.collect();
  String expected = "\n";
  compareResultAsTuples(result, expected);
}

@Test(expected = InvalidProgramException.class)
public void testJoinKeyInvalidAtomic6() {
  final ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment();
  DataSet<Integer> ds1 = env.fromElements(0, 0, 0);
  DataSet<ArrayList<Integer>> ds2 = env.fromElements(new ArrayList<Integer>());
  ds1.join(ds2).where("*").equalTo("*");
}

@Override
protected void testProgram() throws Exception {
  ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment();
  env.setParallelism(4);
  DataSet<String> initialInput = env.fromElements("1", "2", "3", "4", "5").name("input");
  IterativeDataSet<String> iteration = initialInput.iterate(5).name("Loop");
  DataSet<String> sumReduce = iteration.reduceGroup(new SumReducer()).name("Compute sum (GroupReduce");
  DataSet<String> terminationFilter = sumReduce.filter(new TerminationFilter()).name("Compute termination criterion (Map)");
  List<String> result = iteration.closeWith(sumReduce, terminationFilter).collect();
  containsResultAsText(result, EXPECTED);
}

@Test
public void testNonPassingFlatMap() throws Exception {
  /*
   * Test non-passing flatmap
   */
  final ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment();
  DataSet<String> ds = CollectionDataSets.getStringDataSet(env);
  DataSet<String> nonPassingFlatMapDs = ds.
      flatMap(new FlatMapper1());
  List<String> result = nonPassingFlatMapDs.collect();
  String expected = "\n";
  compareResultAsText(result, expected);
}

@Override
public void emitDataSet(DataSet<Row> dataSet) {
  dataSet
    .output(new Utils.CollectHelper<>(accumulatorName, serializer))
    .name("SQL Client Batch Collect Sink");
}

private FlatMapOperator<Tuple3<Integer, Long, String>, String> getSourceDataSet(ExecutionEnvironment env) {
  return CollectionDataSets.get3TupleDataSet(env).flatMap(
    new FlatMapFunction<Tuple3<Integer, Long, String>, String>() {
      @Override
      public void flatMap(Tuple3<Integer, Long, String> value, Collector<String> out) throws Exception {
        out.collect(value.f2);
      }
    });
}

@Override
protected void testProgram() throws Exception {
  ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment();
  env.setParallelism(4);
  DataSet<Long> input = env.generateSequence(1, 10);
  DataSet<Long> bc1 = env.generateSequence(1, 5);
  DataSet<Long> bc2 = env.generateSequence(6, 10);
  List<Long> result = input
      .map(new Mapper())
      .withBroadcastSet(bc1.union(bc2), BC_NAME)
      .reduce(new Reducer())
      .collect();
  Assert.assertEquals(Long.valueOf(3025), result.get(0));
}

Javadoc

A DataSet represents a collection of elements of the same type.

A DataSet can be transformed into another DataSet by applying a transformation as for example

DataSet#map(org.apache.flink.api.common.functions.MapFunction),
DataSet#reduce(org.apache.flink.api.common.functions.ReduceFunction),
DataSet#join(DataSet), or
DataSet#coGroup(DataSet).

Most used methods

map
Applies a Map transformation on this DataSet.The transformation calls a org.apache.flink.api.common.
flatMap
Applies a FlatMap transformation on a DataSet.The transformation calls a org.apache.flink.api.common
output
Emits a DataSet using an OutputFormat. This method adds a data sink to the program. Programs may hav
groupBy
Groups a DataSet using field expressions. A field expression is either the name of a public field or
filter
Applies a Filter transformation on a DataSet.The transformation calls a org.apache.flink.api.common.
join
Initiates a Join transformation.A Join transformation joins the elements of two DataSet on key equal
collect
Convenience method to get the elements of a DataSet as a List. As DataSet can contain a lot of data,
getType
Returns the TypeInformation for the type of this DataSet.
union
Creates a union of this DataSet with an other DataSet. The other DataSet must be of the same data ty
iterate
Initiates an iterative part of the program that executes multiple times and feeds back data sets. Th
writeAsCsv
Writes a Tuple DataSet as CSV file(s) to the specified location.Note: Only a Tuple DataSet can writt
writeAsText
Writes a DataSet as text file(s) to the specified location.For each element of the DataSet the resul

Popular in Java

Reactive rest calls using spring rest template
requestLocationUpdates (LocationManager)
addToBackStack (FragmentTransaction)
scheduleAtFixedRate (Timer)
BufferedInputStream (java.io)
A BufferedInputStream adds functionality to another input stream-namely, the ability to buffer the i
Socket (java.net)
Provides a client-side TCP socket.
Selector (java.nio.channels)
A controller for the selection of SelectableChannel objects. Selectable channels can be registered w
TimerTask (java.util)
The TimerTask class represents a task to run at a specified time. The task may be run once or repeat
Vector (java.util)
Vector is an implementation of List, backed by an array and synchronized. All optional operations in
GridBagLayout (java.awt)
The GridBagLayout class is a flexible layout manager that aligns components vertically and horizonta
Top Vim plugins

How to useDataSet in org.apache.flink.api.java

Best Java code snippets using org.apache.flink.api.java.DataSet (Showing top 20 results out of 468)

How to use
DataSet
in
org.apache.flink.api.java