org.apache.flink.api.java.operators.DataSource java code examples

private void createTextSource(ExecutionEnvironment env, PythonOperationInfo info) {
  sets.add(info.setID, env.readTextFile(info.path).setParallelism(info.parallelism).name("TextSource")
    .map(new SerializerMap<String>()).setParallelism(info.parallelism).name("TextSourcePostStep"));
}

/**
 * Generic method to create an input DataSet with in {@link InputFormat}. The {@link DataSet} will not be
 * immediately created - instead, this method returns a {@link DataSet} that will be lazily created from
 * the input format once the program is executed.
 *
 * <p>The {@link DataSet} is typed to the given TypeInformation. This method is intended for input formats that
 * where the return type cannot be determined by reflection analysis, and that do not implement the
 * {@link ResultTypeQueryable} interface.
 *
 * @param inputFormat The input format used to create the data set.
 * @return A {@link DataSet} that represents the data created by the input format.
 *
 * @see #createInput(InputFormat)
 */
public <X> DataSource<X> createInput(InputFormat<X, ?> inputFormat, TypeInformation<X> producedType) {
  if (inputFormat == null) {
    throw new IllegalArgumentException("InputFormat must not be null.");
  }
  if (producedType == null) {
    throw new IllegalArgumentException("Produced type information must not be null.");
  }
  return new DataSource<>(this, inputFormat, producedType, Utils.getCallLocationName());
}

@Override
public Graph<LongValue, NullValue, NullValue> generate() {
  // Vertices
  DataSet<Vertex<LongValue, NullValue>> vertices = GraphGeneratorUtils.vertexSequence(env, parallelism, vertexCount);
  // Edges
  LongValueSequenceIterator iterator = new LongValueSequenceIterator(0, this.vertexCount - 1);
  // Validate ranges
  Collections.sort(offsetRanges);
  Iterator<OffsetRange> iter = offsetRanges.iterator();
  OffsetRange lastRange = iter.next();
  while (iter.hasNext()) {
    OffsetRange nextRange = iter.next();
    if (lastRange.overlaps(nextRange)) {
      throw new IllegalArgumentException("Overlapping ranges " + lastRange + " and " + nextRange);
    }
    lastRange = nextRange;
  }
  DataSet<Edge<LongValue, NullValue>> edges = env
    .fromParallelCollection(iterator, LongValue.class)
      .setParallelism(parallelism)
      .name("Edge iterators")
    .flatMap(new LinkVertexToOffsets(vertexCount, offsetRanges))
      .setParallelism(parallelism)
      .name("Circulant graph edges");
  // Graph
  return Graph.fromDataSet(vertices, edges, env);
}

/**
 * Creates a Graph from CSV input without vertex values or edge values.
 * @param vertexKey the type of the vertex IDs
 * @return a Graph where the vertex IDs are read from the edges input file.
 */
public <K> Graph<K, NullValue, NullValue> keyType(Class<K> vertexKey) {
  if (edgeReader == null) {
    throw new RuntimeException("The edge input file cannot be null!");
  }
  DataSet<Edge<K, NullValue>> edges = edgeReader
    .types(vertexKey, vertexKey)
      .name(GraphCsvReader.class.getName())
    .map(new Tuple2ToEdgeMap<>())
      .name("Type conversion");
  return Graph.fromDataSet(edges, executionContext);
}

  @Override
  public Graph<LongValue, NullValue, NullValue> generate() {
    Preconditions.checkState(vertexCount >= 0);

    // Vertices
    DataSet<Vertex<LongValue, NullValue>> vertices = GraphGeneratorUtils.vertexSequence(env, parallelism, vertexCount);

    // Edges
    DataSource<Edge<LongValue, NullValue>> edges = env
      .fromCollection(Collections.<Edge<LongValue, NullValue>>emptyList(), TypeInformation.of(new TypeHint<Edge<LongValue, NullValue>>(){}))
        .setParallelism(parallelism)
        .name("Empty edge set");

    // Graph
    return Graph.fromDataSet(vertices, edges, env);
  }
}

public static void main(String[] args) throws Exception {
  final long numSamples = args.length > 0 ? Long.parseLong(args[0]) : 1000000;
  final ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment();
  // count how many of the samples would randomly fall into
  // the unit circle
  DataSet<Long> count =
      env.generateSequence(1, numSamples)
      .map(new Sampler())
      .reduce(new SumReducer());
  long theCount = count.collect().get(0);
  System.out.println("We estimate Pi to be: " + (theCount * 4.0 / numSamples));
}

/**
 * Creates a Graph from CSV input with edge values, but without vertex values.
 * @param vertexKey the type of the vertex IDs
 * @param edgeValue the type of the edge values
 * @return a Graph where the edges are read from an edges CSV file (with values).
 */
public <K, EV> Graph<K, NullValue, EV> edgeTypes(Class<K> vertexKey, Class<EV> edgeValue) {
  if (edgeReader == null) {
    throw new RuntimeException("The edge input file cannot be null!");
  }
  DataSet<Tuple3<K, K, EV>> edges = edgeReader
    .types(vertexKey, vertexKey, edgeValue)
      .name(GraphCsvReader.class.getName());
  return Graph.fromTupleDataSet(edges, executionContext);
}

.generateSequence(0, 1).setParallelism(p * 2)
.map(new IdentityMapper<Long>())
  .withForwardedFields("*").setParallelism(p * 2).name("Map1")
.groupBy("*").reduceGroup(new IdentityGroupReducer<Long>())

@Override
protected void testProgram() throws Exception {
  // set up execution environment
  ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment();
  // read vertex and edge data
  DataSet<Long> vertices = env.fromElements(ConnectedComponentsData.getEnumeratingVertices(NUM_VERTICES).split("\n"))
      .map(new VertexParser());
  DataSet<Tuple2<Long, Long>> edges = env.fromElements(ConnectedComponentsData.getRandomOddEvenEdges(NUM_EDGES, NUM_VERTICES, SEED).split("\n"))
      .flatMap(new EdgeParser());
  // assign the initial components (equal to the vertex id)
  DataSet<Tuple2<Long, Long>> verticesWithInitialId = vertices.map(new DuplicateValue<Long>());
  // open a delta iteration
  DeltaIteration<Tuple2<Long, Long>, Tuple2<Long, Long>> iteration =
      verticesWithInitialId.iterateDelta(verticesWithInitialId, 100, 0);
  // apply the step logic: join with the edges, select the minimum neighbor, update if the component of the candidate is smaller
  DataSet<Tuple2<Long, Long>> changes = iteration
      .getWorkset().join(edges).where(0).equalTo(0).with(new NeighborWithComponentIDJoin())
      .coGroup(iteration.getSolutionSet()).where(0).equalTo(0)
      .with(new MinIdAndUpdate());
  // close the delta iteration (delta and new workset are identical)
  DataSet<Tuple2<Long, Long>> result = iteration.closeWith(changes, changes);
  // emit result
  List<Tuple2<Long, Long>> resutTuples = new ArrayList<>();
  result.output(new LocalCollectionOutputFormat<>(resutTuples));
  env.execute();
}

@SuppressWarnings("unchecked")
private static DataSet<Tuple3<Double, StringValue, LongValue>> getSourceDataSet(ExecutionEnvironment env) {
  return env.fromElements(new Tuple3<Double, StringValue, LongValue>(3.141592, new StringValue("foobar"), new LongValue(77)))
      .setParallelism(1);
}

  private Plan getWordCountPlan(File inFile, File outFile, int parallelism) {
    ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment();
    env.setParallelism(parallelism);
    env.readTextFile(inFile.getAbsolutePath())
      .flatMap(new Tokenizer())
      .groupBy(0)
      .sum(1)
      .writeAsCsv(outFile.getAbsolutePath());
    return env.createProgramPlan();
  }
}

@Test
public void testCustomPartitioningTupleInvalidType() {
  try {
    ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment();
    DataSet<Pojo2> data = env.fromElements(new Pojo2())
        .rebalance().setParallelism(4);
    try {
      data.groupBy("a").withPartitioner(new TestPartitionerLong());
      fail("Should throw an exception");
    }
    catch (InvalidProgramException e) {}
  }
  catch (Exception e) {
    e.printStackTrace();
    fail(e.getMessage());
  }
}

@Override
public Graph<LongValue, NullValue, NullValue> generate() {
  int scale = Long.SIZE - Long.numberOfLeadingZeros(vertexCount - 1);
  // Edges
  int cyclesPerEdge = noiseEnabled ? 5 * scale : scale;
  List<BlockInfo<T>> generatorBlocks = randomGenerableFactory
    .getRandomGenerables(edgeCount, cyclesPerEdge);
  DataSet<Edge<LongValue, NullValue>> edges = env
    .fromCollection(generatorBlocks)
      .name("Random generators")
    .rebalance()
      .setParallelism(parallelism)
      .name("Rebalance")
    .flatMap(new GenerateEdges<>(vertexCount, scale, a, b, c, noiseEnabled, noise))
      .setParallelism(parallelism)
      .name("RMat graph edges");
  // Vertices
  DataSet<Vertex<LongValue, NullValue>> vertices = GraphGeneratorUtils.vertexSet(edges, parallelism);
  // Graph
  return Graph.fromDataSet(vertices, edges, env);
}

private static DataSet<Long> getVertexDataSet(ExecutionEnvironment env, ParameterTool params) {
  if (params.has("vertices")) {
    return env.readCsvFile(params.get("vertices")).types(Long.class).map(
      new MapFunction<Tuple1<Long>, Long>() {
        public Long map(Tuple1<Long> value) {
          return value.f0;
        }
      });
  } else {
    System.out.println("Executing Connected Components example with default vertices data set.");
    System.out.println("Use --vertices to specify file input.");
    return ConnectedComponentsData.getDefaultVertexDataSet(env);
  }
}

env.readFile(new PointInFormat(), dataPath).setParallelism(1).name("Input");

@Override
public DataSet<Row> getDataSet(ExecutionEnvironment execEnv) {
  return execEnv.createInput(new HBaseRowInputFormat(conf, tableName, hBaseSchema), getReturnType()).name(explainSource());
}

  @SuppressWarnings("unchecked")
  private static DataSet<Tuple3<Double, StringValue, LongValue>> getSourceDataSet(ExecutionEnvironment env, int parallelism) {
    return env
        .fromElements(new Tuple3<>(0.0, new StringValue(""), new LongValue(1L)))
        .setParallelism(parallelism);
  }
}

@Test
public void testBatchDistributedCache() throws Exception {
  String textPath = createTempFile("count.txt", DATA);
  ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment();
  env.registerCachedFile(textPath, "cache_test");
  env.readTextFile(textPath).flatMap(new WordChecker()).count();
}

@Test
public void testCustomPartitioningTupleInvalidType() {
  try {
    ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment();
    
    DataSet<Tuple2<Integer, Integer>> data = env.fromElements(new Tuple2<Integer, Integer>(0, 0))
        .rebalance().setParallelism(4);
    
    try {
      data.groupBy(0).withPartitioner(new TestPartitionerLong());
      fail("Should throw an exception");
    }
    catch (InvalidProgramException e) {}
  }
  catch (Exception e) {
    e.printStackTrace();
    fail(e.getMessage());
  }
}

private void createSequenceSource(ExecutionEnvironment env, PythonOperationInfo info) {
  sets.add(info.setID, env.generateSequence(info.frm, info.to).setParallelism(info.parallelism).name("SequenceSource")
    .map(new SerializerMap<Long>()).setParallelism(info.parallelism).name("SequenceSourcePostStep"));
}

Javadoc

An operation that creates a new data set (data source). The operation acts as the data set on which to apply further transformations. It encapsulates additional configuration parameters, to customize the execution.

Most used methods

Popular in Java

Finding current android device location
getApplicationContext (Context)
setRequestProperty (URLConnection)
findViewById (Activity)
KeyStore (java.security)
KeyStore is responsible for maintaining cryptographic keys and their owners. The type of the syste
Date (java.util)
A specific moment in time, with millisecond precision. Values typically come from System#currentTime
LinkedList (java.util)
Doubly-linked list implementation of the List and Dequeinterfaces. Implements all optional list oper
PriorityQueue (java.util)
A PriorityQueue holds elements on a priority heap, which orders the elements according to their natu
CountDownLatch (java.util.concurrent)
A synchronization aid that allows one or more threads to wait until a set of operations being perfor
ReentrantLock (java.util.concurrent.locks)
A reentrant mutual exclusion Lock with the same basic behavior and semantics as the implicit monitor
Best IntelliJ plugins

How to useDataSource in org.apache.flink.api.java.operators

Best Java code snippets using org.apache.flink.api.java.operators.DataSource (Showing top 20 results out of 360)

How to use
DataSource
in
org.apache.flink.api.java.operators