org.apache.beam.sdk java code examples

public static void main(String[] args) {
 Options options = PipelineOptionsFactory.fromArgs(args).withValidation().as(Options.class);
 Pipeline p = Pipeline.create(options);
 PCollection<Struct> records = p.apply(
   SpannerIO.read()
     .withInstanceId(instanceId)
     .withDatabaseId(databaseId)
     .withTable("Singers")
     .withColumns("singerId", "firstName", "lastName"));
   .apply(EstimateSize.create())
   .apply(Sum.longsGlobally());
   .apply(ToString.elements())
   .apply(TextIO.write().to(options.getOutput()).withoutSharding());
 p.run().waitUntilFinish();

public static void main(String[] args) {
 Options options = PipelineOptionsFactory.fromArgs(args).withValidation().as(Options.class);
 Pipeline p = Pipeline.create(options);
 SpannerConfig spannerConfig = SpannerConfig.create()
   .withInstanceId(instanceId)
   .withDatabaseId(databaseId);
 PCollectionView<Transaction> tx = p.apply(
   SpannerIO.createTransaction()
     .withSpannerConfig(spannerConfig)
     .withTimestampBound(TimestampBound.strong()));
 PCollection<Struct> singers = p.apply(SpannerIO.read()
   .withSpannerConfig(spannerConfig)
   .withQuery("SELECT SingerID, FirstName, LastName FROM Singers")
   .withTransaction(tx));
 PCollection<Struct> albums = p.apply(SpannerIO.read().withSpannerConfig(spannerConfig)
   .withQuery("SELECT SingerId, AlbumId, AlbumTitle FROM Albums")
   .withTransaction(tx));
 singers.apply(MapElements.via(new SimpleFunction<Struct, String>() {
 })).apply(TextIO.write().to(options.getSingersFilename()).withoutSharding());
 albums.apply(MapElements.via(new SimpleFunction<Struct, String>() {
 })).apply(TextIO.write().to(options.getAlbumsFilename()).withoutSharding());
 p.run().waitUntilFinish();

public static void main(String[] args) {
 Options options = PipelineOptionsFactory.fromArgs(args).withValidation().as(Options.class);
 Pipeline p = Pipeline.create(options);
 PCollection<String> suspiciousUserIds = p.apply(TextIO.read().from(usersIdFile));
   .apply(MapElements.via(new SimpleFunction<String, MutationGroup>() {
 mutations.apply(SpannerIO.write()
   .withInstanceId(instanceId)
   .withDatabaseId(databaseId)
   .grouped());
 p.run().waitUntilFinish();

public static void main(String[] args) {
 Options options = PipelineOptionsFactory.fromArgs(args).withValidation().as(Options.class);
 Pipeline p = Pipeline.create(options);
 SpannerConfig spannerConfig = SpannerConfig.create()
   .withInstanceId(options.getInstanceId())
   .withDatabaseId(options.getDatabaseId());
 // [START spanner_dataflow_readall]
 PCollection<Struct> allRecords = p.apply(SpannerIO.read()
   .withSpannerConfig(spannerConfig)
   .withQuery("SELECT t.table_name FROM information_schema.tables AS t WHERE t"
     + ".table_catalog = '' AND t.table_schema = ''")).apply(
   MapElements.into(TypeDescriptor.of(ReadOperation.class))
     .via((SerializableFunction<Struct, ReadOperation>) input -> {
      String tableName = input.getString(0);
      return ReadOperation.create().withQuery("SELECT * FROM " + tableName);
     })).apply(SpannerIO.readAll().withSpannerConfig(spannerConfig));
 // [END spanner_dataflow_readall]
 PCollection<Long> dbEstimatedSize = allRecords.apply(EstimateSize.create())
   .apply(Sum.longsGlobally());
 dbEstimatedSize.apply(ToString.elements()).apply(TextIO.write().to(options.getOutput())
   .withoutSharding());
 p.run().waitUntilFinish();
}

public static void runCsvToAvro(SampleOptions options)
  throws IOException, IllegalArgumentException {
 FileSystems.setDefaultPipelineOptions(options);
 // Get Avro Schema
 String schemaJson = getSchema(options.getAvroSchema());
 Schema schema = new Schema.Parser().parse(schemaJson);
 // Check schema field types before starting the Dataflow job
 checkFieldTypes(schema);
 // Create the Pipeline object with the options we defined above.
 Pipeline pipeline = Pipeline.create(options);
 // Convert CSV to Avro
 pipeline.apply("Read CSV files", TextIO.read().from(options.getInputFile()))
   .apply("Convert CSV to Avro formatted data",
     ParDo.of(new ConvertCsvToAvro(schemaJson, options.getCsvDelimiter())))
   .setCoder(AvroCoder.of(GenericRecord.class, schema))
   .apply("Write Avro formatted data", AvroIO.writeGenericRecords(schemaJson)
     .to(options.getOutput()).withCodec(CodecFactory.snappyCodec()).withSuffix(".avro"));
 // Run the pipeline.
 pipeline.run().waitUntilFinish();
}

public static void runAvroToCsv(SampleOptions options)
  throws IOException, IllegalArgumentException {
 FileSystems.setDefaultPipelineOptions(options);
 // Get Avro Schema
 String schemaJson = getSchema(options.getAvroSchema());
 Schema schema = new Schema.Parser().parse(schemaJson);
 // Check schema field types before starting the Dataflow job
 checkFieldTypes(schema);
 // Create the Pipeline object with the options we defined above.
 Pipeline pipeline = Pipeline.create(options);
 // Convert Avro To CSV
 pipeline.apply("Read Avro files",
   AvroIO.readGenericRecords(schemaJson).from(options.getInputFile()))
   .apply("Convert Avro to CSV formatted data",
     ParDo.of(new ConvertAvroToCsv(schemaJson, options.getCsvDelimiter())))
   .apply("Write CSV formatted data", TextIO.write().to(options.getOutput())
     .withSuffix(".csv"));
 // Run the pipeline.
 pipeline.run().waitUntilFinish();
}

/** Creates a {@link Pipeline} out of a single {@link PTransform} step, and executes it. */
public ResultT run(PTransform<PBegin, ?> pTransform, PipelineOptions options) {
 Pipeline p = Pipeline.create(options);
 p.apply(pTransform);
 return run(p);
}

 protected void afterUserCodeFinished() {
  if (!runAttempted && enableAutoRunIfMissing) {
   pipeline.run().waitUntilFinish();
  }
 }
}

public static PipelineResult waitUntilDone(PipelineResult result) {
 PipelineResult.State state  = result.waitUntilFinish();
 if (!state.equals(PipelineResult.State.DONE)) {
  throw new Pipeline.PipelineExecutionException(
    new Exception("Job finished with state " + state.toString()));
 }
 return result;
}

/**
 * Runs this {@link Pipeline} according to the {@link PipelineOptions} used to create the {@link
 * Pipeline} via {@link #create(PipelineOptions)}.
 */
public PipelineResult run() {
 return run(defaultOptions);
}

/**
 * Translates the pipeline by passing this class as a visitor.
 * @param pipeline The pipeline to be translated
 */
public void translate(Pipeline pipeline) {
 pipeline.traverseTopologically(this);
}

/**
 * Applies the given {@link PTransform} to this input {@code KeyedPCollectionTuple} and returns
 * its {@code OutputT}. This uses {@code name} to identify the specific application of the
 * transform. This name is used in various places, including the monitoring UI, logging, and to
 * stably identify this application node in the job graph.
 */
public <OutputT extends POutput> OutputT apply(
  String name, PTransform<KeyedPCollectionTuple<K>, OutputT> transform) {
 return Pipeline.applyTransform(name, this, transform);
}

@Override
public void visitPrimitiveTransform(Node node) {
 checkForMatches(node);
}

public static void main(String[] args) {
 Options options = PipelineOptionsFactory.fromArgs(args).withValidation().as(Options.class);
 Pipeline p = Pipeline.create(options);
 p.apply("ReadSingers", TextIO.read().from(options.getSingersFilename()))
   .apply("ParseSingers", ParDo.of(new ParseSinger()))
   .apply("CreateSingerMutation", ParDo.of(new DoFn<Singer, Mutation>() {
    @ProcessElement
    public void processElement(ProcessContext c) {
   .apply("WriteSingers", SpannerIO.write()
     .withInstanceId(instanceId)
     .withDatabaseId(databaseId));
   .apply("ReadAlbums", TextIO.read().from(options.getAlbumsFilename()))
   .apply("ParseAlbums", ParDo.of(new ParseAlbum()));
   .apply("CreateAlbumMutation", ParDo.of(new DoFn<Album, Mutation>() {
    @ProcessElement
    public void processElement(ProcessContext c) {
   .apply("WriteAlbums", SpannerIO.write()
     .withInstanceId(instanceId)
     .withDatabaseId(databaseId));
 p.run().waitUntilFinish();

 public static void main(String[] args) {
  Options options = PipelineOptionsFactory.fromArgs(args).withValidation().as(Options.class);
  Pipeline p = Pipeline.create(options);

  String instanceId = options.getInstanceId();
  String databaseId = options.getDatabaseId();
  // [START spanner_dataflow_read]
  // Query for all the columns and rows in the specified Spanner table
  PCollection<Struct> records = p.apply(
    SpannerIO.read()
      .withInstanceId(instanceId)
      .withDatabaseId(databaseId)
      .withQuery("SELECT * FROM " + options.getTable()));
  // [END spanner_dataflow_read]


  PCollection<Long> tableEstimatedSize = records
    // Estimate the size of every row
    .apply(EstimateSize.create())
    // Sum all the row sizes to get the total estimated size of the table
    .apply(Sum.longsGlobally());

  // Write the total size to a file
  tableEstimatedSize
    .apply(ToString.elements())
    .apply(TextIO.write().to(options.getOutput()).withoutSharding());

  p.run().waitUntilFinish();
 }
}

/**
 * Translates the pipeline by passing this class as a visitor.
 *
 * @param pipeline The pipeline to be translated
 */
public void translate(Pipeline pipeline) {
 pipeline.traverseTopologically(this);
}

/**
 * Applies the given {@link PTransform} to this {@link PBegin}, using {@code name} to identify
 * this specific application of the transform.
 *
 * <p>This name is used in various places, including the monitoring UI, logging, and to stably
 * identify this application node in the job graph.
 */
public <OutputT extends POutput> OutputT apply(
  String name, PTransform<? super PBegin, OutputT> t) {
 return Pipeline.applyTransform(name, this, t);
}

/**
 * Like {@link #apply(String, PTransform)} but defaulting to the name provided by the {@link
 * PTransform}.
 */
public <OutputT extends POutput> OutputT apply(
  PTransform<KeyedPCollectionTuple<K>, OutputT> transform) {
 return Pipeline.applyTransform(this, transform);
}

/**
 * Like {@link #apply(String, PTransform)} but defaulting to the name of the {@code PTransform}.
 */
public <OutputT extends POutput> OutputT apply(PTransform<PCollectionList<T>, OutputT> t) {
 return Pipeline.applyTransform(this, t);
}

/**
 * Applies the given {@link PTransform} to this input {@link PCollectionTuple}, using {@code name}
 * to identify this specific application of the transform. This name is used in various places,
 * including the monitoring UI, logging, and to stably identify this application node in the job
 * graph.
 *
 * @return the output of the applied {@link PTransform}
 */
public <OutputT extends POutput> OutputT apply(
  String name, PTransform<? super PCollectionTuple, OutputT> t) {
 return Pipeline.applyTransform(name, this, t);
}

How to use org.apache.beam.sdk

Best Java code snippets using org.apache.beam.sdk (Showing top 20 results out of 936)