public static void main(String[] args) { Options options = PipelineOptionsFactory.fromArgs(args).withValidation().as(Options.class); Pipeline p = Pipeline.create(options); PCollection<Struct> records = p.apply( SpannerIO.read() .withInstanceId(instanceId) .withDatabaseId(databaseId) .withTable("Singers") .withColumns("singerId", "firstName", "lastName")); .apply(EstimateSize.create()) .apply(Sum.longsGlobally()); .apply(ToString.elements()) .apply(TextIO.write().to(options.getOutput()).withoutSharding()); p.run().waitUntilFinish();
public static void main(String[] args) { Options options = PipelineOptionsFactory.fromArgs(args).withValidation().as(Options.class); Pipeline p = Pipeline.create(options); SpannerConfig spannerConfig = SpannerConfig.create() .withInstanceId(instanceId) .withDatabaseId(databaseId); PCollectionView<Transaction> tx = p.apply( SpannerIO.createTransaction() .withSpannerConfig(spannerConfig) .withTimestampBound(TimestampBound.strong())); PCollection<Struct> singers = p.apply(SpannerIO.read() .withSpannerConfig(spannerConfig) .withQuery("SELECT SingerID, FirstName, LastName FROM Singers") .withTransaction(tx)); PCollection<Struct> albums = p.apply(SpannerIO.read().withSpannerConfig(spannerConfig) .withQuery("SELECT SingerId, AlbumId, AlbumTitle FROM Albums") .withTransaction(tx)); singers.apply(MapElements.via(new SimpleFunction<Struct, String>() { })).apply(TextIO.write().to(options.getSingersFilename()).withoutSharding()); albums.apply(MapElements.via(new SimpleFunction<Struct, String>() { })).apply(TextIO.write().to(options.getAlbumsFilename()).withoutSharding()); p.run().waitUntilFinish();
public static void main(String[] args) { Options options = PipelineOptionsFactory.fromArgs(args).withValidation().as(Options.class); Pipeline p = Pipeline.create(options); PCollection<String> suspiciousUserIds = p.apply(TextIO.read().from(usersIdFile)); .apply(MapElements.via(new SimpleFunction<String, MutationGroup>() { mutations.apply(SpannerIO.write() .withInstanceId(instanceId) .withDatabaseId(databaseId) .grouped()); p.run().waitUntilFinish();
public static void main(String[] args) { Options options = PipelineOptionsFactory.fromArgs(args).withValidation().as(Options.class); Pipeline p = Pipeline.create(options); SpannerConfig spannerConfig = SpannerConfig.create() .withInstanceId(options.getInstanceId()) .withDatabaseId(options.getDatabaseId()); // [START spanner_dataflow_readall] PCollection<Struct> allRecords = p.apply(SpannerIO.read() .withSpannerConfig(spannerConfig) .withQuery("SELECT t.table_name FROM information_schema.tables AS t WHERE t" + ".table_catalog = '' AND t.table_schema = ''")).apply( MapElements.into(TypeDescriptor.of(ReadOperation.class)) .via((SerializableFunction<Struct, ReadOperation>) input -> { String tableName = input.getString(0); return ReadOperation.create().withQuery("SELECT * FROM " + tableName); })).apply(SpannerIO.readAll().withSpannerConfig(spannerConfig)); // [END spanner_dataflow_readall] PCollection<Long> dbEstimatedSize = allRecords.apply(EstimateSize.create()) .apply(Sum.longsGlobally()); dbEstimatedSize.apply(ToString.elements()).apply(TextIO.write().to(options.getOutput()) .withoutSharding()); p.run().waitUntilFinish(); }
public static void runCsvToAvro(SampleOptions options) throws IOException, IllegalArgumentException { FileSystems.setDefaultPipelineOptions(options); // Get Avro Schema String schemaJson = getSchema(options.getAvroSchema()); Schema schema = new Schema.Parser().parse(schemaJson); // Check schema field types before starting the Dataflow job checkFieldTypes(schema); // Create the Pipeline object with the options we defined above. Pipeline pipeline = Pipeline.create(options); // Convert CSV to Avro pipeline.apply("Read CSV files", TextIO.read().from(options.getInputFile())) .apply("Convert CSV to Avro formatted data", ParDo.of(new ConvertCsvToAvro(schemaJson, options.getCsvDelimiter()))) .setCoder(AvroCoder.of(GenericRecord.class, schema)) .apply("Write Avro formatted data", AvroIO.writeGenericRecords(schemaJson) .to(options.getOutput()).withCodec(CodecFactory.snappyCodec()).withSuffix(".avro")); // Run the pipeline. pipeline.run().waitUntilFinish(); }
public static void runAvroToCsv(SampleOptions options) throws IOException, IllegalArgumentException { FileSystems.setDefaultPipelineOptions(options); // Get Avro Schema String schemaJson = getSchema(options.getAvroSchema()); Schema schema = new Schema.Parser().parse(schemaJson); // Check schema field types before starting the Dataflow job checkFieldTypes(schema); // Create the Pipeline object with the options we defined above. Pipeline pipeline = Pipeline.create(options); // Convert Avro To CSV pipeline.apply("Read Avro files", AvroIO.readGenericRecords(schemaJson).from(options.getInputFile())) .apply("Convert Avro to CSV formatted data", ParDo.of(new ConvertAvroToCsv(schemaJson, options.getCsvDelimiter()))) .apply("Write CSV formatted data", TextIO.write().to(options.getOutput()) .withSuffix(".csv")); // Run the pipeline. pipeline.run().waitUntilFinish(); }
protected void afterUserCodeFinished() { if (!runAttempted && enableAutoRunIfMissing) { pipeline.run().waitUntilFinish(); } } }
public static PipelineResult waitUntilDone(PipelineResult result) { PipelineResult.State state = result.waitUntilFinish(); if (!state.equals(PipelineResult.State.DONE)) { throw new Pipeline.PipelineExecutionException( new Exception("Job finished with state " + state.toString())); } return result; }
/** * Runs this {@link Pipeline} according to the {@link PipelineOptions} used to create the {@link * Pipeline} via {@link #create(PipelineOptions)}. */ public PipelineResult run() { return run(defaultOptions); }
/** * Translates the pipeline by passing this class as a visitor. * @param pipeline The pipeline to be translated */ public void translate(Pipeline pipeline) { pipeline.traverseTopologically(this); }
/** * Applies the given {@link PTransform} to this input {@code KeyedPCollectionTuple} and returns * its {@code OutputT}. This uses {@code name} to identify the specific application of the * transform. This name is used in various places, including the monitoring UI, logging, and to * stably identify this application node in the job graph. */ public <OutputT extends POutput> OutputT apply( String name, PTransform<KeyedPCollectionTuple<K>, OutputT> transform) { return Pipeline.applyTransform(name, this, transform); }
@Override public void visitPrimitiveTransform(Node node) { checkForMatches(node); }
public static void main(String[] args) { Options options = PipelineOptionsFactory.fromArgs(args).withValidation().as(Options.class); Pipeline p = Pipeline.create(options); p.apply("ReadSingers", TextIO.read().from(options.getSingersFilename())) .apply("ParseSingers", ParDo.of(new ParseSinger())) .apply("CreateSingerMutation", ParDo.of(new DoFn<Singer, Mutation>() { @ProcessElement public void processElement(ProcessContext c) { .apply("WriteSingers", SpannerIO.write() .withInstanceId(instanceId) .withDatabaseId(databaseId)); .apply("ReadAlbums", TextIO.read().from(options.getAlbumsFilename())) .apply("ParseAlbums", ParDo.of(new ParseAlbum())); .apply("CreateAlbumMutation", ParDo.of(new DoFn<Album, Mutation>() { @ProcessElement public void processElement(ProcessContext c) { .apply("WriteAlbums", SpannerIO.write() .withInstanceId(instanceId) .withDatabaseId(databaseId)); p.run().waitUntilFinish();
public static void main(String[] args) { Options options = PipelineOptionsFactory.fromArgs(args).withValidation().as(Options.class); Pipeline p = Pipeline.create(options); String instanceId = options.getInstanceId(); String databaseId = options.getDatabaseId(); // [START spanner_dataflow_read] // Query for all the columns and rows in the specified Spanner table PCollection<Struct> records = p.apply( SpannerIO.read() .withInstanceId(instanceId) .withDatabaseId(databaseId) .withQuery("SELECT * FROM " + options.getTable())); // [END spanner_dataflow_read] PCollection<Long> tableEstimatedSize = records // Estimate the size of every row .apply(EstimateSize.create()) // Sum all the row sizes to get the total estimated size of the table .apply(Sum.longsGlobally()); // Write the total size to a file tableEstimatedSize .apply(ToString.elements()) .apply(TextIO.write().to(options.getOutput()).withoutSharding()); p.run().waitUntilFinish(); } }
/** * Translates the pipeline by passing this class as a visitor. * * @param pipeline The pipeline to be translated */ public void translate(Pipeline pipeline) { pipeline.traverseTopologically(this); }
/** * Applies the given {@link PTransform} to this {@link PBegin}, using {@code name} to identify * this specific application of the transform. * * <p>This name is used in various places, including the monitoring UI, logging, and to stably * identify this application node in the job graph. */ public <OutputT extends POutput> OutputT apply( String name, PTransform<? super PBegin, OutputT> t) { return Pipeline.applyTransform(name, this, t); }
/** * Like {@link #apply(String, PTransform)} but defaulting to the name provided by the {@link * PTransform}. */ public <OutputT extends POutput> OutputT apply( PTransform<KeyedPCollectionTuple<K>, OutputT> transform) { return Pipeline.applyTransform(this, transform); }
/** * Like {@link #apply(String, PTransform)} but defaulting to the name of the {@code PTransform}. */ public <OutputT extends POutput> OutputT apply(PTransform<PCollectionList<T>, OutputT> t) { return Pipeline.applyTransform(this, t); }
/** * Applies the given {@link PTransform} to this input {@link PCollectionTuple}, using {@code name} * to identify this specific application of the transform. This name is used in various places, * including the monitoring UI, logging, and to stably identify this application node in the job * graph. * * @return the output of the applied {@link PTransform} */ public <OutputT extends POutput> OutputT apply( String name, PTransform<? super PCollectionTuple, OutputT> t) { return Pipeline.applyTransform(name, this, t); }