public static void runAvroToCsv(SampleOptions options) throws IOException, IllegalArgumentException { FileSystems.setDefaultPipelineOptions(options); // Get Avro Schema String schemaJson = getSchema(options.getAvroSchema()); Schema schema = new Schema.Parser().parse(schemaJson); // Check schema field types before starting the Dataflow job checkFieldTypes(schema); // Create the Pipeline object with the options we defined above. Pipeline pipeline = Pipeline.create(options); // Convert Avro To CSV pipeline.apply("Read Avro files", AvroIO.readGenericRecords(schemaJson).from(options.getInputFile())) .apply("Convert Avro to CSV formatted data", ParDo.of(new ConvertAvroToCsv(schemaJson, options.getCsvDelimiter()))) .apply("Write CSV formatted data", TextIO.write().to(options.getOutput()) .withSuffix(".csv")); // Run the pipeline. pipeline.run().waitUntilFinish(); }
return Joiner.on(DELIMITER).join(input.getLong(0), input.getString(1), input.getString(2)); })).apply(TextIO.write().to(options.getSingersFilename()).withoutSharding()); return Joiner.on(DELIMITER).join(input.getLong(0), input.getLong(1), input.getString(2)); })).apply(TextIO.write().to(options.getAlbumsFilename()).withoutSharding());
public static void main(String[] args) { Options options = PipelineOptionsFactory.fromArgs(args).withValidation().as(Options.class); Pipeline p = Pipeline.create(options); String instanceId = options.getInstanceId(); String databaseId = options.getDatabaseId(); // [START spanner_dataflow_read] // Query for all the columns and rows in the specified Spanner table PCollection<Struct> records = p.apply( SpannerIO.read() .withInstanceId(instanceId) .withDatabaseId(databaseId) .withQuery("SELECT * FROM " + options.getTable())); // [END spanner_dataflow_read] PCollection<Long> tableEstimatedSize = records // Estimate the size of every row .apply(EstimateSize.create()) // Sum all the row sizes to get the total estimated size of the table .apply(Sum.longsGlobally()); // Write the total size to a file tableEstimatedSize .apply(ToString.elements()) .apply(TextIO.write().to(options.getOutput()).withoutSharding()); p.run().waitUntilFinish(); } }
.apply(TextIO.write().to(options.getOutput()).withoutSharding());
public static void main(String[] args) { Options options = PipelineOptionsFactory.fromArgs(args).withValidation().as(Options.class); Pipeline p = Pipeline.create(options); SpannerConfig spannerConfig = SpannerConfig.create() .withInstanceId(options.getInstanceId()) .withDatabaseId(options.getDatabaseId()); // [START spanner_dataflow_readall] PCollection<Struct> allRecords = p.apply(SpannerIO.read() .withSpannerConfig(spannerConfig) .withQuery("SELECT t.table_name FROM information_schema.tables AS t WHERE t" + ".table_catalog = '' AND t.table_schema = ''")).apply( MapElements.into(TypeDescriptor.of(ReadOperation.class)) .via((SerializableFunction<Struct, ReadOperation>) input -> { String tableName = input.getString(0); return ReadOperation.create().withQuery("SELECT * FROM " + tableName); })).apply(SpannerIO.readAll().withSpannerConfig(spannerConfig)); // [END spanner_dataflow_readall] PCollection<Long> dbEstimatedSize = allRecords.apply(EstimateSize.create()) .apply(Sum.longsGlobally()); dbEstimatedSize.apply(ToString.elements()).apply(TextIO.write().to(options.getOutput()) .withoutSharding()); p.run().waitUntilFinish(); }
@Override public PDone buildIOWriter(PCollection<Row> input) { return input .apply("RowToString", writeConverter) .apply("WriteTextFiles", TextIO.write().withDelimiter(new char[] {}).to(filePattern)); } }
@Override public PDone expand(PCollectionTuple pCollectionTuple) { return pCollectionTuple .get(errorTag()) .apply(TextIO.write().to(errorWritePath()).withNumShards(1)); }
@Override public PDone expand(PBegin begin) { return begin.apply(Create.of(LINES)).apply(TextIO.write().to(filename)); } }
@Test public void testGetName() { assertEquals("TextIO.Write", TextIO.write().to("somefile").getName()); }
@Test public void testWriteDisplayDataValidateThenHeader() { TextIO.Write write = TextIO.write().to("foo").withHeader("myHeader"); DisplayData displayData = DisplayData.from(write); assertThat(displayData, hasDisplayItem("fileHeader", "myHeader")); }
@Test public void testWriteDisplayDataValidateThenFooter() { TextIO.Write write = TextIO.write().to("foo").withFooter("myFooter"); DisplayData displayData = DisplayData.from(write); assertThat(displayData, hasDisplayItem("fileFooter", "myFooter")); }
public static void run(Options options) { Pipeline p = Pipeline.create(options); double samplingThreshold = 0.1; p.apply(TextIO.read().from(options.getWikiInput())) .apply(MapElements.via(new ParseTableRowJson())) .apply(new ComputeTopSessions(samplingThreshold)) .apply("Write", TextIO.write().to(options.getOutput())); p.run().waitUntilFinish(); }
static void runWordCount(WordCountOptions options) { Pipeline p = Pipeline.create(options); // Concepts #2 and #3: Our pipeline applies the composite CountWords transform, and passes the // static FormatAsTextFn() to the ParDo transform. p.apply("ReadLines", TextIO.read().from(options.getInputFile())) .apply(new CountWords()) .apply(MapElements.via(new FormatAsTextFn())) .apply("WriteCounts", TextIO.write().to(options.getOutput())); p.run().waitUntilFinish(); }
public static void main(String[] args) throws Exception { Options options = PipelineOptionsFactory.fromArgs(args).withValidation().as(Options.class); Pipeline p = Pipeline.create(options); p.apply("ReadLines", TextIO.read().from(options.getInput())) .apply(Distinct.create()) .apply("DedupedShakespeare", TextIO.write().to(options.getOutput())); p.run().waitUntilFinish(); } }
@Test public void testRuntimeOptionsNotCalledInApply() throws Exception { p.enableAbandonedNodeEnforcement(false); RuntimeTestOptions options = PipelineOptionsFactory.as(RuntimeTestOptions.class); p.apply(Create.of("")).apply(TextIO.write().to(options.getOutput())); }
public static void main(String[] args) { WordCountOptions options = PipelineOptionsFactory.fromArgs(args).withValidation() .as(WordCountOptions.class); Pipeline p = Pipeline.create(options); p.apply("ReadLines", TextIO.read().from(options.getInputFile())) .apply(new CountWords()) .apply(MapElements.via(new FormatAsTextFn())) .apply("WriteCounts", TextIO.write().to(options.getOutput())); p.run(); } }
private Pipeline buildPipeline(DataflowPipelineOptions options) { options.setRunner(DataflowRunner.class); Pipeline p = Pipeline.create(options); p.apply("ReadMyFile", TextIO.read().from("gs://bucket/object")) .apply("WriteMyFile", TextIO.write().to("gs://bucket/object")); DataflowRunner runner = DataflowRunner.fromOptions(options); runner.replaceTransforms(p); return p; }
private Pipeline buildDataflowPipeline(DataflowPipelineOptions options) { options.setStableUniqueNames(CheckEnabled.ERROR); options.setRunner(DataflowRunner.class); Pipeline p = Pipeline.create(options); p.apply("ReadMyFile", TextIO.read().from("gs://bucket/object")) .apply("WriteMyFile", TextIO.write().to("gs://bucket/object")); // Enable the FileSystems API to know about gs:// URIs in this test. FileSystems.setDefaultPipelineOptions(options); return p; }
@Test public void testTextIOWithRuntimeParameters() throws IOException { DataflowPipelineOptions dataflowOptions = buildPipelineOptions(); RuntimeTestOptions options = dataflowOptions.as(RuntimeTestOptions.class); Pipeline p = buildDataflowPipeline(dataflowOptions); p.apply(TextIO.read().from(options.getInput())).apply(TextIO.write().to(options.getOutput())); }
static void runWordCount(WordCountOptions options) { Pipeline p = Pipeline.create(options); p.apply("ReadLines", TextIO.read().from(options.getInputFile())) .apply(ParDo.of(new ExtractWordsFn())) .apply(Count.perElement()) .apply(ParDo.of(new FormatAsStringFn())) .apply("WriteCounts", TextIO.write().to(options.getOutput())); p.run().waitUntilFinish(); }