@Override public PCollection<KV<String, String>> expand(PBegin input) { checkArgument(connectionConfiguration() != null, "withConnectionConfiguration() is required"); return input .apply(Create.of(keyPattern())) .apply(ParDo.of(new ReadKeysWithPattern(connectionConfiguration()))) .apply( RedisIO.readAll() .withConnectionConfiguration(connectionConfiguration()) .withBatchSize(batchSize())); } }
@Test @Category(NeedsRunner.class) public void testParDoOutputWithTimestamp() { PCollection<Integer> input = pipeline.apply(Create.of(Arrays.asList(3, 42, 6))); PCollection<String> output = input .apply(ParDo.of(new TestOutputTimestampDoFn<>())) .apply(ParDo.of(new TestShiftTimestampDoFn<>(Duration.ZERO, Duration.ZERO))) .apply(ParDo.of(new TestFormatTimestampDoFn<>())); PAssert.that(output) .containsInAnyOrder( "processing: 3, timestamp: 3", "processing: 42, timestamp: 42", "processing: 6, timestamp: 6"); pipeline.run(); }
.apply("ParseSingers", ParDo.of(new ParseSinger())) .apply("CreateSingerMutation", ParDo.of(new DoFn<Singer, Mutation>() { @ProcessElement public void processElement(ProcessContext c) { .apply("WriteSingers", SpannerIO.write() .withInstanceId(instanceId) .withDatabaseId(databaseId)); .apply("ReadAlbums", TextIO.read().from(options.getAlbumsFilename())) .apply("ParseAlbums", ParDo.of(new ParseAlbum())); .apply("CreateAlbumMutation", ParDo.of(new DoFn<Album, Mutation>() { @ProcessElement public void processElement(ProcessContext c) {
@Test @Category(ValidatesRunner.class) public void keyBasedOnCountFnManyElements() { DoFn<Long, Integer> fn = new CalculateShardsFn(0); long input = (long) Math.pow(10, 10); int output = 10; PAssert.that(p.apply(Create.of(input)).apply(ParDo.of(fn))).containsInAnyOrder(output); p.run().waitUntilFinish(); }
public static void runAvroToCsv(SampleOptions options) throws IOException, IllegalArgumentException { FileSystems.setDefaultPipelineOptions(options); // Get Avro Schema String schemaJson = getSchema(options.getAvroSchema()); Schema schema = new Schema.Parser().parse(schemaJson); // Check schema field types before starting the Dataflow job checkFieldTypes(schema); // Create the Pipeline object with the options we defined above. Pipeline pipeline = Pipeline.create(options); // Convert Avro To CSV pipeline.apply("Read Avro files", AvroIO.readGenericRecords(schemaJson).from(options.getInputFile())) .apply("Convert Avro to CSV formatted data", ParDo.of(new ConvertAvroToCsv(schemaJson, options.getCsvDelimiter()))) .apply("Write CSV formatted data", TextIO.write().to(options.getOutput()) .withSuffix(".csv")); // Run the pipeline. pipeline.run().waitUntilFinish(); }
@Test @Category(ValidatesRunner.class) public void testParDo() { List<Integer> inputs = Arrays.asList(3, -42, 666); PCollection<String> output = pipeline.apply(Create.of(inputs)).apply(ParDo.of(new TestDoFn())); PAssert.that(output).satisfies(ParDoTest.HasExpectedOutput.forInput(inputs)); pipeline.run(); }
public static void runCsvToAvro(SampleOptions options) throws IOException, IllegalArgumentException { FileSystems.setDefaultPipelineOptions(options); // Get Avro Schema String schemaJson = getSchema(options.getAvroSchema()); Schema schema = new Schema.Parser().parse(schemaJson); // Check schema field types before starting the Dataflow job checkFieldTypes(schema); // Create the Pipeline object with the options we defined above. Pipeline pipeline = Pipeline.create(options); // Convert CSV to Avro pipeline.apply("Read CSV files", TextIO.read().from(options.getInputFile())) .apply("Convert CSV to Avro formatted data", ParDo.of(new ConvertCsvToAvro(schemaJson, options.getCsvDelimiter()))) .setCoder(AvroCoder.of(GenericRecord.class, schema)) .apply("Write Avro formatted data", AvroIO.writeGenericRecords(schemaJson) .to(options.getOutput()).withCodec(CodecFactory.snappyCodec()).withSuffix(".avro")); // Run the pipeline. pipeline.run().waitUntilFinish(); }
@Test @Category(ValidatesRunner.class) public void keyBasedOnCountFnManyElementsExtraShards() { CalculateShardsFn fn = new CalculateShardsFn(3); double count = Math.pow(10, 10); int output = 13; PAssert.that(p.apply(Create.of((long) count)).apply(ParDo.of(fn))).containsInAnyOrder(output); p.run().waitUntilFinish(); }
@Test @Category(NeedsRunner.class) public void testDefaultCoder() throws Exception { p.enableAbandonedNodeEnforcement(true); // Use MyRecord as input and output types without explicitly specifying // a coder (this uses the default coders, which may not be // SerializableCoder). PCollection<String> output = p.apply(Create.of("Hello", "World")) .apply(ParDo.of(new StringToRecord())) .apply(ParDo.of(new RecordToString())); PAssert.that(output).containsInAnyOrder("Hello", "World"); p.run(); }
@Test @Category(NeedsRunner.class) public void testParDoWritingToUndeclaredTag() { List<Integer> inputs = Arrays.asList(3, -42, 666); TupleTag<String> notOutputTag = new TupleTag<String>("additional") {}; pipeline .apply(Create.of(inputs)) .apply( ParDo.of(new TestDoFn(Arrays.asList(), Arrays.asList(notOutputTag))) // No call to .withOutputTags - should cause error ); thrown.expectMessage("additional"); pipeline.run(); }
@Test @Category(ValidatesRunner.class) public void testFilterSingleMonthDataFn() { PCollection<TableRow> input = p.apply(Create.of(outRow1, outRow2, outRow3)); PCollection<TableRow> results = input.apply(ParDo.of(new FilterSingleMonthDataFn(7))); PAssert.that(results).containsInAnyOrder(outRow2); p.run().waitUntilFinish(); } }
@Test @Category(ValidatesRunner.class) public void testProjectionFn() { PCollection<TableRow> input = p.apply(Create.of(row1, row2, row3)); PCollection<TableRow> results = input.apply(ParDo.of(new ProjectionFn())); PAssert.that(results).containsInAnyOrder(outRow1, outRow2, outRow3); p.run().waitUntilFinish(); }
@Test @Category(ValidatesRunner.class) public void keyBasedOnCountFnFewElementsExtraShards() { long countValue = (long) WriteWithShardingFactory.MIN_SHARDS_FOR_LOG + 3; CalculateShardsFn fn = new CalculateShardsFn(3); int output = 6; PAssert.that(p.apply(Create.of(countValue)).apply(ParDo.of(fn))).containsInAnyOrder(output); p.run().waitUntilFinish(); }
@Test @Category(NeedsRunner.class) public void testParDoWithErrorInStartBatch() { List<Integer> inputs = Arrays.asList(3, -42, 666); pipeline.apply(Create.of(inputs)).apply(ParDo.of(new TestStartBatchErrorDoFn())); thrown.expect(RuntimeException.class); thrown.expectMessage("test error in initialize"); pipeline.run(); }