public static void main(String[] args) { Options options = PipelineOptionsFactory.fromArgs(args).withValidation().as(Options.class); Pipeline p = Pipeline.create(options); SpannerConfig spannerConfig = SpannerConfig.create() .withInstanceId(options.getInstanceId()) .withDatabaseId(options.getDatabaseId()); // [START spanner_dataflow_readall] PCollection<Struct> allRecords = p.apply(SpannerIO.read() .withSpannerConfig(spannerConfig) .withQuery("SELECT t.table_name FROM information_schema.tables AS t WHERE t" + ".table_catalog = '' AND t.table_schema = ''")).apply( MapElements.into(TypeDescriptor.of(ReadOperation.class)) .via((SerializableFunction<Struct, ReadOperation>) input -> { String tableName = input.getString(0); return ReadOperation.create().withQuery("SELECT * FROM " + tableName); })).apply(SpannerIO.readAll().withSpannerConfig(spannerConfig)); // [END spanner_dataflow_readall] PCollection<Long> dbEstimatedSize = allRecords.apply(EstimateSize.create()) .apply(Sum.longsGlobally()); dbEstimatedSize.apply(ToString.elements()).apply(TextIO.write().to(options.getOutput()) .withoutSharding()); p.run().waitUntilFinish(); }
@Override public PCollection<String> expand(PCollection<Row> input) { return input.apply( "rowToCsv", MapElements.into(TypeDescriptors.strings()).via(row -> beamRow2CsvLine(row, csvFormat))); } }
@Override public PCollection<T> expand(PCollection<T> input) { List<PCollectionView<?>> views = Lists.newArrayList(); for (int i = 0; i < signals.size(); ++i) { views.add(signals.get(i).apply("To wait view " + i, new ToWaitView())); } return input.apply( "Wait", MapElements.into(input.getCoder().getEncodedTypeDescriptor()) .via(fn((t, c) -> t, requiresSideInputs(views)))); } }
@Override public PCollection<String> expand(PCollection<Row> input) { return input.apply( "rowsToLines", MapElements.into(TypeDescriptors.strings()).via((Row row) -> row.getString(0) + "\n")); } }
@Override public PCollection<Export> expand(PBegin input) { NestedValueProvider<String, String> manifestFile = NestedValueProvider.of(importDirectory, s -> GcsUtil.joinPath(s, "spanner-export.json")); return input .apply("Read manifest", FileIO.match().filepattern(manifestFile)) .apply( "Resource id", MapElements.into(TypeDescriptor.of(ResourceId.class)) .via((MatchResult.Metadata::resourceId))) .apply( "Read manifest json", MapElements.into(TypeDescriptor.of(Export.class)) .via(ReadExportManifestFile::readManifest)); }
@Override public PCollection<Row> expand(PCollection<String> input) { return input .apply( "linesToRows", MapElements.into(TypeDescriptors.rows()) .via(s -> Row.withSchema(SCHEMA).addValue(s).build())) .setRowSchema(SCHEMA); } }
@Test public void testSerializableFunctionDisplayData() { SerializableFunction<Integer, Integer> serializableFn = input -> input; MapElements<?, ?> serializableMap = MapElements.into(integers()).via(serializableFn); assertThat( DisplayData.from(serializableMap), hasDisplayItem("class", serializableFn.getClass())); }
@Override public PCollection<KV<String, Integer>> expand(PCollection<GameActionInfo> gameInfo) { return gameInfo .apply( MapElements.into( TypeDescriptors.kvs(TypeDescriptors.strings(), TypeDescriptors.integers())) .via((GameActionInfo gInfo) -> KV.of(gInfo.getKey(field), gInfo.getScore()))) .apply(Sum.integersPerKey()); } }
private static void runReadPipeline(Options options) { Pipeline pipeline = Pipeline.create(options); pipeline .apply("Find files", FileIO.match().filepattern(options.getInput())) .apply("Read matched files", FileIO.readMatches()) .apply("Read parquet files", ParquetIO.readFiles(SCHEMA)) .apply("Map records to strings", MapElements.into(strings()).via(new GetRecordsFn())); pipeline.run(); }
/** * Basic test of {@link MapElements} with a {@link SerializableFunction}. This style is generally * discouraged in Java 7, in favor of {@link SimpleFunction}. */ @Test @Category(NeedsRunner.class) public void testMapBasicSerializableFunction() throws Exception { PCollection<Integer> output = pipeline.apply(Create.of(1, 2, 3)).apply(MapElements.into(integers()).via(input -> -input)); PAssert.that(output).containsInAnyOrder(-2, -1, -3); pipeline.run(); }
/** A basic smoke test that ensures there is no crash at pipeline construction time. */ @Test public void testMinimalWordCount() throws Exception { p.getOptions().as(GcsOptions.class).setGcsUtil(buildMockGcsUtil()); p.apply(TextIO.read().from("gs://apache-beam-samples/shakespeare/*")) .apply( FlatMapElements.into(TypeDescriptors.strings()) .via((String word) -> Arrays.asList(word.split("[^a-zA-Z']+")))) .apply(Filter.by((String word) -> !word.isEmpty())) .apply(Count.perElement()) .apply( MapElements.into(TypeDescriptors.strings()) .via( (KV<String, Long> wordCount) -> wordCount.getKey() + ": " + wordCount.getValue())) .apply(TextIO.write().to("gs://your-output-bucket/and-output-prefix")); }
/** Basic test of {@link MapElements} with a method reference. */ @Test @Category(NeedsRunner.class) public void testMapMethodReference() throws Exception { PCollection<Integer> output = pipeline .apply(Create.of(1, 2, 3)) .apply( MapElements // Note that the type annotation is required. .into(TypeDescriptors.integers()) .via(new Doubler()::doubleIt)); PAssert.that(output).containsInAnyOrder(6, 2, 4); pipeline.run(); }
/** * Basic test of {@link MapElements} with a lambda (which is instantiated as a {@link * SerializableFunction}). */ @Test @Category(NeedsRunner.class) public void testMapLambda() throws Exception { PCollection<Integer> output = pipeline .apply(Create.of(1, 2, 3)) .apply( MapElements // Note that the type annotation is required. .into(TypeDescriptors.integers()) .via((Integer i) -> i * 2)); PAssert.that(output).containsInAnyOrder(6, 2, 4); pipeline.run(); }
/** Basic test of {@link MapElements} with a {@link Fn} and a side input. */ @Test @Category(NeedsRunner.class) public void testMapBasicWithSideInput() throws Exception { final PCollectionView<Integer> view = pipeline.apply("Create base", Create.of(40)).apply(View.asSingleton()); PCollection<Integer> output = pipeline .apply(Create.of(0, 1, 2)) .apply( MapElements.into(integers()) .via( fn((element, c) -> element + c.sideInput(view), requiresSideInputs(view)))); PAssert.that(output).containsInAnyOrder(40, 41, 42); pipeline.run(); }
@Test @Category(NeedsRunner.class) public void testComposeBasicSerializableFunction() throws Exception { PCollection<Integer> output = pipeline .apply(Create.of(1, 2, 3)) .apply( PTransform.compose( (PCollection<Integer> numbers) -> { PCollection<Integer> inverted = numbers.apply(MapElements.into(integers()).via(input -> -input)); return PCollectionList.of(numbers) .and(inverted) .apply(Flatten.pCollections()); })); PAssert.that(output).containsInAnyOrder(-2, -1, -3, 2, 1, 3); pipeline.run(); } }
@Override public PDone expand(PBegin begin) { PCollection<Boolean> result = begin .apply( Create.of(DUMMY_ROW) .withSchema( DUMMY_SCHEMA, SerializableFunctions.identity(), SerializableFunctions.identity())) .apply(SqlTransform.query("SELECT " + expr)) .apply(MapElements.into(TypeDescriptors.booleans()).via(row -> row.getBoolean(0))); PAssert.that(result) .satisfies( input -> { assertTrue("Test expression is false: " + expr, Iterables.getOnlyElement(input)); return null; }); return PDone.in(begin.getPipeline()); } }
/** Test that bad input data is dropped appropriately. */ @Test @Category(ValidatesRunner.class) public void testUserScoresBadInput() throws Exception { PCollection<String> input = p.apply(Create.of(GAME_EVENTS2).withCoder(StringUtf8Coder.of())); PCollection<KV<String, Integer>> extract = input .apply(ParDo.of(new ParseEventFn())) .apply( MapElements.into( TypeDescriptors.kvs(TypeDescriptors.strings(), TypeDescriptors.integers())) .via((GameActionInfo gInfo) -> KV.of(gInfo.getUser(), gInfo.getScore()))); PAssert.that(extract).empty(); p.run().waitUntilFinish(); } }
@Test public void testUnboundedSourceLogAppendTimestamps() { // LogAppendTime (server side timestamp) for records is set based on record index // in MockConsumer above. Ensure that those exact timestamps are set by the source. int numElements = 1000; PCollection<Long> input = p.apply(mkKafkaReadTransform(numElements, null).withLogAppendTime().withoutMetadata()) .apply(Values.create()); addCountingAsserts(input, numElements); PCollection<Long> diffs = input .apply( MapElements.into(TypeDescriptors.longs()) .via(t -> LOG_APPEND_START_TIME.plus(Duration.standardSeconds(t)).getMillis())) .apply("TimestampDiff", ParDo.of(new ElementValueDiff())) .apply("DistinctTimestamps", Distinct.create()); // This assert also confirms that diff only has one unique value. PAssert.thatSingleton(diffs).isEqualTo(0L); p.run(); }
/** Test the filtering. */ @Test @Category(ValidatesRunner.class) public void testUserScoresFilter() throws Exception { final Instant startMinTimestamp = new Instant(1447965680000L); PCollection<String> input = p.apply(Create.of(GAME_EVENTS).withCoder(StringUtf8Coder.of())); PCollection<KV<String, Integer>> output = input .apply("ParseGameEvent", ParDo.of(new ParseEventFn())) .apply( "FilterStartTime", Filter.by( (GameActionInfo gInfo) -> gInfo.getTimestamp() > startMinTimestamp.getMillis())) // run a map to access the fields in the result. .apply( MapElements.into( TypeDescriptors.kvs(TypeDescriptors.strings(), TypeDescriptors.integers())) .via((GameActionInfo gInfo) -> KV.of(gInfo.getUser(), gInfo.getScore()))); PAssert.that(output).containsInAnyOrder(FILTERED_EVENTS); p.run().waitUntilFinish(); }
@Test public void testReadAllRecordsInDb() throws Exception { SpannerConfig spannerConfig = createSpannerConfig(); PCollectionView<Transaction> tx = p.apply( SpannerIO.createTransaction() .withSpannerConfig(spannerConfig) .withTimestampBound(TimestampBound.strong())); PCollection<Struct> allRecords = p.apply( SpannerIO.read() .withSpannerConfig(spannerConfig) .withBatching(false) .withQuery( "SELECT t.table_name FROM information_schema.tables AS t WHERE t" + ".table_catalog = '' AND t.table_schema = ''")) .apply( MapElements.into(TypeDescriptor.of(ReadOperation.class)) .via( (SerializableFunction<Struct, ReadOperation>) input -> { String tableName = input.getString(0); return ReadOperation.create().withQuery("SELECT * FROM " + tableName); })) .apply(SpannerIO.readAll().withTransaction(tx).withSpannerConfig(spannerConfig)); PAssert.thatSingleton(allRecords.apply("Count rows", Count.globally())).isEqualTo(5L); p.run(); }