@Override public PCollection<Long> expand(PCollection<Struct> input) { return input.apply(ParDo.of(new EstimateStructSizeFn())); }
@Override public PCollection<T> expand(PBegin input) { return PCollection.createPrimitiveOutputInternal( input.getPipeline(), WindowingStrategy.globalDefault(), IsBounded.UNBOUNDED, coder); }
/** * Returns a singleton {@link PCollectionList} containing the given {@link PCollection}. * * <p>Longer {@link PCollectionList PCollectionLists} can be created by calling {@link #and} on * the result. */ public static <T> PCollectionList<T> of(PCollection<T> pc) { return new PCollectionList<T>(pc.getPipeline()).and(pc); }
public static void runCsvToAvro(SampleOptions options) throws IOException, IllegalArgumentException { FileSystems.setDefaultPipelineOptions(options); // Get Avro Schema String schemaJson = getSchema(options.getAvroSchema()); Schema schema = new Schema.Parser().parse(schemaJson); // Check schema field types before starting the Dataflow job checkFieldTypes(schema); // Create the Pipeline object with the options we defined above. Pipeline pipeline = Pipeline.create(options); // Convert CSV to Avro pipeline.apply("Read CSV files", TextIO.read().from(options.getInputFile())) .apply("Convert CSV to Avro formatted data", ParDo.of(new ConvertCsvToAvro(schemaJson, options.getCsvDelimiter()))) .setCoder(AvroCoder.of(GenericRecord.class, schema)) .apply("Write Avro formatted data", AvroIO.writeGenericRecords(schemaJson) .to(options.getOutput()).withCodec(CodecFactory.snappyCodec()).withSuffix(".avro")); // Run the pipeline. pipeline.run().waitUntilFinish(); }
public static void main(String[] args) { Options options = PipelineOptionsFactory.fromArgs(args).withValidation().as(Options.class); Pipeline p = Pipeline.create(options); SpannerConfig spannerConfig = SpannerConfig.create() .withInstanceId(options.getInstanceId()) .withDatabaseId(options.getDatabaseId()); // [START spanner_dataflow_readall] PCollection<Struct> allRecords = p.apply(SpannerIO.read() .withSpannerConfig(spannerConfig) .withQuery("SELECT t.table_name FROM information_schema.tables AS t WHERE t" + ".table_catalog = '' AND t.table_schema = ''")).apply( MapElements.into(TypeDescriptor.of(ReadOperation.class)) .via((SerializableFunction<Struct, ReadOperation>) input -> { String tableName = input.getString(0); return ReadOperation.create().withQuery("SELECT * FROM " + tableName); })).apply(SpannerIO.readAll().withSpannerConfig(spannerConfig)); // [END spanner_dataflow_readall] PCollection<Long> dbEstimatedSize = allRecords.apply(EstimateSize.create()) .apply(Sum.longsGlobally()); dbEstimatedSize.apply(ToString.elements()).apply(TextIO.write().to(options.getOutput()) .withoutSharding()); p.run().waitUntilFinish(); }
@Override public PDone expand(PCollection<PubsubMessage> input) { return PDone.in(input.getPipeline()); }
@Override public final PCollection<ValueWithRecordId<T>> expand(PInput input) { return PCollection.createPrimitiveOutputInternal( input.getPipeline(), WindowingStrategy.globalDefault(), IsBounded.UNBOUNDED, ValueWithRecordId.ValueWithRecordIdCoder.of(source.getOutputCoder())); }
@Override public <T> SerializableFunction<T, Row> toRowFunction(TypeDescriptor<T> typeDescriptor) { if (typeDescriptor.equals(TypeDescriptor.of(TestSchemaClass.class))) { return v -> Row.withSchema(EMPTY_SCHEMA).build(); } return null; }
@Test public void testTypeDescriptorImmediate() throws Exception { assertEquals(Boolean.class, new TypeDescriptor<Boolean>() {}.getRawType()); assertEquals(Double.class, new TypeDescriptor<Double>() {}.getRawType()); assertEquals(Float.class, new TypeDescriptor<Float>() {}.getRawType()); assertEquals(Integer.class, new TypeDescriptor<Integer>() {}.getRawType()); assertEquals(Long.class, new TypeDescriptor<Long>() {}.getRawType()); assertEquals(Short.class, new TypeDescriptor<Short>() {}.getRawType()); assertEquals(String.class, new TypeDescriptor<String>() {}.getRawType()); }
/** The {@link TypeDescriptor} for {@link Map}. */ public static <K, V> TypeDescriptor<Map<K, V>> maps( TypeDescriptor<K> keyType, TypeDescriptor<V> valueType) { TypeDescriptor<Map<K, V>> typeDescriptor = new TypeDescriptor<Map<K, V>>() {}.where(new TypeParameter<K>() {}, keyType) .where(new TypeParameter<V>() {}, valueType); return typeDescriptor; }
@Override public PCollection<T> expand(PBegin input) { return PCollection.createPrimitiveOutputInternal( input.getPipeline(), WindowingStrategy.globalDefault(), IsBounded.BOUNDED, coder); }
@Override public <T> SerializableFunction<T, Row> toRowFunction(TypeDescriptor<T> typeDescriptor) { if (typeDescriptor.equals(TypeDescriptor.of(TestDefaultSchemaClass.class))) { return v -> Row.withSchema(EMPTY_SCHEMA).build(); } return null; }
public static void main(String[] args) { Options options = PipelineOptionsFactory.fromArgs(args).withValidation().as(Options.class); Pipeline p = Pipeline.create(options); String instanceId = options.getInstanceId(); String databaseId = options.getDatabaseId(); // [START spanner_dataflow_read] // Query for all the columns and rows in the specified Spanner table PCollection<Struct> records = p.apply( SpannerIO.read() .withInstanceId(instanceId) .withDatabaseId(databaseId) .withQuery("SELECT * FROM " + options.getTable())); // [END spanner_dataflow_read] PCollection<Long> tableEstimatedSize = records // Estimate the size of every row .apply(EstimateSize.create()) // Sum all the row sizes to get the total estimated size of the table .apply(Sum.longsGlobally()); // Write the total size to a file tableEstimatedSize .apply(ToString.elements()) .apply(TextIO.write().to(options.getOutput()).withoutSharding()); p.run().waitUntilFinish(); } }
@Override public PCollection<T> expand(PBegin input) { return PCollection.createPrimitiveOutputInternal( input.getPipeline(), WindowingStrategy.globalDefault(), PCollection.IsBounded.UNBOUNDED, coder); } }
public static void runAvroToCsv(SampleOptions options) throws IOException, IllegalArgumentException { FileSystems.setDefaultPipelineOptions(options); // Get Avro Schema String schemaJson = getSchema(options.getAvroSchema()); Schema schema = new Schema.Parser().parse(schemaJson); // Check schema field types before starting the Dataflow job checkFieldTypes(schema); // Create the Pipeline object with the options we defined above. Pipeline pipeline = Pipeline.create(options); // Convert Avro To CSV pipeline.apply("Read Avro files", AvroIO.readGenericRecords(schemaJson).from(options.getInputFile())) .apply("Convert Avro to CSV formatted data", ParDo.of(new ConvertAvroToCsv(schemaJson, options.getCsvDelimiter()))) .apply("Write CSV formatted data", TextIO.write().to(options.getOutput()) .withSuffix(".csv")); // Run the pipeline. pipeline.run().waitUntilFinish(); }
@Override public final PCollection<T> expand(PBegin input) { source.validate(); return PCollection.createPrimitiveOutputInternal( input.getPipeline(), WindowingStrategy.globalDefault(), IsBounded.UNBOUNDED, source.getOutputCoder()); }
.apply("ParseSingers", ParDo.of(new ParseSinger())) .apply("CreateSingerMutation", ParDo.of(new DoFn<Singer, Mutation>() { @ProcessElement public void processElement(ProcessContext c) { .apply("WriteSingers", SpannerIO.write() .withInstanceId(instanceId) .withDatabaseId(databaseId)); .apply("ReadAlbums", TextIO.read().from(options.getAlbumsFilename())) .apply("ParseAlbums", ParDo.of(new ParseAlbum())); .apply("CreateAlbumMutation", ParDo.of(new DoFn<Album, Mutation>() { @ProcessElement public void processElement(ProcessContext c) { .apply("WriteAlbums", SpannerIO.write() .withInstanceId(instanceId) .withDatabaseId(databaseId));
singers.apply(MapElements.via(new SimpleFunction<Struct, String>() { return Joiner.on(DELIMITER).join(input.getLong(0), input.getString(1), input.getString(2)); })).apply(TextIO.write().to(options.getSingersFilename()).withoutSharding()); albums.apply(MapElements.via(new SimpleFunction<Struct, String>() { return Joiner.on(DELIMITER).join(input.getLong(0), input.getLong(1), input.getString(2)); })).apply(TextIO.write().to(options.getAlbumsFilename()).withoutSharding());