public static String getSchema(String schemaPath) throws IOException { ReadableByteChannel chan = FileSystems.open(FileSystems.matchNewResource( schemaPath, false)); try (InputStream stream = Channels.newInputStream(chan)) { BufferedReader streamReader = new BufferedReader(new InputStreamReader(stream, "UTF-8")); StringBuilder dataBuilder = new StringBuilder(); String line; while ((line = streamReader.readLine()) != null) { dataBuilder.append(line); } return dataBuilder.toString(); } }
public static void runAvroToCsv(SampleOptions options) throws IOException, IllegalArgumentException { FileSystems.setDefaultPipelineOptions(options); // Get Avro Schema String schemaJson = getSchema(options.getAvroSchema()); Schema schema = new Schema.Parser().parse(schemaJson); // Check schema field types before starting the Dataflow job checkFieldTypes(schema); // Create the Pipeline object with the options we defined above. Pipeline pipeline = Pipeline.create(options); // Convert Avro To CSV pipeline.apply("Read Avro files", AvroIO.readGenericRecords(schemaJson).from(options.getInputFile())) .apply("Convert Avro to CSV formatted data", ParDo.of(new ConvertAvroToCsv(schemaJson, options.getCsvDelimiter()))) .apply("Write CSV formatted data", TextIO.write().to(options.getOutput()) .withSuffix(".csv")); // Run the pipeline. pipeline.run().waitUntilFinish(); }
public static void runCsvToAvro(SampleOptions options) throws IOException, IllegalArgumentException { FileSystems.setDefaultPipelineOptions(options); // Get Avro Schema String schemaJson = getSchema(options.getAvroSchema()); Schema schema = new Schema.Parser().parse(schemaJson); // Check schema field types before starting the Dataflow job checkFieldTypes(schema); // Create the Pipeline object with the options we defined above. Pipeline pipeline = Pipeline.create(options); // Convert CSV to Avro pipeline.apply("Read CSV files", TextIO.read().from(options.getInputFile())) .apply("Convert CSV to Avro formatted data", ParDo.of(new ConvertCsvToAvro(schemaJson, options.getCsvDelimiter()))) .setCoder(AvroCoder.of(GenericRecord.class, schema)) .apply("Write Avro formatted data", AvroIO.writeGenericRecords(schemaJson) .to(options.getOutput()).withCodec(CodecFactory.snappyCodec()).withSuffix(".avro")); // Run the pipeline. pipeline.run().waitUntilFinish(); }
SpannerConfig spannerConfig = SpannerConfig.create() .withInstanceId(instanceId) .withDatabaseId(databaseId); PCollectionView<Transaction> tx = p.apply( SpannerIO.createTransaction() .withSpannerConfig(spannerConfig) .withTimestampBound(TimestampBound.strong())); PCollection<Struct> singers = p.apply(SpannerIO.read() .withSpannerConfig(spannerConfig) .withQuery("SELECT SingerID, FirstName, LastName FROM Singers") .withTransaction(tx)); PCollection<Struct> albums = p.apply(SpannerIO.read().withSpannerConfig(spannerConfig) .withQuery("SELECT SingerId, AlbumId, AlbumTitle FROM Albums") .withTransaction(tx)); return Joiner.on(DELIMITER).join(input.getLong(0), input.getString(1), input.getString(2)); })).apply(TextIO.write().to(options.getSingersFilename()).withoutSharding()); return Joiner.on(DELIMITER).join(input.getLong(0), input.getLong(1), input.getString(2)); })).apply(TextIO.write().to(options.getAlbumsFilename()).withoutSharding());
public static void main(String[] args) { Options options = PipelineOptionsFactory.fromArgs(args).withValidation().as(Options.class); Pipeline p = Pipeline.create(options); SpannerConfig spannerConfig = SpannerConfig.create() .withInstanceId(options.getInstanceId()) .withDatabaseId(options.getDatabaseId()); // [START spanner_dataflow_readall] PCollection<Struct> allRecords = p.apply(SpannerIO.read() .withSpannerConfig(spannerConfig) .withQuery("SELECT t.table_name FROM information_schema.tables AS t WHERE t" + ".table_catalog = '' AND t.table_schema = ''")).apply( MapElements.into(TypeDescriptor.of(ReadOperation.class)) .via((SerializableFunction<Struct, ReadOperation>) input -> { String tableName = input.getString(0); return ReadOperation.create().withQuery("SELECT * FROM " + tableName); })).apply(SpannerIO.readAll().withSpannerConfig(spannerConfig)); // [END spanner_dataflow_readall] PCollection<Long> dbEstimatedSize = allRecords.apply(EstimateSize.create()) .apply(Sum.longsGlobally()); dbEstimatedSize.apply(ToString.elements()).apply(TextIO.write().to(options.getOutput()) .withoutSharding()); p.run().waitUntilFinish(); }
SpannerIO.read() .withInstanceId(instanceId) .withDatabaseId(databaseId) .withTable("Singers") .withColumns("singerId", "firstName", "lastName")); .apply(TextIO.write().to(options.getOutput()).withoutSharding());
public static void main(String[] args) { Options options = PipelineOptionsFactory.fromArgs(args).withValidation().as(Options.class); Pipeline p = Pipeline.create(options); String instanceId = options.getInstanceId(); String databaseId = options.getDatabaseId(); // [START spanner_dataflow_read] // Query for all the columns and rows in the specified Spanner table PCollection<Struct> records = p.apply( SpannerIO.read() .withInstanceId(instanceId) .withDatabaseId(databaseId) .withQuery("SELECT * FROM " + options.getTable())); // [END spanner_dataflow_read] PCollection<Long> tableEstimatedSize = records // Estimate the size of every row .apply(EstimateSize.create()) // Sum all the row sizes to get the total estimated size of the table .apply(Sum.longsGlobally()); // Write the total size to a file tableEstimatedSize .apply(ToString.elements()) .apply(TextIO.write().to(options.getOutput()).withoutSharding()); p.run().waitUntilFinish(); } }
PCollection<String> suspiciousUserIds = p.apply(TextIO.read().from(usersIdFile)); mutations.apply(SpannerIO.write() .withInstanceId(instanceId) .withDatabaseId(databaseId) .grouped());
protected FileBasedSource<String> getSource() { return CompressedSource.from( new TextSource( getFilepattern(), getMatchConfiguration().getEmptyMatchTreatment(), getDelimiter())) .withCompression(getCompression()); }
/** Reads Avro file(s) containing records of the specified schema. */ public static Read<GenericRecord> readGenericRecords(Schema schema) { return new AutoValue_AvroIO_Read.Builder<GenericRecord>() .setMatchConfiguration(MatchConfiguration.create(EmptyMatchTreatment.DISALLOW)) .setRecordClass(GenericRecord.class) .setSchema(schema) .setHintMatchesManyFiles(false) .build(); }
/** * Like {@link #parseGenericRecords(SerializableFunction)}, but reads each filepattern in the * input {@link PCollection}. */ public static <T> ParseAll<T> parseAllGenericRecords( SerializableFunction<GenericRecord, T> parseFn) { return new AutoValue_AvroIO_ParseAll.Builder<T>() .setMatchConfiguration(MatchConfiguration.create(EmptyMatchTreatment.ALLOW_IF_WILDCARD)) .setParseFn(parseFn) .setDesiredBundleSizeBytes(64 * 1024 * 1024L) .build(); }
@Override public long getSplitPointsRemaining() { if (isStarted() && startOfNextRecord >= getCurrentSource().getEndOffset()) { return isDone() ? 0 : 1; } return super.getSplitPointsRemaining(); }
/** * Reads Avro file(s) containing records of an unspecified schema and converting each record to a * custom type. */ public static <T> Parse<T> parseGenericRecords(SerializableFunction<GenericRecord, T> parseFn) { return new AutoValue_AvroIO_Parse.Builder<T>() .setMatchConfiguration(MatchConfiguration.create(EmptyMatchTreatment.DISALLOW)) .setParseFn(parseFn) .setHintMatchesManyFiles(false) .build(); }
/** Writes Avro records of the specified schema. */ public static Write<GenericRecord> writeGenericRecords(Schema schema) { return new Write<>( AvroIO.<GenericRecord, GenericRecord>defaultWriteBuilder() .setGenericRecords(true) .setSchema(schema) .build()); }
/** Writes elements to files using a {@link Sink}. See class-level documentation. */ public static <InputT> Write<Void, InputT> write() { return new AutoValue_FileIO_Write.Builder<Void, InputT>() .setDynamic(false) .setCompression(Compression.UNCOMPRESSED) .setIgnoreWindowing(false) .build(); }
/** Creates a {@link MatchConfiguration} with the given {@link EmptyMatchTreatment}. */ public static MatchConfiguration create(EmptyMatchTreatment emptyMatchTreatment) { return new AutoValue_FileIO_MatchConfiguration.Builder() .setEmptyMatchTreatment(emptyMatchTreatment) .build(); }
/** * Preserves windowing of input elements and writes them to files based on the element's window. * * <p>If using {@link #to(FileBasedSink.FilenamePolicy)}. Filenames will be generated using * {@link FilenamePolicy#windowedFilename}. See also {@link WriteFiles#withWindowedWrites()}. */ public TypedWrite<UserT, DestinationT, OutputT> withWindowedWrites() { return toBuilder().setWindowedWrites(true).build(); }
/** * Uses the given shard name template. * * @see ShardNameTemplate */ public Write withShardNameTemplate(String shardTemplate) { return toBuilder().setShardTemplate(shardTemplate).build(); }
p.apply("ReadSingers", TextIO.read().from(options.getSingersFilename())) .apply("WriteSingers", SpannerIO.write() .withInstanceId(instanceId) .withDatabaseId(databaseId)); .apply("ReadAlbums", TextIO.read().from(options.getAlbumsFilename())) .apply("WriteAlbums", SpannerIO.write() .withInstanceId(instanceId) .withDatabaseId(databaseId));
private static String getSchema(String schemaPath) throws IOException { ReadableByteChannel channel = FileSystems.open(FileSystems.matchNewResource( schemaPath, false)); try (InputStream stream = Channels.newInputStream(channel)) { BufferedReader streamReader = new BufferedReader(new InputStreamReader(stream, "UTF-8")); StringBuilder dataBuilder = new StringBuilder(); String line; while ((line = streamReader.readLine()) != null) { dataBuilder.append(line); } return dataBuilder.toString(); } }