org.apache.beam.sdk.io java code examples

public static String getSchema(String schemaPath) throws IOException {
 ReadableByteChannel chan = FileSystems.open(FileSystems.matchNewResource(
   schemaPath, false));
 try (InputStream stream = Channels.newInputStream(chan)) {
  BufferedReader streamReader = new BufferedReader(new InputStreamReader(stream, "UTF-8"));
  StringBuilder dataBuilder = new StringBuilder();
  String line;
  while ((line = streamReader.readLine()) != null) {
   dataBuilder.append(line);
  }
  return dataBuilder.toString();
 }
}

public static void runAvroToCsv(SampleOptions options)
  throws IOException, IllegalArgumentException {
 FileSystems.setDefaultPipelineOptions(options);
 // Get Avro Schema
 String schemaJson = getSchema(options.getAvroSchema());
 Schema schema = new Schema.Parser().parse(schemaJson);
 // Check schema field types before starting the Dataflow job
 checkFieldTypes(schema);
 // Create the Pipeline object with the options we defined above.
 Pipeline pipeline = Pipeline.create(options);
 // Convert Avro To CSV
 pipeline.apply("Read Avro files",
   AvroIO.readGenericRecords(schemaJson).from(options.getInputFile()))
   .apply("Convert Avro to CSV formatted data",
     ParDo.of(new ConvertAvroToCsv(schemaJson, options.getCsvDelimiter())))
   .apply("Write CSV formatted data", TextIO.write().to(options.getOutput())
     .withSuffix(".csv"));
 // Run the pipeline.
 pipeline.run().waitUntilFinish();
}

public static void runCsvToAvro(SampleOptions options)
  throws IOException, IllegalArgumentException {
 FileSystems.setDefaultPipelineOptions(options);
 // Get Avro Schema
 String schemaJson = getSchema(options.getAvroSchema());
 Schema schema = new Schema.Parser().parse(schemaJson);
 // Check schema field types before starting the Dataflow job
 checkFieldTypes(schema);
 // Create the Pipeline object with the options we defined above.
 Pipeline pipeline = Pipeline.create(options);
 // Convert CSV to Avro
 pipeline.apply("Read CSV files", TextIO.read().from(options.getInputFile()))
   .apply("Convert CSV to Avro formatted data",
     ParDo.of(new ConvertCsvToAvro(schemaJson, options.getCsvDelimiter())))
   .setCoder(AvroCoder.of(GenericRecord.class, schema))
   .apply("Write Avro formatted data", AvroIO.writeGenericRecords(schemaJson)
     .to(options.getOutput()).withCodec(CodecFactory.snappyCodec()).withSuffix(".avro"));
 // Run the pipeline.
 pipeline.run().waitUntilFinish();
}

SpannerConfig spannerConfig = SpannerConfig.create()
  .withInstanceId(instanceId)
  .withDatabaseId(databaseId);
PCollectionView<Transaction> tx = p.apply(
  SpannerIO.createTransaction()
    .withSpannerConfig(spannerConfig)
    .withTimestampBound(TimestampBound.strong()));
PCollection<Struct> singers = p.apply(SpannerIO.read()
  .withSpannerConfig(spannerConfig)
  .withQuery("SELECT SingerID, FirstName, LastName FROM Singers")
  .withTransaction(tx));
PCollection<Struct> albums = p.apply(SpannerIO.read().withSpannerConfig(spannerConfig)
  .withQuery("SELECT SingerId, AlbumId, AlbumTitle FROM Albums")
  .withTransaction(tx));
  return Joiner.on(DELIMITER).join(input.getLong(0), input.getString(1), input.getString(2));
})).apply(TextIO.write().to(options.getSingersFilename()).withoutSharding());
  return Joiner.on(DELIMITER).join(input.getLong(0), input.getLong(1), input.getString(2));
})).apply(TextIO.write().to(options.getAlbumsFilename()).withoutSharding());

public static void main(String[] args) {
 Options options = PipelineOptionsFactory.fromArgs(args).withValidation().as(Options.class);
 Pipeline p = Pipeline.create(options);
 SpannerConfig spannerConfig = SpannerConfig.create()
   .withInstanceId(options.getInstanceId())
   .withDatabaseId(options.getDatabaseId());
 // [START spanner_dataflow_readall]
 PCollection<Struct> allRecords = p.apply(SpannerIO.read()
   .withSpannerConfig(spannerConfig)
   .withQuery("SELECT t.table_name FROM information_schema.tables AS t WHERE t"
     + ".table_catalog = '' AND t.table_schema = ''")).apply(
   MapElements.into(TypeDescriptor.of(ReadOperation.class))
     .via((SerializableFunction<Struct, ReadOperation>) input -> {
      String tableName = input.getString(0);
      return ReadOperation.create().withQuery("SELECT * FROM " + tableName);
     })).apply(SpannerIO.readAll().withSpannerConfig(spannerConfig));
 // [END spanner_dataflow_readall]
 PCollection<Long> dbEstimatedSize = allRecords.apply(EstimateSize.create())
   .apply(Sum.longsGlobally());
 dbEstimatedSize.apply(ToString.elements()).apply(TextIO.write().to(options.getOutput())
   .withoutSharding());
 p.run().waitUntilFinish();
}

SpannerIO.read()
  .withInstanceId(instanceId)
  .withDatabaseId(databaseId)
  .withTable("Singers")
  .withColumns("singerId", "firstName", "lastName"));
.apply(TextIO.write().to(options.getOutput()).withoutSharding());

 public static void main(String[] args) {
  Options options = PipelineOptionsFactory.fromArgs(args).withValidation().as(Options.class);
  Pipeline p = Pipeline.create(options);

  String instanceId = options.getInstanceId();
  String databaseId = options.getDatabaseId();
  // [START spanner_dataflow_read]
  // Query for all the columns and rows in the specified Spanner table
  PCollection<Struct> records = p.apply(
    SpannerIO.read()
      .withInstanceId(instanceId)
      .withDatabaseId(databaseId)
      .withQuery("SELECT * FROM " + options.getTable()));
  // [END spanner_dataflow_read]


  PCollection<Long> tableEstimatedSize = records
    // Estimate the size of every row
    .apply(EstimateSize.create())
    // Sum all the row sizes to get the total estimated size of the table
    .apply(Sum.longsGlobally());

  // Write the total size to a file
  tableEstimatedSize
    .apply(ToString.elements())
    .apply(TextIO.write().to(options.getOutput()).withoutSharding());

  p.run().waitUntilFinish();
 }
}

PCollection<String> suspiciousUserIds = p.apply(TextIO.read().from(usersIdFile));
mutations.apply(SpannerIO.write()
  .withInstanceId(instanceId)
  .withDatabaseId(databaseId)
  .grouped());

protected FileBasedSource<String> getSource() {
 return CompressedSource.from(
     new TextSource(
       getFilepattern(),
       getMatchConfiguration().getEmptyMatchTreatment(),
       getDelimiter()))
   .withCompression(getCompression());
}

/** Reads Avro file(s) containing records of the specified schema. */
public static Read<GenericRecord> readGenericRecords(Schema schema) {
 return new AutoValue_AvroIO_Read.Builder<GenericRecord>()
   .setMatchConfiguration(MatchConfiguration.create(EmptyMatchTreatment.DISALLOW))
   .setRecordClass(GenericRecord.class)
   .setSchema(schema)
   .setHintMatchesManyFiles(false)
   .build();
}

/**
 * Like {@link #parseGenericRecords(SerializableFunction)}, but reads each filepattern in the
 * input {@link PCollection}.
 */
public static <T> ParseAll<T> parseAllGenericRecords(
  SerializableFunction<GenericRecord, T> parseFn) {
 return new AutoValue_AvroIO_ParseAll.Builder<T>()
   .setMatchConfiguration(MatchConfiguration.create(EmptyMatchTreatment.ALLOW_IF_WILDCARD))
   .setParseFn(parseFn)
   .setDesiredBundleSizeBytes(64 * 1024 * 1024L)
   .build();
}

@Override
public long getSplitPointsRemaining() {
 if (isStarted() && startOfNextRecord >= getCurrentSource().getEndOffset()) {
  return isDone() ? 0 : 1;
 }
 return super.getSplitPointsRemaining();
}

/**
 * Reads Avro file(s) containing records of an unspecified schema and converting each record to a
 * custom type.
 */
public static <T> Parse<T> parseGenericRecords(SerializableFunction<GenericRecord, T> parseFn) {
 return new AutoValue_AvroIO_Parse.Builder<T>()
   .setMatchConfiguration(MatchConfiguration.create(EmptyMatchTreatment.DISALLOW))
   .setParseFn(parseFn)
   .setHintMatchesManyFiles(false)
   .build();
}

/** Writes Avro records of the specified schema. */
public static Write<GenericRecord> writeGenericRecords(Schema schema) {
 return new Write<>(
   AvroIO.<GenericRecord, GenericRecord>defaultWriteBuilder()
     .setGenericRecords(true)
     .setSchema(schema)
     .build());
}

/** Writes elements to files using a {@link Sink}. See class-level documentation. */
public static <InputT> Write<Void, InputT> write() {
 return new AutoValue_FileIO_Write.Builder<Void, InputT>()
   .setDynamic(false)
   .setCompression(Compression.UNCOMPRESSED)
   .setIgnoreWindowing(false)
   .build();
}

/** Creates a {@link MatchConfiguration} with the given {@link EmptyMatchTreatment}. */
public static MatchConfiguration create(EmptyMatchTreatment emptyMatchTreatment) {
 return new AutoValue_FileIO_MatchConfiguration.Builder()
   .setEmptyMatchTreatment(emptyMatchTreatment)
   .build();
}

/**
 * Preserves windowing of input elements and writes them to files based on the element's window.
 *
 * <p>If using {@link #to(FileBasedSink.FilenamePolicy)}. Filenames will be generated using
 * {@link FilenamePolicy#windowedFilename}. See also {@link WriteFiles#withWindowedWrites()}.
 */
public TypedWrite<UserT, DestinationT, OutputT> withWindowedWrites() {
 return toBuilder().setWindowedWrites(true).build();
}

/**
 * Uses the given shard name template.
 *
 * @see ShardNameTemplate
 */
public Write withShardNameTemplate(String shardTemplate) {
 return toBuilder().setShardTemplate(shardTemplate).build();
}

p.apply("ReadSingers", TextIO.read().from(options.getSingersFilename()))
  .apply("WriteSingers", SpannerIO.write()
    .withInstanceId(instanceId)
    .withDatabaseId(databaseId));
  .apply("ReadAlbums", TextIO.read().from(options.getAlbumsFilename()))
  .apply("WriteAlbums", SpannerIO.write()
    .withInstanceId(instanceId)
    .withDatabaseId(databaseId));

private static String getSchema(String schemaPath) throws IOException {
 ReadableByteChannel channel = FileSystems.open(FileSystems.matchNewResource(
   schemaPath, false));
 try (InputStream stream = Channels.newInputStream(channel)) {
  BufferedReader streamReader = new BufferedReader(new InputStreamReader(stream, "UTF-8"));
  StringBuilder dataBuilder = new StringBuilder();
  String line;
  while ((line = streamReader.readLine()) != null) {
   dataBuilder.append(line);
  }
  return dataBuilder.toString();
 }
}

How to use org.apache.beam.sdk.io

Best Java code snippets using org.apache.beam.sdk.io (Showing top 20 results out of 315)