org.apache.beam.sdk.transforms.ParDo java code examples

@Override
public PCollection<Long> expand(PCollection<Struct> input) {
 return input.apply(ParDo.of(new EstimateStructSizeFn()));
}

public static void runAvroToCsv(SampleOptions options)
  throws IOException, IllegalArgumentException {
 FileSystems.setDefaultPipelineOptions(options);
 // Get Avro Schema
 String schemaJson = getSchema(options.getAvroSchema());
 Schema schema = new Schema.Parser().parse(schemaJson);
 // Check schema field types before starting the Dataflow job
 checkFieldTypes(schema);
 // Create the Pipeline object with the options we defined above.
 Pipeline pipeline = Pipeline.create(options);
 // Convert Avro To CSV
 pipeline.apply("Read Avro files",
   AvroIO.readGenericRecords(schemaJson).from(options.getInputFile()))
   .apply("Convert Avro to CSV formatted data",
     ParDo.of(new ConvertAvroToCsv(schemaJson, options.getCsvDelimiter())))
   .apply("Write CSV formatted data", TextIO.write().to(options.getOutput())
     .withSuffix(".csv"));
 // Run the pipeline.
 pipeline.run().waitUntilFinish();
}

.apply("ParseSingers", ParDo.of(new ParseSinger()))
.apply("CreateSingerMutation", ParDo.of(new DoFn<Singer, Mutation>() {
 @ProcessElement
 public void processElement(ProcessContext c) {
.apply("ReadAlbums", TextIO.read().from(options.getAlbumsFilename()))
.apply("ParseAlbums", ParDo.of(new ParseAlbum()));
.apply("CreateAlbumMutation", ParDo.of(new DoFn<Album, Mutation>() {
 @ProcessElement
 public void processElement(ProcessContext c) {

public static void runCsvToAvro(SampleOptions options)
  throws IOException, IllegalArgumentException {
 FileSystems.setDefaultPipelineOptions(options);
 // Get Avro Schema
 String schemaJson = getSchema(options.getAvroSchema());
 Schema schema = new Schema.Parser().parse(schemaJson);
 // Check schema field types before starting the Dataflow job
 checkFieldTypes(schema);
 // Create the Pipeline object with the options we defined above.
 Pipeline pipeline = Pipeline.create(options);
 // Convert CSV to Avro
 pipeline.apply("Read CSV files", TextIO.read().from(options.getInputFile()))
   .apply("Convert CSV to Avro formatted data",
     ParDo.of(new ConvertCsvToAvro(schemaJson, options.getCsvDelimiter())))
   .setCoder(AvroCoder.of(GenericRecord.class, schema))
   .apply("Write Avro formatted data", AvroIO.writeGenericRecords(schemaJson)
     .to(options.getOutput()).withCodec(CodecFactory.snappyCodec()).withSuffix(".avro"));
 // Run the pipeline.
 pipeline.run().waitUntilFinish();
}

 @Override
 public PCollection<T> expand(PCollection<? extends T> input) {
  return input.apply(
    ParDo.of(
      new DoFn<T, T>() {
       @ProcessElement
       public void process(@Element T element, OutputReceiver<T> r) {
        r.output(element);
       }
      }));
 }
}

@Override
public PCollection<T> expand(PCollection<T> input) {
 return input.apply(
   "AddTimestamps", ParDo.of(new AddTimestampsDoFn<>(fn, allowedTimestampSkew)));
}

 @Override
 public PCollection<Void> expand(PCollection<SuccessOrFailure> input) {
  return input.apply(ParDo.of(new DefaultConcludeFn()));
 }
}

@Override
public PCollection<Read> expand(PCollection<StreamReadsRequest> input) {
 return input.apply(ParDo.of(new RetrieveReads()))
   .apply(ParDo.of(new ConvergeReadsList()));
}

@Override
public PCollection<Variant> expand(PCollection<StreamVariantsRequest> input) {
 return input
   .apply(ParDo.of(new RetrieveFn(auth, fields)))
   .apply(ParDo.of(new CombineVariantsFn()));
}

  public static void runPipeline(Pipeline p) {
    System.out.println("Sleep time: " + TearDown.SLEEP_TIME + " ms");

    long tId = Thread.currentThread().getId();
    long beginTs = System.currentTimeMillis();

    p.apply(Create.of("value"))
      .apply(ParDo.of(new LongTearDownFn()));
    p.run().waitUntilFinish();

    long endTs = System.currentTimeMillis();

    System.out.println("Thread #" + tId +  ", run for " + (endTs - beginTs) + " ms");
  }
}

protected static <T> PCollection<T> copy(PCollection<T> pc, final int n) {
 return pc.apply(
   ParDo.of(
     new DoFn<T, T>() {
      @ProcessElement
      public void processElement(ProcessContext c) throws Exception {
       for (int i = 0; i < n; i++) {
        c.output(c.element());
       }
      }
     }));
}

  @Override
  public PCollection expand(PCollection<IndexedRecord> inputPCollection) {
    LimitDoFn doFn = new LimitDoFn().withProperties(properties);
    return inputPCollection.apply(ParDo.of(doFn));
  }
}

  @Override
  public PCollection expand(PCollection<IndexedRecord> inputPCollection) {
    NormalizeDoFn doFn = new NormalizeDoFn() //
        .withProperties(properties);

    PCollection outputCollection = inputPCollection.apply(ParDo.of(doFn));
    return outputCollection;
  }
}

@Override
public PCollection expand(PCollection<IndexedRecord> inputPCollection) {
  FieldSelectorDoFn doFn = new FieldSelectorDoFn().withProperties(properties);
  return inputPCollection.apply(ParDo.of(doFn));
}

 @Override
 public PCollection<TableRow> expand(PCollection<TableRow> rows) {
  // row... => month...
  PCollection<Integer> tornadoes = rows.apply(ParDo.of(new ExtractTornadoesFn()));
  // month... => <month,count>...
  PCollection<KV<Integer, Long>> tornadoCounts = tornadoes.apply(Count.perElement());
  // <month,count>... => row...
  PCollection<TableRow> results = tornadoCounts.apply(ParDo.of(new FormatCountsFn()));
  return results;
 }
}

  @Override
  public PCollection expand(PCollection<IndexedRecord> inputPCollection) {
    LimitDoFn doFn = new LimitDoFn().withProperties(properties);
    return inputPCollection.apply(ParDo.of(doFn));
  }
}

@Override
public PCollection expand(PCollection<IndexedRecord> inputPCollection) {
  FieldSelectorDoFn doFn = new FieldSelectorDoFn().withProperties(properties);
  return inputPCollection.apply(ParDo.of(doFn));
}

@Override
public PDone expand(PCollection<RedisMutation> input) {
 input.apply(ParDo.of(dofn));
 return PDone.in(input.getPipeline());
}

@Override
public PCollection<T> expand(PCollection<ReadableFile> input) {
 return input
   .apply("Split into ranges", ParDo.of(new SplitIntoRangesFn(desiredBundleSizeBytes)))
   .apply("Reshuffle", Reshuffle.viaRandomKey())
   .apply("Read ranges", ParDo.of(new ReadFileRangesFn<>(createSource)))
   .setCoder(coder);
}

 @Override
 public PCollection<KV<PosRgsMq, Double>> expand(PCollection<Read> input) {
  return input.apply(ParDo.of(new CoverageCounts(bucketWidth)))
    .apply(Combine.<PosRgsMq, Long>perKey(new SumCounts()))
    .apply(ParDo.of(new CoverageMeans()));
 }
}

Javadoc

ParDo is the core element-wise transform in Apache Beam, invoking a user-specified function on each of the elements of the input PCollection to produce zero or more output elements, all of which are collected into the output PCollection.

Elements are processed independently, and possibly in parallel across distributed cloud resources.

The ParDo processing style is similar to what happens inside the "Mapper" or "Reducer" class of a MapReduce-style algorithm.

DoFn

The function to use to process each element is specified by a DoFn, primarily via its DoFn.ProcessElement method. The DoFn may also provide a DoFn.StartBundle and DoFn.FinishBundle method.

Conceptually, when a ParDo transform is executed, the elements of the input PCollection are first divided up into some number of "bundles". These are farmed off to distributed worker machines (or run locally, if using the DirectRunner). For each bundle of input elements processing proceeds as follows:

If required, a fresh instance of the argument DoFn is created on a worker, and the DoFn.Setup method is called on this instance. This may be through deserialization or other means. A PipelineRunner may reuse DoFn instances for multiple bundles. A DoFn that has terminated abnormally (by throwing an Exception) will never be reused.
The DoFn DoFn.StartBundle method, if provided, is called to initialize it.
The DoFn DoFn.ProcessElement method is called on each of the input elements in the bundle.
The DoFn DoFn.FinishBundle method, if provided, is called to complete its work. After DoFn.FinishBundle is called, the framework will not again invoke DoFn.ProcessElement or DoFn.FinishBundle until a new call to DoFn.StartBundle has occurred.
If any of DoFn.Setup, DoFn.StartBundle, DoFn.ProcessElement or DoFn.FinishBundle methods throw an exception, the DoFn.Teardown method, if provided, will be called on the DoFn instance.
If a runner will no longer use a DoFn, the DoFn.Teardown method, if provided, will be called on the discarded instance.

Note also that calls to DoFn.Teardown are best effort, and may not be called before a DoFn is discarded in the general case. As a result, use of the DoFn.Teardownmethod to perform side effects is not appropriate, because the elements that produced the side effect will not be replayed in case of failure, and those side effects are permanently lost.

Each of the calls to any of the DoFn processing methods can produce zero or more output elements. All of the of output elements from all of the DoFn instances are included in an output PCollection.

For example:

 
PCollection lines = ...;{@literal @}ProcessElement 
public void processElement({@literal @}Element String line, 
{@literal @}OutputReceiver r)  
for (String word : line.split("[^a-zA-Z']+"))  
r.output(word); 
} 
}})); 
PCollection wordLengths = 
words.apply(ParDo.of(new DoFn()  
{@literal @}ProcessElement 
public void processElement({@literal @}Element String word, 
{@literal @}OutputReceiver r)  
Integer length = word.length(); 
r.output(length); 
}})); 
}

Each output element has the same timestamp and is in the same windows as its corresponding input element, and the output PCollection has the same WindowFn associated with it as the input.

Naming ParDo transforms

The name of a transform is used to provide a name for any node in the Pipeline graph resulting from application of the transform. It is best practice to provide a name at the time of application, via PCollection#apply(String,PTransform). Otherwise, a unique name - which may not be stable across pipeline revision - will be generated, based on the transform name.

For example:

 
PCollection words =lines.apply("ExtractWords", ParDo.of(new DoFn() { ... })); 
PCollection wordLengths = 
words.apply("ComputeWordLengths", ParDo.of(new DoFn() { ... })); 
}

Side Inputs

While a ParDo processes elements from a single "main input" PCollection, it can take additional "side input" PCollectionView. These side input PCollectionView express styles of accessing PCollection computed by earlier pipeline operations, passed in to the ParDo transform using SingleOutput#withSideInputs, and their contents accessible to each of the DoFn operations via DoFn.ProcessContext#sideInput. For example:

 
PCollection words = ...;{@literal @}ProcessElement 
public void processElement(ProcessContext c)  
String word = c.element(); 
int lengthCutOff = c.sideInput(maxWordLengthCutOffView); 
if (word.length()  
Additional Outputs 
Optionally, a  
ParDo transform can produce multiple output  
PCollection, both a "main output"  
 PCollection plus any number of additional 
output  
PCollection, each keyed by a distinct  
TupleTag, and bundled 
in a  
PCollectionTuple. The  
TupleTag to be used for the output  
PCollectionTuple are specified by invoking  
SingleOutput#withOutputTags. Unconsumed 
outputs do not necessarily need to be explicitly specified, even if the  
DoFn generates 
them. Within the  
DoFn, an element is added to the main output  
PCollection as 
normal, using  
WindowedContext#output(Object), while an element is added to any additional 
output  
PCollection using  
WindowedContext#output(TupleTag,Object). For example: 
 
PCollection words = ...;new TupleTag(){}; 
final TupleTag wordLengthsAboveCutOffTag = 
new TupleTag(){}; 
final TupleTag markedWordsTag = 
new TupleTag(){}; 
PCollectionTuple results = 
words.apply( 
ParDo 
.of(new DoFn()  
// Create a tag for the unconsumed output. 
final TupleTag specialWordsTag = 
new TupleTag(){}; 
{@literal @}ProcessElement 
public void processElement(@Element String word, MultiOutputReceiver r)  
if (word.length()  wordsBelowCutOff = 
results.get(wordsBelowCutOffTag); 
PCollection wordLengthsAboveCutOff = 
results.get(wordLengthsAboveCutOffTag); 
PCollection markedWords = 
results.get(markedWordsTag); 
} 
Output Coders 
By default, the  
Coder for the elements of the main output  
PCollection is inferred from the concrete type of the  
DoFn. 
By default, the  
Coder for the elements of an output 
PCollection is inferred from the concrete type of 
the corresponding  
TupleTag. To be successful, the 
TupleTag should be created as an instance of a trivial anonymous subclass, with  
{}} suffixed to the constructor call. Such uses block Java's generic type parameter inference, so 
the  
  argument must be provided explicitly. For example: 
 
// A TupleTag to use for a side input can be written concisely:// A TupleTag to use for an output should be written with "{}", 
// and explicit generic parameter type: 
final TupleTag additionalOutputTag = new TupleTag(){}; 
} 
This style of  
 TupleTag instantiation is used in the example of  
ParDo that 
produce multiple outputs, above. 
Serializability of  
DoFn 
A  
DoFn passed to a  
ParDo transform must be  
Serializable. This allows 
the  
DoFn instance created in this "main program" to be sent (in serialized form) to 
remote worker machines and reconstituted for bundles of elements of the input  
PCollectionbeing processed. A  
DoFn can have instance variable state, and non-transient instance 
variable state will be serialized in the main program and then deserialized on remote worker 
machines for some number of bundles of elements to process. 
 
DoFn expressed as anonymous inner classes can be convenient, but due to a quirk 
in Java's rules for serializability, non-static inner or nested classes (including anonymous 
inner classes) automatically capture their enclosing class's instance in their serialized state. 
This can lead to including much more than intended in the serialized state of a  
DoFn, or 
even things that aren't  
Serializable. 
There are two ways to avoid unintended serialized state in a  
DoFn: 
 
Define the  
DoFn as a named, static class. 
Define the  
DoFn as an anonymous inner class inside of a static method. 
 
Both of these approaches ensure that there is no implicit enclosing instance serialized along 
with the  
DoFn instance. 
Prior to Java 8, any local variables of the enclosing method referenced from within an 
anonymous inner class need to be marked as  
 final. If defining the  
DoFn as a named 
static class, such variables would be passed as explicit constructor arguments and stored in 
explicit instance variables. 
There are three main ways to initialize the state of a  
DoFn instance processing a 
bundle: 
 
Define instance variable state (including implicit instance variables holding final 
variables captured by an anonymous inner class), initialized by the  
DoFn's 
constructor (which is implicit for an anonymous inner class). This state will be 
automatically serialized and then deserialized in the  
DoFn instances created for 
bundles. This method is good for state known when the original  
DoFn is created in 
the main program, if it's not overly large. This is not suitable for any state which must 
only be used for a single bundle, as  
DoFn may be used to process multiple 
bundles. 
Compute the state as a singleton  
PCollection and pass it in as a side input to the 
DoFn. This is good if the state needs to be computed by the pipeline, or if the 
state is very large and so is best read from file(s) rather than sent as part of the  
DoFn serialized state. 
Initialize the state in each  
DoFn instance, in a  
DoFn.StartBundle method. 
This is good if the initialization doesn't depend on any information known only by the main 
program or computed by earlier pipeline operations, but is the same for all instances of 
this  
DoFn for all program executions, say setting up empty caches or initializing 
constant data. 
 
No Global Shared State 
 
ParDo operations are intended to be able to run in parallel across multiple worker 
machines. This precludes easy sharing and updating mutable state across those machines. There is 
no support in the Beam model for communicating and synchronizing updates to shared state across 
worker machines, so programs should not access any mutable static variable state in their  
DoFn, without understanding that the Java processes for the main program and workers will each 
have its own independent copy of such state, and there won't be any automatic copying of that 
state across Java processes. All information should be communicated to  
DoFn instances via 
main and side inputs and serialized state, and all output should be communicated from a  
DoFn instance via output  
PCollection, in the absence of external 
communication mechanisms written by user code. 
Fault Tolerance 
In a distributed system, things can fail: machines can crash, machines can be unable to 
communicate across the network, etc. While individual failures are rare, the larger the job, the 
greater the chance that something, somewhere, will fail. Beam runners may strive to mask such 
failures by retrying failed  
DoFn bundle. This means that a  
DoFn instance might 
process a bundle partially, then crash for some reason, then be rerun (often in a new JVM) on 
that same bundle and on the same elements as before. Sometimes two or more  
DoFn instances 
will be running on the same bundle simultaneously, with the system taking the results of the 
first instance to complete successfully. Consequently, the code in a  
DoFn needs to be 
written such that these duplicate (sequential or concurrent) executions do not cause problems. If 
the outputs of a  
DoFn are a pure function of its inputs, then this requirement is 
satisfied. However, if a  
DoFn execution has external side-effects, such as 
performing updates to external HTTP services, then the  
DoFn code needs to take 
care to ensure that those updates are idempotent and that concurrent updates are acceptable. This 
property can be difficult to achieve, so it is advisable to strive to keep  
DoFn as 
pure functions as much as possible. 
Optimization 
Beam runners may choose to apply optimizations to a pipeline before it is executed. A key 
optimization, fusion, relates to  
ParDo operations. If one  
ParDo operation 
produces a  
PCollection that is then consumed as the main input of another  
ParDooperation, the two  
ParDo operations will be fused together into a single ParDo 
operation and run in a single pass; this is "producer-consumer fusion". Similarly, if two or more 
ParDo operations have the same  
PCollection main input, they will be fused into a single 
ParDo that makes just one pass over the input  
PCollection; this is "sibling 
fusion". 
If after fusion there are no more unfused references to a  
PCollection (e.g., one 
between a producer ParDo and a consumer  
ParDo), the  
PCollection itself is "fused 
away" and won't ever be written to disk, saving all the I/O and space expense of constructing it. 
When Beam runners apply fusion optimization, it is essentially "free" to write  
ParDooperations in a very modular, composable style, each  
ParDo operation doing one clear 
task, and stringing together sequences of  
ParDo operations to get the desired overall 
effect. Such programs can be easier to understand, easier to unit-test, easier to extend and 
evolve, and easier to reuse in new programs. The predefined library of PTransforms that come with 
Beam makes heavy use of this modular, composable style, trusting to the runner to "flatten out" 
all the compositions into highly optimized stages.

Most used methods

of
Creates a ParDo PTransform that will invoke the given DoFn function.The resulting PTransform is read
codersForStateSpecTypes
Try to provide coders for as many of the type arguments of given DoFnSignature.StateDeclaration as p
displayDataForFn
validate
Perform common validations of the DoFn, for example ensuring that state is used correctly and that i
validateWindowTypeForMethod
withSideInputs

Popular in Java

Creating JSON documents from java classes using gson
scheduleAtFixedRate (ScheduledExecutorService)
setRequestProperty (URLConnection)
setScale (BigDecimal)
FileNotFoundException (java.io)
Thrown when a file specified by a program cannot be found.
Calendar (java.util)
Calendar is an abstract base class for converting between a Date object and a set of integer fields
Dictionary (java.util)
Note: Do not use this class since it is obsolete. Please use the Map interface for new implementatio
TreeMap (java.util)
Walk the nodes of the tree left-to-right or right-to-left. Note that in descending iterations, next
CountDownLatch (java.util.concurrent)
A synchronization aid that allows one or more threads to wait until a set of operations being perfor
JarFile (java.util.jar)
JarFile is used to read jar entries and their associated data from jar files.
Github Copilot alternatives

How to useParDo in org.apache.beam.sdk.transforms

Best Java code snippets using org.apache.beam.sdk.transforms.ParDo (Showing top 20 results out of 783)

How to use
ParDo
in
org.apache.beam.sdk.transforms