Refine search
public static void runCsvToAvro(SampleOptions options) throws IOException, IllegalArgumentException { FileSystems.setDefaultPipelineOptions(options); // Get Avro Schema String schemaJson = getSchema(options.getAvroSchema()); Schema schema = new Schema.Parser().parse(schemaJson); // Check schema field types before starting the Dataflow job checkFieldTypes(schema); // Create the Pipeline object with the options we defined above. Pipeline pipeline = Pipeline.create(options); // Convert CSV to Avro pipeline.apply("Read CSV files", TextIO.read().from(options.getInputFile())) .apply("Convert CSV to Avro formatted data", ParDo.of(new ConvertCsvToAvro(schemaJson, options.getCsvDelimiter()))) .setCoder(AvroCoder.of(GenericRecord.class, schema)) .apply("Write Avro formatted data", AvroIO.writeGenericRecords(schemaJson) .to(options.getOutput()).withCodec(CodecFactory.snappyCodec()).withSuffix(".avro")); // Run the pipeline. pipeline.run().waitUntilFinish(); }
/** * Returns a UnionTable for the given input PCollection, using the given union index and the given * unionTableEncoder. */ private <V> PCollection<KV<K, RawUnionValue>> makeUnionTable( final int index, PCollection<KV<K, V>> pCollection, KvCoder<K, RawUnionValue> unionTableEncoder) { return pCollection .apply("MakeUnionTable" + index, ParDo.of(new ConstructUnionTableFn<>(index))) .setCoder(unionTableEncoder); }
@Override public PCollection<T> expand(PCollection<ReadableFile> input) { return input .apply("Split into ranges", ParDo.of(new SplitIntoRangesFn(desiredBundleSizeBytes))) .apply("Reshuffle", Reshuffle.viaRandomKey()) .apply("Read ranges", ParDo.of(new ReadFileRangesFn<>(createSource))) .setCoder(coder); }
@Override public PDone expand(PCollection<V> input) { return input .apply( "Kafka values with default key", MapElements.via( new SimpleFunction<V, KV<K, V>>() { @Override public KV<K, V> apply(V element) { return KV.of(null, element); } })) .setCoder(KvCoder.of(new NullOnlyCoder<>(), input.getCoder())) .apply(kvWriteTransform); }
private PCollection<Result<DestinationT>> writeShardedRecords( PCollection<KV<ShardedKey<DestinationT>, TableRow>> shardedRecords, PCollectionView<String> tempFilePrefix) { return shardedRecords .apply("GroupByDestination", GroupByKey.create()) .apply( "WriteGroupedRecords", ParDo.of(new WriteGroupedRecordsToFiles<DestinationT>(tempFilePrefix, maxFileSize)) .withSideInputs(tempFilePrefix)) .setCoder(WriteBundlesToFiles.ResultCoder.of(destinationCoder)); }
@Override public PCollection<T> expand(PBegin input) { return input .apply(Impulse.create()) .apply(ParDo.of(new SplitBoundedSourceFn<>(source, DEFAULT_BUNDLE_SIZE))) .setCoder(new BoundedSourceCoder<>()) .apply(Reshuffle.viaRandomKey()) .apply(ParDo.of(new ReadFromBoundedSourceFn<>())) .setCoder(source.getOutputCoder()); } }
/** Extract timestamps from the windowFieldIndex, then window into windowFns. */ private PCollection<Row> assignTimestampsAndWindow(PCollection<Row> upstream) { PCollection<Row> windowedStream; windowedStream = upstream .apply( "assignEventTimestamp", WithTimestamps.<Row>of(row -> row.getDateTime(windowFieldIndex).toInstant()) .withAllowedTimestampSkew(new Duration(Long.MAX_VALUE))) .setCoder(upstream.getCoder()) .apply(Window.into(windowFn)); return windowedStream; }
@Override public PCollection<KV<K, V>> expand(PCollection<K> input) { return input .apply( ParDo.of( new DoFn<K, KV<K, V>>() { @ProcessElement public void process(ProcessContext c) { c.output(KV.of(c.element(), c.sideInput(view))); } }) .withSideInputs(view)) .setCoder(KvCoder.of(input.getCoder(), coder)); } }
@Override public PDone expand(PCollection<Iterable<T>> input) { input .apply("GroupGlobally", new GroupGlobally<>(rewindowingStrategy)) .apply("GetPane", MapElements.via(paneExtractor)) .setCoder(IterableCoder.of(input.getCoder())) .apply("RunChecks", ParDo.of(new SingletonCheckerDoFn<>(checkerFn, site))) .apply("VerifyAssertions", new DefaultConcludeTransform()); return PDone.in(input.getPipeline()); } }
@Override public PDone expand(PCollection<T> input) { input .apply("GroupGlobally", new GroupGlobally<>(rewindowingStrategy)) .apply("GetPane", MapElements.via(paneExtractor)) .setCoder(IterableCoder.of(input.getCoder())) .apply("RunChecks", ParDo.of(new GroupedValuesCheckerDoFn<>(checkerFn, site))) .apply("VerifyAssertions", new DefaultConcludeTransform()); return PDone.in(input.getPipeline()); } }
@Override public PCollection<IndexedRecord> read(PBegin in) { // Reuseable coder. LazyAvroCoder<Object> lac = LazyAvroCoder.of(); AvroHdfsFileSource source = AvroHdfsFileSource.of(doAs, path, lac); source.getExtraHadoopConfiguration().addFrom(getExtraHadoopConfiguration()); source.setLimit(limit); PCollection<KV<AvroKey, NullWritable>> read = in.apply(Read.from(source)) // .setCoder(source.getDefaultOutputCoder()); PCollection<AvroKey> pc1 = read.apply(Keys.<AvroKey> create()); PCollection<Object> pc2 = pc1.apply(ParDo.of(new ExtractRecordFromAvroKey())); pc2 = pc2.setCoder(lac); PCollection<IndexedRecord> pc3 = pc2.apply(ConvertToIndexedRecord.<Object> of()); return pc3; }
@Override public PCollection<KV<PrimaryKeyT, Iterable<KV<SecondaryKeyT, ValueT>>>> expand( PCollection<KV<PrimaryKeyT, Iterable<KV<SecondaryKeyT, ValueT>>>> input) { return input .apply( ParDo.of( new SortValuesDoFn<>( sorterOptions, getSecondaryKeyCoder(input.getCoder()), getValueCoder(input.getCoder())))) .setCoder(input.getCoder()); }
@Override public PCollectionTuple expand(PCollection<KV<byte[], KV<InputT, RestrictionT>>> input) { return input .apply(new GBKIntoKeyedWorkItems<>()) .setCoder( KeyedWorkItemCoder.of( ByteArrayCoder.of(), ((KvCoder<byte[], KV<InputT, RestrictionT>>) input.getCoder()).getValueCoder(), input.getWindowingStrategy().getWindowFn().windowCoder())) .apply(new ProcessElements<>(original)); } }
@Override public PDone expand(PCollection<KV<K, V>> in) { // Make sure that a window has been applied. in = ofDefaultWindow(in); // Add an artificial GroupByKey to collect the window results together. PCollection<KV<Instant, KV<K, V>>> pc2 = in.apply("GroupToOneShard", ParDo.of(new GroupToOneShard<KV<K, V>>())).setCoder( KvCoder.of(InstantCoder.of(), in.getCoder())); PCollection<KV<Instant, Iterable<KV<K, V>>>> pc3 = pc2.apply(GroupByKey.<Instant, KV<K, V>> create()); pc3.apply("UnboundedWrite", ParDo.of(new UnboundedWriteToFile<K, V>(sink))); return PDone.in(in.getPipeline()); } }
@Override public PDone write(PCollection<IndexedRecord> in) { ParquetHdfsFileSink sink = new ParquetHdfsFileSink(doAs, path, overwrite, mergeOutput); sink.getExtraHadoopConfiguration().addFrom(getExtraHadoopConfiguration()); PCollection<KV<Void, IndexedRecord>> pc1 = in.apply(ParDo.of(new FormatParquet())); pc1 = pc1.setCoder(KvCoder.of(VoidCoder.of(), LazyAvroCoder.of())); if (in.isBounded() == PCollection.IsBounded.BOUNDED) { return pc1.apply(Write.to(sink)); } else { return pc1.apply(UnboundedWrite.of(sink)); } }