@Override public PDone expand(PCollection<RedisMutation> input) { input.apply(ParDo.of(dofn)); return PDone.in(input.getPipeline()); }
public static void runCsvToAvro(SampleOptions options) throws IOException, IllegalArgumentException { FileSystems.setDefaultPipelineOptions(options); // Get Avro Schema String schemaJson = getSchema(options.getAvroSchema()); Schema schema = new Schema.Parser().parse(schemaJson); // Check schema field types before starting the Dataflow job checkFieldTypes(schema); // Create the Pipeline object with the options we defined above. Pipeline pipeline = Pipeline.create(options); // Convert CSV to Avro pipeline.apply("Read CSV files", TextIO.read().from(options.getInputFile())) .apply("Convert CSV to Avro formatted data", ParDo.of(new ConvertCsvToAvro(schemaJson, options.getCsvDelimiter()))) .setCoder(AvroCoder.of(GenericRecord.class, schema)) .apply("Write Avro formatted data", AvroIO.writeGenericRecords(schemaJson) .to(options.getOutput()).withCodec(CodecFactory.snappyCodec()).withSuffix(".avro")); // Run the pipeline. pipeline.run().waitUntilFinish(); }
@Override public PCollection<List<ElemT>> expand(PCollection<List<ElemT>> input) { return PCollection.createPrimitiveOutputInternal( input.getPipeline(), input.getWindowingStrategy(), input.isBounded(), input.getCoder()); }
@Override public PCollection<Long> expand(PCollection<Struct> input) { return input.apply(ParDo.of(new EstimateStructSizeFn())); }
@Override public PDone expand(PCollection<Iterable<T>> input) { input .apply("GroupGlobally", new GroupGlobally<>(rewindowingStrategy)) .apply("GetPane", MapElements.via(paneExtractor)) .setCoder(IterableCoder.of(input.getCoder())) .apply("RunChecks", ParDo.of(new SingletonCheckerDoFn<>(checkerFn, site))) .apply("VerifyAssertions", new DefaultConcludeTransform()); return PDone.in(input.getPipeline()); } }
@Override public PCollection<KV<K, Long>> expand(PCollection<KV<K, V>> input) { Coder<KV<K, V>> inputCoder = input.getCoder(); if (!(inputCoder instanceof KvCoder)) { throw new IllegalStateException( "ApproximateUnique.PerKey requires its input to use KvCoder"); } @SuppressWarnings("unchecked") final Coder<V> coder = ((KvCoder<K, V>) inputCoder).getValueCoder(); return input.apply(Combine.perKey(new ApproximateUniqueCombineFn<>(sampleSize, coder))); }
@Override public PCollectionTuple expand(PCollection<KV<byte[], KV<InputT, RestrictionT>>> input) { return input .apply(new GBKIntoKeyedWorkItems<>()) .setCoder( KeyedWorkItemCoder.of( ByteArrayCoder.of(), ((KvCoder<byte[], KV<InputT, RestrictionT>>) input.getCoder()).getValueCoder(), input.getWindowingStrategy().getWindowFn().windowCoder())) .apply(new ProcessElements<>(original)); } }
@Override public PDone write(PCollection<IndexedRecord> in) { ParquetHdfsFileSink sink = new ParquetHdfsFileSink(doAs, path, overwrite, mergeOutput); sink.getExtraHadoopConfiguration().addFrom(getExtraHadoopConfiguration()); PCollection<KV<Void, IndexedRecord>> pc1 = in.apply(ParDo.of(new FormatParquet())); pc1 = pc1.setCoder(KvCoder.of(VoidCoder.of(), LazyAvroCoder.of())); if (in.isBounded() == PCollection.IsBounded.BOUNDED) { return pc1.apply(Write.to(sink)); } else { return pc1.apply(UnboundedWrite.of(sink)); } }
/** * Applies a window to the input collection if one hasn't already been specified. * * @return the input collection if it already has been windowed, otherwise a the same collection inside a default * window. */ public static <T> PCollection<T> ofDefaultWindow(PCollection<T> in) { if (in.getWindowingStrategy() != WindowingStrategy.globalDefault() && in.getWindowingStrategy() != null) return in; return in.apply("ApplyDefaultWindow", Window.<T> into(FixedWindows.of(DEFAULT_WINDOW_SIZE))); }
@Override public PCollection<KV<K, Iterable<InputT>>> expand(PCollection<KV<K, InputT>> input) { Duration allowedLateness = input.getWindowingStrategy().getAllowedLateness(); checkArgument( input.getCoder() instanceof KvCoder, "coder specified in the input PCollection is not a KvCoder"); KvCoder inputCoder = (KvCoder) input.getCoder(); Coder<K> keyCoder = (Coder<K>) inputCoder.getCoderArguments().get(0); Coder<InputT> valueCoder = (Coder<InputT>) inputCoder.getCoderArguments().get(1); return input.apply( ParDo.of(new GroupIntoBatchesDoFn<>(batchSize, allowedLateness, keyCoder, valueCoder))); }
public static RunnerApi.PCollection toProto(PCollection<?> pCollection, SdkComponents components) throws IOException { String coderId = components.registerCoder(pCollection.getCoder()); String windowingStrategyId = components.registerWindowingStrategy(pCollection.getWindowingStrategy()); // TODO: Display Data return RunnerApi.PCollection.newBuilder() .setUniqueName(pCollection.getName()) .setCoderId(coderId) .setIsBounded(toProto(pCollection.isBounded())) .setWindowingStrategyId(windowingStrategyId) .build(); }
public <T> Coder<WindowedValue<T>> getWindowedInputCoder(PCollection<T> collection) { Coder<T> valueCoder = collection.getCoder(); return WindowedValue.getFullCoder( valueCoder, collection.getWindowingStrategy().getWindowFn().windowCoder()); }
/** Returns whether this {@link PCollection} has an attached schema. */ @Experimental(Kind.SCHEMAS) public boolean hasSchema() { return getCoder() instanceof SchemaCoder; }
@Override public void finishSpecifyingOutput( String transformName, PInput input, PTransform<?, ?> transform) { this.coderOrFailure = inferCoderOrFail( input, transform, getPipeline().getCoderRegistry(), getPipeline().getSchemaRegistry()); super.finishSpecifyingOutput(transformName, input, transform); }
/** * Returns a singleton {@link PCollectionList} containing the given {@link PCollection}. * * <p>Longer {@link PCollectionList PCollectionLists} can be created by calling {@link #and} on * the result. */ public static <T> PCollectionList<T> of(PCollection<T> pc) { return new PCollectionList<T>(pc.getPipeline()).and(pc); }
private void verifySupportedTrigger(PCollection<Row> pCollection) { WindowingStrategy windowingStrategy = pCollection.getWindowingStrategy(); if (UNBOUNDED.equals(pCollection.isBounded()) && !triggersOncePerWindow(windowingStrategy)) { throw new UnsupportedOperationException( "Joining unbounded PCollections is currently only supported for " + "non-global windows with triggers that are known to produce output once per window," + "such as the default trigger with zero allowed lateness. " + "In these cases Beam can guarantee it joins all input elements once per window. " + windowingStrategy + " is not supported"); } }
/** Returns the attached schema's toRowFunction. */ @Experimental(Kind.SCHEMAS) public SerializableFunction<T, Row> getToRowFunction() { if (!hasSchema()) { throw new IllegalStateException("Cannot call getToRowFunction when there is no schema"); } return ((SchemaCoder<T>) getCoder()).getToRowFunction(); }
@Override public void visitValue(PValue value, Node producer) { if (translationMode.equals(TranslationMode.BATCH)) { if (value instanceof PCollection && ((PCollection) value).isBounded() == IsBounded.UNBOUNDED) { LOG.info( "Found unbounded PCollection {}. Switching to streaming execution.", value.getName()); translationMode = TranslationMode.STREAMING; } } } }