@Override public PCollection<Iterable<InputT>> expand(PCollection<InputT> input) { return input .apply(WithKeys.of((Void) null)) .apply(GroupByKey.create()) .apply(Values.create()); } }
/** * Returns a {@code Values<V>} {@code PTransform}. * * @param <V> the type of the values in the input {@code PCollection}, and the type of the * elements in the output {@code PCollection} */ public static <V> Values<V> create() { return new Values<>(); }
@Override public PCollection<T> expand(PCollection<T> input) { return input .apply("Pair with random key", ParDo.of(new AssignShardFn<>())) .apply(Reshuffle.of()) .apply(Values.create()); }
@Override public PCollection<IndexedRecord> read(PBegin in) { boolean isGSFileSystem = false; PCollection<?> pc2; if (path.startsWith("gs://")) { isGSFileSystem = true; pc2 = in.apply(TextIO.read().from(path)); } else { CsvHdfsFileSource source = CsvHdfsFileSource.of(doAs, path, recordDelimiter, encoding, header, textEnclosure, escapeChar); source.getExtraHadoopConfiguration().addFrom(getExtraHadoopConfiguration()); source.setLimit(limit); PCollection<KV<org.apache.hadoop.io.LongWritable, BytesWritable>> pc1 = in.apply(Read.from(source)); pc2 = pc1.apply(Values.<BytesWritable> create()); } Character te = null; if(this.textEnclosure!=null && !this.textEnclosure.isEmpty()) { te = this.textEnclosure.charAt(0); } Character ec = null; if(this.escapeChar!=null && !this.escapeChar.isEmpty()) { ec = this.escapeChar.charAt(0); } PCollection<IndexedRecord> pc3 = pc2.apply(ParDo.of(new ExtractCsvRecord<>(fieldDelimiter.charAt(0), isGSFileSystem, encoding, te, ec))); return pc3; }
@Override public PCollection<IndexedRecord> read(PBegin in) { boolean isGSFileSystem = false; PCollection<?> pc2; if (path.startsWith("gs://")) { isGSFileSystem = true; pc2 = in.apply(TextIO.read().from(path)); } else { CsvHdfsFileSource source = CsvHdfsFileSource.of(doAs, path, recordDelimiter, encoding, header, textEnclosure, escapeChar); source.getExtraHadoopConfiguration().addFrom(getExtraHadoopConfiguration()); source.setLimit(limit); PCollection<KV<org.apache.hadoop.io.LongWritable, BytesWritable>> pc1 = in.apply(Read.from(source)); pc2 = pc1.apply(Values.<BytesWritable> create()); } Character te = null; if(this.textEnclosure!=null && !this.textEnclosure.isEmpty()) { te = this.textEnclosure.charAt(0); } Character ec = null; if(this.escapeChar!=null && !this.escapeChar.isEmpty()) { ec = this.escapeChar.charAt(0); } PCollection<IndexedRecord> pc3 = pc2.apply(ParDo.of(new ExtractCsvRecord<>(fieldDelimiter.charAt(0), isGSFileSystem, encoding, te, ec))); return pc3; }
@Override public PCollection<V> expand(PBegin input) { return input .apply(Create.of((Void) null).withCoder(VoidCoder.of())) .apply(Reify.viewAsValues(view, coder)) .apply(Values.create()); } }
@Override public PCollection<ElemT> expand(final PCollection<ElemT> input) { input .apply(WithKeys.of((Void) null)) .setCoder(KvCoder.of(VoidCoder.of(), input.getCoder())) .apply(GroupByKey.create()) .apply(Values.create()) .apply(new WriteView<>(view)); return input; } }
@Override public PCollection<IndexedRecord> read(PBegin in) { LazyAvroCoder<IndexedRecord> lac = LazyAvroCoder.of(); ParquetHdfsFileSource source = ParquetHdfsFileSource.of(doAs, path, lac); source.setLimit(limit); source.getExtraHadoopConfiguration().addFrom(getExtraHadoopConfiguration()); PCollection<KV<Void, IndexedRecord>> read = in.apply(Read.from(source)) // .setCoder(source.getDefaultOutputCoder()); PCollection<IndexedRecord> pc1 = read.apply(Values.<IndexedRecord> create()); return pc1; }
@Override public PCollection<IndexedRecord> read(PBegin in) { LazyAvroCoder<IndexedRecord> lac = LazyAvroCoder.of(); ParquetHdfsFileSource source = ParquetHdfsFileSource.of(doAs, path, lac); source.setLimit(limit); source.getExtraHadoopConfiguration().addFrom(getExtraHadoopConfiguration()); PCollection<KV<Void, IndexedRecord>> read = in.apply(Read.from(source)) // .setCoder(source.getDefaultOutputCoder()); PCollection<IndexedRecord> pc1 = read.apply(Values.<IndexedRecord> create()); return pc1; }
@Override public PCollection<IndexedRecord> read(PBegin in) { LazyAvroCoder<IndexedRecord> lac = LazyAvroCoder.of(); ExcelHdfsFileSource source = ExcelHdfsFileSource.of(doAs, path, lac, limit, encoding, sheetName, header, footer, excelFormat.name()); source.getExtraHadoopConfiguration().addFrom(getExtraHadoopConfiguration()); source.setLimit(limit); PCollection<KV<Void, IndexedRecord>> pc1 = in.apply(Read.from(source)).setCoder(source.getDefaultOutputCoder()); PCollection<IndexedRecord> pc2 = pc1.apply(Values.<IndexedRecord>create()); return pc2; }
@Override public PCollection<OutputT> expand(PCollection<InputT> input) { PCollection<KV<Void, InputT>> withKeys = input .apply(WithKeys.of((Void) null)) .setCoder(KvCoder.of(VoidCoder.of(), input.getCoder())); Combine.PerKey<Void, InputT, OutputT> combine = Combine.fewKeys(fn, fnDisplayData); if (!sideInputs.isEmpty()) { combine = combine.withSideInputs(sideInputs); } PCollection<KV<Void, OutputT>> combined; if (fanout >= 2) { combined = withKeys.apply(combine.withHotKeyFanout(fanout)); } else { combined = withKeys.apply(combine); } PCollection<OutputT> output = combined.apply(Values.create()); if (insertDefault) { if (!output.getWindowingStrategy().getWindowFn().isCompatible(new GlobalWindows())) { throw new IllegalStateException(fn.getIncompatibleGlobalWindowErrorMessage()); } return insertDefaultValueIfEmpty(output); } else { return output; } }
@Override public PDone expand(PCollection<KV<DestinationT, String>> input) { input .apply(Values.create()) .apply(FileIO.matchAll().withEmptyMatchTreatment(EmptyMatchTreatment.DISALLOW)); return PDone.in(input.getPipeline()); } }
@Override public PCollection<IndexedRecord> read(PBegin in) { LazyAvroCoder<IndexedRecord> lac = LazyAvroCoder.of(); ExcelHdfsFileSource source = ExcelHdfsFileSource.of(doAs, path, lac, limit, encoding, sheetName, header, footer, excelFormat.name()); source.getExtraHadoopConfiguration().addFrom(getExtraHadoopConfiguration()); source.setLimit(limit); PCollection<KV<Void, IndexedRecord>> pc1 = in.apply(Read.from(source)).setCoder(source.getDefaultOutputCoder()); PCollection<IndexedRecord> pc2 = pc1.apply(Values.<IndexedRecord>create()); return pc2; }
@Override public PCollectionTuple expand(PCollection<KV<byte[], KV<InputT, RestrictionT>>> input) { return input .apply("Drop key", Values.create()) .apply("Reshuffle", Reshuffle.of()) .apply( "NaiveProcess", ParDo.of( new NaiveProcessFn<InputT, OutputT, RestrictionT, TrackerT>(original.getFn())) .withSideInputs(original.getSideInputs()) .withOutputTags(original.getMainOutputTag(), original.getAdditionalOutputTags())); } }
@Override public PCollection<T> expand(PCollection<T> input) { return input .apply(WithKeys.of("")) .apply(GroupByKey.create()) .apply(Values.create()) .apply(Flatten.iterables()); } }
sumScores.apply(Values.create()).apply(Mean.<Integer>globally().asSingletonView());
@Test public void testUnboundedSource() { int numElements = 1000; PCollection<Long> input = p.apply(mkKafkaReadTransform(numElements, new ValueAsTimestampFn()).withoutMetadata()) .apply(Values.create()); addCountingAsserts(input, numElements); p.run(); }
.via(input -> KV.of(input.getKey(), input.getValue().hashCode()))) .apply(GroupByKey.create()) .apply(Values.create());