@Override public PCollection<Iterable<InputT>> expand(PCollection<InputT> input) { return input .apply(WithKeys.of((Void) null)) .apply(GroupByKey.create()) .apply(Values.create()); } }
@Override public PCollection<T> expand(PCollection<T> input) { return input .apply("Pair with random key", ParDo.of(new AssignShardFn<>())) .apply(Reshuffle.of()) .apply(Values.create()); }
@Test public void testValuesGetName() { assertEquals("Values", Values.<Integer>create().getName()); } }
@Override public PCollection<V> expand(PBegin input) { return input .apply(Create.of((Void) null).withCoder(VoidCoder.of())) .apply(Reify.viewAsValues(view, coder)) .apply(Values.create()); } }
@Override public PCollection<ElemT> expand(final PCollection<ElemT> input) { input .apply(WithKeys.of((Void) null)) .setCoder(KvCoder.of(VoidCoder.of(), input.getCoder())) .apply(GroupByKey.create()) .apply(Values.create()) .apply(new WriteView<>(view)); return input; } }
@Override public PCollection<IndexedRecord> read(PBegin in) { LazyAvroCoder<IndexedRecord> lac = LazyAvroCoder.of(); ParquetHdfsFileSource source = ParquetHdfsFileSource.of(doAs, path, lac); source.setLimit(limit); source.getExtraHadoopConfiguration().addFrom(getExtraHadoopConfiguration()); PCollection<KV<Void, IndexedRecord>> read = in.apply(Read.from(source)) // .setCoder(source.getDefaultOutputCoder()); PCollection<IndexedRecord> pc1 = read.apply(Values.<IndexedRecord> create()); return pc1; }
@Override public PCollection<IndexedRecord> read(PBegin in) { LazyAvroCoder<IndexedRecord> lac = LazyAvroCoder.of(); ParquetHdfsFileSource source = ParquetHdfsFileSource.of(doAs, path, lac); source.setLimit(limit); source.getExtraHadoopConfiguration().addFrom(getExtraHadoopConfiguration()); PCollection<KV<Void, IndexedRecord>> read = in.apply(Read.from(source)) // .setCoder(source.getDefaultOutputCoder()); PCollection<IndexedRecord> pc1 = read.apply(Values.<IndexedRecord> create()); return pc1; }
@Override public PCollection<IndexedRecord> read(PBegin in) { LazyAvroCoder<IndexedRecord> lac = LazyAvroCoder.of(); ExcelHdfsFileSource source = ExcelHdfsFileSource.of(doAs, path, lac, limit, encoding, sheetName, header, footer, excelFormat.name()); source.getExtraHadoopConfiguration().addFrom(getExtraHadoopConfiguration()); source.setLimit(limit); PCollection<KV<Void, IndexedRecord>> pc1 = in.apply(Read.from(source)).setCoder(source.getDefaultOutputCoder()); PCollection<IndexedRecord> pc2 = pc1.apply(Values.<IndexedRecord>create()); return pc2; }
@Override public PDone expand(PCollection<KV<DestinationT, String>> input) { input .apply(Values.create()) .apply(FileIO.matchAll().withEmptyMatchTreatment(EmptyMatchTreatment.DISALLOW)); return PDone.in(input.getPipeline()); } }
@Override public PCollection<IndexedRecord> read(PBegin in) { LazyAvroCoder<IndexedRecord> lac = LazyAvroCoder.of(); ExcelHdfsFileSource source = ExcelHdfsFileSource.of(doAs, path, lac, limit, encoding, sheetName, header, footer, excelFormat.name()); source.getExtraHadoopConfiguration().addFrom(getExtraHadoopConfiguration()); source.setLimit(limit); PCollection<KV<Void, IndexedRecord>> pc1 = in.apply(Read.from(source)).setCoder(source.getDefaultOutputCoder()); PCollection<IndexedRecord> pc2 = pc1.apply(Values.<IndexedRecord>create()); return pc2; }
@Override public PCollectionTuple expand(PCollection<KV<byte[], KV<InputT, RestrictionT>>> input) { return input .apply("Drop key", Values.create()) .apply("Reshuffle", Reshuffle.of()) .apply( "NaiveProcess", ParDo.of( new NaiveProcessFn<InputT, OutputT, RestrictionT, TrackerT>(original.getFn())) .withSideInputs(original.getSideInputs()) .withOutputTags(original.getMainOutputTag(), original.getAdditionalOutputTags())); } }
@Override public PCollection<T> expand(PCollection<T> input) { return input .apply(WithKeys.of("")) .apply(GroupByKey.create()) .apply(Values.create()) .apply(Flatten.iterables()); } }
@Test public void testUnboundedSource() { int numElements = 1000; PCollection<Long> input = p.apply(mkKafkaReadTransform(numElements, new ValueAsTimestampFn()).withoutMetadata()) .apply(Values.create()); addCountingAsserts(input, numElements); p.run(); }
@Test @Category(NeedsRunner.class) public void testValues() { PCollection<KV<String, Integer>> input = p.apply( Create.of(Arrays.asList(TABLE)) .withCoder(KvCoder.of(StringUtf8Coder.of(), BigEndianIntegerCoder.of()))); PCollection<Integer> output = input.apply(Values.create()); PAssert.that(output).containsInAnyOrder(1, 2, 3, 4, 4); p.run(); }
@Test @Category(NeedsRunner.class) public void testValuesEmpty() { PCollection<KV<String, Integer>> input = p.apply( Create.of(Arrays.asList(EMPTY_TABLE)) .withCoder(KvCoder.of(StringUtf8Coder.of(), BigEndianIntegerCoder.of()))); PCollection<Integer> output = input.apply(Values.create()); PAssert.that(output).empty(); p.run(); }
protected void runTestSimpleCombine( List<KV<String, Integer>> table, int globalSum, List<KV<String, String>> perKeyCombines) { PCollection<KV<String, Integer>> input = createInput(pipeline, table); PCollection<Integer> sum = input.apply(Values.create()).apply(Combine.globally(new SumInts())); PCollection<KV<String, String>> sumPerKey = input.apply(Combine.perKey(new TestCombineFn())); PAssert.that(sum).containsInAnyOrder(globalSum); PAssert.that(sumPerKey).containsInAnyOrder(perKeyCombines); pipeline.run(); }
protected void runTestAccumulatingCombine( List<KV<String, Integer>> table, Double globalMean, List<KV<String, Double>> perKeyMeans) { PCollection<KV<String, Integer>> input = createInput(pipeline, table); PCollection<Double> mean = input.apply(Values.create()).apply(Combine.globally(new MeanInts())); PCollection<KV<String, Double>> meanPerKey = input.apply(Combine.perKey(new MeanInts())); PAssert.that(mean).containsInAnyOrder(globalMean); PAssert.that(meanPerKey).containsInAnyOrder(perKeyMeans); pipeline.run(); }
@SuppressWarnings("unchecked") protected void runTestBasicCombine( List<KV<String, Integer>> table, Set<Integer> globalUnique, List<KV<String, Set<Integer>>> perKeyUnique) { PCollection<KV<String, Integer>> input = createInput(pipeline, table); PCollection<Set<Integer>> unique = input.apply(Values.create()).apply(Combine.globally(new UniqueInts())); PCollection<KV<String, Set<Integer>>> uniquePerKey = input.apply(Combine.perKey(new UniqueInts())); PAssert.that(unique).containsInAnyOrder(globalUnique); PAssert.that(uniquePerKey).containsInAnyOrder(perKeyUnique); pipeline.run(); }
@Test public void testUnboundedSourceStartReadTimeException() { assumeTrue(new ConsumerSpEL().hasOffsetsForTimes()); noMessagesException.expect(RuntimeException.class); int numElements = 1000; // In this MockConsumer, we let the elements of the time and offset equal and there are 20 // partitions. So set this startTime can not read any element. int startTime = numElements / 20; p.apply( mkKafkaReadTransform(numElements, numElements, new ValueAsTimestampFn()) .withStartReadTime(new Instant(startTime)) .withoutMetadata()) .apply(Values.create()); p.run(); }
@Test public void testUnboundedSourceTimestamps() { int numElements = 1000; PCollection<Long> input = p.apply(mkKafkaReadTransform(numElements, new ValueAsTimestampFn()).withoutMetadata()) .apply(Values.create()); addCountingAsserts(input, numElements); PCollection<Long> diffs = input .apply("TimestampDiff", ParDo.of(new ElementValueDiff())) .apply("DistinctTimestamps", Distinct.create()); // This assert also confirms that diffs only has one unique value. PAssert.thatSingleton(diffs).isEqualTo(0L); p.run(); }