/** * Key a PCollection of Avro records by a String field name. This is less safe than writing a custom MapFn, but it * could significantly reduce code volume in cases that need a lot of disparate collections to be joined or processed * according to key values. * @param collection PCollection of Avro records to process * @param fieldPath The Avro schema field name of the field to key on. Use . separated names for nested records * @param fieldType PType of the field you wish to extract from the Avro record. * @param <T> record type * @return supplied collection keyed by the field named fieldName */ public static <T extends SpecificRecord, F> PTable<F, T> keyByAvroField(PCollection<T> collection, String fieldPath, PType<F> fieldType) { Class<T> recordType = collection.getPType().getTypeClass(); return collection.by(new AvroExtractMapFn<T, F>(recordType, fieldPath), fieldType); }
/** Randomizes the order of the items in the collection via a MapReduce job */ private static <T> PCollection<T> randomize(PCollection<T> items) { PTable<Long, T> table = items.by("randomize", new RandomizeFn<T>(), Writables.longs()); table = Sort.sort(table, Sort.Order.ASCENDING); return table.values(); }
/** * Creates a {@code PCollection<T>} that has the same contents as its input argument but will * be written to a fixed number of output files. This is useful for map-only jobs that process * lots of input files but only write out a small amount of input per task. * * @param pc The {@code PCollection<T>} to rebalance * @param numPartitions The number of output partitions to create * @return A rebalanced {@code PCollection<T>} with the same contents as the input */ public static <T> PCollection<T> shard(PCollection<T> pc, int numPartitions) { return pc.by(new ShardFn<T>(), pc.getTypeFamily().ints()) .groupByKey(numPartitions) .ungroup() .values(); }
.by(new GetSessionKey(), Avros.strings()) .groupByKey() .parallelDo(new MakeSession(), Avros.specifics(Session.class));
/** * Sorts the {@code PCollection} of {@link TupleN}s using the specified column * ordering and a client-specified number of reducers. * * @return a {@code PCollection} representing the sorted collection. */ public static <T extends Tuple> PCollection<T> sortTuples(PCollection<T> collection, int numReducers, ColumnOrder... columnOrders) { PType<T> pType = collection.getPType(); SortFns.KeyExtraction<T> ke = new SortFns.KeyExtraction<T>(pType, columnOrders); PTable<Object, T> pt = collection.by(ke.getByFn(), ke.getKeyType()); Configuration conf = collection.getPipeline().getConfiguration(); GroupingOptions options = buildGroupingOptions(pt, conf, numReducers, columnOrders); return pt.groupByKey(options).ungroup().values(); }
GetStorageKey<E> getKey = new GetStorageKey<E>(view, numPartitionWriters); PTable<Pair<GenericData.Record, Integer>, E> table = collection .by(getKey, Avros.pairs(Avros.generics(getKey.schema()), Avros.ints())); PGroupedTable<Pair<GenericData.Record, Integer>, E> grouped = numWriters > 0 ? table.groupByKey(numWriters) : table.groupByKey();
PTable<ByteBuffer, C> cellsByRow = cells.by(new ExtractRowFn<C>(), bytes()); final int versions = scan.getMaxVersions(); return cellsByRow.groupByKey().parallelDo("CombineKeyValueIntoRow",
PTable<ByteBuffer, C> cellsByRow = cells.by(new ExtractRowFn<C>(), bytes()); final int versions = scan.getMaxVersions(); return cellsByRow.groupByKey().parallelDo("CombineKeyValueIntoRow",