/** * Parses the lines of the input {@code PCollection<String>} and returns a {@code PTable<K, V>} using * the given {@code Extractor<Pair<K, V>>}. * * @param groupName A label to use for tracking errors related to the parsing process * @param input The input {@code PCollection<String>} to convert * @param extractor The {@code Extractor<Pair<K, V>>} that converts each line * @return A {@code PTable<K, V>} */ public static <K, V> PTable<K, V> parseTable(String groupName, PCollection<String> input, Extractor<Pair<K, V>> extractor) { return parseTable(groupName, input, input.getTypeFamily(), extractor); }
/** * Parses the lines of the input {@code PCollection<String>} and returns a {@code PCollection<T>} using * the given {@code Extractor<T>}. * * @param groupName A label to use for tracking errors related to the parsing process * @param input The input {@code PCollection<String>} to convert * @param extractor The {@code Extractor<T>} that converts each line * @return A {@code PCollection<T>} */ public static <T> PCollection<T> parse(String groupName, PCollection<String> input, Extractor<T> extractor) { return parse(groupName, input, input.getTypeFamily(), extractor); }
private static <T> PTable<T, Boolean> toTable(PCollection<T> coll) { PTypeFamily typeFamily = coll.getTypeFamily(); return coll.parallelDo(new DoFn<T, Pair<T, Boolean>>() { @Override public void process(T input, Emitter<Pair<T, Boolean>> emitter) { emitter.emit(Pair.of(input, Boolean.TRUE)); } }, typeFamily.tableOf(coll.getPType(), typeFamily.booleans())); }
private static <T> PTable<T, Boolean> toTable(PCollection<T> coll) { PTypeFamily typeFamily = coll.getTypeFamily(); return coll.parallelDo(new DoFn<T, Pair<T, Boolean>>() { @Override public void process(T input, Emitter<Pair<T, Boolean>> emitter) { emitter.emit(Pair.of(input, Boolean.TRUE)); } }, typeFamily.tableOf(coll.getPType(), typeFamily.booleans())); }
private static <T> PTable<T, Boolean> toTable(PCollection<T> coll) { PTypeFamily typeFamily = coll.getTypeFamily(); return coll.parallelDo(new DoFn<T, Pair<T, Boolean>>() { @Override public void process(T input, Emitter<Pair<T, Boolean>> emitter) { emitter.emit(Pair.of(input, Boolean.TRUE)); } }, typeFamily.tableOf(coll.getPType(), typeFamily.booleans())); }
public static PGroupedTable<TupleN, ByteBuffer> apply(PCollection<ByteBuffer> traces, List<String> keys) { Field[] fields = new Field[keys.size()]; boolean[] negate = new boolean[keys.size()]; for (int i = 0; i < keys.size(); i++) { String key = keys.get(i); if (key.charAt(0) == '-') { negate[i] = true; key = key.substring(1); } fields[i] = Fields.getSortField(key); if (fields[i] == null) { throw new IllegalArgumentException("Unrecognized susort key: " + keys.get(i)); } } PTypeFamily tf = traces.getTypeFamily(); PType[] headerTypes = new PType[keys.size()]; for (int i = 0; i < keys.size(); i++) { headerTypes[i] = tf.ints(); } GroupingOptions options = GroupingOptions.builder() .partitionerClass(JoinUtils.getPartitionerClass(tf)).build(); return traces.parallelDo("gethw", new HeaderExtractor(fields, negate), tf.tableOf(tf.tuples(headerTypes), tf.bytes())).groupByKey(options); } }
/** * Creates a {@code PCollection<T>} that has the same contents as its input argument but will * be written to a fixed number of output files. This is useful for map-only jobs that process * lots of input files but only write out a small amount of input per task. * * @param pc The {@code PCollection<T>} to rebalance * @param numPartitions The number of output partitions to create * @return A rebalanced {@code PCollection<T>} with the same contents as the input */ public static <T> PCollection<T> shard(PCollection<T> pc, int numPartitions) { return pc.by(new ShardFn<T>(), pc.getTypeFamily().ints()) .groupByKey(numPartitions) .ungroup() .values(); }
/** * A version of the reservoir sampling algorithm that uses a given seed, primarily for * testing purposes. * * @param input The input data * @param sampleSize The number of elements to select * @param seed The test seed * @return A {@code PCollection} made up of the sampled elements */ public static <T> PCollection<T> reservoirSample( PCollection<T> input, int sampleSize, Long seed) { PTypeFamily ptf = input.getTypeFamily(); PType<Pair<T, Integer>> ptype = ptf.pairs(input.getPType(), ptf.ints()); return weightedReservoirSample( input.parallelDo("Map to pairs for reservoir sampling", new MapFn<T, Pair<T, Integer>>() { @Override public Pair<T, Integer> map(T t) { return Pair.of(t, 1); } }, ptype), sampleSize, seed); }
public static <S> PCollection<S> aggregate(PCollection<S> collect, Aggregator<S> aggregator) { PTypeFamily tf = collect.getTypeFamily(); return collect.parallelDo("Aggregate.aggregator", new MapFn<S, Pair<Boolean, S>>() { public Pair<Boolean, S> map(S input) { return Pair.of(false, input); } }, tf.tableOf(tf.booleans(), collect.getPType())) .groupByKey(1) .combineValues(aggregator) .values(); } }
private static <T> PTable<String, BloomFilter> createFilterTable(PCollection<T> collection, BloomFilterFn<T> filterFn) { PTypeFamily tf = collection.getTypeFamily(); PTable<String, BloomFilter> table = collection.parallelDo(filterFn, tf.tableOf(tf.strings(), Writables.writables(BloomFilter.class))); return table.groupByKey(1).combineValues(new BloomFilterAggregator()); }
/** * Returns a {@code PTable} that contains the unique elements of this collection mapped to a count * of their occurrences. */ public static <S> PTable<S, Long> count(PCollection<S> collect) { PTypeFamily tf = collect.getTypeFamily(); return collect.parallelDo("Aggregate.count", new MapFn<S, Pair<S, Long>>() { public Pair<S, Long> map(S input) { return Pair.of(input, 1L); } }, tf.tableOf(collect.getPType(), tf.longs())).groupByKey() .combineValues(Aggregators.SUM_LONGS()); }
/** * Returns a {@code PTable} that contains the unique elements of this * collection mapped to a count of their occurrences. */ public static <S> PTable<S, Long> count(PCollection<S> collect) { PTypeFamily tf = collect.getTypeFamily(); return collect.parallelDo("Aggregate.count", new MapFn<S, Pair<S, Long>>() { public Pair<S, Long> map(S input) { return Pair.of(input, 1L); } }, tf.tableOf(collect.getPType(), tf.longs())) .groupByKey() .combineValues(CombineFn.<S> SUM_LONGS()); }
/** * Returns a {@code PTable} that contains the unique elements of this collection mapped to a count * of their occurrences. */ public static <S> PTable<S, Long> count(PCollection<S> collect, int numPartitions) { PTypeFamily tf = collect.getTypeFamily(); return collect.parallelDo("Aggregate.count", new MapFn<S, Pair<S, Long>>() { public Pair<S, Long> map(S input) { return Pair.of(input, 1L); } }, tf.tableOf(collect.getPType(), tf.longs())) .groupByKey(numPartitions) .combineValues(Aggregators.SUM_LONGS()); }
/** * The weighted reservoir sampling function with the seed term exposed for testing purposes. * * @param input the weighted observations * @param sampleSize The number of elements to select * @param seed The test seed * @return A random sample of the given size that respects the weighting values */ public static <T, N extends Number> PCollection<T> weightedReservoirSample( PCollection<Pair<T, N>> input, int sampleSize, Long seed) { PTypeFamily ptf = input.getTypeFamily(); PTable<Integer, Pair<T, N>> groupedIn = input.parallelDo( new MapFn<Pair<T, N>, Pair<Integer, Pair<T, N>>>() { @Override public Pair<Integer, Pair<T, N>> map(Pair<T, N> p) { return Pair.of(0, p); } }, ptf.tableOf(ptf.ints(), input.getPType())); int[] ss = { sampleSize }; return groupedWeightedReservoirSample(groupedIn, ss, seed) .parallelDo("Extract sampled value from pair", new MapFn<Pair<Integer, T>, T>() { @Override public T map(Pair<Integer, T> p) { return p.second(); } }, (PType<T>) input.getPType().getSubTypes().get(0)); }
/** * Sorts the {@code PCollection} using the natural ordering of its elements in * the order specified using the given number of reducers. * * @return a {@code PCollection} representing the sorted collection. */ public static <T> PCollection<T> sort(PCollection<T> collection, int numReducers, Order order) { PTypeFamily tf = collection.getTypeFamily(); PTableType<T, Void> type = tf.tableOf(collection.getPType(), tf.nulls()); Configuration conf = collection.getPipeline().getConfiguration(); PTable<T, Void> pt = collection.parallelDo("sort-pre", new DoFn<T, Pair<T, Void>>() { @Override public void process(T input, Emitter<Pair<T, Void>> emitter) { emitter.emit(Pair.of(input, (Void) null)); } }, type); GroupingOptions options = buildGroupingOptions(pt, conf, numReducers, order); return pt.groupByKey(options).ungroup().keys(); }
/** * Sorts the {@link PCollection} of {@link TupleN}s using the specified column * ordering. * * @return a {@link PCollection} representing the sorted collection. */ public static PCollection<TupleN> sortTuples(PCollection<TupleN> collection, ColumnOrder... columnOrders) { PTypeFamily tf = collection.getTypeFamily(); PType<TupleN> pType = collection.getPType(); PTableType<TupleN, Void> type = tf.tableOf(tf.tuples(pType.getSubTypes().toArray(new PType[0])), tf.nulls()); PTable<TupleN, Void> pt = collection.parallelDo(new DoFn<TupleN, Pair<TupleN, Void>>() { @Override public void process(TupleN input, Emitter<Pair<TupleN, Void>> emitter) { emitter.emit(Pair.of(input, (Void) null)); } }, type); Configuration conf = collection.getPipeline().getConfiguration(); GroupingOptions options = buildGroupingOptions(conf, tf, pType, columnOrders); PTable<TupleN, Void> sortedPt = pt.groupByKey(options).ungroup(); return sortedPt.parallelDo(new DoFn<Pair<TupleN, Void>, TupleN>() { @Override public void process(Pair<TupleN, Void> input, Emitter<TupleN> emitter) { emitter.emit(input.first()); } }, collection.getPType()); }
/** * Returns the number of elements in the provided PCollection. * * @param collect The PCollection whose elements should be counted. * @param <S> The type of the PCollection. * @return A {@code PObject} containing the number of elements in the {@code PCollection}. */ public static <S> PObject<Long> length(PCollection<S> collect) { PTypeFamily tf = collect.getTypeFamily(); PTable<Integer, Long> countTable = collect .parallelDo("Aggregate.count", new MapFn<S, Pair<Integer, Long>>() { public Pair<Integer, Long> map(S input) { return Pair.of(1, 1L); } }, tf.tableOf(tf.ints(), tf.longs())) .groupByKey(GroupingOptions.builder().numReducers(1).build()) .combineValues(Aggregators.SUM_LONGS()); PCollection<Long> count = countTable.values(); return new FirstElementPObject<Long>(count); }
/** * Returns the number of elements in the provided PCollection. * * @param collect The PCollection whose elements should be counted. * @param <S> The type of the PCollection. * @return A {@code PObject} containing the number of elements in the {@code PCollection}. */ public static <S> PObject<Long> length(PCollection<S> collect) { PTypeFamily tf = collect.getTypeFamily(); PTable<Integer, Long> countTable = collect .parallelDo("Aggregate.count", new MapFn<S, Pair<Integer, Long>>() { public Pair<Integer, Long> map(S input) { return Pair.of(1, 1L); } public void cleanup(Emitter<Pair<Integer, Long>> e) { e.emit(Pair.of(1, 0L)); } }, tf.tableOf(tf.ints(), tf.longs())) .groupByKey(GroupingOptions.builder().numReducers(1).build()) .combineValues(Aggregators.SUM_LONGS()); PCollection<Long> count = countTable.values(); return new FirstElementPObject<Long>(count, 0L); }
/** * Sorts the {@link PCollection} using the natural ordering of its elements in * the order specified. * * @return a {@link PCollection} representing the sorted collection. */ public static <T> PCollection<T> sort(PCollection<T> collection, Order order) { PTypeFamily tf = collection.getTypeFamily(); PTableType<T, Void> type = tf.tableOf(collection.getPType(), tf.nulls()); Configuration conf = collection.getPipeline().getConfiguration(); GroupingOptions options = buildGroupingOptions(conf, tf, collection.getPType(), order); PTable<T, Void> pt = collection.parallelDo("sort-pre", new DoFn<T, Pair<T, Void>>() { @Override public void process(T input, Emitter<Pair<T, Void>> emitter) { emitter.emit(Pair.of(input, (Void) null)); } }, type); PTable<T, Void> sortedPt = pt.groupByKey(options).ungroup(); return sortedPt.parallelDo("sort-post", new DoFn<Pair<T, Void>, T>() { @Override public void process(Pair<T, Void> input, Emitter<T> emitter) { emitter.emit(input.first()); } }, collection.getPType()); }
/** * Sorts the {@link PCollection} using the natural ordering of its elements * in the order specified. * * @return a {@link PCollection} representing the sorted collection. */ public static <T> PCollection<T> sort(PCollection<T> collection, Order order) { PTypeFamily tf = collection.getTypeFamily(); PTableType<T, Void> type = tf.tableOf(collection.getPType(), tf.nulls()); Configuration conf = collection.getPipeline().getConfiguration(); GroupingOptions options = buildGroupingOptions(conf, tf, collection.getPType(), order); PTable<T, Void> pt = collection.parallelDo("sort-pre", new DoFn<T, Pair<T, Void>>() { @Override public void process(T input, Emitter<Pair<T, Void>> emitter) { emitter.emit(Pair.of(input, (Void) null)); } }, type); PTable<T, Void> sortedPt = pt.groupByKey(options).ungroup(); return sortedPt.parallelDo("sort-post", new DoFn<Pair<T, Void>, T>() { @Override public void process(Pair<T, Void> input, Emitter<T> emitter) { emitter.emit(input.first()); } }, collection.getPType()); }