/** * Ungroup this LGroupedTable back into an {@link LTable}. This will still trigger a "reduce" operation, so is * usually only used in special cases like producing a globally-ordered list by feeding the everything through * a single reducers. */ default LTable<K, V> ungroup() { return factory().wrap(underlying().ungroup()); }
/** * Combine the value part of the table using the provided Crunch {@link Aggregator}. This will be optimised into * both a combine and reduce in the MapReduce implementation, with similar optimisations available for other * implementations. */ default LTable<K, V> combineValues(Aggregator<V> aggregator) { return factory().wrap(underlying().combineValues(aggregator)); }
public static <K1, V1, K2 extends Writable, V2 extends Writable> PTable<K2, V2> reduce( PGroupedTable<K1, V1> input, Class<? extends Reducer<K1, V1, K2, V2>> reducerClass, Class<K2> keyClass, Class<V2> valueClass) { return input.parallelDo(new ReducerFn<K1, V1, K2, V2>(reducerClass), tableOf(keyClass, valueClass)); }
/** * An analogue of the {@code mapValues} function for {@code PGroupedTable<K, U>} collections. * * @param name The name of the operation * @param ptable The {@code PGroupedTable} to be mapped * @param mapFn The mapping function * @param ptype The PType for the returned values * @return A new {@code PTable<K, V>} instance */ public static <K, U, V> PTable<K, V> mapValues(String name, PGroupedTable<K, U> ptable, MapFn<Iterable<U>, V> mapFn, PType<V> ptype) { PTypeFamily ptf = ptable.getTypeFamily(); return ptable.parallelDo(name, new PairMapFn<K, Iterable<U>, K, V>(IdentityFn.<K>getInstance(), mapFn), ptf.tableOf((PType<K>) ptable.getPType().getSubTypes().get(0), ptype)); }
/** * Supports a user-specified number of reducers for the one-to-many join. * * @param left left-side table to join * @param right right-side table to join * @param postProcessFn DoFn to process the results of the join * @param ptype type of the output of the postProcessFn * @param numReducers The number of reducers to use * @return the post-processed output of the join */ public static <K, U, V, T> PCollection<T> oneToManyJoin(PTable<K, U> left, PTable<K, V> right, DoFn<Pair<U, Iterable<V>>, T> postProcessFn, PType<T> ptype, int numReducers) { PGroupedTable<Pair<K, Integer>, Pair<U, V>> grouped = DefaultJoinStrategy.preJoin(left, right, numReducers); return grouped.parallelDo("One to many join " + grouped.getName(), new OneToManyJoinFn<K, U, V, T>(left.getValueType(), postProcessFn), ptype); }
input = input.parallelDo(pieces[0], new SUDoFn(proc), ptf.bytes()); } else { input = sorted.parallelDo(pieces[0], new SUPostGroupFn(proc), ptf.bytes()); sorted = null; input = PTables.values(sorted.ungroup());
/** * Get a {@link PType} which can be used to serialize the value part of this grouped table */ default PType<V> valueType() { return underlying().getGroupedTableType().getTableType().getValueType(); }
public static <K, U, V> PTable<K, Pair<U, V>> join(PTable<K, U> left, PTable<K, V> right, JoinFn<K, U, V> joinFn) { PTypeFamily ptf = left.getTypeFamily(); PGroupedTable<Pair<K, Integer>, Pair<U, V>> grouped = preJoin(left, right); PTableType<K, Pair<U, V>> ret = ptf .tableOf(left.getKeyType(), ptf.pairs(left.getValueType(), right.getValueType())); return grouped.parallelDo(joinFn.getJoinType() + grouped.getName(), joinFn, ret); }
/** * Get a {@link PType} which can be used to serialize the key part of this grouped table */ default PType<K> keyType() { return underlying().getGroupedTableType().getTableType().getKeyType(); }
/** * Create a list of unique items in the input collection with their count, sorted descending by their frequency. * @param input input collection * @param <X> record type * @return global toplist */ public static <X> PTable<X, Long> globalToplist(PCollection<X> input) { return negateCounts(negateCounts(input.count()).groupByKey(1).ungroup()); }
public int run(String[] args) throws Exception { if (args.length != 2) { System.err.println(); System.err.println("Two and only two arguments are accepted."); System.err.println("Usage: " + this.getClass().getName() + " [generic options] input output"); System.err.println(); GenericOptionsParser.printGenericCommandUsage(System.err); return 1; } // Create an object to coordinate pipeline creation and execution. Pipeline pipeline = new MRPipeline(TotalBytesByIP.class, getConf()); // Reference a given text file as a collection of Strings. PCollection<String> lines = pipeline.readTextFile(args[0]); // Aggregator used for summing up response size Aggregator<Long> agg = Aggregators.SUM_LONGS(); // Table of (ip, sum(response size)) PTable<String, Long> ipAddrResponseSize = lines .parallelDo(extractIPResponseSize, Writables.tableOf(Writables.strings(), Writables.longs())).groupByKey() .combineValues(agg); pipeline.writeTextFile(ipAddrResponseSize, args[1]); // Execute the pipeline as a MapReduce. PipelineResult result = pipeline.done(); return result.succeeded() ? 0 : 1; }
/** * Perform a secondary sort on the given {@code PTable} instance and then apply a * {@code DoFn} to the resulting sorted data to yield an output {@code PTable<U, V>}. */ public static <K, V1, V2, U, V> PTable<U, V> sortAndApply(PTable<K, Pair<V1, V2>> input, DoFn<Pair<K, Iterable<Pair<V1, V2>>>, Pair<U, V>> doFn, PTableType<U, V> ptype) { return prepare(input) .parallelDo("SecondarySort.apply", new SSWrapFn<K, V1, V2, Pair<U, V>>(doFn), ptype); }
public static <K, U, V> PTable<K, Pair<U, V>> join(PTable<K, U> left, PTable<K, V> right, JoinFn<K, U, V> joinFn) { PTypeFamily ptf = left.getTypeFamily(); PGroupedTable<Pair<K, Integer>, Pair<U, V>> grouped = preJoin(left, right); PTableType<K, Pair<U, V>> ret = ptf.tableOf(left.getKeyType(), ptf.pairs(left.getValueType(), right.getValueType())); return grouped.parallelDo(joinFn.getJoinType() + grouped.getName(), joinFn, ret); }
/** * Sorts the {@code PTable} using the natural ordering of its keys in the * order specified with a client-specified number of reducers. * * @return a {@code PTable} representing the sorted collection. */ public static <K, V> PTable<K, V> sort(PTable<K, V> table, int numReducers, Order key) { Configuration conf = table.getPipeline().getConfiguration(); GroupingOptions options = buildGroupingOptions(table, conf, numReducers, key); return table.groupByKey(options).ungroup(); }
.groupByKey(1).combineValues(new CombineFn<Boolean, S>() { public void process(Pair<Boolean, Iterable<S>> input, Emitter<Pair<Boolean, S>> emitter) {
public static <K1, V1, K2 extends Writable, V2 extends Writable> PTable<K2, V2> reduce( PGroupedTable<K1, V1> input, Class<? extends Reducer<K1, V1, K2, V2>> reducerClass, Class<K2> keyClass, Class<V2> valueClass) { return input.parallelDo(new ReducerFn<K1, V1, K2, V2>(reducerClass), tableOf(keyClass, valueClass)); }
/** * Perform a default join on the given {@code PTable} instances using a user-specified {@code JoinFn}. * * @param left left table to be joined * @param right right table to be joined * @param joinFn The user-specified implementation of the {@code JoinFn} class * @return joined tables */ public PTable<K, Pair<U, V>> join(PTable<K, U> left, PTable<K, V> right, JoinFn<K, U, V> joinFn) { PTypeFamily ptf = left.getTypeFamily(); PGroupedTable<Pair<K, Integer>, Pair<U, V>> grouped = preJoin(left, right, numReducers); PTableType<K, Pair<U, V>> ret = ptf .tableOf(left.getKeyType(), ptf.pairs(left.getValueType(), right.getValueType())); return grouped.parallelDo(joinFn.getJoinType() + grouped.getName(), joinFn, ret); }
/** * Create a list of unique items in the input collection with their count, sorted descending by their frequency. * @param input input collection * @param <X> record type * @return global toplist */ public static <X> PTable<X, Long> globalToplist(PCollection<X> input) { return SPTables.negateCounts(SPTables.negateCounts(input.count()).groupByKey(1).ungroup()); } }
.groupByKey().combineValues(new CombineFn<Boolean, S>() { public void process(Pair<Boolean, Iterable<S>> input, Emitter<Pair<Boolean, S>> emitter) {
/** * Perform a secondary sort on the given {@code PTable} instance and then apply a * {@code DoFn} to the resulting sorted data to yield an output {@code PCollection<T>}. */ public static <K, V1, V2, T> PCollection<T> sortAndApply(PTable<K, Pair<V1, V2>> input, DoFn<Pair<K, Iterable<Pair<V1, V2>>>, T> doFn, PType<T> ptype) { return prepare(input) .parallelDo("SecondarySort.apply", new SSWrapFn<K, V1, V2, T>(doFn), ptype); }