/** * Create a list of unique items in the input collection with their count, sorted descending by their frequency. * @param input input collection * @param <X> record type * @return global toplist */ public static <X> PTable<X, Long> globalToplist(PCollection<X> input) { return negateCounts(negateCounts(input.count()).groupByKey(1).ungroup()); }
/** * Create a list of unique items in the input collection with their count, sorted descending by their frequency. * @param input input collection * @param <X> record type * @return global toplist */ public static <X> PTable<X, Long> globalToplist(PCollection<X> input) { return SPTables.negateCounts(SPTables.negateCounts(input.count()).groupByKey(1).ungroup()); } }
PTable<String, Long> counts = words.count();
/** * Calculate a set of percentiles for each key in a numerically-valued table. * * Percentiles are calculated on a per-key basis by counting, joining and sorting. This is highly scalable, but takes * 2 more map-reduce cycles than if you can guarantee that the value set will fit into memory. Use inMemory * if you have less than the order of 10M values per key. * * The percentile definition that we use here is the "nearest rank" defined here: * http://en.wikipedia.org/wiki/Percentile#Definition * * @param table numerically-valued PTable * @param p1 First percentile (in the range 0.0 - 1.0) * @param pn More percentiles (in the range 0.0 - 1.0) * @param <K> Key type of the table * @param <V> Value type of the table (must extends java.lang.Number) * @return PTable of each key with a collection of pairs of the percentile provided and it's result. */ public static <K, V extends Number> PTable<K, Result<V>> distributed(PTable<K, V> table, double p1, double... pn) { final List<Double> percentileList = createListFromVarargs(p1, pn); PTypeFamily ptf = table.getTypeFamily(); PTable<K, Long> totalCounts = table.keys().count(); PTable<K, Pair<Long, V>> countValuePairs = totalCounts.join(table); PTable<K, Pair<V, Long>> valueCountPairs = countValuePairs.mapValues(new SwapPairComponents<Long, V>(), ptf.pairs(table.getValueType(), ptf.longs())); return SecondarySort.sortAndApply( valueCountPairs, new DistributedPercentiles<K, V>(percentileList), ptf.tableOf(table.getKeyType(), Result.pType(table.getValueType()))); }
/** * Calculate a set of quantiles for each key in a numerically-valued table. * * Quantiles are calculated on a per-key basis by counting, joining and sorting. This is highly scalable, but takes * 2 more map-reduce cycles than if you can guarantee that the value set will fit into memory. Use inMemory * if you have less than the order of 10M values per key. * * The quantile definition that we use here is the "nearest rank" defined here: * http://en.wikipedia.org/wiki/Percentile#Definition * * @param table numerically-valued PTable * @param p1 First quantile (in the range 0.0 - 1.0) * @param pn More quantiles (in the range 0.0 - 1.0) * @param <K> Key type of the table * @param <V> Value type of the table (must extends java.lang.Number) * @return PTable of each key with a collection of pairs of the quantile provided and it's result. */ public static <K, V extends Number> PTable<K, Result<V>> distributed(PTable<K, V> table, double p1, double... pn) { final List<Double> quantileList = createListFromVarargs(p1, pn); PTypeFamily ptf = table.getTypeFamily(); PTable<K, Long> totalCounts = table.keys().count(); PTable<K, Pair<Long, V>> countValuePairs = totalCounts.join(table); PTable<K, Pair<V, Long>> valueCountPairs = countValuePairs.mapValues(new SwapPairComponents<Long, V>(), ptf.pairs(table.getValueType(), ptf.longs())); return SecondarySort.sortAndApply( valueCountPairs, new DistributedQuantiles<K, V>(quantileList), ptf.tableOf(table.getKeyType(), Result.pType(table.getValueType()))); }