org.apache.crunch.PTable.values java code examples

/**
 * Get an {@link LCollection} containing just the values from this table
 */
default LCollection<V> values() {
  return factory().wrap(underlying().values());
}

public static <T> PObject<BloomFilter> createFilter(PCollection<T> collection, BloomFilterFn<T> filterFn) {
 collection.getPipeline().getConfiguration().set(BloomFilterFn.CRUNCH_FILTER_NAME, collection.getName());
 return new FirstElementPObject<BloomFilter>(createFilterTable(collection, filterFn).values());
}

/** Randomizes the order of the items in the collection via a MapReduce job */
private static <T> PCollection<T> randomize(PCollection<T> items) {
 PTable<Long, T> table = items.by("randomize", new RandomizeFn<T>(), Writables.longs());
 table = Sort.sort(table, Sort.Order.ASCENDING);
 return table.values();
}

/**
 * Creates a {@code PCollection<T>} that has the same contents as its input argument but will
 * be written to a fixed number of output files. This is useful for map-only jobs that process
 * lots of input files but only write out a small amount of input per task.
 * 
 * @param pc The {@code PCollection<T>} to rebalance
 * @param numPartitions The number of output partitions to create
 * @return A rebalanced {@code PCollection<T>} with the same contents as the input
 */
public static <T> PCollection<T> shard(PCollection<T> pc, int numPartitions) {
 return pc.by(new ShardFn<T>(), pc.getTypeFamily().ints())
   .groupByKey(numPartitions)
   .ungroup()
   .values();
}

 public static <S> PCollection<S> aggregate(PCollection<S> collect, Aggregator<S> aggregator) {
  PTypeFamily tf = collect.getTypeFamily();
  return collect.parallelDo("Aggregate.aggregator", new MapFn<S, Pair<Boolean, S>>() {
   public Pair<Boolean, S> map(S input) {
    return Pair.of(false, input);
   }
  }, tf.tableOf(tf.booleans(), collect.getPType()))
  .groupByKey(1)
  .combineValues(aggregator)
  .values();
 }
}

return pipeline.read(source).values();

/**
 * Sorts the {@code PCollection} of {@link TupleN}s using the specified column
 * ordering and a client-specified number of reducers.
 * 
 * @return a {@code PCollection} representing the sorted collection.
 */
public static <T extends Tuple> PCollection<T> sortTuples(PCollection<T> collection, int numReducers,
  ColumnOrder... columnOrders) {
 PType<T> pType = collection.getPType();
 SortFns.KeyExtraction<T> ke = new SortFns.KeyExtraction<T>(pType, columnOrders);
 PTable<Object, T> pt = collection.by(ke.getByFn(), ke.getKeyType());
 Configuration conf = collection.getPipeline().getConfiguration();
 GroupingOptions options = buildGroupingOptions(pt, conf, numReducers, columnOrders);
 return pt.groupByKey(options).ungroup().values();
}

/**
 * Returns the number of elements in the provided PCollection.
 * 
 * @param collect The PCollection whose elements should be counted.
 * @param <S> The type of the PCollection.
 * @return A {@code PObject} containing the number of elements in the {@code PCollection}.
 */
public static <S> PObject<Long> length(PCollection<S> collect) {
 PTypeFamily tf = collect.getTypeFamily();
 PTable<Integer, Long> countTable = collect
   .parallelDo("Aggregate.count", new MapFn<S, Pair<Integer, Long>>() {
    public Pair<Integer, Long> map(S input) {
     return Pair.of(1, 1L);
    }
   }, tf.tableOf(tf.ints(), tf.longs()))
   .groupByKey(GroupingOptions.builder().numReducers(1).build())
   .combineValues(Aggregators.SUM_LONGS());
 PCollection<Long> count = countTable.values();
 return new FirstElementPObject<Long>(count);
}

/**
 * Returns the number of elements in the provided PCollection.
 * 
 * @param collect The PCollection whose elements should be counted.
 * @param <S> The type of the PCollection.
 * @return A {@code PObject} containing the number of elements in the {@code PCollection}.
 */
public static <S> PObject<Long> length(PCollection<S> collect) {
 PTypeFamily tf = collect.getTypeFamily();
 PTable<Integer, Long> countTable = collect
   .parallelDo("Aggregate.count", new MapFn<S, Pair<Integer, Long>>() {
    public Pair<Integer, Long> map(S input) {
     return Pair.of(1, 1L);
    }
    public void cleanup(Emitter<Pair<Integer, Long>> e) {
     e.emit(Pair.of(1, 0L));
    }
   }, tf.tableOf(tf.ints(), tf.longs()))
   .groupByKey(GroupingOptions.builder().numReducers(1).build())
   .combineValues(Aggregators.SUM_LONGS());
 PCollection<Long> count = countTable.values();
 return new FirstElementPObject<Long>(count, 0L);
}

 PGroupedTable<Pair<GenericData.Record, Integer>, E> grouped =
   numWriters > 0 ? table.groupByKey(numWriters) : table.groupByKey();
 return grouped.ungroup().values();
} else {
 return partition(collection, numWriters);

Javadoc

Returns a PCollection made up of the values in this PTable.

Popular methods of PTable

groupByKey
Performs a grouping operation on the keys of this table, using the additional GroupingOptions to con
keys
Returns a PCollection made up of the keys in this PTable.
parallelDo
getKeyType
Returns the PType of the key.
getPTableType
Returns the PTableType of this PTable.
getValueType
Returns the PType of the value.
union
Returns a PTable instance that acts as the union of this PTable and the input PTables.
getTypeFamily
join
Perform an inner join on this table and the one passed in as an argument on their common keys.
cogroup
Co-group operation with the given table.Note: If the given table contains keys that are not present
count
getPipeline

Popular in Java

Creating JSON documents from java classes using gson
orElseThrow (Optional)
Return the contained value, if present, otherwise throw an exception to be created by the provided s
findViewById (Activity)
notifyDataSetChanged (ArrayAdapter)
Thread (java.lang)
A thread is a thread of execution in a program. The Java Virtual Machine allows an application to ha
List (java.util)
An ordered collection (also known as a sequence). The user of this interface has precise control ove
LogFactory (org.apache.commons.logging)
Factory for creating Log instances, with discovery and configuration features similar to that employ
Container (java.awt)
A generic Abstract Window Toolkit(AWT) container object is a component that can contain other AWT co
Filter (javax.servlet)
A filter is an object that performs filtering tasks on either the request to a resource (a servlet o
JList (javax.swing)
Top 12 Jupyter Notebook extensions

How to use valuesmethodin org.apache.crunch.PTable

Best Java code snippets using org.apache.crunch.PTable.values (Showing top 10 results out of 315)

How to use
values
method
in
org.apache.crunch.PTable