org.apache.crunch.PCollection.getTypeFamily java code examples

/**
 * Parses the lines of the input {@code PCollection<String>} and returns a {@code PTable<K, V>} using
 * the given {@code Extractor<Pair<K, V>>}.
 * 
 * @param groupName A label to use for tracking errors related to the parsing process
 * @param input The input {@code PCollection<String>} to convert
 * @param extractor The {@code Extractor<Pair<K, V>>} that converts each line
 * @return A {@code PTable<K, V>}
 */
public static <K, V> PTable<K, V> parseTable(String groupName, PCollection<String> input,
  Extractor<Pair<K, V>> extractor) {
 return parseTable(groupName, input, input.getTypeFamily(), extractor);
}

/**
 * Parses the lines of the input {@code PCollection<String>} and returns a {@code PCollection<T>} using
 * the given {@code Extractor<T>}.
 * 
 * @param groupName A label to use for tracking errors related to the parsing process
 * @param input The input {@code PCollection<String>} to convert
 * @param extractor The {@code Extractor<T>} that converts each line
 * @return A {@code PCollection<T>}
 */
public static <T> PCollection<T> parse(String groupName, PCollection<String> input,
  Extractor<T> extractor) {
 return parse(groupName, input, input.getTypeFamily(), extractor);
}

private static <T> PTable<T, Boolean> toTable(PCollection<T> coll) {
 PTypeFamily typeFamily = coll.getTypeFamily();
 return coll.parallelDo(new DoFn<T, Pair<T, Boolean>>() {
  @Override
  public void process(T input, Emitter<Pair<T, Boolean>> emitter) {
   emitter.emit(Pair.of(input, Boolean.TRUE));
  }
 }, typeFamily.tableOf(coll.getPType(), typeFamily.booleans()));
}

private static <T> PTable<T, Boolean> toTable(PCollection<T> coll) {
 PTypeFamily typeFamily = coll.getTypeFamily();
 return coll.parallelDo(new DoFn<T, Pair<T, Boolean>>() {
  @Override
  public void process(T input, Emitter<Pair<T, Boolean>> emitter) {
   emitter.emit(Pair.of(input, Boolean.TRUE));
  }
 }, typeFamily.tableOf(coll.getPType(), typeFamily.booleans()));
}

private static <T> PTable<T, Boolean> toTable(PCollection<T> coll) {
 PTypeFamily typeFamily = coll.getTypeFamily();
 return coll.parallelDo(new DoFn<T, Pair<T, Boolean>>() {
  @Override
  public void process(T input, Emitter<Pair<T, Boolean>> emitter) {
   emitter.emit(Pair.of(input, Boolean.TRUE));
  }
 }, typeFamily.tableOf(coll.getPType(), typeFamily.booleans()));
}

 public static PGroupedTable<TupleN, ByteBuffer> apply(PCollection<ByteBuffer> traces, List<String> keys) {
  Field[] fields = new Field[keys.size()];
  boolean[] negate = new boolean[keys.size()];
  for (int i = 0; i < keys.size(); i++) {
   String key = keys.get(i);
   if (key.charAt(0) == '-') {
    negate[i] = true;
    key = key.substring(1);
   }
   fields[i] = Fields.getSortField(key);
   if (fields[i] == null) {
    throw new IllegalArgumentException("Unrecognized susort key: " + keys.get(i));
   }
  }
  
  PTypeFamily tf = traces.getTypeFamily();
  PType[] headerTypes = new PType[keys.size()];
  for (int i = 0; i < keys.size(); i++) {
   headerTypes[i] = tf.ints();
  }
  
  GroupingOptions options = GroupingOptions.builder()
    .partitionerClass(JoinUtils.getPartitionerClass(tf)).build();
  return traces.parallelDo("gethw", new HeaderExtractor(fields, negate),
     tf.tableOf(tf.tuples(headerTypes), tf.bytes())).groupByKey(options);
 }
}

/**
 * Creates a {@code PCollection<T>} that has the same contents as its input argument but will
 * be written to a fixed number of output files. This is useful for map-only jobs that process
 * lots of input files but only write out a small amount of input per task.
 * 
 * @param pc The {@code PCollection<T>} to rebalance
 * @param numPartitions The number of output partitions to create
 * @return A rebalanced {@code PCollection<T>} with the same contents as the input
 */
public static <T> PCollection<T> shard(PCollection<T> pc, int numPartitions) {
 return pc.by(new ShardFn<T>(), pc.getTypeFamily().ints())
   .groupByKey(numPartitions)
   .ungroup()
   .values();
}

/**
 * A version of the reservoir sampling algorithm that uses a given seed, primarily for
 * testing purposes.
 * 
 * @param input The input data
 * @param sampleSize The number of elements to select
 * @param seed The test seed
 * @return A {@code PCollection} made up of the sampled elements
 */
public static <T> PCollection<T> reservoirSample(
  PCollection<T> input,
  int sampleSize,
  Long seed) {
 PTypeFamily ptf = input.getTypeFamily();
 PType<Pair<T, Integer>> ptype = ptf.pairs(input.getPType(), ptf.ints());
 return weightedReservoirSample(
   input.parallelDo("Map to pairs for reservoir sampling", new MapFn<T, Pair<T, Integer>>() {
    @Override
    public Pair<T, Integer> map(T t) { return Pair.of(t, 1); }
   }, ptype),
   sampleSize,
   seed);
}

 public static <S> PCollection<S> aggregate(PCollection<S> collect, Aggregator<S> aggregator) {
  PTypeFamily tf = collect.getTypeFamily();
  return collect.parallelDo("Aggregate.aggregator", new MapFn<S, Pair<Boolean, S>>() {
   public Pair<Boolean, S> map(S input) {
    return Pair.of(false, input);
   }
  }, tf.tableOf(tf.booleans(), collect.getPType()))
  .groupByKey(1)
  .combineValues(aggregator)
  .values();
 }
}

private static <T> PTable<String, BloomFilter> createFilterTable(PCollection<T> collection, BloomFilterFn<T> filterFn) {
 PTypeFamily tf = collection.getTypeFamily();
 PTable<String, BloomFilter> table = collection.parallelDo(filterFn,
   tf.tableOf(tf.strings(), Writables.writables(BloomFilter.class)));
 return table.groupByKey(1).combineValues(new BloomFilterAggregator());
}

/**
 * Returns a {@code PTable} that contains the unique elements of this collection mapped to a count
 * of their occurrences.
 */
public static <S> PTable<S, Long> count(PCollection<S> collect) {
 PTypeFamily tf = collect.getTypeFamily();
 return collect.parallelDo("Aggregate.count", new MapFn<S, Pair<S, Long>>() {
  public Pair<S, Long> map(S input) {
   return Pair.of(input, 1L);
  }
 }, tf.tableOf(collect.getPType(), tf.longs())).groupByKey()
   .combineValues(Aggregators.SUM_LONGS());
}

/**
 * Returns a {@code PTable} that contains the unique elements of this
 * collection mapped to a count of their occurrences.
 */
public static <S> PTable<S, Long> count(PCollection<S> collect) {
 PTypeFamily tf = collect.getTypeFamily();
 return collect.parallelDo("Aggregate.count", new MapFn<S, Pair<S, Long>>() {      
  public Pair<S, Long> map(S input) {
   return Pair.of(input, 1L);
  }
 }, tf.tableOf(collect.getPType(), tf.longs()))
 .groupByKey()
 .combineValues(CombineFn.<S> SUM_LONGS());
}

/**
 * Returns a {@code PTable} that contains the unique elements of this collection mapped to a count
 * of their occurrences.
 */
public static <S> PTable<S, Long> count(PCollection<S> collect, int numPartitions) {
 PTypeFamily tf = collect.getTypeFamily();
 return collect.parallelDo("Aggregate.count", new MapFn<S, Pair<S, Long>>() {
  public Pair<S, Long> map(S input) {
   return Pair.of(input, 1L);
  }
 }, tf.tableOf(collect.getPType(), tf.longs()))
   .groupByKey(numPartitions)
   .combineValues(Aggregators.SUM_LONGS());
}

/**
 * The weighted reservoir sampling function with the seed term exposed for testing purposes.
 * 
 * @param input the weighted observations
 * @param sampleSize The number of elements to select
 * @param seed The test seed
 * @return A random sample of the given size that respects the weighting values
 */
public static <T, N extends Number> PCollection<T> weightedReservoirSample(
  PCollection<Pair<T, N>> input,
  int sampleSize,
  Long seed) {
 PTypeFamily ptf = input.getTypeFamily();
 PTable<Integer, Pair<T, N>> groupedIn = input.parallelDo(
   new MapFn<Pair<T, N>, Pair<Integer, Pair<T, N>>>() {
    @Override
    public Pair<Integer, Pair<T, N>> map(Pair<T, N> p) {
     return Pair.of(0, p);
    }
   }, ptf.tableOf(ptf.ints(), input.getPType()));
 int[] ss = { sampleSize };
 return groupedWeightedReservoirSample(groupedIn, ss, seed)
   .parallelDo("Extract sampled value from pair", new MapFn<Pair<Integer, T>, T>() {
    @Override
    public T map(Pair<Integer, T> p) {
     return p.second();
    }
   }, (PType<T>) input.getPType().getSubTypes().get(0));
}

/**
 * Sorts the {@code PCollection} using the natural ordering of its elements in
 * the order specified using the given number of reducers.
 * 
 * @return a {@code PCollection} representing the sorted collection.
 */
public static <T> PCollection<T> sort(PCollection<T> collection, int numReducers, Order order) {
 PTypeFamily tf = collection.getTypeFamily();
 PTableType<T, Void> type = tf.tableOf(collection.getPType(), tf.nulls());
 Configuration conf = collection.getPipeline().getConfiguration();
 PTable<T, Void> pt = collection.parallelDo("sort-pre", new DoFn<T, Pair<T, Void>>() {
  @Override
  public void process(T input, Emitter<Pair<T, Void>> emitter) {
   emitter.emit(Pair.of(input, (Void) null));
  }
 }, type);
 GroupingOptions options = buildGroupingOptions(pt, conf, numReducers, order);
 return pt.groupByKey(options).ungroup().keys();
}

/**
 * Sorts the {@link PCollection} of {@link TupleN}s using the specified column
 * ordering.
 * 
 * @return a {@link PCollection} representing the sorted collection.
 */
public static PCollection<TupleN> sortTuples(PCollection<TupleN> collection, ColumnOrder... columnOrders) {
 PTypeFamily tf = collection.getTypeFamily();
 PType<TupleN> pType = collection.getPType();
 PTableType<TupleN, Void> type = tf.tableOf(tf.tuples(pType.getSubTypes().toArray(new PType[0])), tf.nulls());
 PTable<TupleN, Void> pt = collection.parallelDo(new DoFn<TupleN, Pair<TupleN, Void>>() {
  @Override
  public void process(TupleN input, Emitter<Pair<TupleN, Void>> emitter) {
   emitter.emit(Pair.of(input, (Void) null));
  }
 }, type);
 Configuration conf = collection.getPipeline().getConfiguration();
 GroupingOptions options = buildGroupingOptions(conf, tf, pType, columnOrders);
 PTable<TupleN, Void> sortedPt = pt.groupByKey(options).ungroup();
 return sortedPt.parallelDo(new DoFn<Pair<TupleN, Void>, TupleN>() {
  @Override
  public void process(Pair<TupleN, Void> input, Emitter<TupleN> emitter) {
   emitter.emit(input.first());
  }
 }, collection.getPType());
}

/**
 * Returns the number of elements in the provided PCollection.
 * 
 * @param collect The PCollection whose elements should be counted.
 * @param <S> The type of the PCollection.
 * @return A {@code PObject} containing the number of elements in the {@code PCollection}.
 */
public static <S> PObject<Long> length(PCollection<S> collect) {
 PTypeFamily tf = collect.getTypeFamily();
 PTable<Integer, Long> countTable = collect
   .parallelDo("Aggregate.count", new MapFn<S, Pair<Integer, Long>>() {
    public Pair<Integer, Long> map(S input) {
     return Pair.of(1, 1L);
    }
   }, tf.tableOf(tf.ints(), tf.longs()))
   .groupByKey(GroupingOptions.builder().numReducers(1).build())
   .combineValues(Aggregators.SUM_LONGS());
 PCollection<Long> count = countTable.values();
 return new FirstElementPObject<Long>(count);
}

/**
 * Returns the number of elements in the provided PCollection.
 * 
 * @param collect The PCollection whose elements should be counted.
 * @param <S> The type of the PCollection.
 * @return A {@code PObject} containing the number of elements in the {@code PCollection}.
 */
public static <S> PObject<Long> length(PCollection<S> collect) {
 PTypeFamily tf = collect.getTypeFamily();
 PTable<Integer, Long> countTable = collect
   .parallelDo("Aggregate.count", new MapFn<S, Pair<Integer, Long>>() {
    public Pair<Integer, Long> map(S input) {
     return Pair.of(1, 1L);
    }
    public void cleanup(Emitter<Pair<Integer, Long>> e) {
     e.emit(Pair.of(1, 0L));
    }
   }, tf.tableOf(tf.ints(), tf.longs()))
   .groupByKey(GroupingOptions.builder().numReducers(1).build())
   .combineValues(Aggregators.SUM_LONGS());
 PCollection<Long> count = countTable.values();
 return new FirstElementPObject<Long>(count, 0L);
}

/**
 * Sorts the {@link PCollection} using the natural ordering of its elements in
 * the order specified.
 * 
 * @return a {@link PCollection} representing the sorted collection.
 */
public static <T> PCollection<T> sort(PCollection<T> collection, Order order) {
 PTypeFamily tf = collection.getTypeFamily();
 PTableType<T, Void> type = tf.tableOf(collection.getPType(), tf.nulls());
 Configuration conf = collection.getPipeline().getConfiguration();
 GroupingOptions options = buildGroupingOptions(conf, tf, collection.getPType(), order);
 PTable<T, Void> pt = collection.parallelDo("sort-pre", new DoFn<T, Pair<T, Void>>() {
  @Override
  public void process(T input, Emitter<Pair<T, Void>> emitter) {
   emitter.emit(Pair.of(input, (Void) null));
  }
 }, type);
 PTable<T, Void> sortedPt = pt.groupByKey(options).ungroup();
 return sortedPt.parallelDo("sort-post", new DoFn<Pair<T, Void>, T>() {
  @Override
  public void process(Pair<T, Void> input, Emitter<T> emitter) {
   emitter.emit(input.first());
  }
 }, collection.getPType());
}

/**
 * Sorts the {@link PCollection} using the natural ordering of its elements
 * in the order specified.
 * 
 * @return a {@link PCollection} representing the sorted collection.
 */
public static <T> PCollection<T> sort(PCollection<T> collection, Order order) {
 PTypeFamily tf = collection.getTypeFamily();
 PTableType<T, Void> type = tf.tableOf(collection.getPType(), tf.nulls());
 Configuration conf = collection.getPipeline().getConfiguration();
 GroupingOptions options = buildGroupingOptions(conf, tf,
   collection.getPType(), order);
 PTable<T, Void> pt =
  collection.parallelDo("sort-pre", new DoFn<T, Pair<T, Void>>() {
   @Override
   public void process(T input,
     Emitter<Pair<T, Void>> emitter) {
    emitter.emit(Pair.of(input, (Void) null));
   }
  }, type);
 PTable<T, Void> sortedPt = pt.groupByKey(options).ungroup();
 return sortedPt.parallelDo("sort-post", new DoFn<Pair<T, Void>, T>() {
  @Override
  public void process(Pair<T, Void> input, Emitter<T> emitter) {
   emitter.emit(input.first());
  }
 }, collection.getPType());
}

Javadoc

Returns the PTypeFamily of this PCollection.

Popular methods of PCollection

parallelDo
Applies the given doFn to the elements of this PCollection and returns a new PCollection that is the
getPType
Returns the PType of this PCollection.
by
Apply the given map function to each element of this instance in order to create a PTable.
write
Write the contents of this PCollection to the given Target, using the given Target.WriteMode to hand
materialize
Returns a reference to the data set represented by this PCollection that may be used by the client t
getPipeline
Returns the Pipeline associated with this PCollection.
count
Returns a PTable instance that contains the counts of each unique element of this PCollection.
aggregate
Returns a PCollection that contains the result of aggregating all values in this instance.
asReadable
cache
Marks this data as cached using the given CachingOptions. Cached PCollections will only be processed
filter
Apply the given filter function to this instance and return the resulting PCollection.
first

Popular in Java

Start an intent from android
compareTo (BigDecimal)
getSharedPreferences (Context)
getContentResolver (Context)
Runnable (java.lang)
Represents a command that can be executed. Often used to run code in a different Thread.
SecureRandom (java.security)
This class generates cryptographically secure pseudo-random numbers. It is best to invoke SecureRand
Comparator (java.util)
A Comparator is used to compare two objects to determine their ordering with respect to each other.
SortedMap (java.util)
A map that has its keys ordered. The sorting is according to either the natural ordering of its keys
SAXParseException (org.xml.sax)
Encapsulate an XML parse error or warning.> This module, both source code and documentation, is in t
Scheduler (org.quartz)
This is the main interface of a Quartz Scheduler. A Scheduler maintains a registry of org.quartz.Job
Top PhpStorm plugins

How to use getTypeFamilymethodin org.apache.crunch.PCollection

Best Java code snippets using org.apache.crunch.PCollection.getTypeFamily (Showing top 20 results out of 315)

How to use
getTypeFamily
method
in
org.apache.crunch.PCollection