Tabnine Logo
PCollection.getTypeFamily
Code IndexAdd Tabnine to your IDE (free)

How to use
getTypeFamily
method
in
org.apache.crunch.PCollection

Best Java code snippets using org.apache.crunch.PCollection.getTypeFamily (Showing top 20 results out of 315)

origin: apache/crunch

/**
 * Parses the lines of the input {@code PCollection<String>} and returns a {@code PTable<K, V>} using
 * the given {@code Extractor<Pair<K, V>>}.
 * 
 * @param groupName A label to use for tracking errors related to the parsing process
 * @param input The input {@code PCollection<String>} to convert
 * @param extractor The {@code Extractor<Pair<K, V>>} that converts each line
 * @return A {@code PTable<K, V>}
 */
public static <K, V> PTable<K, V> parseTable(String groupName, PCollection<String> input,
  Extractor<Pair<K, V>> extractor) {
 return parseTable(groupName, input, input.getTypeFamily(), extractor);
}

origin: apache/crunch

/**
 * Parses the lines of the input {@code PCollection<String>} and returns a {@code PCollection<T>} using
 * the given {@code Extractor<T>}.
 * 
 * @param groupName A label to use for tracking errors related to the parsing process
 * @param input The input {@code PCollection<String>} to convert
 * @param extractor The {@code Extractor<T>} that converts each line
 * @return A {@code PCollection<T>}
 */
public static <T> PCollection<T> parse(String groupName, PCollection<String> input,
  Extractor<T> extractor) {
 return parse(groupName, input, input.getTypeFamily(), extractor);
}

origin: cloudera/crunch

private static <T> PTable<T, Boolean> toTable(PCollection<T> coll) {
 PTypeFamily typeFamily = coll.getTypeFamily();
 return coll.parallelDo(new DoFn<T, Pair<T, Boolean>>() {
  @Override
  public void process(T input, Emitter<Pair<T, Boolean>> emitter) {
   emitter.emit(Pair.of(input, Boolean.TRUE));
  }
 }, typeFamily.tableOf(coll.getPType(), typeFamily.booleans()));
}
origin: org.apache.crunch/crunch-core

private static <T> PTable<T, Boolean> toTable(PCollection<T> coll) {
 PTypeFamily typeFamily = coll.getTypeFamily();
 return coll.parallelDo(new DoFn<T, Pair<T, Boolean>>() {
  @Override
  public void process(T input, Emitter<Pair<T, Boolean>> emitter) {
   emitter.emit(Pair.of(input, Boolean.TRUE));
  }
 }, typeFamily.tableOf(coll.getPType(), typeFamily.booleans()));
}
origin: org.apache.crunch/crunch

private static <T> PTable<T, Boolean> toTable(PCollection<T> coll) {
 PTypeFamily typeFamily = coll.getTypeFamily();
 return coll.parallelDo(new DoFn<T, Pair<T, Boolean>>() {
  @Override
  public void process(T input, Emitter<Pair<T, Boolean>> emitter) {
   emitter.emit(Pair.of(input, Boolean.TRUE));
  }
 }, typeFamily.tableOf(coll.getPType(), typeFamily.booleans()));
}
origin: cloudera/seismichadoop

 public static PGroupedTable<TupleN, ByteBuffer> apply(PCollection<ByteBuffer> traces, List<String> keys) {
  Field[] fields = new Field[keys.size()];
  boolean[] negate = new boolean[keys.size()];
  for (int i = 0; i < keys.size(); i++) {
   String key = keys.get(i);
   if (key.charAt(0) == '-') {
    negate[i] = true;
    key = key.substring(1);
   }
   fields[i] = Fields.getSortField(key);
   if (fields[i] == null) {
    throw new IllegalArgumentException("Unrecognized susort key: " + keys.get(i));
   }
  }
  
  PTypeFamily tf = traces.getTypeFamily();
  PType[] headerTypes = new PType[keys.size()];
  for (int i = 0; i < keys.size(); i++) {
   headerTypes[i] = tf.ints();
  }
  
  GroupingOptions options = GroupingOptions.builder()
    .partitionerClass(JoinUtils.getPartitionerClass(tf)).build();
  return traces.parallelDo("gethw", new HeaderExtractor(fields, negate),
     tf.tableOf(tf.tuples(headerTypes), tf.bytes())).groupByKey(options);
 }
}
origin: org.apache.crunch/crunch-core

/**
 * Creates a {@code PCollection<T>} that has the same contents as its input argument but will
 * be written to a fixed number of output files. This is useful for map-only jobs that process
 * lots of input files but only write out a small amount of input per task.
 * 
 * @param pc The {@code PCollection<T>} to rebalance
 * @param numPartitions The number of output partitions to create
 * @return A rebalanced {@code PCollection<T>} with the same contents as the input
 */
public static <T> PCollection<T> shard(PCollection<T> pc, int numPartitions) {
 return pc.by(new ShardFn<T>(), pc.getTypeFamily().ints())
   .groupByKey(numPartitions)
   .ungroup()
   .values();
}

origin: org.apache.crunch/crunch-core

/**
 * A version of the reservoir sampling algorithm that uses a given seed, primarily for
 * testing purposes.
 * 
 * @param input The input data
 * @param sampleSize The number of elements to select
 * @param seed The test seed
 * @return A {@code PCollection} made up of the sampled elements
 */
public static <T> PCollection<T> reservoirSample(
  PCollection<T> input,
  int sampleSize,
  Long seed) {
 PTypeFamily ptf = input.getTypeFamily();
 PType<Pair<T, Integer>> ptype = ptf.pairs(input.getPType(), ptf.ints());
 return weightedReservoirSample(
   input.parallelDo("Map to pairs for reservoir sampling", new MapFn<T, Pair<T, Integer>>() {
    @Override
    public Pair<T, Integer> map(T t) { return Pair.of(t, 1); }
   }, ptype),
   sampleSize,
   seed);
}

origin: org.apache.crunch/crunch-core

 public static <S> PCollection<S> aggregate(PCollection<S> collect, Aggregator<S> aggregator) {
  PTypeFamily tf = collect.getTypeFamily();
  return collect.parallelDo("Aggregate.aggregator", new MapFn<S, Pair<Boolean, S>>() {
   public Pair<Boolean, S> map(S input) {
    return Pair.of(false, input);
   }
  }, tf.tableOf(tf.booleans(), collect.getPType()))
  .groupByKey(1)
  .combineValues(aggregator)
  .values();
 }
}
origin: apache/crunch

private static <T> PTable<String, BloomFilter> createFilterTable(PCollection<T> collection, BloomFilterFn<T> filterFn) {
 PTypeFamily tf = collection.getTypeFamily();
 PTable<String, BloomFilter> table = collection.parallelDo(filterFn,
   tf.tableOf(tf.strings(), Writables.writables(BloomFilter.class)));
 return table.groupByKey(1).combineValues(new BloomFilterAggregator());
}
origin: org.apache.crunch/crunch

/**
 * Returns a {@code PTable} that contains the unique elements of this collection mapped to a count
 * of their occurrences.
 */
public static <S> PTable<S, Long> count(PCollection<S> collect) {
 PTypeFamily tf = collect.getTypeFamily();
 return collect.parallelDo("Aggregate.count", new MapFn<S, Pair<S, Long>>() {
  public Pair<S, Long> map(S input) {
   return Pair.of(input, 1L);
  }
 }, tf.tableOf(collect.getPType(), tf.longs())).groupByKey()
   .combineValues(Aggregators.SUM_LONGS());
}
origin: cloudera/crunch

/**
 * Returns a {@code PTable} that contains the unique elements of this
 * collection mapped to a count of their occurrences.
 */
public static <S> PTable<S, Long> count(PCollection<S> collect) {
 PTypeFamily tf = collect.getTypeFamily();
 return collect.parallelDo("Aggregate.count", new MapFn<S, Pair<S, Long>>() {      
  public Pair<S, Long> map(S input) {
   return Pair.of(input, 1L);
  }
 }, tf.tableOf(collect.getPType(), tf.longs()))
 .groupByKey()
 .combineValues(CombineFn.<S> SUM_LONGS());
}

origin: org.apache.crunch/crunch-core

/**
 * Returns a {@code PTable} that contains the unique elements of this collection mapped to a count
 * of their occurrences.
 */
public static <S> PTable<S, Long> count(PCollection<S> collect, int numPartitions) {
 PTypeFamily tf = collect.getTypeFamily();
 return collect.parallelDo("Aggregate.count", new MapFn<S, Pair<S, Long>>() {
  public Pair<S, Long> map(S input) {
   return Pair.of(input, 1L);
  }
 }, tf.tableOf(collect.getPType(), tf.longs()))
   .groupByKey(numPartitions)
   .combineValues(Aggregators.SUM_LONGS());
}

origin: org.apache.crunch/crunch-core

/**
 * The weighted reservoir sampling function with the seed term exposed for testing purposes.
 * 
 * @param input the weighted observations
 * @param sampleSize The number of elements to select
 * @param seed The test seed
 * @return A random sample of the given size that respects the weighting values
 */
public static <T, N extends Number> PCollection<T> weightedReservoirSample(
  PCollection<Pair<T, N>> input,
  int sampleSize,
  Long seed) {
 PTypeFamily ptf = input.getTypeFamily();
 PTable<Integer, Pair<T, N>> groupedIn = input.parallelDo(
   new MapFn<Pair<T, N>, Pair<Integer, Pair<T, N>>>() {
    @Override
    public Pair<Integer, Pair<T, N>> map(Pair<T, N> p) {
     return Pair.of(0, p);
    }
   }, ptf.tableOf(ptf.ints(), input.getPType()));
 int[] ss = { sampleSize };
 return groupedWeightedReservoirSample(groupedIn, ss, seed)
   .parallelDo("Extract sampled value from pair", new MapFn<Pair<Integer, T>, T>() {
    @Override
    public T map(Pair<Integer, T> p) {
     return p.second();
    }
   }, (PType<T>) input.getPType().getSubTypes().get(0));
}

origin: org.apache.crunch/crunch-core

/**
 * Sorts the {@code PCollection} using the natural ordering of its elements in
 * the order specified using the given number of reducers.
 * 
 * @return a {@code PCollection} representing the sorted collection.
 */
public static <T> PCollection<T> sort(PCollection<T> collection, int numReducers, Order order) {
 PTypeFamily tf = collection.getTypeFamily();
 PTableType<T, Void> type = tf.tableOf(collection.getPType(), tf.nulls());
 Configuration conf = collection.getPipeline().getConfiguration();
 PTable<T, Void> pt = collection.parallelDo("sort-pre", new DoFn<T, Pair<T, Void>>() {
  @Override
  public void process(T input, Emitter<Pair<T, Void>> emitter) {
   emitter.emit(Pair.of(input, (Void) null));
  }
 }, type);
 GroupingOptions options = buildGroupingOptions(pt, conf, numReducers, order);
 return pt.groupByKey(options).ungroup().keys();
}
origin: org.apache.crunch/crunch

/**
 * Sorts the {@link PCollection} of {@link TupleN}s using the specified column
 * ordering.
 * 
 * @return a {@link PCollection} representing the sorted collection.
 */
public static PCollection<TupleN> sortTuples(PCollection<TupleN> collection, ColumnOrder... columnOrders) {
 PTypeFamily tf = collection.getTypeFamily();
 PType<TupleN> pType = collection.getPType();
 PTableType<TupleN, Void> type = tf.tableOf(tf.tuples(pType.getSubTypes().toArray(new PType[0])), tf.nulls());
 PTable<TupleN, Void> pt = collection.parallelDo(new DoFn<TupleN, Pair<TupleN, Void>>() {
  @Override
  public void process(TupleN input, Emitter<Pair<TupleN, Void>> emitter) {
   emitter.emit(Pair.of(input, (Void) null));
  }
 }, type);
 Configuration conf = collection.getPipeline().getConfiguration();
 GroupingOptions options = buildGroupingOptions(conf, tf, pType, columnOrders);
 PTable<TupleN, Void> sortedPt = pt.groupByKey(options).ungroup();
 return sortedPt.parallelDo(new DoFn<Pair<TupleN, Void>, TupleN>() {
  @Override
  public void process(Pair<TupleN, Void> input, Emitter<TupleN> emitter) {
   emitter.emit(input.first());
  }
 }, collection.getPType());
}
origin: org.apache.crunch/crunch

/**
 * Returns the number of elements in the provided PCollection.
 * 
 * @param collect The PCollection whose elements should be counted.
 * @param <S> The type of the PCollection.
 * @return A {@code PObject} containing the number of elements in the {@code PCollection}.
 */
public static <S> PObject<Long> length(PCollection<S> collect) {
 PTypeFamily tf = collect.getTypeFamily();
 PTable<Integer, Long> countTable = collect
   .parallelDo("Aggregate.count", new MapFn<S, Pair<Integer, Long>>() {
    public Pair<Integer, Long> map(S input) {
     return Pair.of(1, 1L);
    }
   }, tf.tableOf(tf.ints(), tf.longs()))
   .groupByKey(GroupingOptions.builder().numReducers(1).build())
   .combineValues(Aggregators.SUM_LONGS());
 PCollection<Long> count = countTable.values();
 return new FirstElementPObject<Long>(count);
}
origin: org.apache.crunch/crunch-core

/**
 * Returns the number of elements in the provided PCollection.
 * 
 * @param collect The PCollection whose elements should be counted.
 * @param <S> The type of the PCollection.
 * @return A {@code PObject} containing the number of elements in the {@code PCollection}.
 */
public static <S> PObject<Long> length(PCollection<S> collect) {
 PTypeFamily tf = collect.getTypeFamily();
 PTable<Integer, Long> countTable = collect
   .parallelDo("Aggregate.count", new MapFn<S, Pair<Integer, Long>>() {
    public Pair<Integer, Long> map(S input) {
     return Pair.of(1, 1L);
    }
    public void cleanup(Emitter<Pair<Integer, Long>> e) {
     e.emit(Pair.of(1, 0L));
    }
   }, tf.tableOf(tf.ints(), tf.longs()))
   .groupByKey(GroupingOptions.builder().numReducers(1).build())
   .combineValues(Aggregators.SUM_LONGS());
 PCollection<Long> count = countTable.values();
 return new FirstElementPObject<Long>(count, 0L);
}
origin: org.apache.crunch/crunch

/**
 * Sorts the {@link PCollection} using the natural ordering of its elements in
 * the order specified.
 * 
 * @return a {@link PCollection} representing the sorted collection.
 */
public static <T> PCollection<T> sort(PCollection<T> collection, Order order) {
 PTypeFamily tf = collection.getTypeFamily();
 PTableType<T, Void> type = tf.tableOf(collection.getPType(), tf.nulls());
 Configuration conf = collection.getPipeline().getConfiguration();
 GroupingOptions options = buildGroupingOptions(conf, tf, collection.getPType(), order);
 PTable<T, Void> pt = collection.parallelDo("sort-pre", new DoFn<T, Pair<T, Void>>() {
  @Override
  public void process(T input, Emitter<Pair<T, Void>> emitter) {
   emitter.emit(Pair.of(input, (Void) null));
  }
 }, type);
 PTable<T, Void> sortedPt = pt.groupByKey(options).ungroup();
 return sortedPt.parallelDo("sort-post", new DoFn<Pair<T, Void>, T>() {
  @Override
  public void process(Pair<T, Void> input, Emitter<T> emitter) {
   emitter.emit(input.first());
  }
 }, collection.getPType());
}
origin: cloudera/crunch

/**
 * Sorts the {@link PCollection} using the natural ordering of its elements
 * in the order specified.
 * 
 * @return a {@link PCollection} representing the sorted collection.
 */
public static <T> PCollection<T> sort(PCollection<T> collection, Order order) {
 PTypeFamily tf = collection.getTypeFamily();
 PTableType<T, Void> type = tf.tableOf(collection.getPType(), tf.nulls());
 Configuration conf = collection.getPipeline().getConfiguration();
 GroupingOptions options = buildGroupingOptions(conf, tf,
   collection.getPType(), order);
 PTable<T, Void> pt =
  collection.parallelDo("sort-pre", new DoFn<T, Pair<T, Void>>() {
   @Override
   public void process(T input,
     Emitter<Pair<T, Void>> emitter) {
    emitter.emit(Pair.of(input, (Void) null));
   }
  }, type);
 PTable<T, Void> sortedPt = pt.groupByKey(options).ungroup();
 return sortedPt.parallelDo("sort-post", new DoFn<Pair<T, Void>, T>() {
  @Override
  public void process(Pair<T, Void> input, Emitter<T> emitter) {
   emitter.emit(input.first());
  }
 }, collection.getPType());
}

org.apache.crunchPCollectiongetTypeFamily

Javadoc

Returns the PTypeFamily of this PCollection.

Popular methods of PCollection

  • parallelDo
    Applies the given doFn to the elements of this PCollection and returns a new PCollection that is the
  • getPType
    Returns the PType of this PCollection.
  • by
    Apply the given map function to each element of this instance in order to create a PTable.
  • write
    Write the contents of this PCollection to the given Target, using the given Target.WriteMode to hand
  • materialize
    Returns a reference to the data set represented by this PCollection that may be used by the client t
  • getPipeline
    Returns the Pipeline associated with this PCollection.
  • count
    Returns a PTable instance that contains the counts of each unique element of this PCollection.
  • aggregate
    Returns a PCollection that contains the result of aggregating all values in this instance.
  • asReadable
  • cache
    Marks this data as cached using the given CachingOptions. Cached PCollections will only be processed
  • filter
    Apply the given filter function to this instance and return the resulting PCollection.
  • first
  • filter,
  • first,
  • getName,
  • getSize,
  • union

Popular in Java

  • Reading from database using SQL prepared statement
  • startActivity (Activity)
  • getExternalFilesDir (Context)
  • setContentView (Activity)
  • Container (java.awt)
    A generic Abstract Window Toolkit(AWT) container object is a component that can contain other AWT co
  • RandomAccessFile (java.io)
    Allows reading from and writing to a file in a random-access manner. This is different from the uni-
  • String (java.lang)
  • KeyStore (java.security)
    KeyStore is responsible for maintaining cryptographic keys and their owners. The type of the syste
  • HashSet (java.util)
    HashSet is an implementation of a Set. All optional operations (adding and removing) are supported.
  • Project (org.apache.tools.ant)
    Central representation of an Ant project. This class defines an Ant project with all of its targets,
  • Top PhpStorm plugins
Tabnine Logo
  • Products

    Search for Java codeSearch for JavaScript code
  • IDE Plugins

    IntelliJ IDEAWebStormVisual StudioAndroid StudioEclipseVisual Studio CodePyCharmSublime TextPhpStormVimGoLandRubyMineEmacsJupyter NotebookJupyter LabRiderDataGripAppCode
  • Company

    About UsContact UsCareers
  • Resources

    FAQBlogTabnine AcademyTerms of usePrivacy policyJava Code IndexJavascript Code Index
Get Tabnine for your IDE now