Tabnine Logo
PCollection
Code IndexAdd Tabnine to your IDE (free)

How to use
PCollection
in
org.apache.crunch

Best Java code snippets using org.apache.crunch.PCollection (Showing top 20 results out of 315)

origin: org.apache.crunch/crunch

/**
 * Sorts the {@link PCollection} using the natural ordering of its elements in
 * the order specified.
 * 
 * @return a {@link PCollection} representing the sorted collection.
 */
public static <T> PCollection<T> sort(PCollection<T> collection, Order order) {
 PTypeFamily tf = collection.getTypeFamily();
 PTableType<T, Void> type = tf.tableOf(collection.getPType(), tf.nulls());
 Configuration conf = collection.getPipeline().getConfiguration();
 GroupingOptions options = buildGroupingOptions(conf, tf, collection.getPType(), order);
 PTable<T, Void> pt = collection.parallelDo("sort-pre", new DoFn<T, Pair<T, Void>>() {
  @Override
  public void process(T input, Emitter<Pair<T, Void>> emitter) {
   emitter.emit(Pair.of(input, (Void) null));
  }
 }, type);
 PTable<T, Void> sortedPt = pt.groupByKey(options).ungroup();
 return sortedPt.parallelDo("sort-post", new DoFn<Pair<T, Void>, T>() {
  @Override
  public void process(Pair<T, Void> input, Emitter<T> emitter) {
   emitter.emit(input.first());
  }
 }, collection.getPType());
}
origin: spotify/crunch-lib

/**
 * Key a PCollection of Avro records by a String field name. This is less safe than writing a custom MapFn, but it
 * could significantly reduce code volume in cases that need a lot of disparate collections to be joined or processed
 * according to key values.
 * @param collection PCollection of Avro records to process
 * @param fieldPath The Avro schema field name of the field to key on. Use . separated names for nested records
 * @param fieldType PType of the field you wish to extract from the Avro record.
 * @param <T> record type
 * @return supplied collection keyed by the field named fieldName
 */
public static <T extends SpecificRecord, F> PTable<F, T> keyByAvroField(PCollection<T> collection, String fieldPath, PType<F> fieldType) {
 Class<T> recordType = collection.getPType().getTypeClass();
 return collection.by(new AvroExtractMapFn<T, F>(recordType, fieldPath), fieldType);
}
origin: org.apache.crunch/crunch-core

@Override
public <T> void writeTextFile(PCollection<T> pcollection, String pathName) {
 pcollection.parallelDo("asText", new StringifyFn<T>(), Writables.strings())
   .write(To.textFile(pathName));
}
origin: cloudera/crunch

 public static <S> PCollection<S> sample(PCollection<S> input, long seed, double probability) {
  String stageName = String.format("sample(%.2f)", probability);
  return input.parallelDo(stageName, new SamplerFn<S>(seed, probability), input.getPType());
 }
}
origin: cloudera/crunch

private static <T> PTable<T, Boolean> toTable(PCollection<T> coll) {
 PTypeFamily typeFamily = coll.getTypeFamily();
 return coll.parallelDo(new DoFn<T, Pair<T, Boolean>>() {
  @Override
  public void process(T input, Emitter<Pair<T, Boolean>> emitter) {
   emitter.emit(Pair.of(input, Boolean.TRUE));
  }
 }, typeFamily.tableOf(coll.getPType(), typeFamily.booleans()));
}
origin: cloudera/crunch

@Override
public PCollection<S> union(PCollection<S>... collections) {
 Collection<S> output = Lists.newArrayList();    
 for (PCollection<S> pcollect : collections) {
  for (S s : pcollect.materialize()) {
   output.add(s);
  }
 }
 output.addAll(collect);
 return new MemCollection<S>(output, collections[0].getPType());
}
origin: org.apache.crunch/crunch-hbase

Configuration conf = cells.getPipeline().getConfiguration();
PTable<C, Void> t = cells.parallelDo(
  "Pre-partition",
  new MapFn<C, Pair<C, Void>>() {
    return Pair.of(input, (Void) null);
  }, tableOf(cells.getPType(), nulls()));
 splitPoints = getSplitPoints(table);
Path partitionFile = new Path(((DistributedPipeline) cells.getPipeline()).createTempPath(), "partition");
writePartitionInfo(conf, partitionFile, splitPoints);
GroupingOptions options = GroupingOptions.builder()
origin: apache/crunch

private static <T> PTable<String, BloomFilter> createFilterTable(PCollection<T> collection, BloomFilterFn<T> filterFn) {
 PTypeFamily tf = collection.getTypeFamily();
 PTable<String, BloomFilter> table = collection.parallelDo(filterFn,
   tf.tableOf(tf.strings(), Writables.writables(BloomFilter.class)));
 return table.groupByKey(1).combineValues(new BloomFilterAggregator());
}
origin: org.apache.crunch/crunch-core

/**
 * Splits a {@link PCollection} of any {@link Pair} of objects into a Pair of
 * PCollection}, to allow for the output of a DoFn to be handled using
 * separate channels.
 * 
 * @param pCollection The {@code PCollection} to split
 * @param firstPType The {@code PType} for the first collection
 * @param secondPType The {@code PType} for the second collection
 * @return {@link Pair} of {@link PCollection}
 */
public static <T, U> Pair<PCollection<T>, PCollection<U>> split(PCollection<Pair<T, U>> pCollection,
  PType<T> firstPType, PType<U> secondPType) {
 PCollection<T> first = pCollection.parallelDo("Extract first value", new FirstEmittingDoFn<T, U>(), firstPType);
 PCollection<U> second = pCollection.parallelDo("Extract second value", new SecondEmittingDoFn<T, U>(), secondPType);
 return Pair.of(first, second);
}
origin: org.apache.crunch/crunch-core

/**
 * Sorts the {@code PCollection} of {@link TupleN}s using the specified column
 * ordering and a client-specified number of reducers.
 * 
 * @return a {@code PCollection} representing the sorted collection.
 */
public static <T extends Tuple> PCollection<T> sortTuples(PCollection<T> collection, int numReducers,
  ColumnOrder... columnOrders) {
 PType<T> pType = collection.getPType();
 SortFns.KeyExtraction<T> ke = new SortFns.KeyExtraction<T>(pType, columnOrders);
 PTable<Object, T> pt = collection.by(ke.getByFn(), ke.getKeyType());
 Configuration conf = collection.getPipeline().getConfiguration();
 GroupingOptions options = buildGroupingOptions(pt, conf, numReducers, columnOrders);
 return pt.groupByKey(options).ungroup().values();
}
origin: org.apache.crunch/crunch-core

@Override
public <T> Iterable<T> materialize(PCollection<T> pcollection) {
 return pcollection.materialize();
}
origin: apache/crunch

PCollection<String> words = lines.parallelDo(new DoFn<String, String>() {
 public void process(String line, Emitter<String> emitter) {
  for (String word : line.split("\\s+")) {
PTable<String, Long> counts = words.count();
origin: org.apache.crunch/crunch

PTypeFamily typeFamily = coll1.getTypeFamily();
PType<T> type = coll1.getPType();
return Cogroup.cogroup(toTable(coll1), toTable(coll2)).parallelDo(
  new DoFn<Pair<T, Pair<Collection<Boolean>, Collection<Boolean>>>, Tuple3<T, T, T>>() {
origin: cloudera/seismichadoop

result.write(To.sequenceFile(cmdLine.getOptionValue("output")));
Iterator<ByteBuffer> iter = result.materialize().iterator();
x.start();
while (iter.hasNext()) {
origin: apache/crunch

  table.getName().getNameAsString(),
  regionLocator.getAllRegionLocations());
Path regionLocationFilePath = new Path(((DistributedPipeline) cells.getPipeline()).createTempPath(),
  "regionLocations" + table.getName().getNameAsString());
 writeRegionLocationTable(cells.getPipeline().getConfiguration(), regionLocationFilePath, regionLocationTable);
 hfileTarget.outputConf(RegionLocationTable.REGION_LOCATION_TABLE_PATH, regionLocationFilePath.toString());
 partitioned
   .filter(new FilterByFamilyFn<C>(family))
   .write(hfileTarget);
origin: org.apache.crunch/crunch-core

/**
 * Creates a {@code PCollection<T>} that has the same contents as its input argument but will
 * be written to a fixed number of output files. This is useful for map-only jobs that process
 * lots of input files but only write out a small amount of input per task.
 * 
 * @param pc The {@code PCollection<T>} to rebalance
 * @param numPartitions The number of output partitions to create
 * @return A rebalanced {@code PCollection<T>} with the same contents as the input
 */
public static <T> PCollection<T> shard(PCollection<T> pc, int numPartitions) {
 return pc.by(new ShardFn<T>(), pc.getTypeFamily().ints())
   .groupByKey(numPartitions)
   .ungroup()
   .values();
}

origin: apache/crunch

/**
 * Get the {@link PType} representing how elements of this collection may be serialized.
 */
default PType<S> pType() { return underlying().getPType(); }
origin: cloudera/search

/** Randomizes the order of the items in the collection via a MapReduce job */
private static <T> PCollection<T> randomize(PCollection<T> items) {
 PTable<Long, T> table = items.by("randomize", new RandomizeFn<T>(), Writables.longs());
 table = Sort.sort(table, Sort.Order.ASCENDING);
 return table.values();
}
origin: org.apache.crunch/crunch-hbase

 cells = cells.filter(new StartRowFilterFn<C>(scan.getStartRow()));
 cells = cells.filter(new StopRowFilterFn<C>(scan.getStopRow()));
 cells = cells.filter(new FamilyMapFilterFn<C>(scan.getFamilyMap()));
 cells = cells.filter(new TimeRangeFilterFn<C>(timeRange));
PTable<ByteBuffer, C> cellsByRow = cells.by(new ExtractRowFn<C>(), bytes());
final int versions = scan.getMaxVersions();
return cellsByRow.groupByKey().parallelDo("CombineKeyValueIntoRow",
origin: org.apache.crunch/crunch-hbase

/**
 * Writes out HFiles from the provided <code>cells</code> and <code>table</code>. <code>limitToAffectedRegions</code>
 * is used to indicate that the regions the <code>cells</code> will be loaded into should be identified prior to writing
 * HFiles. Identifying the regions ahead of time will reduce the number of reducers needed when writing. This is
 * beneficial if the data to be loaded only touches a small enough subset of the total regions in the table. If set to
 * false, the number of reducers will equal the number of regions in the table.
 *
 * @see <a href='https://issues.apache.org/jira/browse/CRUNCH-588'>CRUNCH-588</a>
 */
public static <C extends Cell> void writeToHFilesForIncrementalLoad(
  PCollection<C> cells,
  HTable table,
  Path outputPath,
  boolean limitToAffectedRegions) throws IOException {
 HColumnDescriptor[] families = table.getTableDescriptor().getColumnFamilies();
 if (families.length == 0) {
  LOG.warn("{} has no column families", table);
  return;
 }
 PCollection<C> partitioned = sortAndPartition(cells, table, limitToAffectedRegions);
 for (HColumnDescriptor f : families) {
  byte[] family = f.getName();
  partitioned
    .filter(new FilterByFamilyFn<C>(family))
    .write(new HFileTarget(new Path(outputPath, Bytes.toString(family)), f));
 }
}
org.apache.crunchPCollection

Javadoc

A representation of an immutable, distributed collection of elements that is the fundamental target of computations in Crunch.

Most used methods

  • parallelDo
    Applies the given doFn to the elements of this PCollection and returns a new PCollection that is the
  • getPType
    Returns the PType of this PCollection.
  • by
    Apply the given map function to each element of this instance in order to create a PTable.
  • write
    Write the contents of this PCollection to the given Target, using the given Target.WriteMode to hand
  • materialize
    Returns a reference to the data set represented by this PCollection that may be used by the client t
  • getPipeline
    Returns the Pipeline associated with this PCollection.
  • getTypeFamily
    Returns the PTypeFamily of this PCollection.
  • count
    Returns a PTable instance that contains the counts of each unique element of this PCollection.
  • aggregate
    Returns a PCollection that contains the result of aggregating all values in this instance.
  • asReadable
  • cache
    Marks this data as cached using the given CachingOptions. Cached PCollections will only be processed
  • filter
    Apply the given filter function to this instance and return the resulting PCollection.
  • cache,
  • filter,
  • first,
  • getName,
  • getSize,
  • union

Popular in Java

  • Reading from database using SQL prepared statement
  • startActivity (Activity)
  • getSupportFragmentManager (FragmentActivity)
  • setContentView (Activity)
  • FlowLayout (java.awt)
    A flow layout arranges components in a left-to-right flow, much like lines of text in a paragraph. F
  • GridBagLayout (java.awt)
    The GridBagLayout class is a flexible layout manager that aligns components vertically and horizonta
  • SocketException (java.net)
    This SocketException may be thrown during socket creation or setting options, and is the superclass
  • SocketTimeoutException (java.net)
    This exception is thrown when a timeout expired on a socket read or accept operation.
  • MessageFormat (java.text)
    Produces concatenated messages in language-neutral way. New code should probably use java.util.Forma
  • Loader (org.hibernate.loader)
    Abstract superclass of object loading (and querying) strategies. This class implements useful common
  • CodeWhisperer alternatives
Tabnine Logo
  • Products

    Search for Java codeSearch for JavaScript code
  • IDE Plugins

    IntelliJ IDEAWebStormVisual StudioAndroid StudioEclipseVisual Studio CodePyCharmSublime TextPhpStormVimGoLandRubyMineEmacsJupyter NotebookJupyter LabRiderDataGripAppCode
  • Company

    About UsContact UsCareers
  • Resources

    FAQBlogTabnine AcademyTerms of usePrivacy policyJava Code IndexJavascript Code Index
Get Tabnine for your IDE now