Tabnine Logo
PCollection.parallelDo
Code IndexAdd Tabnine to your IDE (free)

How to use
parallelDo
method
in
org.apache.crunch.PCollection

Best Java code snippets using org.apache.crunch.PCollection.parallelDo (Showing top 20 results out of 315)

origin: cloudera/crunch

 public static <S> PCollection<S> sample(PCollection<S> input, long seed, double probability) {
  String stageName = String.format("sample(%.2f)", probability);
  return input.parallelDo(stageName, new SamplerFn<S>(seed, probability), input.getPType());
 }
}
origin: org.apache.crunch/crunch-core

/**
 * Output records from the given {@code PCollection} using a given seed. Useful for unit
 * testing.
 * 
 * @param input The {@code PCollection} to sample from
 * @param seed The seed for the random number generator
 * @param probability The probability (0.0 &lt; p &lt; 1.0)
 * @return The output {@code PCollection} created from sampling
 */
public static <S> PCollection<S> sample(PCollection<S> input, Long seed, double probability) {
 String stageName = String.format("sample(%.2f)", probability);
 return input.parallelDo(stageName, new SampleFn<S>(probability, seed), input.getPType());
}

origin: org.apache.crunch/crunch

@Override
public PCollection<S> union(PCollection<S>... collections) {
 List<PCollectionImpl<S>> internal = Lists.newArrayList();
 internal.add(this);
 for (PCollection<S> collection : collections) {
  internal.add((PCollectionImpl<S>) collection.parallelDo(IdentityFn.<S>getInstance(), collection.getPType()));
 }
 return new UnionCollection<S>(internal);
}
origin: org.apache.crunch/crunch-core

@Override
public PCollection<S> union(PCollection<S>... collections) {
 List<PCollectionImpl<S>> internal = Lists.newArrayList();
 internal.add(this);
 for (PCollection<S> collection : collections) {
  internal.add((PCollectionImpl<S>) collection.parallelDo(IdentityFn.<S>getInstance(), collection.getPType()));
 }
 return pipeline.getFactory().createUnionCollection(internal);
}
origin: apache/crunch

/**
 * Transform this LCollection to an {@link LTable} using a standard Crunch {@link DoFn}
 */
default <K, V> LTable<K, V> parallelDo(DoFn<S, Pair<K, V>> fn, PTableType<K, V> pType) {
  return factory().wrap(underlying().parallelDo(fn, pType));
}
origin: apache/crunch

/**
 * Transform this LCollection using a standard Crunch {@link DoFn}
 */
default <T> LCollection<T> parallelDo(DoFn<S, T> fn, PType<T> pType) {
  return factory().wrap(underlying().parallelDo(fn, pType));
}
origin: org.apache.crunch/crunch-core

/**
 * Convert the given {@code PCollection<Pair<K, V>>} to a {@code PTable<K, V>}.
 * @param pcollect The {@code PCollection} to convert
 * @return A {@code PTable} that contains the same data as the input {@code PCollection}
 */
public static <K, V> PTable<K, V> asPTable(PCollection<Pair<K, V>> pcollect) {
 PType<Pair<K, V>> pt = pcollect.getPType();
 PTypeFamily ptf = pt.getFamily();
 PTableType<K, V> ptt = ptf.tableOf(pt.getSubTypes().get(0), pt.getSubTypes().get(1));
 DoFn<Pair<K, V>, Pair<K, V>> id = IdentityFn.getInstance();
 return pcollect.parallelDo("asPTable", id, ptt);
}
origin: org.apache.crunch/crunch

/**
 * Convert the given {@code PCollection<Pair<K, V>>} to a {@code PTable<K, V>}.
 * @param pcollect The {@code PCollection} to convert
 * @return A {@code PTable} that contains the same data as the input {@code PCollection}
 */
public static <K, V> PTable<K, V> asPTable(PCollection<Pair<K, V>> pcollect) {
 PType<Pair<K, V>> pt = pcollect.getPType();
 PTypeFamily ptf = pt.getFamily();
 PTableType<K, V> ptt = ptf.tableOf(pt.getSubTypes().get(0), pt.getSubTypes().get(1));
 DoFn<Pair<K, V>, Pair<K, V>> id = IdentityFn.getInstance();
 return pcollect.parallelDo("asPTable", id, ptt);
}

origin: cloudera/crunch

@SuppressWarnings("unchecked")
public void write(PCollection<?> pcollection, Target target) {
 if (pcollection instanceof PGroupedTableImpl) {
  pcollection = ((PGroupedTableImpl<?, ?>) pcollection).ungroup();
 } else if (pcollection instanceof UnionCollection || pcollection instanceof UnionTable) {
  pcollection = pcollection.parallelDo("UnionCollectionWrapper",
    (MapFn) IdentityFn.<Object> getInstance(), pcollection.getPType());
 }
 addOutput((PCollectionImpl<?>) pcollection, target);
}
origin: cloudera/crunch

private static <T> PTable<T, Boolean> toTable(PCollection<T> coll) {
 PTypeFamily typeFamily = coll.getTypeFamily();
 return coll.parallelDo(new DoFn<T, Pair<T, Boolean>>() {
  @Override
  public void process(T input, Emitter<Pair<T, Boolean>> emitter) {
   emitter.emit(Pair.of(input, Boolean.TRUE));
  }
 }, typeFamily.tableOf(coll.getPType(), typeFamily.booleans()));
}
origin: org.apache.crunch/crunch-core

@Override
public <T> void writeTextFile(PCollection<T> pcollection, String pathName) {
 pcollection.parallelDo("asText", new StringifyFn<T>(), Writables.strings())
   .write(To.textFile(pathName));
}
origin: org.apache.crunch/crunch-core

private static <T> PTable<T, Boolean> toTable(PCollection<T> coll) {
 PTypeFamily typeFamily = coll.getTypeFamily();
 return coll.parallelDo(new DoFn<T, Pair<T, Boolean>>() {
  @Override
  public void process(T input, Emitter<Pair<T, Boolean>> emitter) {
   emitter.emit(Pair.of(input, Boolean.TRUE));
  }
 }, typeFamily.tableOf(coll.getPType(), typeFamily.booleans()));
}
origin: org.apache.crunch/crunch

@Override
public <T> void writeTextFile(PCollection<T> pcollection, String pathName) {
 pcollection.parallelDo("asText", new StringifyFn<T>(), Writables.strings())
   .write(To.textFile(pathName));
}
origin: org.apache.crunch/crunch

private static <T> PTable<T, Boolean> toTable(PCollection<T> coll) {
 PTypeFamily typeFamily = coll.getTypeFamily();
 return coll.parallelDo(new DoFn<T, Pair<T, Boolean>>() {
  @Override
  public void process(T input, Emitter<Pair<T, Boolean>> emitter) {
   emitter.emit(Pair.of(input, Boolean.TRUE));
  }
 }, typeFamily.tableOf(coll.getPType(), typeFamily.booleans()));
}
origin: cloudera/crunch

@Override
public <T> void writeTextFile(PCollection<T> pcollection, String pathName) {
 // Ensure that this is a writable pcollection instance.
 pcollection = pcollection.parallelDo("asText", IdentityFn.<T> getInstance(), WritableTypeFamily
   .getInstance().as(pcollection.getPType()));
 write(pcollection, At.textFile(pathName));
}
origin: org.apache.crunch/crunch-core

@Override
public <K, V> PTable<K, V> create(Iterable<Pair<K, V>> contents, PTableType<K, V> ptype, CreateOptions options) {
 if (Iterables.isEmpty(contents)) {
  return emptyPTable(ptype);
 }
 ReadableSource<Pair<K, V>> src = null;
 try {
  src = ptype.createSourceTarget(getConfiguration(), createTempPath(), contents, options.getParallelism());
 } catch (IOException e) {
  throw new CrunchRuntimeException("Error creating PTable: " + contents, e);
 }
 return read(src).parallelDo(IdentityFn.<Pair<K, V>>getInstance(), ptype);
}
origin: org.apache.crunch/crunch-core

 public static <S> PCollection<S> aggregate(PCollection<S> collect, Aggregator<S> aggregator) {
  PTypeFamily tf = collect.getTypeFamily();
  return collect.parallelDo("Aggregate.aggregator", new MapFn<S, Pair<Boolean, S>>() {
   public Pair<Boolean, S> map(S input) {
    return Pair.of(false, input);
   }
  }, tf.tableOf(tf.booleans(), collect.getPType()))
  .groupByKey(1)
  .combineValues(aggregator)
  .values();
 }
}
origin: apache/crunch

private static <T> PTable<String, BloomFilter> createFilterTable(PCollection<T> collection, BloomFilterFn<T> filterFn) {
 PTypeFamily tf = collection.getTypeFamily();
 PTable<String, BloomFilter> table = collection.parallelDo(filterFn,
   tf.tableOf(tf.strings(), Writables.writables(BloomFilter.class)));
 return table.groupByKey(1).combineValues(new BloomFilterAggregator());
}
origin: org.apache.crunch/crunch

/**
 * Returns a {@code PTable} that contains the unique elements of this collection mapped to a count
 * of their occurrences.
 */
public static <S> PTable<S, Long> count(PCollection<S> collect) {
 PTypeFamily tf = collect.getTypeFamily();
 return collect.parallelDo("Aggregate.count", new MapFn<S, Pair<S, Long>>() {
  public Pair<S, Long> map(S input) {
   return Pair.of(input, 1L);
  }
 }, tf.tableOf(collect.getPType(), tf.longs())).groupByKey()
   .combineValues(Aggregators.SUM_LONGS());
}
origin: kite-sdk/kite

private static <E> PCollection<E> partition(PCollection<E> collection,
                      int numReducers) {
 PType<E> type = collection.getPType();
 PTableType<E, Void> tableType = Avros.tableOf(type, Avros.nulls());
 PTable<E, Void> table = collection.parallelDo(new AsKeyTable<E>(), tableType);
 PGroupedTable<E, Void> grouped =
   numReducers > 0 ? table.groupByKey(numReducers) : table.groupByKey();
 return grouped.ungroup().keys();
}
org.apache.crunchPCollectionparallelDo

Javadoc

Similar to the other parallelDo instance, but returns a PTable instance instead of a PCollection.

Popular methods of PCollection

  • getPType
    Returns the PType of this PCollection.
  • by
    Apply the given map function to each element of this instance in order to create a PTable.
  • write
    Write the contents of this PCollection to the given Target, using the given Target.WriteMode to hand
  • materialize
    Returns a reference to the data set represented by this PCollection that may be used by the client t
  • getPipeline
    Returns the Pipeline associated with this PCollection.
  • getTypeFamily
    Returns the PTypeFamily of this PCollection.
  • count
    Returns a PTable instance that contains the counts of each unique element of this PCollection.
  • aggregate
    Returns a PCollection that contains the result of aggregating all values in this instance.
  • asReadable
  • cache
    Marks this data as cached using the given CachingOptions. Cached PCollections will only be processed
  • filter
    Apply the given filter function to this instance and return the resulting PCollection.
  • first
  • filter,
  • first,
  • getName,
  • getSize,
  • union

Popular in Java

  • Running tasks concurrently on multiple threads
  • onRequestPermissionsResult (Fragment)
  • setRequestProperty (URLConnection)
  • compareTo (BigDecimal)
  • FileNotFoundException (java.io)
    Thrown when a file specified by a program cannot be found.
  • DateFormat (java.text)
    Formats or parses dates and times.This class provides factories for obtaining instances configured f
  • NoSuchElementException (java.util)
    Thrown when trying to retrieve an element past the end of an Enumeration or Iterator.
  • Set (java.util)
    A Set is a data structure which does not allow duplicate elements.
  • TreeSet (java.util)
    TreeSet is an implementation of SortedSet. All optional operations (adding and removing) are support
  • Callable (java.util.concurrent)
    A task that returns a result and may throw an exception. Implementors define a single method with no
  • Top PhpStorm plugins
Tabnine Logo
  • Products

    Search for Java codeSearch for JavaScript code
  • IDE Plugins

    IntelliJ IDEAWebStormVisual StudioAndroid StudioEclipseVisual Studio CodePyCharmSublime TextPhpStormVimGoLandRubyMineEmacsJupyter NotebookJupyter LabRiderDataGripAppCode
  • Company

    About UsContact UsCareers
  • Resources

    FAQBlogTabnine AcademyTerms of usePrivacy policyJava Code IndexJavascript Code Index
Get Tabnine for your IDE now