@Override public PCollection<S> union(PCollection<S>... collections) { Collection<S> output = Lists.newArrayList(); for (PCollection<S> pcollect : collections) { for (S s : pcollect.materialize()) { output.add(s); } } output.addAll(collect); return new MemCollection<S>(output, collections[0].getPType()); }
@Override public <S> PCollection<S> union(List<PCollection<S>> collections) { List<S> output = Lists.newArrayList(); for (PCollection<S> pcollect : collections) { Iterables.addAll(output, pcollect.materialize()); } return new MemCollection<S>(output, collections.get(0).getPType()); }
@Override public PCollection<S> union(PCollection<S>... collections) { Collection<S> output = Lists.newArrayList(); for (PCollection<S> pcollect : collections) { for (S s : pcollect.materialize()) { output.add(s); } } output.addAll(collect); return new MemCollection<S>(output, collections[0].getPType()); }
public static <S> PCollection<S> sample(PCollection<S> input, long seed, double probability) { String stageName = String.format("sample(%.2f)", probability); return input.parallelDo(stageName, new SamplerFn<S>(seed, probability), input.getPType()); } }
/** * Output records from the given {@code PCollection} using a given seed. Useful for unit * testing. * * @param input The {@code PCollection} to sample from * @param seed The seed for the random number generator * @param probability The probability (0.0 < p < 1.0) * @return The output {@code PCollection} created from sampling */ public static <S> PCollection<S> sample(PCollection<S> input, Long seed, double probability) { String stageName = String.format("sample(%.2f)", probability); return input.parallelDo(stageName, new SampleFn<S>(probability, seed), input.getPType()); }
@Override public PCollection<S> union(PCollection<S>... collections) { List<PCollectionImpl<S>> internal = Lists.newArrayList(); internal.add(this); for (PCollection<S> collection : collections) { internal.add((PCollectionImpl<S>) collection.parallelDo(IdentityFn.<S>getInstance(), collection.getPType())); } return new UnionCollection<S>(internal); }
/** * Splits a {@link PCollection} of any {@link Pair} of objects into a Pair of * PCollection}, to allow for the output of a DoFn to be handled using * separate channels. * * @param pCollection The {@code PCollection} to split */ public static <T, U> Pair<PCollection<T>, PCollection<U>> split(PCollection<Pair<T, U>> pCollection) { PType<Pair<T, U>> pt = pCollection.getPType(); return split(pCollection, pt.getSubTypes().get(0), pt.getSubTypes().get(1)); }
/** * Get the {@link PTypeFamily} representing how elements of this collection may be serialized. */ default PTypeFamily ptf() { return underlying().getPType().getFamily(); }
@Override public PCollection<S> union(PCollection<S>... collections) { List<PCollectionImpl<S>> internal = Lists.newArrayList(); internal.add(this); for (PCollection<S> collection : collections) { internal.add((PCollectionImpl<S>) collection.parallelDo(IdentityFn.<S>getInstance(), collection.getPType())); } return pipeline.getFactory().createUnionCollection(internal); }
/** * Convert the given {@code PCollection<Pair<K, V>>} to a {@code PTable<K, V>}. * @param pcollect The {@code PCollection} to convert * @return A {@code PTable} that contains the same data as the input {@code PCollection} */ public static <K, V> PTable<K, V> asPTable(PCollection<Pair<K, V>> pcollect) { PType<Pair<K, V>> pt = pcollect.getPType(); PTypeFamily ptf = pt.getFamily(); PTableType<K, V> ptt = ptf.tableOf(pt.getSubTypes().get(0), pt.getSubTypes().get(1)); DoFn<Pair<K, V>, Pair<K, V>> id = IdentityFn.getInstance(); return pcollect.parallelDo("asPTable", id, ptt); }
/** * Convert the given {@code PCollection<Pair<K, V>>} to a {@code PTable<K, V>}. * @param pcollect The {@code PCollection} to convert * @return A {@code PTable} that contains the same data as the input {@code PCollection} */ public static <K, V> PTable<K, V> asPTable(PCollection<Pair<K, V>> pcollect) { PType<Pair<K, V>> pt = pcollect.getPType(); PTypeFamily ptf = pt.getFamily(); PTableType<K, V> ptt = ptf.tableOf(pt.getSubTypes().get(0), pt.getSubTypes().get(1)); DoFn<Pair<K, V>, Pair<K, V>> id = IdentityFn.getInstance(); return pcollect.parallelDo("asPTable", id, ptt); }
private static <T> PTable<T, Boolean> toTable(PCollection<T> coll) { PTypeFamily typeFamily = coll.getTypeFamily(); return coll.parallelDo(new DoFn<T, Pair<T, Boolean>>() { @Override public void process(T input, Emitter<Pair<T, Boolean>> emitter) { emitter.emit(Pair.of(input, Boolean.TRUE)); } }, typeFamily.tableOf(coll.getPType(), typeFamily.booleans())); }
@SuppressWarnings("unchecked") public void write(PCollection<?> pcollection, Target target) { if (pcollection instanceof PGroupedTableImpl) { pcollection = ((PGroupedTableImpl<?, ?>) pcollection).ungroup(); } else if (pcollection instanceof UnionCollection || pcollection instanceof UnionTable) { pcollection = pcollection.parallelDo("UnionCollectionWrapper", (MapFn) IdentityFn.<Object> getInstance(), pcollection.getPType()); } addOutput((PCollectionImpl<?>) pcollection, target); }
private static <T> PTable<T, Boolean> toTable(PCollection<T> coll) { PTypeFamily typeFamily = coll.getTypeFamily(); return coll.parallelDo(new DoFn<T, Pair<T, Boolean>>() { @Override public void process(T input, Emitter<Pair<T, Boolean>> emitter) { emitter.emit(Pair.of(input, Boolean.TRUE)); } }, typeFamily.tableOf(coll.getPType(), typeFamily.booleans())); }
private static <T> PTable<T, Boolean> toTable(PCollection<T> coll) { PTypeFamily typeFamily = coll.getTypeFamily(); return coll.parallelDo(new DoFn<T, Pair<T, Boolean>>() { @Override public void process(T input, Emitter<Pair<T, Boolean>> emitter) { emitter.emit(Pair.of(input, Boolean.TRUE)); } }, typeFamily.tableOf(coll.getPType(), typeFamily.booleans())); }
private void writeSequenceFileFromPCollection(final FileSystem fs, final Path path, final PCollection collection) throws IOException { final PType pType = collection.getPType(); final Converter converter = pType.getConverter(); final Class valueClass = converter.getValueClass(); final SequenceFile.Writer writer = new SequenceFile.Writer(fs, fs.getConf(), path, NullWritable.class, valueClass); for (final Object o : collection.materialize()) { final Object value = pType.getOutputMapFn().map(o); writer.append(NullWritable.get(), value); } writer.close(); }
@Override public <T> void writeTextFile(PCollection<T> pcollection, String pathName) { // Ensure that this is a writable pcollection instance. pcollection = pcollection.parallelDo("asText", IdentityFn.<T> getInstance(), WritableTypeFamily .getInstance().as(pcollection.getPType())); write(pcollection, At.textFile(pathName)); }
public static <S> PCollection<S> aggregate(PCollection<S> collect, Aggregator<S> aggregator) { PTypeFamily tf = collect.getTypeFamily(); return collect.parallelDo("Aggregate.aggregator", new MapFn<S, Pair<Boolean, S>>() { public Pair<Boolean, S> map(S input) { return Pair.of(false, input); } }, tf.tableOf(tf.booleans(), collect.getPType())) .groupByKey(1) .combineValues(aggregator) .values(); } }
/** * Returns a {@code PTable} that contains the unique elements of this collection mapped to a count * of their occurrences. */ public static <S> PTable<S, Long> count(PCollection<S> collect) { PTypeFamily tf = collect.getTypeFamily(); return collect.parallelDo("Aggregate.count", new MapFn<S, Pair<S, Long>>() { public Pair<S, Long> map(S input) { return Pair.of(input, 1L); } }, tf.tableOf(collect.getPType(), tf.longs())).groupByKey() .combineValues(Aggregators.SUM_LONGS()); }
private static <E> PCollection<E> partition(PCollection<E> collection, int numReducers) { PType<E> type = collection.getPType(); PTableType<E, Void> tableType = Avros.tableOf(type, Avros.nulls()); PTable<E, Void> table = collection.parallelDo(new AsKeyTable<E>(), tableType); PGroupedTable<E, Void> grouped = numReducers > 0 ? table.groupByKey(numReducers) : table.groupByKey(); return grouped.ungroup().keys(); }