public static PCollection<String> filterByCountry(PCollection<String> data, final String country) { return data.apply("FilterByCountry", Filter.by(new SerializableFunction<String, Boolean>() { public Boolean apply(String row) { return getCountry(row).equals(country); } })); }
@Test public void testDisplayData() { assertThat(DisplayData.from(Filter.lessThan(123)), hasDisplayItem("predicate", "x < 123")); assertThat(DisplayData.from(Filter.lessThanEq(234)), hasDisplayItem("predicate", "x ≤ 234")); assertThat(DisplayData.from(Filter.greaterThan(345)), hasDisplayItem("predicate", "x > 345")); assertThat(DisplayData.from(Filter.greaterThanEq(456)), hasDisplayItem("predicate", "x ≥ 456")); assertThat(DisplayData.from(Filter.equal(567)), hasDisplayItem("predicate", "x == 567")); }
/** * Returns a new {@link Filter} {@link PTransform} that's like this {@link PTransform} but with * the specified description for {@link DisplayData}. Does not modify this {@link PTransform}. */ Filter<T> described(String description) { return new Filter<>(predicate, description); }
/** * Returns a {@code PTransform} that takes an input {@code PCollection<T>} and returns a {@code * PCollection<T>} with elements that equals to a given value. Elements must be {@code * Comparable}. * * <p>Example of use: * * <pre>{@code * PCollection<Integer> listOfNumbers = ...; * PCollection<Integer> equalNumbers = listOfNumbers.apply(Filter.equal(1000)); * }</pre> * * <p>See also {@link #greaterThan}, {@link #lessThan}, {@link #lessThanEq} and {@link * #greaterThanEq}, which return elements satisfying various inequalities with the specified value * based on the elements' natural ordering. * * <p>See also {@link #by}, which returns elements that satisfy the given predicate. */ public static <T extends Comparable<T>> Filter<T> equal(final T value) { return by((SerializableFunction<T, Boolean>) input -> input.compareTo(value) == 0) .described(String.format("x == %s", value)); }
@Test @Category(NeedsRunner.class) public void testFilterGreaterThan() { PCollection<Integer> output = p.apply(Create.of(1, 2, 3, 4, 5, 6, 7)).apply(Filter.greaterThan(4)); PAssert.that(output).containsInAnyOrder(5, 6, 7); p.run(); }
/** * Returns a {@code PTransform} that takes an input {@code PCollection<T>} and returns a {@code * PCollection<T>} with elements that are less than or equal to a given value, based on the * elements' natural ordering. Elements must be {@code Comparable}. * * <p>Example of use: * * <pre>{@code * PCollection<Integer> listOfNumbers = ...; * PCollection<Integer> smallOrEqualNumbers = * listOfNumbers.apply(Filter.lessThanEq(10)); * }</pre> * * <p>See also {@link #lessThan}, {@link #greaterThanEq}, {@link #equal} and {@link #greaterThan}, * which return elements satisfying various inequalities with the specified value based on the * elements' natural ordering. * * <p>See also {@link #by}, which returns elements that satisfy the given predicate. */ public static <T extends Comparable<T>> Filter<T> lessThanEq(final T value) { return by((SerializableFunction<T, Boolean>) input -> input.compareTo(value) <= 0) .described(String.format("x ≤ %s", value)); }
@Before public void setup() { createdInts = p.apply("createdInts", Create.of(1, 2, 3)); filtered = createdInts.apply("filtered", Filter.greaterThan(1)); filteredTimesTwo = filtered.apply( "timesTwo", ParDo.of( new DoFn<Integer, Integer>() { @ProcessElement public void processElement(ProcessContext c) throws Exception { c.output(c.element() * 2); } })); keyed = createdInts.apply("keyed", WithKeys.of("MyKey")); intsToFlatten = p.apply("intsToFlatten", Create.of(-1, 256, 65535)); PCollectionList<Integer> preFlatten = PCollectionList.of(createdInts).and(intsToFlatten); flattened = preFlatten.apply("flattened", Flatten.pCollections()); clock = MockClock.fromInstant(new Instant(1000)); DirectGraphs.performDirectOverrides(p); graph = DirectGraphs.getGraph(p); manager = WatermarkManager.create(clock, graph, AppliedPTransform::getFullName); bundleFactory = ImmutableListBundleFactory.create(); }
/** * Compute a PCollection of reference allele frequencies for SNPs of interest. * The SNPs all have only a single alternate allele, and neither the * reference nor the alternate allele have a population frequency < minFreq. * The results are returned in a PCollection indexed by Position. * * @param variants a set of variant calls for a reference population * @param minFreq the minimum allele frequency for the set * @return a PCollection mapping Position to AlleleCounts */ static PCollection<KV<Position, AlleleFreq>> getFreq( PCollection<Variant> variants, double minFreq) { return variants.apply("PassingFilter", Filter.by(VariantFunctions.IS_PASSING)) .apply("OnChromosomeFilter", Filter.by(VariantFunctions.IS_ON_CHROMOSOME)) .apply("NotLowQualityFilter", Filter.by(VariantFunctions.IS_NOT_LOW_QUALITY)) .apply("SNPFilter", Filter.by(VariantFunctions.IS_SINGLE_ALTERNATE_SNP)) .apply(ParDo.of(new GetAlleleFreq())) .apply(Filter.by(new FilterFreq(minFreq))); }
/** * Returns a {@code PTransform} that takes an input {@link PCollection} and returns a {@link * PCollection} with elements that are less than a given value, based on the elements' natural * ordering. Elements must be {@code Comparable}. * * <p>Example of use: * * <pre>{@code * PCollection<Integer> listOfNumbers = ...; * PCollection<Integer> smallNumbers = * listOfNumbers.apply(Filter.lessThan(10)); * }</pre> * * <p>See also {@link #lessThanEq}, {@link #greaterThanEq}, {@link #equal} and {@link * #greaterThan}, which return elements satisfying various inequalities with the specified value * based on the elements' natural ordering. * * <p>See also {@link #by}, which returns elements that satisfy the given predicate. */ public static <T extends Comparable<T>> Filter<T> lessThan(final T value) { return by((SerializableFunction<T, Boolean>) input -> input.compareTo(value) < 0) .described(String.format("x < %s", value)); }
/** * Returns a {@code PTransform} that takes an input {@code PCollection<T>} and returns a {@code * PCollection<T>} with elements that satisfy the given predicate. The predicate must be a {@code * SerializableFunction<T, Boolean>}. * * <p>Example of use: * * <pre>{@code * PCollection<String> wordList = ...; * PCollection<String> longWords = * wordList.apply(Filter.by(new MatchIfWordLengthGT(6))); * }</pre> * * <p>See also {@link #lessThan}, {@link #lessThanEq}, {@link #greaterThan}, {@link * #greaterThanEq}, which return elements satisfying various inequalities with the specified value * based on the elements' natural ordering. */ public static <T, PredicateT extends SerializableFunction<T, Boolean>> Filter<T> by( PredicateT predicate) { return new Filter<>(predicate); }
/** * Compute a PCollection of reference allele frequencies for SNPs of interest. * The SNPs all have only a single alternate allele, and neither the * reference nor the alternate allele have a population frequency < minFreq. * The results are returned in a PCollection indexed by Position. * * @param variants a set of variant calls for a reference population * @param minFreq the minimum allele frequency for the set * @return a PCollection mapping Position to AlleleCounts */ static PCollection<KV<Position, AlleleFreq>> getFreq( PCollection<Variant> variants, double minFreq) { return variants.apply("PassingFilter", Filter.by(VariantFunctions.IS_PASSING)) .apply("OnChromosomeFilter", Filter.by(VariantFunctions.IS_ON_CHROMOSOME)) .apply("NotLowQualityFilter", Filter.by(VariantFunctions.IS_NOT_LOW_QUALITY)) .apply("SNPFilter", Filter.by(VariantFunctions.IS_SINGLE_ALTERNATE_SNP)) .apply(ParDo.of(new GetAlleleFreq())) .apply(Filter.by(new FilterFreq(minFreq))); }
/** * Returns a {@code PTransform} that takes an input {@code PCollection<T>} and returns a {@code * PCollection<T>} with elements that are greater than a given value, based on the elements' * natural ordering. Elements must be {@code Comparable}. * * <p>Example of use: * * <pre>{@code * PCollection<Integer> listOfNumbers = ...; * PCollection<Integer> largeNumbers = * listOfNumbers.apply(Filter.greaterThan(1000)); * }</pre> * * <p>See also {@link #greaterThanEq}, {@link #lessThan}, {@link #equal} and {@link #lessThanEq}, * which return elements satisfying various inequalities with the specified value based on the * elements' natural ordering. * * <p>See also {@link #by}, which returns elements that satisfy the given predicate. */ public static <T extends Comparable<T>> Filter<T> greaterThan(final T value) { return by((SerializableFunction<T, Boolean>) input -> input.compareTo(value) > 0) .described(String.format("x > %s", value)); }
reads.apply("IsOnChromosome", Filter.by(ReadFunctions.IS_ON_CHROMOSOME)) .apply("IsNotQCFailure", Filter.by(ReadFunctions.IS_NOT_QC_FAILURE)) .apply("IsNotDuplicate", Filter.by(ReadFunctions.IS_NOT_DUPLICATE)) .apply("IsProperPlacement", Filter.by(ReadFunctions.IS_PROPER_PLACEMENT)) .apply(ParDo.of(new SplitReads())) .apply(Filter.by(new SampleReads(samplingFraction, samplingPrefix)));
/** * Returns a {@code PTransform} that takes an input {@code PCollection<T>} and returns a {@code * PCollection<T>} with elements that are greater than or equal to a given value, based on the * elements' natural ordering. Elements must be {@code Comparable}. * * <p>Example of use: * * <pre>{@code * PCollection<Integer> listOfNumbers = ...; * PCollection<Integer> largeOrEqualNumbers = * listOfNumbers.apply(Filter.greaterThanEq(1000)); * }</pre> * * <p>See also {@link #greaterThan}, {@link #lessThan}, {@link #equal} and {@link #lessThanEq}, * which return elements satisfying various inequalities with the specified value based on the * elements' natural ordering. * * <p>See also {@link #by}, which returns elements that satisfy the given predicate. */ public static <T extends Comparable<T>> Filter<T> greaterThanEq(final T value) { return by((SerializableFunction<T, Boolean>) input -> input.compareTo(value) >= 0) .described(String.format("x ≥ %s", value)); }
reads.apply("IsOnChromosome", Filter.by(ReadFunctions.IS_ON_CHROMOSOME)) .apply("IsNotQCFailure", Filter.by(ReadFunctions.IS_NOT_QC_FAILURE)) .apply("IsNotDuplicate", Filter.by(ReadFunctions.IS_NOT_DUPLICATE)) .apply("IsProperPlacement", Filter.by(ReadFunctions.IS_PROPER_PLACEMENT)) .apply(ParDo.of(new SplitReads())) .apply(Filter.by(new SampleReads(samplingFraction, samplingPrefix)));
@Override public PCollection<String> expand(PCollection<String> inputCollection) { PCollection<String> compositeKeys = inputCollection.apply("ExtractCompositeKey", ParDo.of(new DoFn<String, String>() { @ProcessElement public void processElement(ProcessContext c) { c.output(getCompositeKey(c.element())); } })).apply("FilterValidCompositeKeys", Filter.by(new SerializableFunction<String, Boolean>() { public Boolean apply(String input) { return (!input.equals("NA")); } })); PCollection<KV<String, Long>> compositesEventsPairs = compositeKeys.apply("GetEventsByCompositeKey", Count.<String>perElement()); PCollection<String> result = compositesEventsPairs.apply("FormatOutput", MapElements.via( new SimpleFunction<KV<String, Long>, String>() { @Override public String apply(KV<String, Long> kv) { StringBuilder str = new StringBuilder(); String[] split = kv.getKey().split("_"); String country = split[0]; String subject = split[1]; Long eventsNb = kv.getValue(); str.append(country).append(" ").append(subject).append(" ").append(eventsNb); return str.toString(); } })); return result; }
@Override public PCollection<KV<String, String>> expand(PCollection<KV<String, String>> input) { // reparallelize mimics the same behavior as in JdbcIO // breaking fusion PCollectionView<Iterable<KV<String, String>>> empty = input .apply("Consume", Filter.by(SerializableFunctions.constant(false))) .apply(View.asIterable()); PCollection<KV<String, String>> materialized = input.apply( "Identity", ParDo.of( new DoFn<KV<String, String>, KV<String, String>>() { @ProcessElement public void processElement(ProcessContext context) { context.output(context.element()); } }) .withSideInputs(empty)); return materialized.apply(Reshuffle.viaRandomKey()); } }
@Override public PCollection<T> expand(PCollection<T> input) { // See https://issues.apache.org/jira/browse/BEAM-2803 // We use a combined approach to "break fusion" here: // (see https://cloud.google.com/dataflow/service/dataflow-service-desc#preventing-fusion) // 1) force the data to be materialized by passing it as a side input to an identity fn, // then 2) reshuffle it with a random key. Initial materialization provides some parallelism // and ensures that data to be shuffled can be generated in parallel, while reshuffling // provides perfect parallelism. // In most cases where a "fusion break" is needed, a simple reshuffle would be sufficient. // The current approach is necessary only to support the particular case of JdbcIO where // a single query may produce many gigabytes of query results. PCollectionView<Iterable<T>> empty = input .apply("Consume", Filter.by(SerializableFunctions.constant(false))) .apply(View.asIterable()); PCollection<T> materialized = input.apply( "Identity", ParDo.of( new DoFn<T, T>() { @ProcessElement public void process(ProcessContext c) { c.output(c.element()); } }) .withSideInputs(empty)); return materialized.apply(Reshuffle.viaRandomKey()); } }
@Override public PCollection<T> expand(PCollection<T> input) { // See https://issues.apache.org/jira/browse/BEAM-2803 // We use a combined approach to "break fusion" here: // (see https://cloud.google.com/dataflow/service/dataflow-service-desc#preventing-fusion) // 1) force the data to be materialized by passing it as a side input to an identity fn, // then 2) reshuffle it with a random key. Initial materialization provides some parallelism // and ensures that data to be shuffled can be generated in parallel, while reshuffling // provides perfect parallelism. // In most cases where a "fusion break" is needed, a simple reshuffle would be sufficient. // The current approach is necessary only to support the particular case of JdbcIO where // a single query may produce many gigabytes of query results. PCollectionView<Iterable<T>> empty = input .apply("Consume", Filter.by(SerializableFunctions.constant(false))) .apply(View.asIterable()); PCollection<T> materialized = input.apply( "Identity", ParDo.of( new DoFn<T, T>() { @ProcessElement public void process(ProcessContext c) { c.output(c.element()); } }) .withSideInputs(empty)); return materialized.apply(Reshuffle.viaRandomKey()); } }
PCollection<IndexedRecord> output = mainPCollection.apply(ctx.getPTransformName(), Filter.by(predicate)); ctx.putPCollectionByLinkName(hasFlow ? flowLink : rejectLink, output); } else {