private void runApproximateUniqueWithSkewedDistributions( final int elementCount, final int uniqueCount, final int sampleSize) { final List<Integer> elements = Lists.newArrayList(); // Zipf distribution with approximately elementCount items. final double s = 1 - 1.0 * uniqueCount / elementCount; final double maxCount = Math.pow(uniqueCount, s); for (int k = 0; k < uniqueCount; k++) { final int count = Math.max(1, (int) Math.round(maxCount * Math.pow(k, -s))); // Element k occurs count times. for (int c = 0; c < count; c++) { elements.add(k); } } final PCollection<Integer> input = p.apply(Create.of(elements)); final PCollection<Long> estimate = input.apply(ApproximateUnique.globally(sampleSize)); PAssert.thatSingleton(estimate).satisfies(new VerifyEstimateFn(uniqueCount, sampleSize)); p.run(); }
.satisfies( input -> { assertEquals(input.keySet(), expectedTempTables.keySet());
private void runApproximateUniqueWithDuplicates( final int elementCount, final int uniqueCount, final int sampleSize) { assert elementCount >= uniqueCount; final List<Double> elements = Lists.newArrayList(); for (int i = 0; i < elementCount; i++) { elements.add(1.0 / (i % uniqueCount + 1)); } Collections.shuffle(elements); final PCollection<Double> input = p.apply(Create.of(elements)); final PCollection<Long> estimate = input.apply(ApproximateUnique.globally(sampleSize)); PAssert.thatSingleton(estimate).satisfies(new VerifyEstimateFn(uniqueCount, sampleSize)); p.run(); }
Arrays.asList(decompressedAuto, decompressedDefault, decompressedUncompressed)) { PAssert.thatSingleton(c) .satisfies( input -> { assertEquals(path, input.getMetadata().resourceId().toString()); Arrays.asList(compressionAuto, compressionDefault, compressionGzip)) { PAssert.thatSingleton(c) .satisfies( input -> { assertEquals(pathGZ, input.getMetadata().resourceId().toString());
.satisfies( results -> { CoGbkResult result1 = results.get(1);
@Test @Category(ValidatesRunner.class) public void testCoGroupByKeyGetOnly() { final TupleTag<String> tag1 = new TupleTag<>(); final TupleTag<String> tag2 = new TupleTag<>(); PCollection<KV<Integer, CoGbkResult>> coGbkResults = buildGetOnlyGbk(p, tag1, tag2); PAssert.thatMap(coGbkResults) .satisfies( results -> { assertEquals("collection1-1", results.get(1).getOnly(tag1)); assertEquals("collection1-2", results.get(2).getOnly(tag1)); assertEquals("collection2-2", results.get(2).getOnly(tag2)); assertEquals("collection2-3", results.get(3).getOnly(tag2)); return null; }); p.run(); }
@Test public void testParseDamagedPdfFile() throws IOException { String path = getClass().getResource("/damaged.pdf").getPath(); PCollection<ParseResult> res = p.apply("ParseInvalidPdfFile", TikaIO.parse().filepattern(path)); PAssert.thatSingleton(res) .satisfies( input -> { assertEquals(path, input.getFileLocation()); assertFalse(input.isSuccess()); assertTrue(input.getError() instanceof TikaException); return null; }); p.run(); }