@Test public void getInputSucceeds() { TestPipeline p = TestPipeline.create(); PCollection<KV<String, Integer>> input = p.apply( Create.of(KV.of("foo", 1)) .withCoder(KvCoder.of(StringUtf8Coder.of(), VarIntCoder.of()))); PCollection<KV<String, Iterable<Integer>>> grouped = input.apply(GroupByKey.create()); AppliedPTransform<?, ?, ?> producer = DirectGraphs.getProducer(grouped); PTransformReplacement< PCollection<KV<String, Integer>>, PCollection<KV<String, Iterable<Integer>>>> replacement = factory.getReplacementTransform((AppliedPTransform) producer); assertThat(replacement.getInput(), Matchers.<PCollection<?>>equalTo(input)); } }
private PCollection<Row> pCollectionOf2Elements() { return pipeline.apply( "boundedInput1", Create.of( Row.withSchema(INPUT_SCHEMA) .addValues(1) .addArray(Arrays.asList("111", "222")) .build(), Row.withSchema(INPUT_SCHEMA) .addValues(2) .addArray(Arrays.asList("33", "44", "55")) .build()) .withSchema( INPUT_SCHEMA, SerializableFunctions.identity(), SerializableFunctions.identity())); } }
@Override public final PCollection<T> expand(PBegin input) { try { PCollection<T> pc = Pipeline.applyTransform(input, Impulse.create()) .apply( ParDo.of( DecodeAndEmitDoFn.fromIterable( transform.getElements(), originalOutput.getCoder()))); pc.setCoder(originalOutput.getCoder()); return pc; } catch (IOException e) { throw new IllegalStateException("Unable to encode elements.", e); } }
Row.withSchema(elementSchema).addValues("DD", 44).build())) .build()) .withSchema( inputType, SerializableFunctions.identity(), SerializableFunctions.identity()));
1, Row.withSchema(nestedSchema).addValues(312, "CC", 313).build()) .build()) .withSchema( inputType, SerializableFunctions.identity(), SerializableFunctions.identity()));
1, Row.withSchema(nestedSchema).addValues(312, "CC", 313).build()) .build()) .withSchema( inputType, SerializableFunctions.identity(), SerializableFunctions.identity()));
"boundedInput1", Create.of(row1, row2) .withSchema( INPUT_SCHEMA, SerializableFunctions.identity(),
2, Row.withSchema(nestedSchema).addValues(412, "DD", 413).build()) .build()) .withSchema( inputType, SerializableFunctions.identity(), SerializableFunctions.identity()));
.apply("ExposeSchemaCountersSeed", Create.of(Collections.singletonList(0)) .withType(TypeDescriptors.integers())) .apply("ExposeSchemaCounters", MapElements.into(TypeDescriptors.integers()).via(v -> {
pipeline.apply( Create.of(Row.withSchema(INPUT_ROW_SCHEMA).addValues(1).addValue("20181018").build()) .withSchema( INPUT_ROW_SCHEMA, SerializableFunctions.identity(),
@Test @Category({ValidatesRunner.class, UsesSchema.class}) public void testFieldAccessSchemaPipeline() { List<MyPojo> pojoList = Lists.newArrayList(new MyPojo("a", 1), new MyPojo("b", 2), new MyPojo("c", 3)); Schema schema = Schema.builder().addStringField("string_field").addInt32Field("integer_field").build(); PCollection<String> output = pipeline .apply( Create.of(pojoList) .withSchema( schema, o -> Row.withSchema(schema).addValues(o.stringField, o.integerField).build(), r -> new MyPojo(r.getString("string_field"), r.getInt32("integer_field")))) .apply( ParDo.of( new DoFn<MyPojo, String>() { @FieldAccess("foo") final FieldAccessDescriptor fieldAccess = FieldAccessDescriptor.withAllFields(); @ProcessElement public void process(@FieldAccess("foo") Row row, OutputReceiver<String> r) { r.output(row.getString(0) + ":" + row.getInt32(1)); } })); PAssert.that(output).containsInAnyOrder("a:1", "b:2", "c:3"); pipeline.run(); }
@Test public void testCastToDate() { PCollection<Row> input = pipeline.apply( Create.of(Row.withSchema(INPUT_ROW_SCHEMA).addValues(1).addValue("20181018").build()) .withSchema( INPUT_ROW_SCHEMA, SerializableFunctions.identity(), SerializableFunctions.identity())); Schema resultType = Schema.builder().addInt32Field("f_int").addNullableField("f_date", DATETIME).build(); PCollection<Row> result = input.apply( SqlTransform.query( "SELECT f_int, \n" + " CAST( \n" + " SUBSTRING(TRIM(f_string) FROM 1 FOR 4) \n" + " ||'-' \n" + " ||SUBSTRING(TRIM(f_string) FROM 5 FOR 2) \n" + " ||'-' \n" + " ||SUBSTRING(TRIM(f_string) FROM 7 FOR 2) as DATE) \n" + "FROM PCOLLECTION")); PAssert.that(result) .containsInAnyOrder( Row.withSchema(resultType).addValues(1, new DateTime(2018, 10, 18, 0, 0)).build()); pipeline.run(); }
@Test @Category({ValidatesRunner.class, UsesSchema.class}) public void testSimpleSchemaPipeline() { List<MyPojo> pojoList = Lists.newArrayList(new MyPojo("a", 1), new MyPojo("b", 2), new MyPojo("c", 3)); Schema schema = Schema.builder().addStringField("string_field").addInt32Field("integer_field").build(); PCollection<String> output = pipeline .apply( Create.of(pojoList) .withSchema( schema, o -> Row.withSchema(schema).addValues(o.stringField, o.integerField).build(), r -> new MyPojo(r.getString("string_field"), r.getInt32("integer_field")))) .apply( ParDo.of( new DoFn<MyPojo, String>() { @ProcessElement public void process(@Element Row row, OutputReceiver<String> r) { r.output(row.getString(0) + ":" + row.getInt32(1)); } })); PAssert.that(output).containsInAnyOrder("a:1", "b:2", "c:3"); pipeline.run(); }
@Test public void testUnnestNamedLiteral() { PCollection<Row> input = pipeline.apply( "boundedInput1", Create.empty(TypeDescriptor.of(Row.class)) .withSchema( INPUT_SCHEMA, SerializableFunctions.identity(), SerializableFunctions.identity())); // Because we have a multi-part FROM the DSL considers it multi-input TupleTag<Row> mainTag = new TupleTag<Row>("main") {}; PCollectionTuple inputTuple = PCollectionTuple.of(mainTag, input); Schema resultType = Schema.builder().addStringField("f_string").build(); PCollection<Row> result = inputTuple.apply( "sqlQuery", SqlTransform.query("SELECT * FROM UNNEST (ARRAY ['a', 'b', 'c']) AS t(f_string)")); PAssert.that(result) .containsInAnyOrder( Row.withSchema(resultType).addValues("a").build(), Row.withSchema(resultType).addValues("b").build(), Row.withSchema(resultType).addValues("c").build()); pipeline.run(); }
@Test public void testUnnestLiteral() { PCollection<Row> input = pipeline.apply( "boundedInput1", Create.empty(TypeDescriptor.of(Row.class)) .withSchema( INPUT_SCHEMA, SerializableFunctions.identity(), SerializableFunctions.identity())); // Because we have a multi-part FROM the DSL considers it multi-input TupleTag<Row> mainTag = new TupleTag<Row>("main") {}; PCollectionTuple inputTuple = PCollectionTuple.of(mainTag, input); Schema resultType = Schema.builder().addStringField("f_string").build(); PCollection<Row> result = inputTuple.apply( "sqlQuery", SqlTransform.query("SELECT * FROM UNNEST (ARRAY ['a', 'b', 'c'])")); PAssert.that(result) .containsInAnyOrder( Row.withSchema(resultType).addValues("a").build(), Row.withSchema(resultType).addValues("b").build(), Row.withSchema(resultType).addValues("c").build()); pipeline.run(); }
@Test @Category({ValidatesRunner.class, UsesSchema.class}) public void testUnmatchedSchema() { List<MyPojo> pojoList = Lists.newArrayList(new MyPojo("a", 1), new MyPojo("b", 2), new MyPojo("c", 3)); Schema schema = Schema.builder().addStringField("string_field").addInt32Field("integer_field").build(); thrown.expect(IllegalArgumentException.class); pipeline .apply( Create.of(pojoList) .withSchema( schema, o -> Row.withSchema(schema).addValues(o.stringField, o.integerField).build(), r -> new MyPojo(r.getString("string_field"), r.getInt32("integer_field")))) .apply( ParDo.of( new DoFn<MyPojo, Void>() { @FieldAccess("a") FieldAccessDescriptor fieldAccess = FieldAccessDescriptor.withFieldNames("baad"); @ProcessElement public void process(@FieldAccess("a") Row row) {} })); }
@Test public void testCastToDate2() { PCollection<Row> input = pipeline.apply( Create.of(Row.withSchema(INPUT_ROW_SCHEMA).addValues(1).addValue("20181018").build()) .withSchema( INPUT_ROW_SCHEMA, SerializableFunctions.identity(), SerializableFunctions.identity())); Schema resultType = Schema.builder().addInt32Field("f_int").addNullableField("f_date", DATETIME).build(); PCollection<Row> result = input.apply( SqlTransform.query( "SELECT f_int, \n" + " CAST( \n" + " f_string AS DATE) \n" + "FROM PCOLLECTION")); PAssert.that(result) .containsInAnyOrder( Row.withSchema(resultType).addValues(1, new DateTime(2018, 10, 18, 0, 0)).build()); pipeline.run(); }
@Before public void setUp() { Schema schema = Schema.builder() .addDoubleField("f_double1") .addDoubleField("f_double2") .addDoubleField("f_double3") .addInt32Field("f_int1") .addInt32Field("f_int2") .addInt32Field("f_int3") .build(); List<Row> rowsInTableB = TestUtils.RowsBuilder.of(schema) .addRows( 3.0, 1.0, 1.0, 3, 1, 0, 4.0, 2.0, 2.0, 4, 2, 0, 5.0, 3.0, 1.0, 5, 3, 0, 6.0, 4.0, 2.0, 6, 4, 0, 8.0, 4.0, 1.0, 8, 4, 0) .getRows(); boundedInput = pipeline.apply( Create.of(rowsInTableB) .withSchema( schema, SerializableFunctions.identity(), SerializableFunctions.identity())); }
@Override public PDone expand(PBegin begin) { PCollection<Boolean> result = begin .apply( Create.of(DUMMY_ROW) .withSchema( DUMMY_SCHEMA, SerializableFunctions.identity(), SerializableFunctions.identity())) .apply(SqlTransform.query("SELECT " + expr)) .apply(MapElements.into(TypeDescriptors.booleans()).via(row -> row.getBoolean(0))); PAssert.that(result) .satisfies( input -> { assertTrue("Test expression is false: " + expr, Iterables.getOnlyElement(input)); return null; }); return PDone.in(begin.getPipeline()); } }
@Before public void setUp() { Schema schema = Schema.builder() .addNullableField("f_int1", Schema.FieldType.INT32) .addNullableField("f_int2", Schema.FieldType.INT32) .addInt32Field("f_int3") .build(); List<Row> rows = TestUtils.RowsBuilder.of(schema) .addRows(1, 5, 1) .addRows(null, 1, 1) .addRows(2, 1, 1) .addRows(null, 1, 1) .addRows(null, null, 1) .addRows(null, null, 1) .addRows(3, 2, 1) .getRows(); boundedInput = PBegin.in(pipeline).apply(Create.of(rows).withSchema(schema, identity(), identity())); }