/** * Converts AVRO schema to Beam row schema. * * @param schema schema of type RECORD */ public static Schema toSchema(@Nonnull org.apache.avro.Schema schema) { Schema.Builder builder = Schema.builder(); for (org.apache.avro.Schema.Field field : schema.getFields()) { org.apache.avro.Schema unwrapped = unwrapNullableSchema(field.schema()); if (!unwrapped.equals(field.schema())) { builder.addNullableField(field.name(), toFieldType(unwrapped)); } else { builder.addField(field.name(), toFieldType(unwrapped)); } } return builder.build(); }
static Schema getOutputSchema(Schema inputSchema, FieldAccessDescriptor fieldAccessDescriptor) { if (fieldAccessDescriptor.allFields()) { return inputSchema; } Schema.Builder builder = new Schema.Builder(); for (int fieldId : fieldAccessDescriptor.fieldIdsAccessed()) { builder.addField(inputSchema.getField(fieldId)); } for (Map.Entry<Integer, FieldAccessDescriptor> nested : fieldAccessDescriptor.nestedFields().entrySet()) { Field field = inputSchema.getField(nested.getKey()); FieldAccessDescriptor nestedDescriptor = nested.getValue(); FieldType nestedType = FieldType.row(getOutputSchema(field.getType().getRowSchema(), nestedDescriptor)); if (field.getNullable()) { builder.addNullableField(field.getName(), nestedType); } else { builder.addField(field.getName(), nestedType); } } return builder.build(); }
private static Schema getUnnestedSchema( Schema schema, List<String> nameComponents, SerializableFunction<List<String>, String> fn) { Schema.Builder builder = Schema.builder(); for (Field field : schema.getFields()) { nameComponents.add(field.getName()); if (field.getType().getTypeName().isCompositeType()) { Schema nestedSchema = getUnnestedSchema(field.getType().getRowSchema(), nameComponents, fn); for (Field nestedField : nestedSchema.getFields()) { builder.addField(nestedField); } } else { String name = fn.apply(nameComponents); Field newField = field.toBuilder().setName(name).build(); builder.addField(newField); } nameComponents.remove(nameComponents.size() - 1); } return builder.build(); } /** Unnest a row. */
@Test public void testLeftOuterJoin() throws Exception { String sql = "SELECT * " + "FROM ORDER_DETAILS1 o1" + " LEFT OUTER JOIN ORDER_DETAILS2 o2" + " on " + " o1.order_id=o2.site_id AND o2.price=o1.site_id"; PCollection<Row> rows = compilePipeline(sql, pipeline); pipeline.enableAbandonedNodeEnforcement(false); PAssert.that(rows) .containsInAnyOrder( TestUtils.RowsBuilder.of( Schema.builder() .addField("order_id", Schema.FieldType.INT32) .addField("site_id", Schema.FieldType.INT32) .addField("price", Schema.FieldType.INT32) .addNullableField("order_id0", Schema.FieldType.INT32) .addNullableField("site_id0", Schema.FieldType.INT32) .addNullableField("price0", Schema.FieldType.INT32) .build()) .addRows(1, 2, 3, null, null, null, 2, 3, 3, 1, 2, 3, 3, 4, 5, null, null, null) .getRows()); pipeline.run(); }
@Test public void testLeftOuterJoin() throws Exception { String sql = "SELECT o1.order_id, o1.sum_site_id, o2.buyer FROM " + "(select order_id, sum(site_id) as sum_site_id FROM ORDER_DETAILS " + " GROUP BY order_id, TUMBLE(order_time, INTERVAL '1' HOUR)) o1 " + " LEFT OUTER JOIN " + " ORDER_DETAILS1 o2 " + " on " + " o1.order_id=o2.order_id"; PCollection<Row> rows = compilePipeline(sql, pipeline); rows.apply(ParDo.of(new BeamSqlOutputToConsoleFn("helloworld"))); PAssert.that(rows.apply(ParDo.of(new TestUtils.BeamSqlRow2StringDoFn()))) .containsInAnyOrder( TestUtils.RowsBuilder.of( Schema.builder() .addField("order_id", Schema.FieldType.INT32) .addField("sum_site_id", Schema.FieldType.INT32) .addNullableField("buyer", Schema.FieldType.STRING) .build()) .addRows(1, 3, "james", 2, 5, "bond", 3, 3, null) .getStringRows()); pipeline.run(); }
@Test public void testRightOuterJoin() throws Exception { String sql = "SELECT * " + "FROM ORDER_DETAILS1 o1" + " RIGHT OUTER JOIN ORDER_DETAILS2 o2" + " on " + " o1.order_id=o2.site_id AND o2.price=o1.site_id"; PCollection<Row> rows = compilePipeline(sql, pipeline); PAssert.that(rows) .containsInAnyOrder( TestUtils.RowsBuilder.of( Schema.builder() .addNullableField("order_id", Schema.FieldType.INT32) .addNullableField("site_id", Schema.FieldType.INT32) .addNullableField("price", Schema.FieldType.INT32) .addField("order_id0", Schema.FieldType.INT32) .addField("site_id0", Schema.FieldType.INT32) .addField("price0", Schema.FieldType.INT32) .build()) .addRows(2, 3, 3, 1, 2, 3, null, null, null, 2, 3, 3, null, null, null, 3, 4, 5) .getRows()); pipeline.run(); }
@Test public void testInnerJoin() throws Exception { String sql = "SELECT * " + "FROM ORDER_DETAILS1 o1" + " JOIN ORDER_DETAILS2 o2" + " on " + " o1.order_id=o2.site_id AND o2.price=o1.site_id"; PCollection<Row> rows = compilePipeline(sql, pipeline); PAssert.that(rows) .containsInAnyOrder( TestUtils.RowsBuilder.of( Schema.builder() .addField("order_id", Schema.FieldType.INT32) .addField("site_id", Schema.FieldType.INT32) .addField("price", Schema.FieldType.INT32) .addField("order_id0", Schema.FieldType.INT32) .addField("site_id0", Schema.FieldType.INT32) .addField("price0", Schema.FieldType.INT32) .build()) .addRows(2, 3, 3, 1, 2, 3) .getRows()); pipeline.run(); }
@Test public void testRightOuterJoin() throws Exception { String sql = "SELECT o1.order_id, o1.sum_site_id, o2.buyer FROM " + " ORDER_DETAILS1 o2 " + " RIGHT OUTER JOIN " + "(select order_id, sum(site_id) as sum_site_id FROM ORDER_DETAILS " + " GROUP BY order_id, TUMBLE(order_time, INTERVAL '1' HOUR)) o1 " + " on " + " o1.order_id=o2.order_id"; PCollection<Row> rows = compilePipeline(sql, pipeline); PAssert.that(rows.apply(ParDo.of(new TestUtils.BeamSqlRow2StringDoFn()))) .containsInAnyOrder( TestUtils.RowsBuilder.of( Schema.builder() .addField("order_id", Schema.FieldType.INT32) .addField("sum_site_id", Schema.FieldType.INT32) .addNullableField("buyer", Schema.FieldType.STRING) .build()) .addRows(1, 3, "james", 2, 5, "bond", 3, 3, null) .getStringRows()); pipeline.run(); }
@Test public void testLeftOuterJoin() throws Exception { String sql = "SELECT * FROM " + "(select site_id as order_id, sum(site_id) as sum_site_id FROM ORDER_DETAILS " + " GROUP BY site_id, TUMBLE(order_time, INTERVAL '1' HOUR)) o1 " + " LEFT OUTER JOIN " + "(select order_id, sum(site_id) as sum_site_id FROM ORDER_DETAILS " + " GROUP BY order_id, TUMBLE(order_time, INTERVAL '1' HOUR)) o2 " + " on " + " o1.order_id=o2.order_id"; // 1, 1 | 1, 3 // 2, 2 | NULL, NULL // ---- | ----- // 2, 2 | 2, 5 // 3, 3 | NULL, NULL PCollection<Row> rows = compilePipeline(sql, pipeline); PAssert.that(rows.apply(ParDo.of(new TestUtils.BeamSqlRow2StringDoFn()))) .containsInAnyOrder( TestUtils.RowsBuilder.of( Schema.builder() .addField("order_id1", Schema.FieldType.INT32) .addField("sum_site_id", Schema.FieldType.INT32) .addNullableField("order_id", Schema.FieldType.INT32) .addNullableField("sum_site_id0", Schema.FieldType.INT32) .build()) .addRows(1, 1, 1, 3, 2, 2, null, null, 2, 2, 2, 5, 3, 3, null, null) .getStringRows()); pipeline.run(); }
@Test public void testOrderBy_nullsLast() throws Exception { Schema schema = Schema.builder() .addField("order_id", Schema.FieldType.INT64) .addNullableField("site_id", Schema.FieldType.INT32) .addField("price", Schema.FieldType.DOUBLE) .build(); registerTable( "ORDER_DETAILS", TestBoundedTable.of(schema) .addRows(1L, 2, 1.0, 1L, null, 2.0, 2L, 1, 3.0, 2L, null, 4.0, 5L, 5, 5.0)); registerTable("SUB_ORDER_RAM", TestBoundedTable.of(schema)); String sql = "INSERT INTO SUB_ORDER_RAM(order_id, site_id, price) SELECT " + " order_id, site_id, price " + "FROM ORDER_DETAILS " + "ORDER BY order_id asc, site_id desc NULLS LAST limit 4"; PCollection<Row> rows = compilePipeline(sql, pipeline); PAssert.that(rows) .containsInAnyOrder( TestUtils.RowsBuilder.of(schema) .addRows(1L, 2, 1.0, 1L, null, 2.0, 2L, 1, 3.0, 2L, null, 4.0) .getRows()); pipeline.run().waitUntilFinish(); }
@Test public void testOrderBy_nullsFirst() throws Exception { Schema schema = Schema.builder() .addField("order_id", Schema.FieldType.INT64) .addNullableField("site_id", Schema.FieldType.INT32) .addField("price", Schema.FieldType.DOUBLE) .build(); registerTable( "ORDER_DETAILS", TestBoundedTable.of(schema) .addRows(1L, 2, 1.0, 1L, null, 2.0, 2L, 1, 3.0, 2L, null, 4.0, 5L, 5, 5.0)); registerTable("SUB_ORDER_RAM", TestBoundedTable.of(schema)); String sql = "INSERT INTO SUB_ORDER_RAM(order_id, site_id, price) SELECT " + " order_id, site_id, price " + "FROM ORDER_DETAILS " + "ORDER BY order_id asc, site_id desc NULLS FIRST limit 4"; PCollection<Row> rows = compilePipeline(sql, pipeline); PAssert.that(rows) .containsInAnyOrder( TestUtils.RowsBuilder.of(schema) .addRows(1L, null, 2.0, 1L, 2, 1.0, 2L, null, 4.0, 2L, 1, 3.0) .getRows()); pipeline.run().waitUntilFinish(); }
@Test public void testInnerJoin() throws Exception { String sql = "SELECT * FROM " + "(select order_id, sum(site_id) as sum_site_id FROM ORDER_DETAILS " + " GROUP BY order_id, TUMBLE(order_time, INTERVAL '1' HOUR)) o1 " + " JOIN " + "(select order_id, sum(site_id) as sum_site_id FROM ORDER_DETAILS " + " GROUP BY order_id, TUMBLE(order_time, INTERVAL '1' HOUR)) o2 " + " on " + " o1.order_id=o2.order_id"; PCollection<Row> rows = compilePipeline(sql, pipeline); PAssert.that(rows.apply(ParDo.of(new TestUtils.BeamSqlRow2StringDoFn()))) .containsInAnyOrder( TestUtils.RowsBuilder.of( Schema.builder() .addField("order_id1", Schema.FieldType.INT32) .addField("sum_site_id", Schema.FieldType.INT32) .addField("order_id", Schema.FieldType.INT32) .addField("sum_site_id0", Schema.FieldType.INT32) .build()) .addRows(1, 3, 1, 3, 2, 5, 2, 5) .getStringRows()); pipeline.run(); }
/** * Infer a schema from a Java class. * * <p>Takes in a function to extract a list of field types from a class. Different callers may * have different strategies for extracting this list: e.g. introspecting public member variables, * public getter methods, or special annotations on the class. */ public static Schema schemaFromClass( Class<?> clazz, Function<Class, List<TypeInformation>> getTypesForClass) { Schema.Builder builder = Schema.builder(); for (TypeInformation type : getTypesForClass.apply(clazz)) { Schema.FieldType fieldType = fieldFromType(type.getType(), getTypesForClass); if (type.isNullable()) { builder.addNullableField(type.getName(), fieldType); } else { builder.addField(type.getName(), fieldType); } } return builder.build(); }
@Test public void testRoundTripBeamSchema() { final Schema schema = Schema.builder() .addField("f1", Schema.FieldType.BYTE) .addField("f2", Schema.FieldType.INT16) .addField("f3", Schema.FieldType.INT32) .addField("f4", Schema.FieldType.INT64) .addField("f5", Schema.FieldType.FLOAT) .addField("f6", Schema.FieldType.DOUBLE) .addField("f7", Schema.FieldType.DECIMAL) .addField("f8", Schema.FieldType.BOOLEAN) .addField("f9", Schema.FieldType.BYTES) .addField("f10", Schema.FieldType.STRING) .build(); final Schema out = CalciteUtils.toSchema(CalciteUtils.toCalciteRowType(schema, dataTypeFactory)); assertEquals(schema, out); }
@Test(expected = NonDeterministicException.class) public void testVerifyDeterministicNestedRow() throws NonDeterministicException { Schema schema = Schema.builder() .addField( "f1", FieldType.row( Schema.builder() .addField("a1", FieldType.DOUBLE) .addField("a2", FieldType.INT64) .build())) .build(); RowCoder coder = RowCoder.of(schema); coder.verifyDeterministic(); }
private Schema schemaWithField(String fieldName, FieldType fieldType) { return Schema.builder().addField(fieldName, fieldType).build(); }
@Test public void testArrayOfArray() throws Exception { FieldType arrayType = FieldType.array(FieldType.array(FieldType.INT32)); Schema schema = Schema.builder().addField("f_array", arrayType).build(); Row row = Row.withSchema(schema) .addArray( Lists.newArrayList(1, 2, 3, 4), Lists.newArrayList(5, 6, 7, 8), Lists.newArrayList(9, 10, 11, 12)) .build(); CoderProperties.coderDecodeEncodeEqual(RowCoder.of(schema), row); }
FieldAggregation( FieldAccessDescriptor fieldsToAggregate, Field outputField, CombineFn<FieldT, AccumT, OutputT> fn, TupleTag<Object> combineTag) { this( fieldsToAggregate, outputField, fn, combineTag, Schema.builder().addField(outputField).build(), null); }
@Test(expected = NonDeterministicException.class) public void testVerifyDeterministic() throws NonDeterministicException { Schema schema = Schema.builder() .addField("f1", FieldType.DOUBLE) .addField("f2", FieldType.FLOAT) .addField("f3", FieldType.INT32) .build(); RowCoder coder = RowCoder.of(schema); coder.verifyDeterministic(); }
private Schema getOutputSchema(List<FieldAggregation> fieldAggregations) { Schema.Builder outputSchema = Schema.builder(); for (FieldAggregation aggregation : fieldAggregations) { outputSchema.addField(aggregation.outputField); } return outputSchema.build(); }