/** * Converts AVRO schema to Beam row schema. * * @param schema schema of type RECORD */ public static Schema toSchema(@Nonnull org.apache.avro.Schema schema) { Schema.Builder builder = Schema.builder(); for (org.apache.avro.Schema.Field field : schema.getFields()) { org.apache.avro.Schema unwrapped = unwrapNullableSchema(field.schema()); if (!unwrapped.equals(field.schema())) { builder.addNullableField(field.name(), toFieldType(unwrapped)); } else { builder.addField(field.name(), toFieldType(unwrapped)); } } return builder.build(); }
static Schema getOutputSchema(Schema inputSchema, FieldAccessDescriptor fieldAccessDescriptor) { if (fieldAccessDescriptor.allFields()) { return inputSchema; } Schema.Builder builder = new Schema.Builder(); for (int fieldId : fieldAccessDescriptor.fieldIdsAccessed()) { builder.addField(inputSchema.getField(fieldId)); } for (Map.Entry<Integer, FieldAccessDescriptor> nested : fieldAccessDescriptor.nestedFields().entrySet()) { Field field = inputSchema.getField(nested.getKey()); FieldAccessDescriptor nestedDescriptor = nested.getValue(); FieldType nestedType = FieldType.row(getOutputSchema(field.getType().getRowSchema(), nestedDescriptor)); if (field.getNullable()) { builder.addNullableField(field.getName(), nestedType); } else { builder.addField(field.getName(), nestedType); } } return builder.build(); }
@Test public void testToCalciteRowTypeNullable() { final Schema schema = Schema.builder() .addNullableField("f1", Schema.FieldType.BYTE) .addNullableField("f2", Schema.FieldType.INT16) .addNullableField("f3", Schema.FieldType.INT32) .addNullableField("f4", Schema.FieldType.INT64) .addNullableField("f5", Schema.FieldType.FLOAT) .addNullableField("f6", Schema.FieldType.DOUBLE) .addNullableField("f7", Schema.FieldType.DECIMAL) .addNullableField("f8", Schema.FieldType.BOOLEAN) .addNullableField("f9", Schema.FieldType.BYTES) .addNullableField("f10", Schema.FieldType.STRING) .build(); final Map<String, RelDataType> fields = calciteRowTypeFields(schema); assertEquals(10, fields.size()); fields.values().forEach(x -> assertTrue(x.isNullable())); assertEquals(SqlTypeName.TINYINT, fields.get("f1").getSqlTypeName()); assertEquals(SqlTypeName.SMALLINT, fields.get("f2").getSqlTypeName()); assertEquals(SqlTypeName.INTEGER, fields.get("f3").getSqlTypeName()); assertEquals(SqlTypeName.BIGINT, fields.get("f4").getSqlTypeName()); assertEquals(SqlTypeName.FLOAT, fields.get("f5").getSqlTypeName()); assertEquals(SqlTypeName.DOUBLE, fields.get("f6").getSqlTypeName()); assertEquals(SqlTypeName.DECIMAL, fields.get("f7").getSqlTypeName()); assertEquals(SqlTypeName.BOOLEAN, fields.get("f8").getSqlTypeName()); assertEquals(SqlTypeName.VARBINARY, fields.get("f9").getSqlTypeName()); assertEquals(SqlTypeName.VARCHAR, fields.get("f10").getSqlTypeName()); }
@Test public void testOrderBy_nullsLast() throws Exception { Schema schema = Schema.builder() .addField("order_id", Schema.FieldType.INT64) .addNullableField("site_id", Schema.FieldType.INT32) .addField("price", Schema.FieldType.DOUBLE) .build(); registerTable( "ORDER_DETAILS", TestBoundedTable.of(schema) .addRows(1L, 2, 1.0, 1L, null, 2.0, 2L, 1, 3.0, 2L, null, 4.0, 5L, 5, 5.0)); registerTable("SUB_ORDER_RAM", TestBoundedTable.of(schema)); String sql = "INSERT INTO SUB_ORDER_RAM(order_id, site_id, price) SELECT " + " order_id, site_id, price " + "FROM ORDER_DETAILS " + "ORDER BY order_id asc, site_id desc NULLS LAST limit 4"; PCollection<Row> rows = compilePipeline(sql, pipeline); PAssert.that(rows) .containsInAnyOrder( TestUtils.RowsBuilder.of(schema) .addRows(1L, 2, 1.0, 1L, null, 2.0, 2L, 1, 3.0, 2L, null, 4.0) .getRows()); pipeline.run().waitUntilFinish(); }
@Test public void testLeftOuterJoin() throws Exception { String sql = "SELECT o1.order_id, o1.sum_site_id, o2.buyer FROM " + "(select order_id, sum(site_id) as sum_site_id FROM ORDER_DETAILS " + " GROUP BY order_id, TUMBLE(order_time, INTERVAL '1' HOUR)) o1 " + " LEFT OUTER JOIN " + " ORDER_DETAILS1 o2 " + " on " + " o1.order_id=o2.order_id"; PCollection<Row> rows = compilePipeline(sql, pipeline); rows.apply(ParDo.of(new BeamSqlOutputToConsoleFn("helloworld"))); PAssert.that(rows.apply(ParDo.of(new TestUtils.BeamSqlRow2StringDoFn()))) .containsInAnyOrder( TestUtils.RowsBuilder.of( Schema.builder() .addField("order_id", Schema.FieldType.INT32) .addField("sum_site_id", Schema.FieldType.INT32) .addNullableField("buyer", Schema.FieldType.STRING) .build()) .addRows(1, 3, "james", 2, 5, "bond", 3, 3, null) .getStringRows()); pipeline.run(); }
@Test public void testRightOuterJoin() throws Exception { String sql = "SELECT * " + "FROM ORDER_DETAILS1 o1" + " RIGHT OUTER JOIN ORDER_DETAILS2 o2" + " on " + " o1.order_id=o2.site_id AND o2.price=o1.site_id"; PCollection<Row> rows = compilePipeline(sql, pipeline); PAssert.that(rows) .containsInAnyOrder( TestUtils.RowsBuilder.of( Schema.builder() .addNullableField("order_id", Schema.FieldType.INT32) .addNullableField("site_id", Schema.FieldType.INT32) .addNullableField("price", Schema.FieldType.INT32) .addField("order_id0", Schema.FieldType.INT32) .addField("site_id0", Schema.FieldType.INT32) .addField("price0", Schema.FieldType.INT32) .build()) .addRows(2, 3, 3, 1, 2, 3, null, null, null, 2, 3, 3, null, null, null, 3, 4, 5) .getRows()); pipeline.run(); }
@Test public void testLeftOuterJoin() throws Exception { String sql = "SELECT * FROM " + "(select site_id as order_id, sum(site_id) as sum_site_id FROM ORDER_DETAILS " + " GROUP BY site_id, TUMBLE(order_time, INTERVAL '1' HOUR)) o1 " + " LEFT OUTER JOIN " + "(select order_id, sum(site_id) as sum_site_id FROM ORDER_DETAILS " + " GROUP BY order_id, TUMBLE(order_time, INTERVAL '1' HOUR)) o2 " + " on " + " o1.order_id=o2.order_id"; // 1, 1 | 1, 3 // 2, 2 | NULL, NULL // ---- | ----- // 2, 2 | 2, 5 // 3, 3 | NULL, NULL PCollection<Row> rows = compilePipeline(sql, pipeline); PAssert.that(rows.apply(ParDo.of(new TestUtils.BeamSqlRow2StringDoFn()))) .containsInAnyOrder( TestUtils.RowsBuilder.of( Schema.builder() .addField("order_id1", Schema.FieldType.INT32) .addField("sum_site_id", Schema.FieldType.INT32) .addNullableField("order_id", Schema.FieldType.INT32) .addNullableField("sum_site_id0", Schema.FieldType.INT32) .build()) .addRows(1, 1, 1, 3, 2, 2, null, null, 2, 2, 2, 5, 3, 3, null, null) .getStringRows()); pipeline.run(); }
@Test public void testRightOuterJoin() throws Exception { String sql = "SELECT o1.order_id, o1.sum_site_id, o2.buyer FROM " + " ORDER_DETAILS1 o2 " + " RIGHT OUTER JOIN " + "(select order_id, sum(site_id) as sum_site_id FROM ORDER_DETAILS " + " GROUP BY order_id, TUMBLE(order_time, INTERVAL '1' HOUR)) o1 " + " on " + " o1.order_id=o2.order_id"; PCollection<Row> rows = compilePipeline(sql, pipeline); PAssert.that(rows.apply(ParDo.of(new TestUtils.BeamSqlRow2StringDoFn()))) .containsInAnyOrder( TestUtils.RowsBuilder.of( Schema.builder() .addField("order_id", Schema.FieldType.INT32) .addField("sum_site_id", Schema.FieldType.INT32) .addNullableField("buyer", Schema.FieldType.STRING) .build()) .addRows(1, 3, "james", 2, 5, "bond", 3, 3, null) .getStringRows()); pipeline.run(); }
@Test public void testSelectAll() { PCollection<Row> input = pCollectionOf2Elements(); Schema resultType = Schema.builder() .addInt32Field("f_int") .addNullableField( "f_map", Schema.FieldType.map(Schema.FieldType.STRING, Schema.FieldType.INT32)) .build(); PCollection<Row> result = input.apply( "sqlQuery", SqlTransform.query("SELECT f_int, f_intStringMap as f_map FROM PCOLLECTION")); PAssert.that(result) .containsInAnyOrder( Row.withSchema(resultType) .addValues(1, ImmutableMap.of("key11", 11, "key22", 22)) .build(), Row.withSchema(resultType) .addValues(2, ImmutableMap.of("key33", 33, "key44", 44, "key55", 55)) .build()); pipeline.run(); }
@Test public void testCastToDate2() { PCollection<Row> input = pipeline.apply( Create.of(Row.withSchema(INPUT_ROW_SCHEMA).addValues(1).addValue("20181018").build()) .withSchema( INPUT_ROW_SCHEMA, SerializableFunctions.identity(), SerializableFunctions.identity())); Schema resultType = Schema.builder().addInt32Field("f_int").addNullableField("f_date", DATETIME).build(); PCollection<Row> result = input.apply( SqlTransform.query( "SELECT f_int, \n" + " CAST( \n" + " f_string AS DATE) \n" + "FROM PCOLLECTION")); PAssert.that(result) .containsInAnyOrder( Row.withSchema(resultType).addValues(1, new DateTime(2018, 10, 18, 0, 0)).build()); pipeline.run(); }
@Test public void testFullOuterJoin() throws Exception { String sql = "SELECT * " + "FROM ORDER_DETAILS1 o1" + " FULL OUTER JOIN ORDER_DETAILS2 o2" + " on " + " o1.order_id=o2.site_id AND o2.price=o1.site_id"; PCollection<Row> rows = compilePipeline(sql, pipeline); PAssert.that(rows) .containsInAnyOrder( TestUtils.RowsBuilder.of( Schema.builder() .addNullableField("order_id", Schema.FieldType.INT32) .addNullableField("site_id", Schema.FieldType.INT32) .addNullableField("price", Schema.FieldType.INT32) .addNullableField("order_id0", Schema.FieldType.INT32) .addNullableField("site_id0", Schema.FieldType.INT32) .addNullableField("price0", Schema.FieldType.INT32) .build()) .addRows( 2, 3, 3, 1, 2, 3, 1, 2, 3, null, null, null, 3, 4, 5, null, null, null, null, null, null, 2, 3, 3, null, null, null, 3, 4, 5) .getRows()); pipeline.run(); }
@Test public void testCountGroupByNullable() { String sql = "SELECT COUNT(f_int1) as c, f_int2 FROM PCOLLECTION GROUP BY f_int2"; PCollection<Row> out = boundedInput.apply(SqlTransform.query(sql)); Schema schema = out.getSchema(); PAssert.that(out) .containsInAnyOrder( Row.withSchema(schema).addValues(0L, null).build(), Row.withSchema(schema).addValues(1L, 1).build(), Row.withSchema(schema).addValues(1L, 5).build(), Row.withSchema(schema).addValues(1L, 2).build()); assertEquals( Schema.builder() // COUNT() is never nullable, and calcite knows it .addInt64Field("c") .addNullableField("f_int2", Schema.FieldType.INT32) .build(), schema); pipeline.run(); }
/** * Infer a schema from a Java class. * * <p>Takes in a function to extract a list of field types from a class. Different callers may * have different strategies for extracting this list: e.g. introspecting public member variables, * public getter methods, or special annotations on the class. */ public static Schema schemaFromClass( Class<?> clazz, Function<Class, List<TypeInformation>> getTypesForClass) { Schema.Builder builder = Schema.builder(); for (TypeInformation type : getTypesForClass.apply(clazz)) { Schema.FieldType fieldType = fieldFromType(type.getType(), getTypesForClass); if (type.isNullable()) { builder.addNullableField(type.getName(), fieldType); } else { builder.addField(type.getName(), fieldType); } } return builder.build(); }
@Test public void testRoundTripBeamNullableSchema() { final Schema schema = Schema.builder() .addNullableField("f1", Schema.FieldType.BYTE) .addNullableField("f2", Schema.FieldType.INT16) .addNullableField("f3", Schema.FieldType.INT32) .addNullableField("f4", Schema.FieldType.INT64) .addNullableField("f5", Schema.FieldType.FLOAT) .addNullableField("f6", Schema.FieldType.DOUBLE) .addNullableField("f7", Schema.FieldType.DECIMAL) .addNullableField("f8", Schema.FieldType.BOOLEAN) .addNullableField("f9", Schema.FieldType.BYTES) .addNullableField("f10", Schema.FieldType.STRING) .build(); final Schema out = CalciteUtils.toSchema(CalciteUtils.toCalciteRowType(schema, dataTypeFactory)); assertEquals(schema, out); } }
@Before public void setUp() { Schema schema = Schema.builder() .addNullableField("f_int1", Schema.FieldType.INT32) .addNullableField("f_int2", Schema.FieldType.INT32) .addInt32Field("f_int3") .build(); List<Row> rows = TestUtils.RowsBuilder.of(schema) .addRows(1, 5, 1) .addRows(null, 1, 1) .addRows(2, 1, 1) .addRows(null, 1, 1) .addRows(null, null, 1) .addRows(null, null, 1) .addRows(3, 2, 1) .getRows(); boundedInput = PBegin.in(pipeline).apply(Create.of(rows).withSchema(schema, identity(), identity())); }
@Test public void testPrimitiveNotEquivalent() { Schema schema1 = Schema.builder().addInt64Field("foo").build(); Schema schema2 = Schema.builder().addStringField("foo").build(); assertNotEquals(schema1, schema2); assertFalse(schema1.equivalent(schema2)); schema1 = Schema.builder().addInt64Field("foo").build(); schema2 = Schema.builder().addInt64Field("bar").build(); assertNotEquals(schema1, schema2); assertFalse(schema1.equivalent(schema2)); schema1 = Schema.builder().addInt64Field("foo").build(); schema2 = Schema.builder().addNullableField("foo", FieldType.INT64).build(); assertNotEquals(schema1, schema2); assertFalse(schema1.equivalent(schema2)); }
@Test public void testParsesNulls() throws Exception { Schema schema = Schema.builder() .addByteField("f_byte") .addNullableField("f_string", FieldType.STRING) .build(); String rowString = "{\n" + "\"f_byte\" : 12,\n" + "\"f_string\" : null\n" + "}"; RowJsonDeserializer deserializer = RowJsonDeserializer.forSchema(schema); Row parsedRow = newObjectMapperWith(deserializer).readValue(rowString, Row.class); Row expectedRow = Row.withSchema(schema).addValues((byte) 12, null).build(); assertEquals(expectedRow, parsedRow); }
@Test public void testAccessMapElement() { PCollection<Row> input = pCollectionOf2Elements(); Schema resultType = Schema.builder().addNullableField("f_mapElem", Schema.FieldType.INT32).build(); PCollection<Row> result = input.apply( "sqlQuery", SqlTransform.query("SELECT f_intStringMap['key11'] FROM PCOLLECTION")); PAssert.that(result) .containsInAnyOrder( Row.withSchema(resultType).addValues(11).build(), Row.withSchema(resultType).addValue(null).build()); pipeline.run(); }
private Schema newSimpleSchemaWith(FieldType fieldType) { return Schema.builder().addNullableField("fieldName", fieldType).build(); }
@Test public void testBuildBeamSqlTable() throws Exception { Table table = mockTable("hello"); store.createTable(table); BeamSqlTable actualSqlTable = store.buildBeamSqlTable(table); assertNotNull(actualSqlTable); assertEquals( Schema.builder() .addNullableField("id", Schema.FieldType.INT32) .addNullableField("name", Schema.FieldType.STRING) .build(), actualSqlTable.getSchema()); }