/** Returns the result of unnesting the given schema. The default naming policy is used. */ static Schema getUnnestedSchema(Schema schema) { List<String> nameComponents = Lists.newArrayList(); return getUnnestedSchema(schema, nameComponents, CONCAT_FIELD_NAMES); } /** Returns the result of unnesting the given schema with the given naming policy. */
@ProcessElement public void processElement(@Element Row row, OutputReceiver<Row> o) { o.output(unnestRow(row, outputSchema)); } }))
@Test @Category(NeedsRunner.class) public void testFlatSchema() { List<Row> rows = IntStream.rangeClosed(0, 2) .mapToObj(i -> Row.withSchema(SIMPLE_SCHEMA).addValues(i, Integer.toString(i)).build()) .collect(Collectors.toList()); PCollection<Row> unnested = pipeline.apply(Create.of(rows).withRowSchema(SIMPLE_SCHEMA)).apply(Unnest.create()); PAssert.that(unnested).containsInAnyOrder(rows); pipeline.run(); }
@Test @Category(NeedsRunner.class) public void testClashingNamePolicy() { List<Row> bottomRow = IntStream.rangeClosed(0, 2) .mapToObj(i -> Row.withSchema(SIMPLE_SCHEMA).addValues(i, Integer.toString(i)).build()) .collect(Collectors.toList()); thrown.expect(IllegalArgumentException.class); List<Row> rows = bottomRow .stream() .map(r -> Row.withSchema(NESTED_SCHEMA).addValues(r, r).build()) .collect(Collectors.toList()); PCollection<Row> unnested = pipeline .apply(Create.of(rows).withRowSchema(NESTED_SCHEMA)) .apply(Unnest.<Row>create().withFieldNameFunction(Unnest.KEEP_NESTED_NAME)); pipeline.run(); } }
/** Returns the result of unnesting the given schema with the given naming policy. */ static Schema getUnnestedSchema(Schema schema, SerializableFunction<List<String>, String> fn) { List<String> nameComponents = Lists.newArrayList(); return getUnnestedSchema(schema, nameComponents, fn); }
/** Unnest a row. */ static Row unnestRow(Row input, Schema unnestedSchema) { Row.Builder builder = Row.withSchema(unnestedSchema); unnestRow(input, builder); return builder.build(); }
@Test @Category(NeedsRunner.class) public void testAlternateNamePolicy() { List<Row> bottomRow = IntStream.rangeClosed(0, 2) .mapToObj(i -> Row.withSchema(SIMPLE_SCHEMA).addValues(i, Integer.toString(i)).build()) .collect(Collectors.toList()); List<Row> rows = bottomRow .stream() .map(r -> Row.withSchema(NESTED_SCHEMA2).addValues(r).build()) .collect(Collectors.toList()); PCollection<Row> unnested = pipeline .apply(Create.of(rows).withRowSchema(NESTED_SCHEMA2)) .apply(Unnest.<Row>create().withFieldNameFunction(Unnest.KEEP_NESTED_NAME)); assertEquals(UNNESTED2_SCHEMA_ALTERNATE, unnested.getSchema()); List<Row> expected = bottomRow .stream() .map( r -> Row.withSchema(UNNESTED2_SCHEMA_ALTERNATE) .addValues(r.getValue(0), r.getValue(1)) .build()) .collect(Collectors.toList()); ; PAssert.that(unnested).containsInAnyOrder(expected); pipeline.run(); }
FieldAggregation( FieldAccessDescriptor fieldsToAggregate, Field outputField, CombineFn<FieldT, AccumT, OutputT> fn, TupleTag<Object> combineTag, Schema aggregationSchema, @Nullable Schema inputSchema) { if (inputSchema != null) { this.fieldsToAggregate = fieldsToAggregate.resolve(inputSchema); this.inputSubSchema = Select.getOutputSchema(inputSchema, this.fieldsToAggregate); this.unnestedInputSubSchema = Unnest.getUnnestedSchema(inputSubSchema); this.needsUnnesting = !inputSchema.equals(unnestedInputSubSchema); } else { this.fieldsToAggregate = fieldsToAggregate; this.inputSubSchema = null; this.unnestedInputSubSchema = null; this.needsUnnesting = false; } this.outputField = outputField; this.fn = fn; this.combineTag = combineTag; this.aggregationSchema = aggregationSchema; }
@Override public OutputT apply(InputT input) { Row row = toRowFunction.apply(input); Row selected = Select.selectRow( row, fieldAggregation.fieldsToAggregate, row.getSchema(), fieldAggregation.inputSubSchema); if (fieldAggregation.needsUnnesting) { selected = Unnest.unnestRow(selected, fieldAggregation.unnestedInputSubSchema); } return selected.getValue(0); } }
@Test @Category(NeedsRunner.class) public void testSimpleUnnesting() { List<Row> bottomRow = IntStream.rangeClosed(0, 2) .mapToObj(i -> Row.withSchema(SIMPLE_SCHEMA).addValues(i, Integer.toString(i)).build()) .collect(Collectors.toList()); List<Row> rows = bottomRow .stream() .map(r -> Row.withSchema(NESTED_SCHEMA).addValues(r, r).build()) .collect(Collectors.toList()); PCollection<Row> unnested = pipeline.apply(Create.of(rows).withRowSchema(NESTED_SCHEMA)).apply(Unnest.create()); assertEquals(UNNESTED_SCHEMA, unnested.getSchema()); List<Row> expected = bottomRow .stream() .map( r -> Row.withSchema(UNNESTED_SCHEMA) .addValues(r.getValue(0), r.getValue(1), r.getValue(0), r.getValue(1)) .build()) .collect(Collectors.toList()); ; PAssert.that(unnested).containsInAnyOrder(expected); pipeline.run(); }
@Override public PCollection<Row> expand(PCollection<T> input) { Schema inputSchema = input.getSchema(); Schema outputSchema = getUnnestedSchema(inputSchema, getFieldNameFunction()); return input .apply( ParDo.of( new DoFn<T, Row>() { @ProcessElement public void processElement(@Element Row row, OutputReceiver<Row> o) { o.output(unnestRow(row, outputSchema)); } })) .setRowSchema(outputSchema); } }
private static void unnestRow(Row input, Row.Builder output) { for (int i = 0; i < input.getSchema().getFieldCount(); ++i) { Field field = input.getSchema().getField(i); if (field.getType().getTypeName().isCompositeType()) { unnestRow(input.getRow(i), output); } else { output.addValue(input.getValue(i)); } } } /** A {@link PTransform} that unnests nested row. */
private static Schema getUnnestedSchema( Schema schema, List<String> nameComponents, SerializableFunction<List<String>, String> fn) { Schema.Builder builder = Schema.builder(); for (Field field : schema.getFields()) { nameComponents.add(field.getName()); if (field.getType().getTypeName().isCompositeType()) { Schema nestedSchema = getUnnestedSchema(field.getType().getRowSchema(), nameComponents, fn); for (Field nestedField : nestedSchema.getFields()) { builder.addField(nestedField); } } else { String name = fn.apply(nameComponents); Field newField = field.toBuilder().setName(name).build(); builder.addField(newField); } nameComponents.remove(nameComponents.size() - 1); } return builder.build(); } /** Unnest a row. */