@Test public void testNestedFieldById() { FieldAccessDescriptor fieldAccessDescriptor = FieldAccessDescriptor.withFieldNames("field1") .withNestedField(1, FieldAccessDescriptor.withAllFields()); fieldAccessDescriptor = fieldAccessDescriptor.resolve(NESTED_SCHEMA2); assertTrue(fieldAccessDescriptor.fieldIdsAccessed().isEmpty()); assertEquals(1, fieldAccessDescriptor.nestedFields().size()); FieldAccessDescriptor nestedAccess = fieldAccessDescriptor.nestedFields().get(1); assertTrue(nestedAccess.allFields()); }
/** * Return a descriptor that access the specified fields. * * <p>By default, if the field is a nested row (or a container containing a row), all fields of * said rows are accessed. For finer-grained acccess to nested rows, call withNestedField and pass * in a recursive {@link FieldAccessDescriptor}. */ public static FieldAccessDescriptor withFieldIds(Integer... ids) { return withFieldIds(Arrays.asList(ids)); }
public FieldAccessDescriptor resolve(Schema schema) { Set<Integer> resolvedFieldIdsAccessed = resolveFieldIdsAccessed(schema); Map<Integer, FieldAccessDescriptor> resolvedNestedFieldsAccessed = resolveNestedFieldsAccessed(schema); checkState( !getAllFields() || resolvedNestedFieldsAccessed.isEmpty(), "nested fields cannot be set if allFields is also set"); // If a recursive access is set for any nested fields, remove those fields from // fieldIdsAccessed. resolvedFieldIdsAccessed.removeAll(resolvedNestedFieldsAccessed.keySet()); return builder() .setAllFields(getAllFields()) .setFieldIdsAccessed(resolvedFieldIdsAccessed) .setNestedFieldsAccessedById(resolvedNestedFieldsAccessed) .build(); }
private FieldAccessDescriptor resolvedNestedFieldsHelper( Field field, FieldAccessDescriptor subDescriptor) { return subDescriptor.resolve(getFieldSchema(field)); }
@Test public void testArrayNestedField() { FieldAccessDescriptor level1 = FieldAccessDescriptor.withFieldNames("field2"); FieldAccessDescriptor level2 = FieldAccessDescriptor.withFieldNames("field1").withNestedField("field1", level1); FieldAccessDescriptor resolved = level2.resolve(NESTED_ARRAY_SCHEMA); assertTrue(resolved.fieldIdsAccessed().isEmpty()); assertEquals(1, resolved.nestedFields().size()); resolved = resolved.nestedFields().get(1); assertEquals(Sets.newHashSet(2), resolved.fieldIdsAccessed()); }
/** * Return a descriptor that access the specified fields. * * <p>By default, if the field is a nested row (or a container containing a row), all fields of * said rows are accessed. For finer-grained acccess to nested rows, call withNestedField and pass * in a recursive {@link FieldAccessDescriptor}. */ public static FieldAccessDescriptor withFieldNames(String... names) { return withFieldNames(Arrays.asList(names)); }
fieldAccessDescriptor = FieldAccessDescriptor.withAllFields(); } else { fieldAccessDescriptor.resolve(((SchemaCoder<?>) inputCoder).getSchema());
@Test @Category(NeedsRunner.class) public void testSelectNestedPartial() { PCollection<POJO2NestedPartial> pojos = pipeline .apply(Create.of(new POJO2())) .apply( Select.fieldAccess( FieldAccessDescriptor.create() .withNestedField( "field2", FieldAccessDescriptor.withFieldNames("field1", "field3")))) .apply(Convert.to(POJO2NestedPartial.class)); PAssert.that(pojos).containsInAnyOrder(new POJO2NestedPartial()); pipeline.run(); } }
@Test public void testAllFields() { FieldAccessDescriptor fieldAccessDescriptor = FieldAccessDescriptor.withAllFields(); assertTrue(fieldAccessDescriptor.resolve(SIMPLE_SCHEMA).allFields()); }
@Test public void testFieldNames() { FieldAccessDescriptor fieldAccessDescriptor = FieldAccessDescriptor.withFieldNames("field0", "field2").resolve(SIMPLE_SCHEMA); assertEquals(Sets.newHashSet(0, 2), fieldAccessDescriptor.fieldIdsAccessed()); }
@Test @Category(NeedsRunner.class) public void testSelectNestedAll() { PCollection<POJO2NestedAll> pojos = pipeline .apply(Create.of(new POJO2())) .apply( Select.fieldAccess( FieldAccessDescriptor.create() .withNestedField("field2", FieldAccessDescriptor.withAllFields()))) .apply(Convert.to(POJO2NestedAll.class)); PAssert.that(pojos).containsInAnyOrder(new POJO2NestedAll()); pipeline.run(); }
@Override public PCollection<Row> expand(PCollection<T> input) { Schema inputSchema = input.getSchema(); verifyCompatibility(inputSchema); return input .apply( ParDo.of( new DoFn<T, Row>() { // TODO: This should be the same as resolved so that Beam knows which fields // are being accessed. Currently Beam only supports wildcard descriptors. // Once BEAM-4457 is fixed, fix this. @FieldAccess("filterFields") final FieldAccessDescriptor fieldAccessDescriptor = FieldAccessDescriptor.withAllFields(); @ProcessElement public void process( @FieldAccess("filterFields") Row input, OutputReceiver<Row> r) { Row output = castRow(input, inputSchema, outputSchema()); r.output(output); } })) .setRowSchema(outputSchema()); }
@Test public void testFieldIds() { FieldAccessDescriptor fieldAccessDescriptor = FieldAccessDescriptor.withFieldIds(1, 3).resolve(SIMPLE_SCHEMA); assertEquals(Sets.newHashSet(1, 3), fieldAccessDescriptor.fieldIdsAccessed()); }
static Schema getOutputSchema(Schema inputSchema, FieldAccessDescriptor fieldAccessDescriptor) { if (fieldAccessDescriptor.allFields()) { return inputSchema; } Schema.Builder builder = new Schema.Builder(); for (int fieldId : fieldAccessDescriptor.fieldIdsAccessed()) { builder.addField(inputSchema.getField(fieldId)); } for (Map.Entry<Integer, FieldAccessDescriptor> nested : fieldAccessDescriptor.nestedFields().entrySet()) { Field field = inputSchema.getField(nested.getKey()); FieldAccessDescriptor nestedDescriptor = nested.getValue(); FieldType nestedType = FieldType.row(getOutputSchema(field.getType().getRowSchema(), nestedDescriptor)); if (field.getNullable()) { builder.addNullableField(field.getName(), nestedType); } else { builder.addField(field.getName(), nestedType); } } return builder.build(); }
fieldAccessDescriptor.withOrderByFieldInsertionOrder().resolve(schema); Schema currentKeySchema = Select.getOutputSchema(schema, resolved); if (keySchema == null) {
FieldAggregation( FieldAccessDescriptor fieldsToAggregate, Field outputField, CombineFn<FieldT, AccumT, OutputT> fn, TupleTag<Object> combineTag, Schema aggregationSchema, @Nullable Schema inputSchema) { if (inputSchema != null) { this.fieldsToAggregate = fieldsToAggregate.resolve(inputSchema); this.inputSubSchema = Select.getOutputSchema(inputSchema, this.fieldsToAggregate); this.unnestedInputSubSchema = Unnest.getUnnestedSchema(inputSubSchema); this.needsUnnesting = !inputSchema.equals(unnestedInputSubSchema); } else { this.fieldsToAggregate = fieldsToAggregate; this.inputSubSchema = null; this.unnestedInputSubSchema = null; this.needsUnnesting = false; } this.outputField = outputField; this.fn = fn; this.combineTag = combineTag; this.aggregationSchema = aggregationSchema; }
/** Return an empty {@link FieldAccessDescriptor}. */ public static FieldAccessDescriptor create() { return builder().build(); }
@Test public void testMapNestedField() { FieldAccessDescriptor level1 = FieldAccessDescriptor.withFieldNames("field2"); FieldAccessDescriptor level2 = FieldAccessDescriptor.withFieldNames("field1").withNestedField("field1", level1); FieldAccessDescriptor resolved = level2.resolve(NESTED_MAP_SCHEMA); assertTrue(resolved.fieldIdsAccessed().isEmpty()); assertEquals(1, resolved.nestedFields().size()); resolved = resolved.nestedFields().get(1); assertEquals(Sets.newHashSet(2), resolved.fieldIdsAccessed()); } }
/** * Select the following field names for the specified PCollection. * * <p>Each PCollection in the input must have fields specified for the join key. */ public static Inner byFieldNames(TupleTag<?> tag, String... fieldNames) { return byFieldAccessDescriptor(tag, FieldAccessDescriptor.withFieldNames(fieldNames)); }
@Override public PCollection<Row> expand(PCollection<T> input) { Schema inputSchema = input.getSchema(); FieldAccessDescriptor resolved = fieldAccessDescriptor.resolve(inputSchema); Schema outputSchema = getOutputSchema(inputSchema, resolved); PCollection<Row> selected = input .apply( ParDo.of( new DoFn<T, Row>() { // TODO: This should be the same as resolved so that Beam knows which fields // are being accessed. Currently Beam only supports wildcard descriptors. // Once BEAM-4457 is fixed, fix this. @FieldAccess("filterFields") final FieldAccessDescriptor fieldAccessDescriptor = FieldAccessDescriptor.withAllFields(); @ProcessElement public void process( @FieldAccess("filterFields") Row row, OutputReceiver<Row> r) { r.output(selectRow(row, resolved, inputSchema, outputSchema)); } })) .setRowSchema(outputSchema); return selected; }