/** * Return a descriptor that access the specified fields. * * <p>By default, if the field is a nested row (or a container containing a row), all fields of * said rows are accessed. For finer-grained acccess to nested rows, call withNestedField and pass * in a recursive {@link FieldAccessDescriptor}. */ public static FieldAccessDescriptor withFieldNames(String... names) { return withFieldNames(Arrays.asList(names)); }
/** * Select the following field names for the specified PCollection. * * <p>Each PCollection in the input must have fields specified for the join key. */ public static Inner byFieldNames(TupleTag<?> tag, String... fieldNames) { return byFieldAccessDescriptor(tag, FieldAccessDescriptor.withFieldNames(fieldNames)); }
/** * Join by the following field names. * * <p>The same field names are used in all input PCollections. */ public Inner byFieldNames(TupleTag<?> tag, String... fieldNames) { return byFieldAccessDescriptor(tag, FieldAccessDescriptor.withFieldNames(fieldNames)); }
/** * Build up an aggregation function over the input elements. * * <p>This method specifies an aggregation over multiple fields of the input. The union of all * calls to aggregateField and aggregateFields will determine the output schema. */ public <CombineInputT, AccumT, CombineOutputT> CombineFieldsByFields<InputT> aggregateFields( List<String> inputFieldNames, CombineFn<CombineInputT, AccumT, CombineOutputT> fn, Field outputField) { return aggregateFields( FieldAccessDescriptor.withFieldNames(inputFieldNames), fn, outputField); }
/** * Returns a transform that groups all elements in the input {@link PCollection} keyed by the list * of fields specified. The output of this transform will be a {@link KV} keyed by a {@link Row} * containing the specified extracted fields. The returned transform contains further builder * methods to control how the grouping is done. */ public static <T> ByFields<T> byFieldNames(String... fieldNames) { return new ByFields<>(FieldAccessDescriptor.withFieldNames(fieldNames)); }
/** Same as {@link #byFieldNames(String...)}. */ public static <T> ByFields<T> byFieldNames(Iterable<String> fieldNames) { return new ByFields<>(FieldAccessDescriptor.withFieldNames(fieldNames)); }
/** * Join by the following field names. * * <p>The same field names are used in all input PCollections. */ public static Inner byFieldNames(String... fieldNames) { return byFieldAccessDescriptor(FieldAccessDescriptor.withFieldNames(fieldNames)); }
/** * Build up an aggregation function over the input elements. * * <p>This method specifies an aggregation over multiple fields of the input. The union of all * calls to aggregateField and aggregateFields will determine the output schema. */ public <CombineInputT, AccumT, CombineOutputT> CombineFieldsGlobally<InputT> aggregateFields( List<String> inputFieldNames, CombineFn<CombineInputT, AccumT, CombineOutputT> fn, Field outputField) { return aggregateFields( FieldAccessDescriptor.withFieldNames(inputFieldNames), fn, outputField); }
/** * Build up an aggregation function over the input elements. * * <p>This method specifies an aggregation over multiple fields of the input. The union of all * calls to aggregateField and aggregateFields will determine the output schema. */ public <CombineInputT, AccumT, CombineOutputT> CombineFieldsGlobally<InputT> aggregateFields( List<String> inputFieldNames, CombineFn<CombineInputT, AccumT, CombineOutputT> fn, Field outputField) { return aggregateFields( FieldAccessDescriptor.withFieldNames(inputFieldNames), fn, outputField); }
/** Select a set of top-level field names from the row. */ public static <T> Select<T> fieldNames(String... names) { return new Select(FieldAccessDescriptor.withFieldNames(names)); }
/** * Build up an aggregation function over the input elements. * * <p>This method specifies an aggregation over multiple fields of the input. The union of all * calls to aggregateField and aggregateFields will determine the output schema. */ public <CombineInputT, AccumT, CombineOutputT> CombineFieldsByFields<InputT> aggregateFields( List<String> inputFieldNames, CombineFn<CombineInputT, AccumT, CombineOutputT> fn, Field outputField) { return aggregateFields( FieldAccessDescriptor.withFieldNames(inputFieldNames), fn, outputField); }
/** * Build up an aggregation function over the input elements. * * <p>This method specifies an aggregation over multiple fields of the input. The union of all * calls to aggregateField and aggregateFields will determine the output schema. * * <p>Field types in the output schema will be inferred from the provided combine function. * Sometimes the field type cannot be inferred due to Java's type erasure. In that case, use the * overload that allows setting the output field type explicitly. */ public <CombineInputT, AccumT, CombineOutputT> CombineFieldsByFields<InputT> aggregateFields( List<String> inputFieldNames, CombineFn<CombineInputT, AccumT, CombineOutputT> fn, String outputFieldName) { return aggregateFields( FieldAccessDescriptor.withFieldNames(inputFieldNames), fn, outputFieldName); }
/** * Build up an aggregation function over the input elements. * * <p>This method specifies an aggregation over single field of the input. The union of all * calls to aggregateField and aggregateFields will determine the output schema. */ public <CombineInputT, AccumT, CombineOutputT> CombineFieldsGlobally<InputT> aggregateField( String inputFieldName, CombineFn<CombineInputT, AccumT, CombineOutputT> fn, Field outputField) { return new CombineFieldsGlobally<>( schemaAggregateFn.aggregateFields( FieldAccessDescriptor.withFieldNames(inputFieldName), fn, outputField)); }
/** * Build up an aggregation function over the input elements. * * <p>This method specifies an aggregation over single field of the input. The union of all * calls to aggregateField and aggregateFields will determine the output schema. */ public <CombineInputT, AccumT, CombineOutputT> CombineFieldsByFields<InputT> aggregateField( String inputFieldName, CombineFn<CombineInputT, AccumT, CombineOutputT> fn, Field outputField) { return new CombineFieldsByFields<>( this, SchemaAggregateFn.<InputT>create() .aggregateFields( FieldAccessDescriptor.withFieldNames(inputFieldName), fn, outputField)); }
@Test public void testArrayNestedField() { FieldAccessDescriptor level1 = FieldAccessDescriptor.withFieldNames("field2"); FieldAccessDescriptor level2 = FieldAccessDescriptor.withFieldNames("field1").withNestedField("field1", level1); FieldAccessDescriptor resolved = level2.resolve(NESTED_ARRAY_SCHEMA); assertTrue(resolved.fieldIdsAccessed().isEmpty()); assertEquals(1, resolved.nestedFields().size()); resolved = resolved.nestedFields().get(1); assertEquals(Sets.newHashSet(2), resolved.fieldIdsAccessed()); }
@Test public void testMapNestedField() { FieldAccessDescriptor level1 = FieldAccessDescriptor.withFieldNames("field2"); FieldAccessDescriptor level2 = FieldAccessDescriptor.withFieldNames("field1").withNestedField("field1", level1); FieldAccessDescriptor resolved = level2.resolve(NESTED_MAP_SCHEMA); assertTrue(resolved.fieldIdsAccessed().isEmpty()); assertEquals(1, resolved.nestedFields().size()); resolved = resolved.nestedFields().get(1); assertEquals(Sets.newHashSet(2), resolved.fieldIdsAccessed()); } }
@Test public void testPartialAccessNestedField() { FieldAccessDescriptor level1 = FieldAccessDescriptor.withFieldNames("field2"); FieldAccessDescriptor level2 = FieldAccessDescriptor.withFieldNames("field1").withNestedField("field1", level1); FieldAccessDescriptor level3 = FieldAccessDescriptor.withFieldNames("field1").withNestedField("field1", level2); FieldAccessDescriptor resolved = level3.resolve(NESTED_SCHEMA2); assertTrue(resolved.fieldIdsAccessed().isEmpty()); assertEquals(1, resolved.nestedFields().size()); resolved = resolved.nestedFields().get(1); assertTrue(resolved.fieldIdsAccessed().isEmpty()); assertEquals(1, resolved.nestedFields().size()); resolved = resolved.nestedFields().get(1); assertEquals(Sets.newHashSet(2), resolved.fieldIdsAccessed()); }
@Test public void testFieldNames() { FieldAccessDescriptor fieldAccessDescriptor = FieldAccessDescriptor.withFieldNames("field0", "field2").resolve(SIMPLE_SCHEMA); assertEquals(Sets.newHashSet(0, 2), fieldAccessDescriptor.fieldIdsAccessed()); }
@Test public void testNestedFieldByName() { FieldAccessDescriptor fieldAccessDescriptor = FieldAccessDescriptor.withFieldNames("field1") .withNestedField("field1", FieldAccessDescriptor.withAllFields()); fieldAccessDescriptor = fieldAccessDescriptor.resolve(NESTED_SCHEMA2); assertTrue(fieldAccessDescriptor.fieldIdsAccessed().isEmpty()); assertEquals(1, fieldAccessDescriptor.nestedFields().size()); FieldAccessDescriptor nestedAccess = fieldAccessDescriptor.nestedFields().get(1); assertTrue(nestedAccess.allFields()); }
@Test public void testNestedFieldById() { FieldAccessDescriptor fieldAccessDescriptor = FieldAccessDescriptor.withFieldNames("field1") .withNestedField(1, FieldAccessDescriptor.withAllFields()); fieldAccessDescriptor = fieldAccessDescriptor.resolve(NESTED_SCHEMA2); assertTrue(fieldAccessDescriptor.fieldIdsAccessed().isEmpty()); assertEquals(1, fieldAccessDescriptor.nestedFields().size()); FieldAccessDescriptor nestedAccess = fieldAccessDescriptor.nestedFields().get(1); assertTrue(nestedAccess.allFields()); }