/** * Matches a filepattern using {@link FileSystems#match} and produces a collection of matched * resources (both files and directories) as {@link MatchResult.Metadata}. * * <p>By default, matches the filepattern once and produces a bounded {@link PCollection}. To * continuously watch the filepattern for new matches, use {@link MatchAll#continuously(Duration, * TerminationCondition)} - this will produce an unbounded {@link PCollection}. * * <p>By default, a filepattern matching no resources is treated according to {@link * EmptyMatchTreatment#DISALLOW}. To configure this behavior, use {@link * Match#withEmptyMatchTreatment}. * * <p>Returned {@link MatchResult.Metadata} are deduplicated by filename. For example, if this * transform observes a file with the same name several times with different metadata (e.g. * because the file is growing), it will emit the metadata the first time this file is observed, * and will ignore future changes to this file. */ public static Match match() { return new AutoValue_FileIO_Match.Builder() .setConfiguration(MatchConfiguration.create(EmptyMatchTreatment.DISALLOW)) .build(); }
/** * A {@link PTransform} that works like {@link #read}, but reads each file in a {@link * PCollection} of filepatterns. * * <p>Can be applied to both bounded and unbounded {@link PCollection PCollections}, so this is * suitable for reading a {@link PCollection} of filepatterns arriving as a stream. However, every * filepattern is expanded once at the moment it is processed, rather than watched for new files * matching the filepattern to appear. Likewise, every file is read once, rather than watched for * new entries. */ public static ReadAll readAll() { return new AutoValue_TextIO_ReadAll.Builder() .setCompression(Compression.AUTO) .setMatchConfiguration(MatchConfiguration.create(EmptyMatchTreatment.ALLOW_IF_WILDCARD)) .build(); }
/** * Like {@link #match}, but matches each filepattern in a collection of filepatterns. * * <p>Resources are not deduplicated between filepatterns, i.e. if the same resource matches * multiple filepatterns, it will be produced multiple times. * * <p>By default, a filepattern matching no resources is treated according to {@link * EmptyMatchTreatment#ALLOW_IF_WILDCARD}. To configure this behavior, use {@link * MatchAll#withEmptyMatchTreatment}. */ public static MatchAll matchAll() { return new AutoValue_FileIO_MatchAll.Builder() .setConfiguration(MatchConfiguration.create(EmptyMatchTreatment.ALLOW_IF_WILDCARD)) .build(); }
/** Reads Avro file(s) containing records of the specified schema. */ public static Read<GenericRecord> readGenericRecords(Schema schema) { return new AutoValue_AvroIO_Read.Builder<GenericRecord>() .setMatchConfiguration(MatchConfiguration.create(EmptyMatchTreatment.DISALLOW)) .setRecordClass(GenericRecord.class) .setSchema(schema) .setHintMatchesManyFiles(false) .build(); }
/** * Reads records of the given type from an Avro file (or multiple Avro files matching a pattern). * * <p>The schema must be specified using one of the {@code withSchema} functions. */ public static <T> Read<T> read(Class<T> recordClass) { return new AutoValue_AvroIO_Read.Builder<T>() .setMatchConfiguration(MatchConfiguration.create(EmptyMatchTreatment.DISALLOW)) .setRecordClass(recordClass) .setSchema(ReflectData.get().getSchema(recordClass)) .setHintMatchesManyFiles(false) .build(); }
/** * Like {@link #readGenericRecords(Schema)}, but reads each filepattern in the input {@link * PCollection}. */ public static ReadAll<GenericRecord> readAllGenericRecords(Schema schema) { return new AutoValue_AvroIO_ReadAll.Builder<GenericRecord>() .setMatchConfiguration(MatchConfiguration.create(EmptyMatchTreatment.ALLOW_IF_WILDCARD)) .setRecordClass(GenericRecord.class) .setSchema(schema) .setDesiredBundleSizeBytes(64 * 1024 * 1024L) .build(); }
/** Like {@link #read}, but reads each filepattern in the input {@link PCollection}. */ public static <T> ReadAll<T> readAll(Class<T> recordClass) { return new AutoValue_AvroIO_ReadAll.Builder<T>() .setMatchConfiguration(MatchConfiguration.create(EmptyMatchTreatment.ALLOW_IF_WILDCARD)) .setRecordClass(recordClass) .setSchema(ReflectData.get().getSchema(recordClass)) // 64MB is a reasonable value that allows to amortize the cost of opening files, // but is not so large as to exhaust a typical runner's maximum amount of output per // ProcessElement call. .setDesiredBundleSizeBytes(64 * 1024 * 1024L) .build(); }
/** * Reads Avro file(s) containing records of an unspecified schema and converting each record to a * custom type. */ public static <T> Parse<T> parseGenericRecords(SerializableFunction<GenericRecord, T> parseFn) { return new AutoValue_AvroIO_Parse.Builder<T>() .setMatchConfiguration(MatchConfiguration.create(EmptyMatchTreatment.DISALLOW)) .setParseFn(parseFn) .setHintMatchesManyFiles(false) .build(); }
/** * A {@link PTransform} that reads from one or more text files and returns a bounded {@link * PCollection} containing one element for each line of the input files. */ public static Read read() { return new AutoValue_TextIO_Read.Builder() .setCompression(Compression.AUTO) .setHintMatchesManyFiles(false) .setMatchConfiguration(MatchConfiguration.create(EmptyMatchTreatment.DISALLOW)) .build(); }
/** * Like {@link #parseGenericRecords(SerializableFunction)}, but reads each filepattern in the * input {@link PCollection}. */ public static <T> ParseAll<T> parseAllGenericRecords( SerializableFunction<GenericRecord, T> parseFn) { return new AutoValue_AvroIO_ParseAll.Builder<T>() .setMatchConfiguration(MatchConfiguration.create(EmptyMatchTreatment.ALLOW_IF_WILDCARD)) .setParseFn(parseFn) .setDesiredBundleSizeBytes(64 * 1024 * 1024L) .build(); }