public AvroParquetOutputFormat() { super(new AvroWriteSupport<T>()); }
/** * Set the Avro schema to use for writing. The schema is translated into a Parquet * schema so that the records can be written in Parquet format. It is also * stored in the Parquet metadata so that records can be reconstructed as Avro * objects at read time without specifying a read schema. * @param job a job * @param schema a schema for the data that will be written * @see org.apache.parquet.avro.AvroParquetInputFormat#setAvroReadSchema(org.apache.hadoop.mapreduce.Job, org.apache.avro.Schema) */ public static void setSchema(Job job, Schema schema) { AvroWriteSupport.setSchema(ContextUtil.getConfiguration(job), schema); }
@Override public WriteContext init(Configuration conf) { String outputName = conf.get("crunch.namedoutput"); if (outputName != null && !outputName.isEmpty()) { String schema = conf.get(PARQUET_AVRO_SCHEMA_PARAMETER + "." + outputName); setSchema(conf, new Schema.Parser().parse(schema)); } return super.init(conf); } }
recordConsumer.addBinary(fromAvroString(value)); break; case RECORD: writeRecord(type.asGroupType(), avroSchema, value); break; case ENUM: break; case MAP: writeMap(type.asGroupType(), avroSchema, (Map<CharSequence, ?>) value); break; case UNION: writeUnion(type.asGroupType(), avroSchema, value); break;
/** * Calls an appropriate write method based on the value. * Value MUST not be null. * * @param type the Parquet type * @param avroSchema the Avro schema * @param value a non-null value to write */ private void writeValue(Type type, Schema avroSchema, Object value) { Schema nonNullAvroSchema = AvroSchemaConverter.getNonNull(avroSchema); LogicalType logicalType = nonNullAvroSchema.getLogicalType(); if (logicalType != null) { Conversion<?> conversion = model.getConversionByClass( value.getClass(), logicalType); writeValueWithoutConversion(type, nonNullAvroSchema, convert(nonNullAvroSchema, logicalType, conversion, value)); } else { writeValueWithoutConversion(type, nonNullAvroSchema, value); } }
@Override public void write(T record) { if (rootLogicalType != null) { Conversion<?> conversion = model.getConversionByClass( record.getClass(), rootLogicalType); recordConsumer.startMessage(); writeRecordFields(rootSchema, rootAvroSchema, convert(rootAvroSchema, rootLogicalType, conversion, record)); recordConsumer.endMessage(); } else { recordConsumer.startMessage(); writeRecordFields(rootSchema, rootAvroSchema, record); recordConsumer.endMessage(); } }
@SuppressWarnings("unchecked") public void write(IndexedRecord record) { write((T) record); }
/** * Sets the {@link AvroDataSupplier} class that will be used. The data * supplier provides instances of {@link org.apache.avro.generic.GenericData} * that are used to deconstruct records. * * @param job a {@link Job} to configure * @param supplierClass a supplier class */ public static void setAvroDataSupplier( Job job, Class<? extends AvroDataSupplier> supplierClass) { AvroWriteSupport.setAvroDataSupplier(ContextUtil.getConfiguration(job), supplierClass); } }
@Override public WriteContext init(Configuration configuration) { if (rootAvroSchema == null) { this.rootAvroSchema = new Schema.Parser().parse(configuration.get(AVRO_SCHEMA)); this.rootSchema = new AvroSchemaConverter().convert(rootAvroSchema); } if (model == null) { this.model = getDataModel(configuration); } boolean writeOldListStructure = configuration.getBoolean( WRITE_OLD_LIST_STRUCTURE, WRITE_OLD_LIST_STRUCTURE_DEFAULT); if (writeOldListStructure) { this.listWriter = new TwoLevelListWriter(); } else { this.listWriter = new ThreeLevelListWriter(); } Map<String, String> extraMetaData = new HashMap<String, String>(); extraMetaData.put(AvroReadSupport.AVRO_SCHEMA_METADATA_KEY, rootAvroSchema.toString()); return new WriteContext(rootSchema, extraMetaData); }
@SuppressWarnings("unchecked") private <T> WriteSupport<T> getWriteSupport(MessageType type) { if (writeSupport != null) { return (WriteSupport<T>) writeSupport; } else { return new AvroWriteSupport<>( type, ParquetAvro.parquetAvroSchema(AvroSchemaUtil.convert(schema, name)), ParquetAvro.DEFAULT_MODEL); } }
@Override protected void configure(Job job, KV<Void, IndexedRecord> sample) { super.configure(job, sample); IndexedRecord record = (IndexedRecord) sample.getValue(); AvroWriteSupport.setSchema(job.getConfiguration(), record.getSchema()); ParquetOutputFormat.setCompression(job, CompressionCodecName.SNAPPY); }
private static <T> WriteSupport<T> writeSupport(Schema avroSchema, GenericData model) { return new AvroWriteSupport<T>( new AvroSchemaConverter().convert(avroSchema), avroSchema, model); }
@Override protected void configure(Job job, KV<Void, IndexedRecord> sample) { super.configure(job, sample); IndexedRecord record = (IndexedRecord) sample.getValue(); AvroWriteSupport.setSchema(job.getConfiguration(), record.getSchema()); ParquetOutputFormat.setCompression(job, CompressionCodecName.SNAPPY); }
private static <T> WriteSupport<T> writeSupport(Configuration conf, Schema avroSchema, GenericData model) { return new AvroWriteSupport<T>( new AvroSchemaConverter(conf).convert(avroSchema), avroSchema, model); }