MessageType projection = hasIds(fileSchema) ? pruneColumns(fileSchema, expectedSchema) : pruneColumnsFallback(fileSchema, expectedSchema); AvroSchemaUtil.convert(expectedSchema, projection.getName())); org.apache.avro.Schema avroReadSchema = AvroSchemaUtil.buildAvroProjection( AvroSchemaUtil.convert(ParquetSchemaUtil.convert(projection), projection.getName()), expectedSchema, ImmutableMap.of()); AvroReadSupport.setAvroReadSchema(configuration, ParquetAvro.parquetAvroSchema(avroReadSchema));
boolean hasIds = hasIds(fileSchema); MessageType typeWithIds = hasIds ? fileSchema : addFallbackIds(fileSchema); pruneColumns(fileSchema, expectedSchema) : pruneColumnsFallback(fileSchema, expectedSchema); this.model = (ParquetValueReader<T>) readerFunc.apply(typeWithIds); this.rowGroups = reader.getRowGroups();
@SuppressWarnings("unchecked") public static ParquetValueReader<Tuple> buildReader(MessageType fileSchema, Schema expectedSchema, Map<Integer, Object> partitionValues) { if (hasIds(fileSchema)) { return (ParquetValueReader<Tuple>) TypeWithSchemaVisitor.visit(expectedSchema.asStruct(), fileSchema, new ReadBuilder(fileSchema, partitionValues)); } else { return (ParquetValueReader<Tuple>) TypeWithSchemaVisitor.visit(expectedSchema.asStruct(), fileSchema, new FallbackReadBuilder(fileSchema, partitionValues)); } }
public PigParquetReader(Schema readSchema, MessageType fileSchema, Map<Integer, Object> partitionValues) { this.reader = buildReader(convert(readSchema, fileSchema.getName()), readSchema, partitionValues); }
@SuppressWarnings("unchecked") public static ParquetValueReader<InternalRow> buildReader(Schema expectedSchema, MessageType fileSchema) { if (hasIds(fileSchema)) { return (ParquetValueReader<InternalRow>) TypeWithSchemaVisitor.visit(expectedSchema.asStruct(), fileSchema, new ReadBuilder(fileSchema)); } else { return (ParquetValueReader<InternalRow>) TypeWithSchemaVisitor.visit(expectedSchema.asStruct(), fileSchema, new FallbackReadBuilder(fileSchema)); } }
@Override public RecordMaterializer<T> prepareForRead(Configuration configuration, Map<String, String> fileMetadata, MessageType fileMessageType, ReadContext readContext) { // This is the type created in init that was based on the file's schema. The schema that this // will pass to the wrapped ReadSupport needs to match the expected schema's names. Rather than // renaming the file's schema, convert the expected schema to Parquet. This relies on writing // files with the correct schema. // TODO: this breaks when columns are reordered. MessageType readSchema = ParquetSchemaUtil.convert(expectedSchema, fileMessageType.getName()); return wrapped.prepareForRead(configuration, fileMetadata, readSchema, readContext); }
@SuppressWarnings("unchecked") public static ParquetValueReader<GenericRecord> buildReader(Schema expectedSchema, MessageType fileSchema) { if (hasIds(fileSchema)) { return (ParquetValueReader<GenericRecord>) TypeWithSchemaVisitor.visit(expectedSchema.asStruct(), fileSchema, new ReadBuilder(fileSchema)); } else { return (ParquetValueReader<GenericRecord>) TypeWithSchemaVisitor.visit(expectedSchema.asStruct(), fileSchema, new FallbackReadBuilder(fileSchema)); } }
@SuppressWarnings("unchecked") ParquetWriter(Configuration conf, OutputFile output, Schema schema, long rowGroupSize, Map<String, String> metadata, Function<MessageType, ParquetValueWriter<?>> createWriterFunc, CompressionCodecName codec) { this.output = output; this.targetRowGroupSize = rowGroupSize; this.metadata = ImmutableMap.copyOf(metadata); this.compressor = new CodecFactory(conf, props.getPageSizeThreshold()).getCompressor(codec); this.parquetSchema = convert(schema, "table"); this.model = (ParquetValueWriter<T>) createWriterFunc.apply(parquetSchema); try { this.writer = new ParquetFileWriter(ParquetIO.file(output, conf), parquetSchema, ParquetFileWriter.Mode.OVERWRITE, rowGroupSize, 0); } catch (IOException e) { throw new RuntimeIOException(e, "Failed to create Parquet file"); } try { writer.start(); } catch (IOException e) { throw new RuntimeIOException(e, "Failed to start Parquet file writer"); } startRowGroup(); }
MessageType type = ParquetSchemaUtil.convert(schema, name);
Schema fileSchema = ParquetSchemaUtil.convert(parquetType);
throw new RuntimeIOException(e); Schema fileSchema = ParquetSchemaUtil.convert(type); builder.useStatsFilter() .useDictionaryFilter()