meta("iceberg.schema", SchemaParser.toJson(schema)); forwardConfig("parquet.block.size", PARQUET_ROW_GROUP_SIZE_BYTES, PARQUET_ROW_GROUP_SIZE_BYTES_DEFAULT); forwardConfig("parquet.page.size", PARQUET_PAGE_SIZE_BYTES, PARQUET_PAGE_SIZE_BYTES_DEFAULT); forwardConfig("parquet.dictionary.page.size", PARQUET_DICT_SIZE_BYTES, PARQUET_DICT_SIZE_BYTES_DEFAULT); set("parquet.avro.write-old-list-structure", "false"); MessageType type = ParquetSchemaUtil.convert(schema, name); PARQUET_ROW_GROUP_SIZE_BYTES, PARQUET_ROW_GROUP_SIZE_BYTES_DEFAULT)); return new com.netflix.iceberg.parquet.ParquetWriter<>( conf, file, schema, rowGroupSize, metadata, createWriterFunc, codec()); } else { return new ParquetWriteAdapter<>(new ParquetWriteBuilder<D>(ParquetIO.file(file)) .setConfig(config) .setKeyValueMetadata(metadata) .setWriteSupport(getWriteSupport(type)) .withCompressionCodec(codec())
String jsonSchema = convert(schema).json(); return Parquet.write(file) .writeSupport(new ParquetWriteSupport()) .set("org.apache.spark.sql.parquet.row.attributes", jsonSchema) .set("spark.sql.parquet.writeLegacyFormat", "false") .set("spark.sql.parquet.binaryAsString", "false") .set("spark.sql.parquet.int96AsTimestamp", "false") .set("spark.sql.parquet.outputTimestampType", "TIMESTAMP_MICROS") .setAll(properties) .schema(schema) .build();
@BeforeClass public static void createInputFile() throws IOException { if (PARQUET_FILE.exists()) { Assert.assertTrue(PARQUET_FILE.delete()); } OutputFile outFile = Files.localOutput(PARQUET_FILE); try (FileAppender<Record> appender = Parquet.write(outFile) .schema(FILE_SCHEMA) .build()) { GenericRecordBuilder builder = new GenericRecordBuilder(convert(FILE_SCHEMA, "table")); // create 50 records for (int i = 0; i < 50; i += 1) { builder.set("_id", 30 + i); // min=30, max=79, num-nulls=0 builder.set("_no_stats", TOO_LONG_FOR_STATS); // value longer than 4k will produce no stats builder.set("_required", "req"); // required, always non-null builder.set("_all_nulls", null); // never non-null builder.set("_some_nulls", (i % 10 == 0) ? null : "some"); // includes some null values builder.set("_no_nulls", ""); // optional, but always non-null appender.add(builder.build()); } } InputFile inFile = Files.localInput(PARQUET_FILE); try (ParquetFileReader reader = ParquetFileReader.open(ParquetIO.file(inFile))) { Assert.assertEquals("Should create only one row group", 1, reader.getRowGroups().size()); ROW_GROUP_METADATA = reader.getRowGroups().get(0); PARQUET_SCHEMA = reader.getFileMetaData().getSchema(); } PARQUET_FILE.deleteOnExit(); }
private InputFile writeFile(String location, String filename, List<Record> records) throws IOException { Path path = new Path(location, filename); FileFormat format = FileFormat.fromFileName(filename); Preconditions.checkNotNull(format, "Cannot determine format for file: %s", filename); switch (format) { case AVRO: try (FileAppender<Record> appender = Avro.write(fromPath(path, CONF)) .schema(SCHEMA) .createWriterFunc(DataWriter::create) .named(format.name()) .build()) { appender.addAll(records); } return HadoopInputFile.fromPath(path, CONF); case PARQUET: try (FileAppender<Record> appender = Parquet.write(fromPath(path, CONF)) .schema(SCHEMA) .createWriterFunc(GenericParquetWriter::buildWriter) .build()) { appender.addAll(records); } return HadoopInputFile.fromPath(path, CONF); default: throw new UnsupportedOperationException("Cannot write format: " + format); } }
protected void writeAndValidate(Schema schema) throws IOException { Assume.assumeTrue("Parquet Avro cannot write non-string map keys", null == TypeUtil.find(schema, type -> type.isMapType() && type.asMapType().keyType() != Types.StringType.get())); List<GenericData.Record> expected = RandomData.generateList(schema, 100, 0L); File testFile = temp.newFile(); Assert.assertTrue("Delete should succeed", testFile.delete()); try (FileAppender<GenericData.Record> writer = Parquet.write(Files.localOutput(testFile)) .schema(schema) .named("test") .build()) { writer.addAll(expected); } try (CloseableIterable<InternalRow> reader = Parquet.read(Files.localInput(testFile)) .project(schema) .createReaderFunc(type -> SparkParquetReaders.buildReader(schema, type)) .build()) { Iterator<InternalRow> rows = reader.iterator(); for (int i = 0; i < expected.size(); i += 1) { Assert.assertTrue("Should have expected number of rows", rows.hasNext()); assertEqualsUnsafe(schema.asStruct(), expected.get(i), rows.next()); } Assert.assertFalse("Should not have extra rows", rows.hasNext()); } } }
protected GenericData.Record writeAndRead(String desc, Schema writeSchema, Schema readSchema, GenericData.Record record) throws IOException { File file = temp.newFile(desc + ".parquet"); file.delete(); try (FileAppender<GenericData.Record> appender = Parquet.write(Files.localOutput(file)) .schema(writeSchema) .build()) { appender.add(record); } Iterable<GenericData.Record> records = Parquet.read(Files.localInput(file)) .project(readSchema) .callInit() .build(); return Iterables.getOnlyElement(records); } }
protected void writeAndValidate(Schema schema) throws IOException { List<Record> expected = RandomGenericData.generate(schema, 100, 0L); File testFile = temp.newFile(); Assert.assertTrue("Delete should succeed", testFile.delete()); try (FileAppender<Record> appender = Parquet.write(Files.localOutput(testFile)) .schema(schema) .createWriterFunc(GenericParquetWriter::buildWriter) .build()) { appender.addAll(expected); } List<Record> rows; try (CloseableIterable<Record> reader = Parquet.read(Files.localInput(testFile)) .project(schema) .createReaderFunc(fileSchema -> GenericParquetReaders.buildReader(schema, fileSchema)) .build()) { rows = Lists.newArrayList(reader); } for (int i = 0; i < expected.size(); i += 1) { DataTestHelpers.assertEquals(schema.asStruct(), expected.get(i), rows.get(i)); } } }
protected Record writeAndRead(String desc, Schema writeSchema, Schema readSchema, Record record) throws IOException { File file = temp.newFile(desc + ".parquet"); file.delete(); try (FileAppender<Record> appender = Parquet.write(Files.localOutput(file)) .schema(writeSchema) .createWriterFunc(GenericParquetWriter::buildWriter) .build()) { appender.add(record); } Iterable<Record> records = Parquet.read(Files.localInput(file)) .project(readSchema) .createReaderFunc(fileSchema -> GenericParquetReaders.buildReader(readSchema, fileSchema)) .build(); return Iterables.getOnlyElement(records); } }
public static WriteBuilder write(OutputFile file) { return new WriteBuilder(file); }
private File writeTestData(Schema schema, int n, int seed) throws IOException { File testFile = temp.newFile(); Assert.assertTrue("Delete should succeed", testFile.delete()); try (FileAppender<Record> writer = Parquet.write(Files.localOutput(testFile)) .schema(schema) .build()) { writer.addAll(RandomData.generate(schema, n, seed)); } return testFile; } }
void forwardConfig(String parquetProperty, String icebergProperty, String defaultValue) { String value = config.getOrDefault(icebergProperty, defaultValue); if (value != null) { set(parquetProperty, value); } }