protected void writeAndValidate(Schema schema) throws IOException { Assume.assumeTrue("Parquet Avro cannot write non-string map keys", null == TypeUtil.find(schema, type -> type.isMapType() && type.asMapType().keyType() != Types.StringType.get())); List<GenericData.Record> expected = RandomData.generateList(schema, 100, 0L); File testFile = temp.newFile(); Assert.assertTrue("Delete should succeed", testFile.delete()); try (FileAppender<GenericData.Record> writer = Parquet.write(Files.localOutput(testFile)) .schema(schema) .named("test") .build()) { writer.addAll(expected); } try (CloseableIterable<InternalRow> reader = Parquet.read(Files.localInput(testFile)) .project(schema) .createReaderFunc(type -> SparkParquetReaders.buildReader(schema, type)) .build()) { Iterator<InternalRow> rows = reader.iterator(); for (int i = 0; i < expected.size(); i += 1) { Assert.assertTrue("Should have expected number of rows", rows.hasNext()); assertEqualsUnsafe(schema.asStruct(), expected.get(i), rows.next()); } Assert.assertFalse("Should not have extra rows", rows.hasNext()); } } }
private CloseableIterable<Record> open(FileScanTask task) { InputFile input = ops.io().newInputFile(task.file().path().toString()); // TODO: join to partition data from the manifest file switch (task.file().format()) { case AVRO: Avro.ReadBuilder avro = Avro.read(input) .project(projection) .createReaderFunc(DataReader::create) .split(task.start(), task.length()); if (reuseContainers) { avro.reuseContainers(); } return avro.build(); case PARQUET: Parquet.ReadBuilder parquet = Parquet.read(input) .project(projection) .createReaderFunc(fileSchema -> buildReader(projection, fileSchema)) .split(task.start(), task.length()); if (reuseContainers) { parquet.reuseContainers(); } return parquet.build(); default: throw new UnsupportedOperationException(String.format("Cannot read %s file: %s", task.file().format().name(), task.file().path())); } }
protected GenericData.Record writeAndRead(String desc, Schema writeSchema, Schema readSchema, GenericData.Record record) throws IOException { File file = temp.newFile(desc + ".parquet"); file.delete(); try (FileAppender<GenericData.Record> appender = Parquet.write(Files.localOutput(file)) .schema(writeSchema) .build()) { appender.add(record); } Iterable<GenericData.Record> records = Parquet.read(Files.localInput(file)) .project(readSchema) .callInit() .build(); return Iterables.getOnlyElement(records); } }
protected void writeAndValidate(Schema schema) throws IOException { List<Record> expected = RandomGenericData.generate(schema, 100, 0L); File testFile = temp.newFile(); Assert.assertTrue("Delete should succeed", testFile.delete()); try (FileAppender<Record> appender = Parquet.write(Files.localOutput(testFile)) .schema(schema) .createWriterFunc(GenericParquetWriter::buildWriter) .build()) { appender.addAll(expected); } List<Record> rows; try (CloseableIterable<Record> reader = Parquet.read(Files.localInput(testFile)) .project(schema) .createReaderFunc(fileSchema -> GenericParquetReaders.buildReader(schema, fileSchema)) .build()) { rows = Lists.newArrayList(reader); } for (int i = 0; i < expected.size(); i += 1) { DataTestHelpers.assertEquals(schema.asStruct(), expected.get(i), rows.get(i)); } } }
protected Record writeAndRead(String desc, Schema writeSchema, Schema readSchema, Record record) throws IOException { File file = temp.newFile(desc + ".parquet"); file.delete(); try (FileAppender<Record> appender = Parquet.write(Files.localOutput(file)) .schema(writeSchema) .createWriterFunc(GenericParquetWriter::buildWriter) .build()) { appender.add(record); } Iterable<Record> records = Parquet.read(Files.localInput(file)) .project(readSchema) .createReaderFunc(fileSchema -> GenericParquetReaders.buildReader(readSchema, fileSchema)) .build(); return Iterables.getOnlyElement(records); } }
public static ReadBuilder read(InputFile file) { return new ReadBuilder(file); }
private CloseableIterable<InternalRow> newParquetIterable(InputFile location, FileScanTask task, Schema readSchema) { return Parquet.read(location) .project(readSchema) .split(task.start(), task.length()) .createReaderFunc(fileSchema -> SparkParquetReaders.buildReader(readSchema, fileSchema)) .filter(task.residual()) .build(); } }