@Override public List<ManifestFile> manifests() { if (manifests == null) { // if manifests isn't set, then the snapshotFile is set and should be read to get the list try (CloseableIterable<ManifestFile> files = Avro.read(manifestList) .rename("manifest_file", GenericManifestFile.class.getName()) .rename("partitions", GenericPartitionFieldSummary.class.getName()) .rename("r508", GenericPartitionFieldSummary.class.getName()) .project(ManifestFile.schema()) .reuseContainers(false) .build()) { this.manifests = Lists.newLinkedList(files); } catch (IOException e) { throw new RuntimeIOException(e, "Cannot read snapshot file: %s", manifestList.location()); } } return manifests; }
CloseableIterable<ManifestEntry> entries(Collection<String> columns) { if (entries != null) { // if this reader is an in-memory list or if the entries have been cached, return the list. return CloseableIterable.withNoopClose(entries); } FileFormat format = FileFormat.fromFileName(file.location()); Preconditions.checkArgument(format != null, "Unable to determine format of manifest: " + file); Schema schema = ManifestEntry.projectSchema(spec.partitionType(), columns); switch (format) { case AVRO: AvroIterable<ManifestEntry> reader = Avro.read(file) .project(schema) .rename("manifest_entry", ManifestEntry.class.getName()) .rename("partition", PartitionData.class.getName()) .rename("r102", PartitionData.class.getName()) .rename("data_file", GenericDataFile.class.getName()) .rename("r2", GenericDataFile.class.getName()) .reuseContainers() .build(); addCloseable(reader); return reader; default: throw new UnsupportedOperationException("Invalid format for manifest file: " + format); } }
private ManifestReader(InputFile file) { this.file = file; try { try (AvroIterable<ManifestEntry> headerReader = Avro.read(file) .project(ManifestEntry.getSchema(Types.StructType.of()).select("status")) .build()) { this.metadata = headerReader.getMetadata(); } } catch (IOException e) { throw new RuntimeIOException(e); } this.schema = SchemaParser.fromJson(metadata.get("schema")); int specId = TableMetadata.INITIAL_SPEC_ID; String specProperty = metadata.get("partition-spec-id"); if (specProperty != null) { specId = Integer.parseInt(specProperty); } this.spec = PartitionSpecParser.fromJsonFields(schema, specId, metadata.get("partition-spec")); this.entries = null; }
private CloseableIterable<Record> open(FileScanTask task) { InputFile input = ops.io().newInputFile(task.file().path().toString()); // TODO: join to partition data from the manifest file switch (task.file().format()) { case AVRO: Avro.ReadBuilder avro = Avro.read(input) .project(projection) .createReaderFunc(DataReader::create) .split(task.start(), task.length()); if (reuseContainers) { avro.reuseContainers(); } return avro.build(); case PARQUET: Parquet.ReadBuilder parquet = Parquet.read(input) .project(projection) .createReaderFunc(fileSchema -> buildReader(projection, fileSchema)) .split(task.start(), task.length()); if (reuseContainers) { parquet.reuseContainers(); } return parquet.build(); default: throw new UnsupportedOperationException(String.format("Cannot read %s file: %s", task.file().format().name(), task.file().path())); } }
.createReaderFunc(SparkAvroReader::new) .project(schema) .build()) { rows = Lists.newArrayList(reader);
protected void writeAndValidate(Schema schema) throws IOException { List<Record> expected = RandomAvroData.generate(schema, 100, 0L); File testFile = temp.newFile(); Assert.assertTrue("Delete should succeed", testFile.delete()); try (FileAppender<Record> writer = Avro.write(Files.localOutput(testFile)) .schema(schema) .named("test") .build()) { for (Record rec : expected) { writer.add(rec); } } List<Record> rows; try (AvroIterable<Record> reader = Avro.read(Files.localInput(testFile)) .project(schema) .build()) { rows = Lists.newArrayList(reader); } for (int i = 0; i < expected.size(); i += 1) { AvroTestHelpers.assertEquals(schema.asStruct(), expected.get(i), rows.get(i)); } } }
protected void writeAndValidate(Schema schema) throws IOException { List<Record> expected = RandomData.generateList(schema, 100, 0L); File testFile = temp.newFile(); Assert.assertTrue("Delete should succeed", testFile.delete()); try (FileAppender<Record> writer = Avro.write(Files.localOutput(testFile)) .schema(schema) .named("test") .build()) { for (Record rec : expected) { writer.add(rec); } } List<InternalRow> rows; try (AvroIterable<InternalRow> reader = Avro.read(Files.localInput(testFile)) .createReaderFunc(SparkAvroReader::new) .project(schema) .build()) { rows = Lists.newArrayList(reader); } for (int i = 0; i < expected.size(); i += 1) { assertEqualsUnsafe(schema.asStruct(), expected.get(i), rows.get(i)); } } }
protected void writeAndValidate(Schema schema) throws IOException { List<Record> expected = RandomGenericData.generate(schema, 100, 0L); File testFile = temp.newFile(); Assert.assertTrue("Delete should succeed", testFile.delete()); try (FileAppender<Record> writer = Avro.write(Files.localOutput(testFile)) .schema(schema) .createWriterFunc(DataWriter::create) .named("test") .build()) { for (Record rec : expected) { writer.add(rec); } } List<Record> rows; try (AvroIterable<Record> reader = Avro.read(Files.localInput(testFile)) .project(schema) .createReaderFunc(DataReader::create) .build()) { rows = Lists.newArrayList(reader); } for (int i = 0; i < expected.size(); i += 1) { DataTestHelpers.assertEquals(schema.asStruct(), expected.get(i), rows.get(i)); } } }
protected Record writeAndRead(String desc, Schema writeSchema, Schema readSchema, Record record) throws IOException { File file = temp.newFile(desc + ".avro"); file.delete(); try (FileAppender<Record> appender = Avro.write(Files.localOutput(file)) .schema(writeSchema) .createWriterFunc(DataWriter::create) .build()) { appender.add(record); } Iterable<Record> records = Avro.read(Files.localInput(file)) .project(readSchema) .createReaderFunc(DataReader::create) .build(); return Iterables.getOnlyElement(records); } }
protected GenericData.Record writeAndRead(String desc, Schema writeSchema, Schema readSchema, GenericData.Record record) throws IOException { File file = temp.newFile(desc + ".avro"); file.delete(); try (FileAppender<GenericData.Record> appender = Avro.write(Files.localOutput(file)) .schema(writeSchema) .build()) { appender.add(record); } Iterable<GenericData.Record> records = Avro.read(Files.localInput(file)) .project(readSchema) .build(); return Iterables.getOnlyElement(records); } }
private CloseableIterable<InternalRow> newAvroIterable(InputFile location, FileScanTask task, Schema readSchema) { return Avro.read(location) .reuseContainers() .project(readSchema) .split(task.start(), task.length()) .createReaderFunc(SparkAvroReader::new) .build(); }