@Override public boolean hasNext() { while (true) { if (currentIterator.hasNext()) { return true; } else if (tasks.hasNext()) { if (currentCloseable != null) { try { currentCloseable.close(); } catch (IOException e) { throw new RuntimeIOException(e, "Failed to close task"); } } FileScanTask task = tasks.next(); CloseableIterable<Record> reader = open(task); this.currentCloseable = reader; if (task.residual() != null && task.residual() != Expressions.alwaysTrue()) { Evaluator filter = new Evaluator(projection.asStruct(), task.residual()); this.currentIterator = filter(reader, filter::eval).iterator(); } else { this.currentIterator = reader.iterator(); } } else { return false; } } }
private Iterator<InternalRow> open(FileScanTask task, Schema readSchema, Configuration conf) { InputFile location = HadoopInputFile.fromLocation(task.file().path(), conf); CloseableIterable<InternalRow> iter; switch (task.file().format()) { case ORC: SparkOrcReader reader = new SparkOrcReader(location, task, readSchema); this.currentCloseable = reader; return reader; case PARQUET: iter = newParquetIterable(location, task, readSchema); break; case AVRO: iter = newAvroIterable(location, task, readSchema); break; default: throw new UnsupportedOperationException( "Cannot read unknown format: " + task.file().format()); } this.currentCloseable = iter; return iter.iterator(); }
protected void writeAndValidate(Schema schema) throws IOException { Assume.assumeTrue("Parquet Avro cannot write non-string map keys", null == TypeUtil.find(schema, type -> type.isMapType() && type.asMapType().keyType() != Types.StringType.get())); List<GenericData.Record> expected = RandomData.generateList(schema, 100, 0L); File testFile = temp.newFile(); Assert.assertTrue("Delete should succeed", testFile.delete()); try (FileAppender<GenericData.Record> writer = Parquet.write(Files.localOutput(testFile)) .schema(schema) .named("test") .build()) { writer.addAll(expected); } try (CloseableIterable<InternalRow> reader = Parquet.read(Files.localInput(testFile)) .project(schema) .createReaderFunc(type -> SparkParquetReaders.buildReader(schema, type)) .build()) { Iterator<InternalRow> rows = reader.iterator(); for (int i = 0; i < expected.size(); i += 1) { Assert.assertTrue("Should have expected number of rows", rows.hasNext()); assertEqualsUnsafe(schema.asStruct(), expected.get(i), rows.next()); } Assert.assertFalse("Should not have extra rows", rows.hasNext()); } } }