@Override protected Page computeNext() { try { if (Thread.currentThread().isInterrupted()) { throw new InterruptedIOException(); } int batchSize = reader.nextBatch(); if (batchSize <= 0) { return endOfData(); } Block[] blocks = new Block[types.size()]; for (int i = 0; i < types.size(); i++) { blocks[i] = reader.readBlock(types.get(i), i).getLoadedBlock(); } return new Page(batchSize, blocks); } catch (IOException e) { throw new PrestoException(HIVE_WRITER_DATA_ERROR, "Failed to read temporary data"); } } }
@Override public void close() { // some hive input formats are broken and bad things can happen if you close them multiple times if (closed) { return; } closed = true; try { stats.addMaxCombinedBytesPerRow(recordReader.getMaxCombinedBytesPerRow()); recordReader.close(); } catch (IOException e) { throw new UncheckedIOException(e); } }
this.orcDataSource = requireNonNull(orcDataSource, "orcDataSource is null"); this.rowsToDelete = new BitSet(toIntExact(recordReader.getFileRowCount()));
/** * @return The total size of memory retained by this OrcRecordReader */ @VisibleForTesting long getRetainedSizeInBytes() { return INSTANCE_SIZE + getStreamReaderRetainedSizeInBytes() + getCurrentStripeRetainedSizeInBytes(); }
private static void assertInitialRetainedSizes(OrcRecordReader reader, int rows) { assertEquals(reader.getReaderRowCount(), rows); assertEquals(reader.getReaderPosition(), 0); assertEquals(reader.getCurrentStripeRetainedSizeInBytes(), 0); // there will be object overheads assertGreaterThan(reader.getStreamReaderRetainedSizeInBytes(), 0L); // there will be object overheads assertGreaterThan(reader.getRetainedSizeInBytes(), 0L); assertEquals(reader.getSystemMemoryUsage(), 0); }
@Test public void testEntireFile() throws Exception { try (TempFile tempFile = new TempFile()) { createMultiStripeFile(tempFile.getFile()); try (OrcRecordReader reader = createCustomOrcRecordReader(tempFile, ORC, OrcPredicate.TRUE, BIGINT, MAX_BATCH_SIZE)) { assertEquals(reader.getReaderRowCount(), 100); assertEquals(reader.getReaderPosition(), 0); assertEquals(reader.getFileRowCount(), reader.getReaderRowCount()); assertEquals(reader.getFilePosition(), reader.getReaderPosition()); for (int i = 0; i < 5; i++) { assertEquals(reader.nextBatch(), 20); assertEquals(reader.getReaderPosition(), i * 20L); assertEquals(reader.getFilePosition(), reader.getReaderPosition()); assertCurrentBatch(reader, i); } assertEquals(reader.nextBatch(), -1); assertEquals(reader.getReaderPosition(), 100); assertEquals(reader.getFilePosition(), reader.getReaderPosition()); } } }
assertInitialRetainedSizes(reader, rows); long stripeReaderRetainedSize = reader.getCurrentStripeRetainedSizeInBytes(); long streamReaderRetainedSize = reader.getStreamReaderRetainedSizeInBytes(); long readerRetainedSize = reader.getRetainedSizeInBytes(); long readerSystemMemoryUsage = reader.getSystemMemoryUsage(); int batchSize = reader.nextBatch(); if (batchSize == -1) { break; Block block = reader.readBlock(VARCHAR, 0); assertEquals(block.getPositionCount(), batchSize); assertGreaterThan(reader.getCurrentStripeRetainedSizeInBytes(), stripeReaderRetainedSize); assertEquals(reader.getStreamReaderRetainedSizeInBytes() - streamReaderRetainedSize, 0L); assertGreaterThan(reader.getRetainedSizeInBytes() - readerRetainedSize, 0L); assertGreaterThan(reader.getSystemMemoryUsage() - readerSystemMemoryUsage, 0L); reader.close();
assertEquals(reader.getReaderRowCount(), 2); assertEquals(reader.getFileRowCount(), 2); assertEquals(reader.getSplitLength(), file.length()); assertEquals(reader.nextBatch(), 2); Block column0 = reader.readBlock(BIGINT, 0); assertEquals(column0.getPositionCount(), 2); for (int i = 0; i < 2; i++) { Block column1 = reader.readBlock(createVarcharType(20), 1); assertEquals(column1.getPositionCount(), 2); for (int i = 0; i < 2; i++) { assertFalse(reader.getUserMetadata().containsKey(OrcFileMetadata.KEY)); assertEquals(reader.getReaderRowCount(), 1); assertEquals(reader.getFileRowCount(), 1); assertEquals(reader.getSplitLength(), newFile.length()); assertEquals(reader.nextBatch(), 1); Block column0 = reader.readBlock(BIGINT, 0); assertEquals(column0.getPositionCount(), 1); assertEquals(column0.isNull(0), false); assertEquals(BIGINT.getLong(column0, 0), 123L); Block column1 = reader.readBlock(createVarcharType(20), 1); assertEquals(column1.getPositionCount(), 1);
assertEquals(recordReader.getReaderPosition(), 0); assertEquals(recordReader.getFilePosition(), 0); for (int batchSize = toIntExact(recordReader.nextBatch()); batchSize >= 0; batchSize = toIntExact(recordReader.nextBatch())) { if (skipStripe && rowsProcessed < 10000) { assertEquals(advance(iterator, batchSize), batchSize); Block block = recordReader.readBlock(type, 0); assertEquals(recordReader.getReaderPosition(), rowsProcessed); assertEquals(recordReader.getFilePosition(), rowsProcessed); rowsProcessed += batchSize; assertEquals(recordReader.getReaderPosition(), rowsProcessed); assertEquals(recordReader.getFilePosition(), rowsProcessed);
private RowBlock read(TempFile tempFile, Type readerType) throws IOException { DataSize dataSize = new DataSize(1, MEGABYTE); OrcDataSource orcDataSource = new FileOrcDataSource(tempFile.getFile(), dataSize, dataSize, dataSize, true); OrcReader orcReader = new OrcReader(orcDataSource, ORC, dataSize, dataSize, dataSize, dataSize); Map<Integer, Type> includedColumns = new HashMap<>(); includedColumns.put(0, readerType); OrcRecordReader recordReader = orcReader.createRecordReader(includedColumns, OrcPredicate.TRUE, UTC, newSimpleAggregatedMemoryContext(), OrcReader.INITIAL_BATCH_SIZE); recordReader.nextBatch(); RowBlock block = (RowBlock) recordReader.readBlock(readerType, 0); recordReader.close(); return block; }
public void doIntegration(TestingOrcDataSource orcDataSource, DataSize maxMergeDistance, DataSize maxReadSize, DataSize tinyStripeThreshold) throws IOException { OrcReader orcReader = new OrcReader(orcDataSource, ORC, maxMergeDistance, maxReadSize, tinyStripeThreshold, new DataSize(1, Unit.MEGABYTE)); // 1 for reading file footer assertEquals(orcDataSource.getReadCount(), 1); List<StripeInformation> stripes = orcReader.getFooter().getStripes(); // Sanity check number of stripes. This can be three or higher because of orc writer low memory mode. assertGreaterThanOrEqual(stripes.size(), 3); //verify wrapped by CachingOrcReader assertInstanceOf(wrapWithCacheIfTinyStripes(orcDataSource, stripes, maxMergeDistance, tinyStripeThreshold), CachingOrcDataSource.class); OrcRecordReader orcRecordReader = orcReader.createRecordReader( ImmutableMap.of(0, VARCHAR), (numberOfRows, statisticsByColumnIndex) -> true, HIVE_STORAGE_TIME_ZONE, newSimpleAggregatedMemoryContext(), INITIAL_BATCH_SIZE); int positionCount = 0; while (true) { int batchSize = orcRecordReader.nextBatch(); if (batchSize <= 0) { break; } Block block = orcRecordReader.readBlock(VARCHAR, 0); positionCount += block.getPositionCount(); } assertEquals(positionCount, POSITION_COUNT); }
@Override public final void load(LazyBlock lazyBlock) { if (loaded) { return; } checkState(batchId == expectedBatchId); try { Block block = recordReader.readBlock(type, columnIndex); lazyBlock.setBlock(block); } catch (IOException e) { throw new PrestoException(RAPTOR_ERROR, e); } loaded = true; } }
int batchSize = recordReader.nextBatch(); if (batchSize <= 0) { close(); return null; long filePosition = recordReader.getFilePosition();
static void validateFile( OrcWriteValidation writeValidation, OrcDataSource input, List<Type> types, DateTimeZone hiveStorageTimeZone, OrcEncoding orcEncoding) throws OrcCorruptionException { ImmutableMap.Builder<Integer, Type> readTypes = ImmutableMap.builder(); for (int columnIndex = 0; columnIndex < types.size(); columnIndex++) { readTypes.put(columnIndex, types.get(columnIndex)); } try { OrcReader orcReader = new OrcReader(input, orcEncoding, new DataSize(1, MEGABYTE), new DataSize(8, MEGABYTE), new DataSize(8, MEGABYTE), new DataSize(16, MEGABYTE), Optional.of(writeValidation)); try (OrcRecordReader orcRecordReader = orcReader.createRecordReader(readTypes.build(), OrcPredicate.TRUE, hiveStorageTimeZone, newSimpleAggregatedMemoryContext(), INITIAL_BATCH_SIZE)) { while (orcRecordReader.nextBatch() >= 0) { // ignored } } } catch (IOException e) { throw new OrcCorruptionException(e, input.getId(), "Validation failed"); } } }
private static void assertClosedRetainedSizes(OrcRecordReader reader) { assertEquals(reader.getCurrentStripeRetainedSizeInBytes(), 0); // after close() we still account for the StreamReader instance sizes. assertGreaterThan(reader.getStreamReaderRetainedSizeInBytes(), 0L); // after close() we still account for the StreamReader instance sizes. assertGreaterThan(reader.getRetainedSizeInBytes(), 0L); assertEquals(reader.getSystemMemoryUsage(), 0); } }
ImmutableList.Builder<StripeInformation> stripes = ImmutableList.builder(); ImmutableList.Builder<Long> stripeFilePositions = ImmutableList.builder(); if (predicate.matches(numberOfRows, getStatisticsByColumnOrdinal(root, fileStats))) { if (splitContainsStripe(splitOffset, splitLength, stripe) && isStripeIncluded(root, stripe, info.getStats(), predicate)) { stripes.add(stripe); stripeFilePositions.add(fileRowCount); this.stripeFilePositions = stripeFilePositions.build(); orcDataSource = wrapWithCacheIfTinyStripes(orcDataSource, this.stripes, maxMergeDistance, tinyStripeThreshold); this.orcDataSource = orcDataSource; this.splitLength = splitLength; writeValidation); streamReaders = createStreamReaders(orcDataSource, types, hiveStorageTimeZone, presentColumnsAndTypes.build(), streamReadersSystemMemoryContext); maxBytesPerCell = new long[streamReaders.length]; nextBatchSize = initialBatchSize;
@Override public void close() { closed = true; try { recordReader.close(); } catch (IOException e) { throw new PrestoException(RAPTOR_ERROR, e); } }
DataSize tinyStripeThreshold = new DataSize(8, Unit.MEGABYTE); OrcDataSource actual = wrapWithCacheIfTinyStripes( FakeOrcDataSource.INSTANCE, ImmutableList.of(), assertInstanceOf(actual, CachingOrcDataSource.class); actual = wrapWithCacheIfTinyStripes( FakeOrcDataSource.INSTANCE, ImmutableList.of(new StripeInformation(123, 3, 10, 10, 10)), assertInstanceOf(actual, CachingOrcDataSource.class); actual = wrapWithCacheIfTinyStripes( FakeOrcDataSource.INSTANCE, ImmutableList.of(new StripeInformation(123, 3, 10, 10, 10), new StripeInformation(123, 33, 10, 10, 10), new StripeInformation(123, 63, 10, 10, 10)), assertInstanceOf(actual, CachingOrcDataSource.class); actual = wrapWithCacheIfTinyStripes( FakeOrcDataSource.INSTANCE, ImmutableList.of(new StripeInformation(123, 3, 10, 10, 10), new StripeInformation(123, 33, 10, 10, 10), new StripeInformation(123, 63, 1048576 * 8 - 20, 10, 10)), actual = wrapWithCacheIfTinyStripes( FakeOrcDataSource.INSTANCE, ImmutableList.of(new StripeInformation(123, 3, 10, 10, 10), new StripeInformation(123, 33, 10, 10, 10), new StripeInformation(123, 63, 1048576 * 8 - 20 + 1, 10, 10)),
@Test public void testEntireFile() throws Exception { try (TempFile tempFile = new TempFile()) { createMultiStripeFile(tempFile.getFile()); try (OrcRecordReader reader = createCustomOrcRecordReader(tempFile, ORC, OrcPredicate.TRUE, BIGINT, MAX_BATCH_SIZE)) { assertEquals(reader.getReaderRowCount(), 100); assertEquals(reader.getReaderPosition(), 0); assertEquals(reader.getFileRowCount(), reader.getReaderRowCount()); assertEquals(reader.getFilePosition(), reader.getReaderPosition()); for (int i = 0; i < 5; i++) { assertEquals(reader.nextBatch(), 20); assertEquals(reader.getReaderPosition(), i * 20L); assertEquals(reader.getFilePosition(), reader.getReaderPosition()); assertCurrentBatch(reader, i); } assertEquals(reader.nextBatch(), -1); assertEquals(reader.getReaderPosition(), 100); assertEquals(reader.getFilePosition(), reader.getReaderPosition()); } } }
assertInitialRetainedSizes(reader, rows); long stripeReaderRetainedSize = reader.getCurrentStripeRetainedSizeInBytes(); long streamReaderRetainedSize = reader.getStreamReaderRetainedSizeInBytes(); long readerRetainedSize = reader.getRetainedSizeInBytes(); long readerSystemMemoryUsage = reader.getSystemMemoryUsage(); int batchSize = reader.nextBatch(); if (batchSize == -1) { break; Block block = reader.readBlock(BIGINT, 0); assertEquals(block.getPositionCount(), batchSize); assertGreaterThan(reader.getCurrentStripeRetainedSizeInBytes(), stripeReaderRetainedSize); assertEquals(reader.getStreamReaderRetainedSizeInBytes() - streamReaderRetainedSize, 0L); assertGreaterThan(reader.getRetainedSizeInBytes() - readerRetainedSize, 0L); assertGreaterThan(reader.getSystemMemoryUsage() - readerSystemMemoryUsage, 0L); reader.close();