@Override public void close() throws IOException { orcWriter.close(); }
public void writePage(Page page) { try { orcWriter.write(page); } catch (IOException e) { throw new UncheckedIOException(e); } }
@Override public long getSystemMemoryUsage() { return INSTANCE_SIZE + orcWriter.getRetainedBytes(); }
private static void writeOrcColumnPresto(File outputFile, Format format, CompressionKind compression, Type type, Iterator<?> values, OrcWriterStats stats) throws Exception { ImmutableMap.Builder<String, String> metadata = ImmutableMap.builder(); metadata.put("columns", "test"); metadata.put("columns.types", createSettableStructObjectInspector("test", type).getTypeName()); OrcWriter writer; writer = new OrcWriter( new OutputStreamOrcDataSink(new FileOutputStream(outputFile)), ImmutableList.of("test"), ImmutableList.of(type), format.getOrcEncoding(), compression, new OrcWriterOptions(), ImmutableMap.of(), HIVE_STORAGE_TIME_ZONE, true, BOTH, stats); BlockBuilder blockBuilder = type.createBlockBuilder(null, 1024); while (values.hasNext()) { Object value = values.next(); writeValue(type, blockBuilder, value); } writer.write(new Page(blockBuilder.build())); writer.close(); writer.validate(new FileOrcDataSource(outputFile, new DataSize(1, MEGABYTE), new DataSize(1, MEGABYTE), new DataSize(1, MEGABYTE), true)); }
orcWriter = new OrcWriter( orcDataSink, columnNames,
@Override public void commit() { try { orcWriter.close(); } catch (IOException | UncheckedIOException e) { try { rollbackAction.call(); } catch (Exception ignored) { // ignore } throw new PrestoException(HIVE_WRITER_CLOSE_ERROR, "Error committing write to Hive", e); } if (validationInputFactory.isPresent()) { try { try (OrcDataSource input = validationInputFactory.get().get()) { long startThreadCpuTime = THREAD_MX_BEAN.getCurrentThreadCpuTime(); orcWriter.validate(input); validationCpuNanos += THREAD_MX_BEAN.getCurrentThreadCpuTime() - startThreadCpuTime; } } catch (IOException | UncheckedIOException e) { throw new PrestoException(HIVE_WRITE_VALIDATION_FAILED, e); } } }
writeChunk(chunk); long recordedSizeInBytes = getRetainedBytes(); stats.updateSizeInBytes(recordedSizeInBytes - previouslyRecordedSizeInBytes); previouslyRecordedSizeInBytes = recordedSizeInBytes;
finishRowGroup(); StripeFooter stripeFooter = new StripeFooter(allStreams, toDenseList(columnEncodings, orcTypes.size())); Slice footer = metadataWriter.writeStripeFooter(stripeFooter); outputData.add(createDataOutput(footer)); StripeStatistics statistics = new StripeStatistics(toDenseList(columnStatistics, orcTypes.size())); recordValidation(validation -> validation.addStripeStatistics(stripeStartOffset, statistics)); StripeInformation stripeInformation = new StripeInformation(stripeRowCount, stripeStartOffset, indexLength, dataLength, footer.length()); ClosedStripe closedStripe = new ClosedStripe(stripeInformation, statistics); closedStripes.add(closedStripe); closedStripesRetainedBytes += closedStripe.getRetainedSizeInBytes(); recordValidation(validation -> validation.addStripe(stripeInformation.getNumberOfRows())); stats.recordStripeWritten(flushReason, stripeInformation.getTotalLength(), stripeInformation.getNumberOfRows(), dictionaryCompressionOptimizer.getDictionaryMemoryBytes());
this.orcEncoding = requireNonNull(orcEncoding, "orcEncoding is null"); this.compression = requireNonNull(compression, "compression is null"); recordValidation(validation -> validation.setCompression(compression)); this.stripeMaxRowCount = options.getStripeMaxRowCount(); this.rowGroupMaxRowCount = options.getRowGroupMaxRowCount(); recordValidation(validation -> validation.setRowGroupMaxRowCount(rowGroupMaxRowCount)); this.maxCompressionBufferSize = toIntExact(options.getMaxCompressionBufferSize().toBytes()); recordValidation(validation -> validation.setColumnNames(columnNames)); recordValidation(validation -> validation.addMetadataProperty(entry.getKey(), utf8Slice(entry.getValue()))); this.previouslyRecordedSizeInBytes = getRetainedBytes(); stats.updateSizeInBytes(previouslyRecordedSizeInBytes);
private void flushStripe(FlushReason flushReason) throws IOException { List<OrcDataOutput> outputData = new ArrayList<>(); long stripeStartOffset = orcDataSink.size(); // add header to first stripe (this is not required but nice to have) if (closedStripes.isEmpty()) { outputData.add(createDataOutput(MAGIC)); stripeStartOffset += MAGIC.length(); } // add stripe data outputData.addAll(bufferStripeData(stripeStartOffset, flushReason)); // if the file is being closed, add the file footer if (flushReason == CLOSED) { outputData.addAll(bufferFileFooter()); } // write all data orcDataSink.write(outputData); // open next stripe columnWriters.forEach(ColumnWriter::reset); dictionaryCompressionOptimizer.reset(); rowGroupRowCount = 0; stripeRowCount = 0; bufferedBytes = toIntExact(columnWriters.stream().mapToLong(ColumnWriter::getBufferedBytes).sum()); }
finishRowGroup(); flushStripe(MAX_ROWS); flushStripe(MAX_BYTES); flushStripe(DICTIONARY_FULL);
@Override public void close() throws IOException { if (closed) { return; } closed = true; stats.updateSizeInBytes(-previouslyRecordedSizeInBytes); previouslyRecordedSizeInBytes = 0; flushStripe(CLOSED); orcDataSink.close(); }
private static void writeOrcColumnPresto(File outputFile, Format format, CompressionKind compression, Type type, Iterator<?> values, OrcWriterStats stats) throws Exception { ImmutableMap.Builder<String, String> metadata = ImmutableMap.builder(); metadata.put("columns", "test"); metadata.put("columns.types", createSettableStructObjectInspector("test", type).getTypeName()); OrcWriter writer; writer = new OrcWriter( new OutputStreamOrcDataSink(new FileOutputStream(outputFile)), ImmutableList.of("test"), ImmutableList.of(type), format.getOrcEncoding(), compression, new OrcWriterOptions(), ImmutableMap.of(), HIVE_STORAGE_TIME_ZONE, true, BOTH, stats); BlockBuilder blockBuilder = type.createBlockBuilder(null, 1024); while (values.hasNext()) { Object value = values.next(); writeValue(type, blockBuilder, value); } writer.write(new Page(blockBuilder.build())); writer.close(); writer.validate(new FileOrcDataSource(outputFile, new DataSize(1, MEGABYTE), new DataSize(1, MEGABYTE), new DataSize(1, MEGABYTE), true)); }
private static OrcWriter createOrcFileWriter(OrcDataSink sink, List<Type> types) { List<String> columnNames = IntStream.range(0, types.size()) .mapToObj(String::valueOf) .collect(toImmutableList()); return new OrcWriter( sink, columnNames, types, ORC, LZ4, new OrcWriterOptions() .withMaxStringStatisticsLimit(new DataSize(0, BYTE)) .withStripeMinSize(new DataSize(64, MEGABYTE)) .withDictionaryMaxMemory(new DataSize(1, MEGABYTE)), ImmutableMap.of(), UTC, false, OrcWriteValidationMode.BOTH, new OrcWriterStats()); } }
writeChunk(chunk); long recordedSizeInBytes = getRetainedBytes(); stats.updateSizeInBytes(recordedSizeInBytes - previouslyRecordedSizeInBytes); previouslyRecordedSizeInBytes = recordedSizeInBytes;
finishRowGroup(); StripeFooter stripeFooter = new StripeFooter(allStreams, toDenseList(columnEncodings, orcTypes.size())); Slice footer = metadataWriter.writeStripeFooter(stripeFooter); outputData.add(createDataOutput(footer)); StripeStatistics statistics = new StripeStatistics(toDenseList(columnStatistics, orcTypes.size())); recordValidation(validation -> validation.addStripeStatistics(stripeStartOffset, statistics)); StripeInformation stripeInformation = new StripeInformation(stripeRowCount, stripeStartOffset, indexLength, dataLength, footer.length()); ClosedStripe closedStripe = new ClosedStripe(stripeInformation, statistics); closedStripes.add(closedStripe); closedStripesRetainedBytes += closedStripe.getRetainedSizeInBytes(); recordValidation(validation -> validation.addStripe(stripeInformation.getNumberOfRows())); stats.recordStripeWritten(flushReason, stripeInformation.getTotalLength(), stripeInformation.getNumberOfRows(), dictionaryCompressionOptimizer.getDictionaryMemoryBytes());
this.orcEncoding = requireNonNull(orcEncoding, "orcEncoding is null"); this.compression = requireNonNull(compression, "compression is null"); recordValidation(validation -> validation.setCompression(compression)); this.stripeMaxRowCount = options.getStripeMaxRowCount(); this.rowGroupMaxRowCount = options.getRowGroupMaxRowCount(); recordValidation(validation -> validation.setRowGroupMaxRowCount(rowGroupMaxRowCount)); this.maxCompressionBufferSize = toIntExact(options.getMaxCompressionBufferSize().toBytes()); recordValidation(validation -> validation.setColumnNames(columnNames)); recordValidation(validation -> validation.addMetadataProperty(entry.getKey(), utf8Slice(entry.getValue()))); this.previouslyRecordedSizeInBytes = getRetainedBytes(); stats.updateSizeInBytes(previouslyRecordedSizeInBytes);
private void flushStripe(FlushReason flushReason) throws IOException { List<OrcDataOutput> outputData = new ArrayList<>(); long stripeStartOffset = orcDataSink.size(); // add header to first stripe (this is not required but nice to have) if (closedStripes.isEmpty()) { outputData.add(createDataOutput(MAGIC)); stripeStartOffset += MAGIC.length(); } // add stripe data outputData.addAll(bufferStripeData(stripeStartOffset, flushReason)); // if the file is being closed, add the file footer if (flushReason == CLOSED) { outputData.addAll(bufferFileFooter()); } // write all data orcDataSink.write(outputData); // open next stripe columnWriters.forEach(ColumnWriter::reset); dictionaryCompressionOptimizer.reset(); rowGroupRowCount = 0; stripeRowCount = 0; bufferedBytes = toIntExact(columnWriters.stream().mapToLong(ColumnWriter::getBufferedBytes).sum()); }