public DataSimulator( int stripeMinBytes, int stripeMaxBytes, int stripeMaxRowCount, int dictionaryMemoryMaxBytes, int otherColumnsBytesPerRow, TestDictionaryColumn... dictionaryColumns) { this.stripeMaxBytes = stripeMaxBytes; this.stripeMaxRowCount = stripeMaxRowCount; this.otherColumnsBytesPerRow = otherColumnsBytesPerRow; this.dictionaryColumns = ImmutableSet.copyOf(dictionaryColumns); this.optimizer = new DictionaryCompressionOptimizer(this.dictionaryColumns, stripeMinBytes, stripeMaxBytes, stripeMaxRowCount, dictionaryMemoryMaxBytes); }
@Override public int getIndexBytes() { checkState(!directEncoded); return toIntExact(estimateIndexBytesPerValue(dictionary.getEntryCount()) * getNonNullValueCount()); }
public void finalOptimize(int bufferedBytes) { convertLowCompressionStreams(bufferedBytes); }
private int convertLowCompressionStreams(int bufferedBytes) { // convert all low compression column to direct for (DictionaryColumnManager dictionaryWriter : ImmutableList.copyOf(directConversionCandidates)) { if (dictionaryWriter.getCompressionRatio() < DICTIONARY_MIN_COMPRESSION_RATIO) { int columnBufferedBytes = toIntExact(dictionaryWriter.getBufferedBytes()); OptionalInt directBytes = tryConvertToDirect(dictionaryWriter, getMaxDirectBytes(bufferedBytes)); if (directBytes.isPresent()) { bufferedBytes = bufferedBytes + directBytes.getAsInt() - columnBufferedBytes; if (bufferedBytes >= stripeMaxBytes) { return bufferedBytes; } } } } return bufferedBytes; }
bufferedBytes = convertLowCompressionStreams(bufferedBytes); DictionaryCompressionProjection projection = selectDictionaryColumnToConvert(nonDictionaryBufferedBytes, stripeRowCount); int selectDictionaryColumnBufferedBytes = toIntExact(projection.getColumnToConvert().getBufferedBytes()); OptionalInt directBytes = tryConvertToDirect(projection.getColumnToConvert(), getMaxDirectBytes(bufferedBytes)); if (directBytes.isPresent()) { bufferedBytes = bufferedBytes + directBytes.getAsInt() - selectDictionaryColumnBufferedBytes; double currentCompressionRatio = currentCompressionRatio(nonDictionaryBufferedBytes); while (!directConversionCandidates.isEmpty() && bufferedBytes < stripeMaxBytes) { DictionaryCompressionProjection projection = selectDictionaryColumnToConvert(nonDictionaryBufferedBytes, stripeRowCount); if (projection.getPredictedFileCompressionRatio() < currentCompressionRatio) { return; OptionalInt directBytes = tryConvertToDirect(projection.getColumnToConvert(), getMaxDirectBytes(bufferedBytes)); if (directBytes.isPresent()) { bufferedBytes = bufferedBytes + directBytes.getAsInt() - selectDictionaryColumnBufferedBytes;
dictionaryCompressionOptimizer.finalOptimize(bufferedBytes); closedStripesRetainedBytes += closedStripe.getRetainedSizeInBytes(); recordValidation(validation -> validation.addStripe(stripeInformation.getNumberOfRows())); stats.recordStripeWritten(flushReason, stripeInformation.getTotalLength(), stripeInformation.getNumberOfRows(), dictionaryCompressionOptimizer.getDictionaryMemoryBytes());
private void flushStripe(FlushReason flushReason) throws IOException { List<OrcDataOutput> outputData = new ArrayList<>(); long stripeStartOffset = orcDataSink.size(); // add header to first stripe (this is not required but nice to have) if (closedStripes.isEmpty()) { outputData.add(createDataOutput(MAGIC)); stripeStartOffset += MAGIC.length(); } // add stripe data outputData.addAll(bufferStripeData(stripeStartOffset, flushReason)); // if the file is being closed, add the file footer if (flushReason == CLOSED) { outputData.addAll(bufferFileFooter()); } // write all data orcDataSink.write(outputData); // open next stripe columnWriters.forEach(ColumnWriter::reset); dictionaryCompressionOptimizer.reset(); rowGroupRowCount = 0; stripeRowCount = 0; bufferedBytes = toIntExact(columnWriters.stream().mapToLong(ColumnWriter::getBufferedBytes).sum()); }
public void finalOptimize() { optimizer.finalOptimize(toIntExact(getBufferedBytes())); }
public boolean isDictionaryMemoryFull() { return optimizer.isFull(getBufferedBytes()); }
bufferedBytes = convertLowCompressionStreams(bufferedBytes); DictionaryCompressionProjection projection = selectDictionaryColumnToConvert(nonDictionaryBufferedBytes, stripeRowCount); int selectDictionaryColumnBufferedBytes = toIntExact(projection.getColumnToConvert().getBufferedBytes()); OptionalInt directBytes = tryConvertToDirect(projection.getColumnToConvert(), getMaxDirectBytes(bufferedBytes)); if (directBytes.isPresent()) { bufferedBytes = bufferedBytes + directBytes.getAsInt() - selectDictionaryColumnBufferedBytes; double currentCompressionRatio = currentCompressionRatio(nonDictionaryBufferedBytes); while (!directConversionCandidates.isEmpty() && bufferedBytes < stripeMaxBytes) { DictionaryCompressionProjection projection = selectDictionaryColumnToConvert(nonDictionaryBufferedBytes, stripeRowCount); if (projection.getPredictedFileCompressionRatio() < currentCompressionRatio) { return; OptionalInt directBytes = tryConvertToDirect(projection.getColumnToConvert(), getMaxDirectBytes(bufferedBytes)); if (directBytes.isPresent()) { bufferedBytes = bufferedBytes + directBytes.getAsInt() - selectDictionaryColumnBufferedBytes;
dictionaryCompressionOptimizer.finalOptimize(bufferedBytes); closedStripesRetainedBytes += closedStripe.getRetainedSizeInBytes(); recordValidation(validation -> validation.addStripe(stripeInformation.getNumberOfRows())); stats.recordStripeWritten(flushReason, stripeInformation.getTotalLength(), stripeInformation.getNumberOfRows(), dictionaryCompressionOptimizer.getDictionaryMemoryBytes());
public void reset() { rowCount = 0; optimizer.reset(); for (TestDictionaryColumn dictionaryColumn : dictionaryColumns) { dictionaryColumn.reset(); } }
public void finalOptimize() { optimizer.finalOptimize(toIntExact(getBufferedBytes())); }
public boolean isDictionaryMemoryFull() { return optimizer.isFull(getBufferedBytes()); }
private int convertLowCompressionStreams(int bufferedBytes) { // convert all low compression column to direct for (DictionaryColumnManager dictionaryWriter : ImmutableList.copyOf(directConversionCandidates)) { if (dictionaryWriter.getCompressionRatio() < DICTIONARY_MIN_COMPRESSION_RATIO) { int columnBufferedBytes = toIntExact(dictionaryWriter.getBufferedBytes()); OptionalInt directBytes = tryConvertToDirect(dictionaryWriter, getMaxDirectBytes(bufferedBytes)); if (directBytes.isPresent()) { bufferedBytes = bufferedBytes + directBytes.getAsInt() - columnBufferedBytes; if (bufferedBytes >= stripeMaxBytes) { return bufferedBytes; } } } } return bufferedBytes; }
public void advanceToNextStateChange() { List<Boolean> directColumnFlags = getDirectColumnFlags(); while (!optimizer.isFull(getBufferedBytes()) && getBufferedBytes() < stripeMaxBytes && getRowCount() < stripeMaxRowCount && directColumnFlags.equals(getDirectColumnFlags())) { rowCount += 1024; for (TestDictionaryColumn dictionaryColumn : dictionaryColumns) { dictionaryColumn.advanceTo(rowCount); } optimizer.optimize(toIntExact(getBufferedBytes()), getRowCount()); } }
@Override public int getIndexBytes() { checkState(!directEncoded); return toIntExact(estimateIndexBytesPerValue(dictionary.getEntryCount()) * getNonNullValueCount()); }
private void flushStripe(FlushReason flushReason) throws IOException { List<OrcDataOutput> outputData = new ArrayList<>(); long stripeStartOffset = orcDataSink.size(); // add header to first stripe (this is not required but nice to have) if (closedStripes.isEmpty()) { outputData.add(createDataOutput(MAGIC)); stripeStartOffset += MAGIC.length(); } // add stripe data outputData.addAll(bufferStripeData(stripeStartOffset, flushReason)); // if the file is being closed, add the file footer if (flushReason == CLOSED) { outputData.addAll(bufferFileFooter()); } // write all data orcDataSink.write(outputData); // open next stripe columnWriters.forEach(ColumnWriter::reset); dictionaryCompressionOptimizer.reset(); rowGroupRowCount = 0; stripeRowCount = 0; bufferedBytes = toIntExact(columnWriters.stream().mapToLong(ColumnWriter::getBufferedBytes).sum()); }