/** * {@link VectorizedOrcAcidRowBatchReader} is always used for vectorized reads of acid tables. * In some cases this cannot be used from LLAP IO elevator because * {@link RecordReader#getRowNumber()} is not (currently) available there but is required to * generate ROW__IDs for "original" files * @param hasDeletes - if there are any deletes that apply to this split * todo: HIVE-17944 */ static boolean canUseLlapForAcid(OrcSplit split, boolean hasDeletes, Configuration conf) { if(!split.isOriginal()) { return true; } VectorizedRowBatchCtx rbCtx = Utilities.getVectorizedRowBatchCtx(conf); if(rbCtx == null) { throw new IllegalStateException("Could not create VectorizedRowBatchCtx for " + split.getPath()); } return !needSyntheticRowIds(split.isOriginal(), hasDeletes, areRowIdsProjected(rbCtx)); }
private static Path[] getDeleteDeltaDirsFromSplit(OrcSplit orcSplit) throws IOException { Path path = orcSplit.getPath(); Path root; if (orcSplit.hasBase()) { if (orcSplit.isOriginal()) { root = path.getParent(); } else { root = path.getParent().getParent(); } } else { root = path; } return AcidUtils.deserializeDeleteDeltas(root, orcSplit.getDeltas()); }
/** * Returns whether it is possible to create a valid instance of this class for a given split. * @param conf is the job configuration * @param inputSplit * @return true if it is possible, else false. */ public static boolean canCreateVectorizedAcidRowBatchReaderOnSplit(JobConf conf, InputSplit inputSplit) { if (!(inputSplit instanceof OrcSplit)) { return false; // must be an instance of OrcSplit. } // First check if we are reading any original files in the split. // To simplify the vectorization logic, the vectorized acid row batch reader does not handle // original files for now as they have a different schema than a regular ACID file. final OrcSplit split = (OrcSplit) inputSplit; if (AcidUtils.getAcidOperationalProperties(conf).isSplitUpdate() && !split.isOriginal()) { // When split-update is turned on for ACID, a more optimized vectorized batch reader // can be created. But still only possible when we are *NOT* reading any originals. return true; } return false; // no split-update or possibly reading originals! }
static Path[] getDeleteDeltaDirsFromSplit(OrcSplit orcSplit) throws IOException { Path path = orcSplit.getPath(); Path root; if (orcSplit.hasBase()) { if (orcSplit.isOriginal()) { root = orcSplit.getRootDir(); } else { root = path.getParent().getParent();//todo: why not just use getRootDir()? assert root.equals(orcSplit.getRootDir()) : "root mismatch: baseDir=" + orcSplit.getRootDir() + " path.p.p=" + root; } } else { throw new IllegalStateException("Split w/o base w/Acid 2.0??: " + path); } return AcidUtils.deserializeDeleteDeltas(root, orcSplit.getDeltas()); }
public OrcNewSplit(OrcSplit inner) throws IOException { super(inner.getPath(), inner.getStart(), inner.getLength(), inner.getLocations()); this.orcTail = inner.getOrcTail(); this.hasFooter = inner.hasFooter(); this.isOriginal = inner.isOriginal(); this.hasBase = inner.hasBase(); this.deltas.addAll(inner.getDeltas()); }
public OrcNewSplit(OrcSplit inner) throws IOException { super(inner.getPath(), inner.getStart(), inner.getLength(), inner.getLocations()); this.orcTail = inner.getOrcTail(); this.hasFooter = inner.hasFooter(); this.isOriginal = inner.isOriginal(); this.hasBase = inner.hasBase(); this.deltas.addAll(inner.getDeltas()); }
SortMergedDeleteEventRegistry(JobConf conf, OrcSplit orcSplit, Reader.Options readerOptions) throws IOException { final Path[] deleteDeltas = getDeleteDeltaDirsFromSplit(orcSplit); if (deleteDeltas.length > 0) { int bucket = AcidUtils.parseBucketId(orcSplit.getPath()); String txnString = conf.get(ValidWriteIdList.VALID_WRITEIDS_KEY); this.validWriteIdList = (txnString == null) ? new ValidReaderWriteIdList() : new ValidReaderWriteIdList(txnString); LOG.debug("Using SortMergedDeleteEventRegistry"); OrcRawRecordMerger.Options mergerOptions = new OrcRawRecordMerger.Options().isDeleteReader(true); assert !orcSplit.isOriginal() : "If this now supports Original splits, set up mergeOptions properly"; this.deleteRecords = new OrcRawRecordMerger(conf, true, null, false, bucket, validWriteIdList, readerOptions, deleteDeltas, mergerOptions); this.deleteRecordKey = new OrcRawRecordMerger.ReaderKey(); this.deleteRecordValue = this.deleteRecords.createValue(); // Initialize the first value in the delete reader. this.isDeleteRecordAvailable = this.deleteRecords.next(deleteRecordKey, deleteRecordValue); } else { this.isDeleteRecordAvailable = false; this.deleteRecordKey = null; this.deleteRecordValue = null; this.deleteRecords = null; } }
OrcFile.readerOptions(conf)); if(orcSplit.isOriginal()) {
if (split.isOriginal()) { root = path.getParent(); } else { new ValidReadTxnList(txnString); final OrcRawRecordMerger records = new OrcRawRecordMerger(conf, true, reader, split.isOriginal(), bucket, validTxnList, readOptions, deltas); return new RowReader<OrcStruct>() {
"delta_0000001_0000010_0000/bucket_00000", splits.get(0).getPath().toUri().toString()); assertFalse(splits.get(0).isOriginal());
if(split.isOriginal()) { mergerOptions.copyIndex(acidIOOptions.getCopyNumber()).bucketPath(split.getPath()); new OrcRawRecordMerger(conf, true, reader, split.isOriginal(), bucket, validWriteIdList, readOptions, deltas, mergerOptions); return new RowReader<OrcStruct>() {
isOriginal = orcSplit.isOriginal(); if (isOriginal) { recordIdColumnVector = new StructColumnVector(VectorizedRowBatch.DEFAULT_SIZE,
assertEquals("mock:/a/b/000000_0", splits.get(0).getPath().toUri().toString()); assertEquals("mock:/a/b/000000_1", splits.get(1).getPath().toUri().toString()); assertTrue(splits.get(0).isOriginal()); assertTrue(splits.get(1).isOriginal()); assertEquals("mock:/a/base_0000001/bucket_00000", splits.get(0).getPath().toUri().toString()); assertEquals("mock:/a/base_0000001/bucket_00001", splits.get(1).getPath().toUri().toString()); assertFalse(splits.get(0).isOriginal()); assertFalse(splits.get(1).isOriginal()); assertEquals("mock:/a/b/000000_0", splits.get(0).getPath().toUri().toString()); assertEquals("mock:/a/b/000000_1", splits.get(1).getPath().toUri().toString()); assertTrue(splits.get(0).isOriginal()); assertTrue(splits.get(1).isOriginal()); assertEquals(true, splitStrategies.get(1) instanceof OrcInputFormat.ACIDSplitStrategy); splits = ((OrcInputFormat.ACIDSplitStrategy)splitStrategies.get(1)).getSplits(); assertEquals("mock:/a/delta_0000001_0000001_0000/bucket_00000", splits.get(0).getPath().toUri().toString()); assertEquals("mock:/a/delta_0000001_0000001_0000/bucket_00001", splits.get(1).getPath().toUri().toString()); assertFalse(splits.get(0).isOriginal()); assertFalse(splits.get(1).isOriginal()); assertEquals(1, splits.size()); assertEquals("mock:/a/b/000000_0", splits.get(0).getPath().toUri().toString()); assertTrue(splits.get(0).isOriginal()); assertEquals(true, splitStrategies.get(1) instanceof OrcInputFormat.ACIDSplitStrategy); splits = ((OrcInputFormat.ACIDSplitStrategy)splitStrategies.get(1)).getSplits(); assertEquals(1, splits.size()); assertEquals("mock:/a/delta_0000001_0000001_0000/bucket_00000", splits.get(0).getPath().toUri().toString());
assertEquals(root.toUri().toString() + File.separator + "000000_0", splits.get(0).getPath().toUri().toString()); assertTrue(splits.get(0).isOriginal()); assertTrue(splits.get(1).isOriginal()); assertTrue(splits.get(2).isOriginal());
"base_10000002/bucket_00001", splits.get(0).getPath().toUri().toString()); assertFalse(splits.get(0).isOriginal());
"delta_0000001_0000001_0000/bucket_00000", splits.get(0).getPath().toUri().toString()); assertFalse(splits.get(0).isOriginal()); assertFalse(splits.get(1).isOriginal()); assertFalse(splits.get(2).isOriginal());
private static boolean hasDeltas(OrcSplit orcSplit) throws IOException { final Path path = orcSplit.getPath(); final Path root; // If the split has a base, extract the base file size, bucket and root path info. if (orcSplit.hasBase()) { if (orcSplit.isOriginal()) { root = path.getParent(); } else { root = path.getParent().getParent(); } } else { root = path; } final Path[] deltas = AcidUtils.deserializeDeltas(root, orcSplit.getDeltas()); return deltas.length > 0; } }
long offset = fSplit.getStart(); long length = fSplit.getLength(); options.schema(fSplit.isOriginal() ? hiveReader.getSchema() : hiveReader.getSchema().getChildren().get(TRANS_ROW_COLUMN_INDEX)); options.range(offset, length); boolean[] include = OrcInputFormat.genIncludedColumns(types, jobConf, fSplit.isOriginal()); if (!fSplit.isOriginal()) { if (!fSplit.isOriginal()) { selectedColNames = ArrayUtils.addAll(new String[]{"row"}, selectedColNames); final ORCScanFilter orcScanFilter = (ORCScanFilter) filter; final SearchArgument sarg = orcScanFilter.getSarg(); options.searchArgument(sarg, OrcInputFormat.getSargColumnNames(selectedColNames, types, options.getInclude(), fSplit.isOriginal())); hiveBatch = createVectorizedRowBatch(partitionOI, fSplit.isOriginal()); copiers = HiveORCCopiers.createCopiers(projectedColOrdinals, vectors, hiveBatch, fSplit.isOriginal());
public OrcNewSplit(OrcSplit inner) throws IOException { super(inner.getPath(), inner.getStart(), inner.getLength(), inner.getLocations()); this.fileMetaInfo = inner.getFileMetaInfo(); this.hasFooter = inner.hasFooter(); this.isOriginal = inner.isOriginal(); this.hasBase = inner.hasBase(); this.deltas.addAll(inner.getDeltas()); }
if (split.isOriginal() && split.getDeltas().isEmpty()) { if (vectorMode) { return createVectorizedReader(inputSplit, conf, reporter);