public Optional<FileSlice> getLatestFileSliceInRange(List<String> commitRange) { return getAllFileSlices() .filter(slice -> commitRange.contains(slice.getBaseInstantTime())) .findFirst(); }
/** * A FileSlice is considered committed, if one of the following is true - There is a committed * data file - There are some log files, that are based off a commit or delta commit */ private boolean isFileSliceCommitted(FileSlice slice) { String maxCommitTime = lastInstant.get().getTimestamp(); return timeline.containsOrBeforeTimelineStarts(slice.getBaseInstantTime()) && HoodieTimeline.compareTimestamps(slice.getBaseInstantTime(), maxCommitTime, HoodieTimeline.LESSER_OR_EQUAL); }
/** * Obtain the latest file slice, upto a commitTime i.e <= maxCommitTime */ public Optional<FileSlice> getLatestFileSliceBeforeOrOn(String maxCommitTime) { return getAllFileSlices() .filter(slice -> HoodieTimeline.compareTimestamps(slice.getBaseInstantTime(), maxCommitTime, HoodieTimeline.LESSER_OR_EQUAL)) .findFirst(); }
/** * Returns true if the file-group is under pending-compaction and the file-slice' baseInstant matches * compaction Instant * @param fileSlice File Slice * @return */ private boolean isFileSliceAfterPendingCompaction(FileSlice fileSlice) { Pair<String, CompactionOperation> compactionWithInstantTime = fileIdToPendingCompaction.get(fileSlice.getFileId()); return (null != compactionWithInstantTime) && fileSlice.getBaseInstantTime().equals(compactionWithInstantTime.getKey()); }
/** * Obtain the latest file slice, upto a commitTime i.e < maxInstantTime * @param maxInstantTime Max Instant Time * @return */ public Optional<FileSlice> getLatestFileSliceBefore(String maxInstantTime) { return getAllFileSlices() .filter(slice -> HoodieTimeline.compareTimestamps(slice.getBaseInstantTime(), maxInstantTime, HoodieTimeline.LESSER)) .findFirst(); }
/** * Gets the latest version < commitTime. This version file could still be used by queries. */ private String getLatestVersionBeforeCommit(List<FileSlice> fileSliceList, HoodieInstant commitTime) { for (FileSlice file : fileSliceList) { String fileCommitTime = file.getBaseInstantTime(); if (HoodieTimeline .compareTimestamps(commitTime.getTimestamp(), fileCommitTime, HoodieTimeline.GREATER)) { // fileList is sorted on the reverse, so the first commit we find <= commitTime is the // one we want return fileCommitTime; } } // There is no version of this file which is <= commitTime return null; }
/** * Gets the latest version < commitTime. This version file could still be used by queries. */ private String getLatestVersionBeforeCommit(List<FileSlice> fileSliceList, HoodieInstant commitTime) { for (FileSlice file : fileSliceList) { String fileCommitTime = file.getBaseInstantTime(); if (HoodieTimeline .compareTimestamps(commitTime.getTimestamp(), fileCommitTime, HoodieTimeline.GREATER)) { // fileList is sorted on the reverse, so the first commit we find <= commitTime is the // one we want return fileCommitTime; } } // There is no version of this file which is <= commitTime return null; }
/** * Determine if file slice needed to be preserved for pending compaction * @param fileSlice File Slice * @return true if file slice needs to be preserved, false otherwise. */ private boolean isFileSliceNeededForPendingCompaction(FileSlice fileSlice) { CompactionOperation op = fileIdToPendingCompactionOperations.get(fileSlice.getFileId()); if (null != op) { // If file slice's instant time is newer or same as that of operation, do not clean return HoodieTimeline.compareTimestamps(fileSlice.getBaseInstantTime(), op.getBaseInstantTime(), HoodieTimeline.GREATER_OR_EQUAL); } return false; } }
/** * Determine if file slice needed to be preserved for pending compaction * @param fileSlice File Slice * @return true if file slice needs to be preserved, false otherwise. */ private boolean isFileSliceNeededForPendingCompaction(FileSlice fileSlice) { CompactionOperation op = fileIdToPendingCompactionOperations.get(fileSlice.getFileId()); if (null != op) { // If file slice's instant time is newer or same as that of operation, do not clean return HoodieTimeline.compareTimestamps(fileSlice.getBaseInstantTime(), op.getBaseInstantTime(), HoodieTimeline.GREATER_OR_EQUAL); } return false; } }
/** * If the file-slice is because of pending compaction instant, this method merges the file-slice with the one before * the compaction instant time * @param fileGroup File Group for which the file slice belongs to * @param fileSlice File Slice which needs to be merged * @return */ private FileSlice getMergedFileSlice(HoodieFileGroup fileGroup, FileSlice fileSlice) { // if the file-group is under construction, pick the latest before compaction instant time. if (fileIdToPendingCompaction.containsKey(fileSlice.getFileId())) { String compactionInstantTime = fileIdToPendingCompaction.get(fileSlice.getFileId()).getKey(); if (fileSlice.getBaseInstantTime().equals(compactionInstantTime)) { Optional<FileSlice> prevFileSlice = fileGroup.getLatestFileSliceBefore(compactionInstantTime); if (prevFileSlice.isPresent()) { return mergeCompactionPendingFileSlices(fileSlice, prevFileSlice.get()); } } } return fileSlice; }
/** * Helper to merge last 2 file-slices. These 2 file-slices do not have compaction done yet. * * @param lastSlice Latest File slice for a file-group * @param penultimateSlice Penultimate file slice for a file-group in commit timeline order */ private static FileSlice mergeCompactionPendingFileSlices(FileSlice lastSlice, FileSlice penultimateSlice) { FileSlice merged = new FileSlice(penultimateSlice.getBaseInstantTime(), penultimateSlice.getFileId()); if (penultimateSlice.getDataFile().isPresent()) { merged.setDataFile(penultimateSlice.getDataFile().get()); } // Add Log files from penultimate and last slices penultimateSlice.getLogFiles().forEach(merged::addLogFile); lastSlice.getLogFiles().forEach(merged::addLogFile); return merged; }
/** * With async compaction, it is possible to see partial/complete data-files due to inflight-compactions, * Ignore those data-files * @param fileSlice File Slice * @return */ private FileSlice filterDataFileAfterPendingCompaction(FileSlice fileSlice) { if (isFileSliceAfterPendingCompaction(fileSlice)) { // Data file is filtered out of the file-slice as the corresponding compaction // instant not completed yet. FileSlice transformed = new FileSlice(fileSlice.getBaseInstantTime(), fileSlice.getFileId()); fileSlice.getLogFiles().forEach(transformed::addLogFile); return transformed; } return fileSlice; }
/** * Generate compaction operation from file-slice * * @param partitionPath Partition path * @param fileSlice File Slice * @param metricsCaptureFunction Metrics Capture function * @return Compaction Operation */ public static HoodieCompactionOperation buildFromFileSlice(String partitionPath, FileSlice fileSlice, Optional<Function<Pair<String, FileSlice>, Map<String, Double>>> metricsCaptureFunction) { HoodieCompactionOperation.Builder builder = HoodieCompactionOperation.newBuilder(); builder.setPartitionPath(partitionPath); builder.setFileId(fileSlice.getFileId()); builder.setBaseInstantTime(fileSlice.getBaseInstantTime()); builder.setDeltaFilePaths(fileSlice.getLogFiles().map(lf -> lf.getPath().toString()).collect(Collectors.toList())); if (fileSlice.getDataFile().isPresent()) { builder.setDataFilePath(fileSlice.getDataFile().get().getPath()); } if (metricsCaptureFunction.isPresent()) { builder.setMetrics(metricsCaptureFunction.get().apply(Pair.of(partitionPath, fileSlice))); } return builder.build(); }
baseInstantTime = fileSlice.get().getBaseInstantTime(); } else {
private void executeCompaction(String compactionInstantTime, HoodieWriteClient client, HoodieTable table, HoodieWriteConfig cfg, int expectedNumRecs, boolean hasDeltaCommitAfterPendingCompaction) throws IOException { client.compact(compactionInstantTime); List<FileSlice> fileSliceList = getCurrentLatestFileSlices(table, cfg); assertTrue("Ensure latest file-slices are not empty", fileSliceList.stream().findAny().isPresent()); assertFalse("Verify all file-slices have base-instant same as compaction instant", fileSliceList.stream().filter(fs -> !fs.getBaseInstantTime().equals(compactionInstantTime)) .findAny().isPresent()); assertFalse("Verify all file-slices have data-files", fileSliceList.stream().filter(fs -> !fs.getDataFile().isPresent()).findAny().isPresent()); if (hasDeltaCommitAfterPendingCompaction) { assertFalse("Verify all file-slices have atleast one log-file", fileSliceList.stream().filter(fs -> fs.getLogFiles().count() == 0).findAny().isPresent()); } else { assertFalse("Verify all file-slices have no log-files", fileSliceList.stream().filter(fs -> fs.getLogFiles().count() > 0).findAny().isPresent()); } // verify that there is a commit table = HoodieTable.getHoodieTable( new HoodieTableMetaClient(jsc.hadoopConfiguration(), cfg.getBasePath(), true), cfg, jsc); HoodieTimeline timeline = table.getMetaClient().getCommitTimeline().filterCompletedInstants(); String latestCompactionCommitTime = timeline.lastInstant().get().getTimestamp(); assertEquals("Expect compaction instant time to be the latest commit time", latestCompactionCommitTime, compactionInstantTime); assertEquals("Must contain expected records", expectedNumRecs, HoodieClientTestUtils.readSince(basePath, sqlContext, timeline, "000").count()); }
baseInstantTime = fileSlice.get().getBaseInstantTime(); } else {
/** * HELPER METHODS FOR TESTING **/ private void validateDeltaCommit(String latestDeltaCommit, final Map<String, Pair<String, HoodieCompactionOperation>> fileIdToCompactionOperation, HoodieWriteConfig cfg) throws IOException { HoodieTableMetaClient metaClient = new HoodieTableMetaClient(jsc.hadoopConfiguration(), cfg.getBasePath()); HoodieTable table = HoodieTable.getHoodieTable(metaClient, cfg, jsc); List<FileSlice> fileSliceList = getCurrentLatestFileSlices(table, cfg); fileSliceList.forEach(fileSlice -> { Pair<String, HoodieCompactionOperation> opPair = fileIdToCompactionOperation.get(fileSlice.getFileId()); if (opPair != null) { System.out.println("FileSlice :" + fileSlice); assertTrue("Expect baseInstant to match compaction Instant", fileSlice.getBaseInstantTime().equals(opPair.getKey())); assertTrue("Expect atleast one log file to be present where the latest delta commit was written", fileSlice.getLogFiles().count() > 0); assertFalse("Expect no data-file to be present", fileSlice.getDataFile().isPresent()); } else { assertTrue("Expect baseInstant to be less than or equal to latestDeltaCommit", fileSlice.getBaseInstantTime().compareTo(latestDeltaCommit) <= 0); } }); }
/** * Validates if generated compaction operation matches with input file slice and partition path * * @param slice File Slice * @param op HoodieCompactionOperation * @param expPartitionPath Partition path */ private void testFileSliceCompactionOpEquality(FileSlice slice, HoodieCompactionOperation op, String expPartitionPath) { Assert.assertEquals("Partition path is correct", expPartitionPath, op.getPartitionPath()); Assert.assertEquals("Same base-instant", slice.getBaseInstantTime(), op.getBaseInstantTime()); Assert.assertEquals("Same file-id", slice.getFileId(), op.getFileId()); if (slice.getDataFile().isPresent()) { Assert.assertEquals("Same data-file", slice.getDataFile().get().getPath(), op.getDataFilePath()); } List<String> paths = slice.getLogFiles().map(l -> l.getPath().toString()).collect(Collectors.toList()); IntStream.range(0, paths.size()).boxed().forEach(idx -> { Assert.assertEquals("Log File Index " + idx, paths.get(idx), op.getDeltaFilePaths().get(idx)); }); Assert.assertEquals("Metrics set", metrics, op.getMetrics()); } }
new HoodieTableFileSystemView(metaClient, metaClient.getCommitsAndCompactionTimeline()); Set<HoodieLogFile> expLogFilesToBeRenamed = fsView.getLatestFileSlices(HoodieTestUtils.DEFAULT_PARTITION_PATHS[0]) .filter(fs -> fs.getBaseInstantTime().equals(compactionInstant)) .filter(fs -> fs.getFileId().equals(op.getFileId())) .flatMap(fs -> fs.getLogFiles()) .filter(fs -> fs.getBaseInstantTime().equals(ingestionInstant)) .filter(fs -> fs.getFileId().equals(op.getFileId())) .map(fs -> Pair.of(fs.getFileId(), fs.getLogFiles().count())) .filter(fs -> fs.getBaseInstantTime().equals(compactionInstant)) .filter(fs -> fs.getFileId().equals(op.getFileId())).forEach(fs -> { Assert.assertFalse("No Data file must be present", fs.getDataFile().isPresent()); .filter(fs -> fs.getBaseInstantTime().equals(ingestionInstant)) .filter(fs -> fs.getFileId().equals(op.getFileId())) .map(fs -> Pair.of(fs.getFileId(), fs.getLogFiles().count()))
assertEquals("File-Id must be set correctly", fileId, fileSlice.getFileId()); assertFalse("Data file for base instant must be present", fileSlice.getDataFile().isPresent()); assertEquals("Base Instant for file-group set correctly", instantTime1, fileSlice.getBaseInstantTime()); List<HoodieLogFile> logFiles = fileSlice.getLogFiles().collect(Collectors.toList()); assertEquals("Correct number of log-files shows up in file-slice", 2, logFiles.size()); assertEquals("File-Id must be set correctly", fileId, fileSlice.getFileId()); assertFalse("Data file for base instant must be present", fileSlice.getDataFile().isPresent()); assertEquals("Base Instant for file-group set correctly", instantTime1, fileSlice.getBaseInstantTime()); logFiles = fileSlice.getLogFiles().collect(Collectors.toList()); assertEquals("Correct number of log-files shows up in file-slice", 2, logFiles.size()); assertEquals("File-Id must be set correctly", fileId, fileSlice.getFileId()); assertFalse("Data file for base instant must be present", fileSlice.getDataFile().isPresent()); assertEquals("Base Instant for file-group set correctly", instantTime1, fileSlice.getBaseInstantTime()); logFiles = fileSlice.getLogFiles().collect(Collectors.toList()); assertEquals("Correct number of log-files shows up in file-slice", 2, logFiles.size());