/** * Generate compaction operation from file-slice * * @param partitionPath Partition path * @param fileSlice File Slice * @param metricsCaptureFunction Metrics Capture function * @return Compaction Operation */ public static HoodieCompactionOperation buildFromFileSlice(String partitionPath, FileSlice fileSlice, Optional<Function<Pair<String, FileSlice>, Map<String, Double>>> metricsCaptureFunction) { HoodieCompactionOperation.Builder builder = HoodieCompactionOperation.newBuilder(); builder.setPartitionPath(partitionPath); builder.setFileId(fileSlice.getFileId()); builder.setBaseInstantTime(fileSlice.getBaseInstantTime()); builder.setDeltaFilePaths(fileSlice.getLogFiles().map(lf -> lf.getPath().toString()).collect(Collectors.toList())); if (fileSlice.getDataFile().isPresent()) { builder.setDataFilePath(fileSlice.getDataFile().get().getPath()); } if (metricsCaptureFunction.isPresent()) { builder.setMetrics(metricsCaptureFunction.get().apply(Pair.of(partitionPath, fileSlice))); } return builder.build(); }
/** * Helper to merge last 2 file-slices. These 2 file-slices do not have compaction done yet. * * @param lastSlice Latest File slice for a file-group * @param penultimateSlice Penultimate file slice for a file-group in commit timeline order */ private static FileSlice mergeCompactionPendingFileSlices(FileSlice lastSlice, FileSlice penultimateSlice) { FileSlice merged = new FileSlice(penultimateSlice.getBaseInstantTime(), penultimateSlice.getFileId()); if (penultimateSlice.getDataFile().isPresent()) { merged.setDataFile(penultimateSlice.getDataFile().get()); } // Add Log files from penultimate and last slices penultimateSlice.getLogFiles().forEach(merged::addLogFile); lastSlice.getLogFiles().forEach(merged::addLogFile); return merged; }
/** * Add a new log file into the group */ public void addLogFile(HoodieLogFile logFile) { if (!fileSlices.containsKey(logFile.getBaseCommitTime())) { fileSlices.put(logFile.getBaseCommitTime(), new FileSlice(logFile.getBaseCommitTime(), id)); } fileSlices.get(logFile.getBaseCommitTime()).addLogFile(logFile); }
/** * With async compaction, it is possible to see partial/complete data-files due to inflight-compactions, * Ignore those data-files * @param fileSlice File Slice * @return */ private FileSlice filterDataFileAfterPendingCompaction(FileSlice fileSlice) { if (isFileSliceAfterPendingCompaction(fileSlice)) { // Data file is filtered out of the file-slice as the corresponding compaction // instant not completed yet. FileSlice transformed = new FileSlice(fileSlice.getBaseInstantTime(), fileSlice.getFileId()); fileSlice.getLogFiles().forEach(transformed::addLogFile); return transformed; } return fileSlice; }
/** * Returns true if the file-group is under pending-compaction and the file-slice' baseInstant matches * compaction Instant * @param fileSlice File Slice * @return */ private boolean isFileSliceAfterPendingCompaction(FileSlice fileSlice) { Pair<String, CompactionOperation> compactionWithInstantTime = fileIdToPendingCompaction.get(fileSlice.getFileId()); return (null != compactionWithInstantTime) && fileSlice.getBaseInstantTime().equals(compactionWithInstantTime.getKey()); }
FileSlice merged = fileSystemView.getLatestMergedFileSlicesBeforeOrOn(operation.getPartitionPath(), lastInstant.getTimestamp()) .filter(fs -> fs.getFileId().equals(operation.getFileId())).findFirst().get(); List<HoodieLogFile> logFilesToRepair = merged.getLogFiles().filter(lf -> lf.getBaseCommitTime().equals(compactionInstant)) .sorted(HoodieLogFile.getBaseInstantAndLogVersionComparator().reversed()) .collect(Collectors.toList()); FileSlice fileSliceForCompaction = fileSystemView.getLatestFileSlicesBeforeOrOn(operation.getPartitionPath(), operation.getBaseInstantTime()) .filter(fs -> fs.getFileId().equals(operation.getFileId())).findFirst().get(); int maxUsedVersion = fileSliceForCompaction.getLogFiles().findFirst().map(HoodieLogFile::getLogVersion) .orElse(HoodieLogFile.LOGFILE_BASE_VERSION - 1); String logExtn = fileSliceForCompaction.getLogFiles().findFirst().map(lf -> "." + lf.getFileExtension()) .orElse(HoodieLogFile.DELTA_EXTENSION); String parentPath = fileSliceForCompaction.getDataFile().map(df -> new Path(df.getPath()).getParent().toString()) .orElse(fileSliceForCompaction.getLogFiles().findFirst().map(lf -> lf.getPath().getParent().toString()).get()); for (HoodieLogFile toRepair : logFilesToRepair) { int version = maxUsedVersion + 1;
String lastVersion = fileSliceList.get(0).getBaseInstantTime(); String lastVersionBeforeEarliestCommitToRetain = getLatestVersionBeforeCommit(fileSliceList, earliestCommitToRetain); Optional<HoodieDataFile> aFile = aSlice.getDataFile(); String fileCommitTime = aSlice.getBaseInstantTime(); if (aFile.isPresent() && savepointedFiles.contains(aFile.get().getFileName())) { if (hoodieTable.getMetaClient().getTableType() == HoodieTableType.MERGE_ON_READ) { deletePaths.addAll(aSlice.getLogFiles().map(file -> file.getPath().toString()) .collect(Collectors.toList()));
Optional<HoodieDataFile> dataFile = nextSlice.getDataFile(); if (dataFile.isPresent() && savepointedFiles.contains(dataFile.get().getFileName())) { FileSlice nextSlice = fileSliceIterator.next(); if (!isFileSliceNeededForPendingCompaction(nextSlice)) { if (nextSlice.getDataFile().isPresent()) { HoodieDataFile dataFile = nextSlice.getDataFile().get(); deletePaths.add(dataFile.getFileStatus().getPath().toString()); deletePaths.addAll(nextSlice.getLogFiles().map(file -> file.getPath().toString()) .collect(Collectors.toList()));
FileSlice merged = fileSystemView.getLatestMergedFileSlicesBeforeOrOn(op.getPartitionPath(), lastInstant.getTimestamp()) .filter(fs -> fs.getFileId().equals(op.getFileId())).findFirst().get(); final int maxVersion = op.getDeltaFilePaths().stream().map(lf -> FSUtils.getFileVersionFromLog(new Path(lf))) .reduce((x, y) -> x > y ? x : y).orElse(0); List<HoodieLogFile> logFilesToBeMoved = merged.getLogFiles().filter(lf -> lf.getLogVersion() > maxVersion).collect(Collectors.toList()); return logFilesToBeMoved.stream().map(lf -> { Preconditions.checkArgument(lf.getLogVersion() - maxVersion > 0,
.filter(fileSlice1 -> fileSlice1.getFileId().equals(fileId)).findFirst(); baseInstantTime = fileSlice.get().getBaseInstantTime(); } else { fileSlice = Optional.of(new FileSlice(baseInstantTime, this.fileId)); logger.info("New InsertHandle for partition :" + partitionPath);
public Optional<FileSlice> getLatestFileSliceInRange(List<String> commitRange) { return getAllFileSlices() .filter(slice -> commitRange.contains(slice.getBaseInstantTime())) .findFirst(); }
/** * Generate input for compaction plan tests */ private Pair<List<Pair<String, FileSlice>>, HoodieCompactionPlan> buildCompactionPlan() { FileSlice emptyFileSlice = new FileSlice("000", "empty1"); FileSlice fileSlice = new FileSlice("000", "noData1"); fileSlice.setDataFile(new TestHoodieDataFile("/tmp/noLog.parquet")); fileSlice.addLogFile(new HoodieLogFile(new Path( FSUtils.makeLogFileName("noData1", ".log", "000", 1)))); fileSlice.addLogFile(new HoodieLogFile(new Path( FSUtils.makeLogFileName("noData1", ".log", "000", 2)))); FileSlice noLogFileSlice = new FileSlice("000", "noLog1"); noLogFileSlice.setDataFile(new TestHoodieDataFile("/tmp/noLog.parquet")); FileSlice noDataFileSlice = new FileSlice("000", "noData1"); noDataFileSlice.addLogFile(new HoodieLogFile(new Path( FSUtils.makeLogFileName("noData1", ".log", "000", 1)))); noDataFileSlice.addLogFile(new HoodieLogFile(new Path( FSUtils.makeLogFileName("noData1", ".log", "000", 2)))); List<FileSlice> fileSliceList = Arrays.asList(emptyFileSlice, noDataFileSlice, fileSlice, noLogFileSlice); List<Pair<String, FileSlice>> input = fileSliceList.stream().map(f -> Pair.of(DEFAULT_PARTITION_PATHS[0], f)) .collect(Collectors.toList()); return Pair.of(input, CompactionUtils.buildFromFileSlices(input, Optional.empty(), Optional.of(metricsCaptureFn))); }
private Writer createLogWriter(Optional<FileSlice> fileSlice, String baseCommitTime) throws IOException, InterruptedException { return HoodieLogFormat.newWriterBuilder() .onParentPath(new Path(hoodieTable.getMetaClient().getBasePath(), partitionPath)) .withFileId(fileId).overBaseCommit(baseCommitTime).withLogVersion( fileSlice.get().getLogFiles().map(logFile -> logFile.getLogVersion()) .max(Comparator.naturalOrder()).orElse(HoodieLogFile.LOGFILE_BASE_VERSION)) .withSizeThreshold(config.getLogFileMaxSize()).withFs(fs) .withFileExtension(HoodieLogFile.DELTA_EXTENSION).build(); }
@Override public Stream<FileSlice> getLatestUnCompactedFileSlices(String partitionPath) { return getAllFileGroups(partitionPath) .map(fileGroup -> { FileSlice fileSlice = fileGroup.getLatestFileSlice().get(); // if the file-group is under compaction, pick the latest before compaction instant time. if (isFileSliceAfterPendingCompaction(fileSlice)) { String compactionInstantTime = fileIdToPendingCompaction.get(fileSlice.getFileId()).getLeft(); return fileGroup.getLatestFileSliceBefore(compactionInstantTime); } return Optional.of(fileSlice); }) .map(Optional::get); }
/** * Stream of committed data files, sorted reverse commit time */ public Stream<HoodieDataFile> getAllDataFiles() { return getAllFileSlices() .filter(slice -> slice.getDataFile().isPresent()) .map(slice -> slice.getDataFile().get()); }
/** * Add a new datafile into the file group */ public void addDataFile(HoodieDataFile dataFile) { if (!fileSlices.containsKey(dataFile.getCommitTime())) { fileSlices.put(dataFile.getCommitTime(), new FileSlice(dataFile.getCommitTime(), id)); } fileSlices.get(dataFile.getCommitTime()).setDataFile(dataFile); }
/** * Potentially add a new file-slice by adding base-instant time * A file-slice without any data-file and log-files can exist (if a compaction just got requested) */ public void addNewFileSliceAtInstant(String baseInstantTime) { if (!fileSlices.containsKey(baseInstantTime)) { fileSlices.put(baseInstantTime, new FileSlice(baseInstantTime, id)); } }
java.util.Optional<FileSlice> fileSliceOptional = fileSystemView.getLatestUnCompactedFileSlices(operation.getPartitionPath()) .filter(fs -> fs.getFileId().equals(operation.getFileId())).findFirst(); if (fileSliceOptional.isPresent()) { FileSlice fs = fileSliceOptional.get(); java.util.Optional<HoodieDataFile> df = fs.getDataFile(); if (operation.getDataFilePath().isPresent()) { String expPath = metaClient.getFs().getFileStatus(new Path(operation.getDataFilePath().get())).getPath() "Base Path in operation is specified as " + expPath + " but got path " + df.get().getPath()); Set<HoodieLogFile> logFilesInFileSlice = fs.getLogFiles().collect(Collectors.toSet()); Set<HoodieLogFile> logFilesInCompactionOp = operation.getDeltaFilePaths().stream() .map(dp -> {
String lastVersion = fileSliceList.get(0).getBaseInstantTime(); String lastVersionBeforeEarliestCommitToRetain = getLatestVersionBeforeCommit(fileSliceList, earliestCommitToRetain); Optional<HoodieDataFile> aFile = aSlice.getDataFile(); String fileCommitTime = aSlice.getBaseInstantTime(); if (aFile.isPresent() && savepointedFiles.contains(aFile.get().getFileName())) { deletePaths.addAll(aSlice.getLogFiles().map(file -> file.getPath().toString()) .collect(Collectors.toList()));
Optional<HoodieDataFile> dataFile = nextSlice.getDataFile(); if (dataFile.isPresent() && savepointedFiles.contains(dataFile.get().getFileName())) { FileSlice nextSlice = fileSliceIterator.next(); if (!isFileSliceNeededForPendingCompaction(nextSlice)) { if (nextSlice.getDataFile().isPresent()) { HoodieDataFile dataFile = nextSlice.getDataFile().get(); deletePaths.add(dataFile.getFileStatus().getPath().toString()); deletePaths.addAll(nextSlice.getLogFiles().map(file -> file.getPath().toString()) .collect(Collectors.toList()));