/** * Returns files to be cleaned for the given partitionPath based on cleaning policy. */ public List<String> getDeletePaths(String partitionPath) throws IOException { HoodieCleaningPolicy policy = config.getCleanerPolicy(); List<String> deletePaths; if (policy == HoodieCleaningPolicy.KEEP_LATEST_COMMITS) { deletePaths = getFilesToCleanKeepingLatestCommits(partitionPath); } else if (policy == HoodieCleaningPolicy.KEEP_LATEST_FILE_VERSIONS) { deletePaths = getFilesToCleanKeepingLatestVersions(partitionPath); } else { throw new IllegalArgumentException("Unknown cleaning policy : " + policy.name()); } logger.info(deletePaths.size() + " patterns used to delete in partition path:" + partitionPath); return deletePaths; }
/** * Returns files to be cleaned for the given partitionPath based on cleaning policy. */ public List<String> getDeletePaths(String partitionPath) throws IOException { HoodieCleaningPolicy policy = config.getCleanerPolicy(); List<String> deletePaths; if (policy == HoodieCleaningPolicy.KEEP_LATEST_COMMITS) { deletePaths = getFilesToCleanKeepingLatestCommits(partitionPath); } else if (policy == HoodieCleaningPolicy.KEEP_LATEST_FILE_VERSIONS) { deletePaths = getFilesToCleanKeepingLatestVersions(partitionPath); } else { throw new IllegalArgumentException("Unknown cleaning policy : " + policy.name()); } logger.info(deletePaths.size() + " patterns used to delete in partition path:" + partitionPath); return deletePaths; }
/** * Returns earliest commit to retain based on cleaning policy. */ public Optional<HoodieInstant> getEarliestCommitToRetain() { Optional<HoodieInstant> earliestCommitToRetain = Optional.empty(); int commitsRetained = config.getCleanerCommitsRetained(); if (config.getCleanerPolicy() == HoodieCleaningPolicy.KEEP_LATEST_COMMITS && commitTimeline.countInstants() > commitsRetained) { earliestCommitToRetain = commitTimeline .nthInstant(commitTimeline.countInstants() - commitsRetained); } return earliestCommitToRetain; }
/** * Returns earliest commit to retain based on cleaning policy. */ public Optional<HoodieInstant> getEarliestCommitToRetain() { Optional<HoodieInstant> earliestCommitToRetain = Optional.empty(); int commitsRetained = config.getCleanerCommitsRetained(); if (config.getCleanerPolicy() == HoodieCleaningPolicy.KEEP_LATEST_COMMITS && commitTimeline.countInstants() > commitsRetained) { earliestCommitToRetain = commitTimeline .nthInstant(commitTimeline.countInstants() - commitsRetained); } return earliestCommitToRetain; }
/** * Performs cleaning of partition paths according to cleaning policy and returns the number of * files cleaned. Handles skews in partitions to clean by making files to clean as the unit of * task distribution. * * @throws IllegalArgumentException if unknown cleaning policy is provided */ @Override public List<HoodieCleanStat> clean(JavaSparkContext jsc) { try { FileSystem fs = getMetaClient().getFs(); List<String> partitionsToClean = FSUtils .getAllPartitionPaths(fs, getMetaClient().getBasePath(), config.shouldAssumeDatePartitioning()); logger.info("Partitions to clean up : " + partitionsToClean + ", with policy " + config .getCleanerPolicy()); if (partitionsToClean.isEmpty()) { logger.info("Nothing to clean here mom. It is already clean"); return Collections.emptyList(); } return cleanPartitionPaths(partitionsToClean, jsc); } catch (IOException e) { throw new HoodieIOException("Failed to clean up after commit", e); } }
/** * Performs cleaning of partition paths according to cleaning policy and returns the number of * files cleaned. Handles skews in partitions to clean by making files to clean as the unit of * task distribution. * * @throws IllegalArgumentException if unknown cleaning policy is provided */ @Override public List<HoodieCleanStat> clean(JavaSparkContext jsc) { try { FileSystem fs = getMetaClient().getFs(); List<String> partitionsToClean = FSUtils .getAllPartitionPaths(fs, getMetaClient().getBasePath(), config.shouldAssumeDatePartitioning()); logger.info("Partitions to clean up : " + partitionsToClean + ", with policy " + config .getCleanerPolicy()); if (partitionsToClean.isEmpty()) { logger.info("Nothing to clean here mom. It is already clean"); return Collections.emptyList(); } return cleanPartitionPaths(partitionsToClean, jsc); } catch (IOException e) { throw new HoodieIOException("Failed to clean up after commit", e); } }
private List<HoodieCleanStat> cleanPartitionPaths(List<String> partitionsToClean, JavaSparkContext jsc) { int cleanerParallelism = Math.min(partitionsToClean.size(), config.getCleanerParallelism()); logger.info("Using cleanerParallelism: " + cleanerParallelism); List<Tuple2<String, PartitionCleanStat>> partitionCleanStats = jsc .parallelize(partitionsToClean, cleanerParallelism) .flatMapToPair(getFilesToDeleteFunc(this, config)) .repartition(cleanerParallelism) // repartition to remove skews .mapPartitionsToPair(deleteFilesFunc(this)).reduceByKey( // merge partition level clean stats below (Function2<PartitionCleanStat, PartitionCleanStat, PartitionCleanStat>) (e1, e2) -> e1 .merge(e2)).collect(); Map<String, PartitionCleanStat> partitionCleanStatsMap = partitionCleanStats.stream() .collect(Collectors.toMap(Tuple2::_1, Tuple2::_2)); HoodieCleanHelper cleaner = new HoodieCleanHelper(this, config); // Return PartitionCleanStat for each partition passed. return partitionsToClean.stream().map(partitionPath -> { PartitionCleanStat partitionCleanStat = (partitionCleanStatsMap.containsKey(partitionPath)) ? partitionCleanStatsMap .get(partitionPath) : new PartitionCleanStat(partitionPath); return HoodieCleanStat.newBuilder().withPolicy(config.getCleanerPolicy()) .withPartitionPath(partitionPath) .withEarliestCommitRetained(cleaner.getEarliestCommitToRetain()) .withDeletePathPattern(partitionCleanStat.deletePathPatterns) .withSuccessfulDeletes(partitionCleanStat.successDeleteFiles) .withFailedDeletes(partitionCleanStat.failedDeleteFiles).build(); }).collect(Collectors.toList()); }
private List<HoodieCleanStat> cleanPartitionPaths(List<String> partitionsToClean, JavaSparkContext jsc) { int cleanerParallelism = Math.min(partitionsToClean.size(), config.getCleanerParallelism()); logger.info("Using cleanerParallelism: " + cleanerParallelism); List<Tuple2<String, PartitionCleanStat>> partitionCleanStats = jsc .parallelize(partitionsToClean, cleanerParallelism) .flatMapToPair(getFilesToDeleteFunc(this, config)) .repartition(cleanerParallelism) // repartition to remove skews .mapPartitionsToPair(deleteFilesFunc(this)).reduceByKey( // merge partition level clean stats below (Function2<PartitionCleanStat, PartitionCleanStat, PartitionCleanStat>) (e1, e2) -> e1 .merge(e2)).collect(); Map<String, PartitionCleanStat> partitionCleanStatsMap = partitionCleanStats.stream() .collect(Collectors.toMap(e -> e._1(), e -> e._2())); HoodieCleanHelper cleaner = new HoodieCleanHelper(this, config); // Return PartitionCleanStat for each partition passed. return partitionsToClean.stream().map(partitionPath -> { PartitionCleanStat partitionCleanStat = (partitionCleanStatsMap.containsKey(partitionPath)) ? partitionCleanStatsMap .get(partitionPath) : new PartitionCleanStat(partitionPath); return HoodieCleanStat.newBuilder().withPolicy(config.getCleanerPolicy()) .withPartitionPath(partitionPath) .withEarliestCommitRetained(cleaner.getEarliestCommitToRetain()) .withDeletePathPattern(partitionCleanStat.deletePathPatterns) .withSuccessfulDeletes(partitionCleanStat.successDeleteFiles) .withFailedDeletes(partitionCleanStat.failedDeleteFiles).build(); }).collect(Collectors.toList()); }