@Override public Collection<TimestampedDatasetVersion> findDatasetVersions(Dataset dataset) throws IOException { FileSystemDataset fsDataset = (FileSystemDataset) dataset; FileStatus status = this.fs.getFileStatus(fsDataset.datasetRoot()); return Lists.newArrayList(new TimestampedDatasetVersion(new DateTime(status.getModificationTime()), fsDataset .datasetRoot())); } }
@Override public Collection<FileStatusDatasetVersion> findDatasetVersions(Dataset dataset) throws IOException { return Lists.newArrayList(new FileStatusDatasetVersion(this.fs.getFileStatus(((FileSystemDataset) dataset) .datasetRoot()))); } }
@Override public Collection<FileStatusDatasetVersion> findDatasetVersions(Dataset dataset) throws IOException { return Lists.newArrayList(new FileStatusDatasetVersion(this.fs.getFileStatus(((FileSystemDataset) dataset) .datasetRoot()))); } }
/** * Deletes any empty paths in <code>possiblyEmptyDirectories</code> all the way upto the {@link FileSystemDataset#datasetRoot()}. */ public void cleanEmptyDirectories(final Set<Path> possiblyEmptyDirectories, final FileSystemDataset fsDataset) throws IOException { if (this.deleteEmptyDirectories && !this.simulate) { for (Path parentDirectory : possiblyEmptyDirectories) { PathUtils.deleteEmptyParentDirectories(fs, fsDataset.datasetRoot(), parentDirectory); } } } }
@Override public Collection<TimestampedDatasetVersion> findDatasetVersions(Dataset dataset) { FileSystemDataset fsDataset = (FileSystemDataset) dataset; try { List<TimestampedDatasetVersion> timestampedVersions = Lists.newArrayList(); for (FileStatus fileStatus : FileListUtils.listMostNestedPathRecursively(this.fs, fsDataset.datasetRoot())) { timestampedVersions.add(new TimestampedDatasetVersion(new DateTime(fileStatus.getModificationTime()), fileStatus.getPath())); } return timestampedVersions; } catch (IOException e) { LOGGER.warn("Failed to get ModifiedTimeStamp for candidate dataset version at " + fsDataset.datasetRoot() + ". Ignoring."); return Lists.newArrayList(); } } }
/** * Find dataset versions in the input {@link org.apache.hadoop.fs.Path}. Dataset versions are subdirectories of the * input {@link org.apache.hadoop.fs.Path} representing a single manageable unit in the dataset. * See {@link gobblin.data.management.retention.DatasetCleaner} for more information. * * @param dataset {@link org.apache.hadoop.fs.Path} to directory containing all versions of a dataset. * @return Map of {@link gobblin.data.management.version.DatasetVersion} and {@link org.apache.hadoop.fs.FileStatus} * for each dataset version found. * @throws IOException */ @Override public Collection<T> findDatasetVersions(Dataset dataset) throws IOException { FileSystemDataset fsDataset = (FileSystemDataset) dataset; Path versionGlobStatus = new Path(fsDataset.datasetRoot(), globVersionPattern()); FileStatus[] dataSetVersionPaths = this.fs.globStatus(versionGlobStatus); List<T> dataSetVersions = Lists.newArrayList(); for (FileStatus dataSetVersionPath : dataSetVersionPaths) { T datasetVersion = getDatasetVersion(PathUtils.relativizePath(dataSetVersionPath.getPath(), fsDataset.datasetRoot()), dataSetVersionPath); if (datasetVersion != null) { dataSetVersions.add(datasetVersion); } } return dataSetVersions; }
/** * Refer to {@link MRCompactorAvroKeyDedupJobRunner#configureInputAndOutputPaths(Job)} */ protected void configureInputAndOutputPaths(Job job, FileSystemDataset dataset) throws IOException { this.mapReduceInputPaths = getGranularInputPaths(dataset.datasetRoot()); for (Path path: mapReduceInputPaths) { FileInputFormat.addInputPath(job, path); } String mrOutputBase = this.state.getProp(MRCompactor.COMPACTION_JOB_DIR); CompactionPathParser parser = new CompactionPathParser(this.state); CompactionPathParser.CompactionParserResult rst = parser.parse(dataset); this.mrOutputPath = concatPaths (mrOutputBase, rst.getDatasetName(), rst.getDstSubDir(), rst.getTimeString()); log.info ("Cleaning temporary MR output directory: " + mrOutputPath); this.fs.delete(mrOutputPath, true); FileOutputFormat.setOutputPath(job, mrOutputPath); }
public boolean verify (FileSystemDataset dataset) { try { CompactionPathParser.CompactionParserResult result = new CompactionPathParser(state).parse(dataset); DateTime folderTime = result.getTime(); DateTimeZone timeZone = DateTimeZone.forID(this.state.getProp(MRCompactor.COMPACTION_TIMEZONE, MRCompactor.DEFAULT_COMPACTION_TIMEZONE)); DateTime current = new DateTime(timeZone); PeriodFormatter formatter = new PeriodFormatterBuilder().appendMonths().appendSuffix("m").appendDays().appendSuffix("d").appendHours() .appendSuffix("h").toFormatter(); // get earliest time String maxTimeAgoStr = this.state.getProp(TimeBasedSubDirDatasetsFinder.COMPACTION_TIMEBASED_MAX_TIME_AGO, TimeBasedSubDirDatasetsFinder.DEFAULT_COMPACTION_TIMEBASED_MAX_TIME_AGO); Period maxTimeAgo = formatter.parsePeriod(maxTimeAgoStr); DateTime earliest = current.minus(maxTimeAgo); // get latest time String minTimeAgoStr = this.state.getProp(TimeBasedSubDirDatasetsFinder.COMPACTION_TIMEBASED_MIN_TIME_AGO, TimeBasedSubDirDatasetsFinder.DEFAULT_COMPACTION_TIMEBASED_MIN_TIME_AGO); Period minTimeAgo = formatter.parsePeriod(minTimeAgoStr); DateTime latest = current.minus(minTimeAgo); if (earliest.isBefore(folderTime) && latest.isAfter(folderTime)) { log.info("{} falls in the user defined time range", dataset.datasetRoot()); return true; } } catch (Exception e) { log.error("{} cannot be verified because of {}", dataset.datasetRoot(), e.toString()); } return false; }