@Override public Collection<TimestampedDatasetVersion> findDatasetVersions(Dataset dataset) throws IOException { FileSystemDataset fsDataset = (FileSystemDataset) dataset; FileStatus status = this.fs.getFileStatus(fsDataset.datasetRoot()); return Lists.newArrayList(new TimestampedDatasetVersion(new DateTime(status.getModificationTime()), fsDataset .datasetRoot())); } }
/** * Serialize a dataset {@link FileSystemDataset} to a {@link State} * @param dataset A dataset needs serialization * @param state A state that is used to save {@link gobblin.dataset.Dataset} */ public void save (FileSystemDataset dataset, State state) { state.setProp(SERIALIZE_COMPACTION_FILE_PATH_NAME, dataset.datasetURN()); }
@Override public Collection<FileStatusDatasetVersion> findDatasetVersions(Dataset dataset) throws IOException { return Lists.newArrayList(new FileStatusDatasetVersion(this.fs.getFileStatus(((FileSystemDataset) dataset) .datasetRoot()))); } }
public void onCompactionJobComplete (FileSystemDataset dataset) throws IOException { boolean renamingRequired = this.state.getPropAsBoolean(MRCompactor.COMPACTION_RENAME_SOURCE_DIR_ENABLED, MRCompactor.DEFAULT_COMPACTION_RENAME_SOURCE_DIR_ENABLED); if (renamingRequired) { Collection<Path> paths = configurator.getMapReduceInputPaths(); for (Path path: paths) { Path newPath = new Path (path.getParent(), path.getName() + MRCompactor.COMPACTION_RENAME_SOURCE_DIR_SUFFIX); log.info("[{}] Renaming {} to {}", dataset.datasetURN(), path, newPath); fs.rename(path, newPath); } // submit events if directory is renamed if (eventSubmitter != null) { Map<String, String> eventMetadataMap = ImmutableMap.of(CompactionSlaEventHelper.DATASET_URN, dataset.datasetURN(), CompactionSlaEventHelper.RENAME_DIR_PATHS, Joiner.on(',').join(paths)); this.eventSubmitter.submit(CompactionSlaEventHelper.COMPACTION_MARK_DIR_EVENT, eventMetadataMap); } } }
@Override public Collection<FileStatusDatasetVersion> findDatasetVersions(Dataset dataset) throws IOException { return Lists.newArrayList(new FileStatusDatasetVersion(this.fs.getFileStatus(((FileSystemDataset) dataset) .datasetRoot()))); } }
private void parseTimeAndDatasetName (FileSystemDataset dataset, CompactionParserResult rst) { String commonBase = rst.getSrcBaseDir(); String fullPath = dataset.datasetURN(); int startPos = fullPath.indexOf(commonBase) + commonBase.length(); String relative = StringUtils.removeStart(fullPath.substring(startPos), "/"); int delimiterStart = StringUtils.indexOf(relative, rst.getSrcSubDir()); if (delimiterStart == -1) { throw new StringIndexOutOfBoundsException(); } int delimiterEnd = relative.indexOf("/", delimiterStart); String datasetName = StringUtils.removeEnd(relative.substring(0, delimiterStart), "/"); String timeString = StringUtils.removeEnd(relative.substring(delimiterEnd + 1), "/"); rst.datasetName = datasetName; rst.timeString = timeString; rst.time = getTime (timeString); }
/** * Deletes any empty paths in <code>possiblyEmptyDirectories</code> all the way upto the {@link FileSystemDataset#datasetRoot()}. */ public void cleanEmptyDirectories(final Set<Path> possiblyEmptyDirectories, final FileSystemDataset fsDataset) throws IOException { if (this.deleteEmptyDirectories && !this.simulate) { for (Path parentDirectory : possiblyEmptyDirectories) { PathUtils.deleteEmptyParentDirectories(fs, fsDataset.datasetRoot(), parentDirectory); } } } }
public void onCompactionJobComplete(FileSystemDataset dataset) throws IOException { if (state.contains(ConfigurationKeys.HIVE_REGISTRATION_POLICY)) { HiveRegister hiveRegister = HiveRegister.get(state); HiveRegistrationPolicy hiveRegistrationPolicy = HiveRegistrationPolicyBase.getPolicy(state); CompactionPathParser.CompactionParserResult result = new CompactionPathParser(state).parse(dataset); List<String> paths = new ArrayList<>(); for (HiveSpec spec : hiveRegistrationPolicy.getHiveSpecs(new Path(result.getDstAbsoluteDir()))) { hiveRegister.register(spec); paths.add(spec.getPath().toUri().toASCIIString()); log.info("Hive registration is done for {}", result.getDstAbsoluteDir()); } // submit events for hive registration if (eventSubmitter != null) { Map<String, String> eventMetadataMap = ImmutableMap.of(CompactionSlaEventHelper.DATASET_URN, dataset.datasetURN(), CompactionSlaEventHelper.HIVE_REGISTRATION_PATHS, Joiner.on(',').join(paths)); this.eventSubmitter.submit(CompactionSlaEventHelper.COMPACTION_HIVE_REGISTRATION_EVENT, eventMetadataMap); } } }
@Override public Collection<TimestampedDatasetVersion> findDatasetVersions(Dataset dataset) { FileSystemDataset fsDataset = (FileSystemDataset) dataset; try { List<TimestampedDatasetVersion> timestampedVersions = Lists.newArrayList(); for (FileStatus fileStatus : FileListUtils.listMostNestedPathRecursively(this.fs, fsDataset.datasetRoot())) { timestampedVersions.add(new TimestampedDatasetVersion(new DateTime(fileStatus.getModificationTime()), fileStatus.getPath())); } return timestampedVersions; } catch (IOException e) { LOGGER.warn("Failed to get ModifiedTimeStamp for candidate dataset version at " + fsDataset.datasetRoot() + ". Ignoring."); return Lists.newArrayList(); } } }
double newRecords = helper.calculateRecordCount (Lists.newArrayList(new Path(dataset.datasetURN()))); double oldRecords = InputRecordCountHelper.readRecordCount (helper.getFs(), new Path(result.getDstAbsoluteDir())); log.info ("Dataset {} : previous records {}, current records {}", dataset.datasetURN(), oldRecords, newRecords); if (oldRecords == 0) { return true; log.info ("Dataset {} records exceeded the threshold {}", dataset.datasetURN(), threshold); return true;
/** * Find dataset versions in the input {@link org.apache.hadoop.fs.Path}. Dataset versions are subdirectories of the * input {@link org.apache.hadoop.fs.Path} representing a single manageable unit in the dataset. * See {@link gobblin.data.management.retention.DatasetCleaner} for more information. * * @param dataset {@link org.apache.hadoop.fs.Path} to directory containing all versions of a dataset. * @return Map of {@link gobblin.data.management.version.DatasetVersion} and {@link org.apache.hadoop.fs.FileStatus} * for each dataset version found. * @throws IOException */ @Override public Collection<T> findDatasetVersions(Dataset dataset) throws IOException { FileSystemDataset fsDataset = (FileSystemDataset) dataset; Path versionGlobStatus = new Path(fsDataset.datasetRoot(), globVersionPattern()); FileStatus[] dataSetVersionPaths = this.fs.globStatus(versionGlobStatus); List<T> dataSetVersions = Lists.newArrayList(); for (FileStatus dataSetVersionPath : dataSetVersionPaths) { T datasetVersion = getDatasetVersion(PathUtils.relativizePath(dataSetVersionPath.getPath(), fsDataset.datasetRoot()), dataSetVersionPath); if (datasetVersion != null) { dataSetVersions.add(datasetVersion); } } return dataSetVersions; }
Map<String, String> eventMetadataMap = ImmutableMap.of(CompactionSlaEventHelper.DATASET_URN, dataset.datasetURN(), CompactionSlaEventHelper.RECORD_COUNT_TOTAL, Long.toString(newTotalRecords)); this.eventSubmitter.submit(CompactionSlaEventHelper.COMPACTION_RECORD_COUNT_EVENT, eventMetadataMap);
/** * Refer to {@link MRCompactorAvroKeyDedupJobRunner#configureInputAndOutputPaths(Job)} */ protected void configureInputAndOutputPaths(Job job, FileSystemDataset dataset) throws IOException { this.mapReduceInputPaths = getGranularInputPaths(dataset.datasetRoot()); for (Path path: mapReduceInputPaths) { FileInputFormat.addInputPath(job, path); } String mrOutputBase = this.state.getProp(MRCompactor.COMPACTION_JOB_DIR); CompactionPathParser parser = new CompactionPathParser(this.state); CompactionPathParser.CompactionParserResult rst = parser.parse(dataset); this.mrOutputPath = concatPaths (mrOutputBase, rst.getDatasetName(), rst.getDstSubDir(), rst.getTimeString()); log.info ("Cleaning temporary MR output directory: " + mrOutputPath); this.fs.delete(mrOutputPath, true); FileOutputFormat.setOutputPath(job, mrOutputPath); }
public boolean verify (FileSystemDataset dataset) { try { CompactionPathParser.CompactionParserResult result = new CompactionPathParser(state).parse(dataset); DateTime folderTime = result.getTime(); DateTimeZone timeZone = DateTimeZone.forID(this.state.getProp(MRCompactor.COMPACTION_TIMEZONE, MRCompactor.DEFAULT_COMPACTION_TIMEZONE)); DateTime current = new DateTime(timeZone); PeriodFormatter formatter = new PeriodFormatterBuilder().appendMonths().appendSuffix("m").appendDays().appendSuffix("d").appendHours() .appendSuffix("h").toFormatter(); // get earliest time String maxTimeAgoStr = this.state.getProp(TimeBasedSubDirDatasetsFinder.COMPACTION_TIMEBASED_MAX_TIME_AGO, TimeBasedSubDirDatasetsFinder.DEFAULT_COMPACTION_TIMEBASED_MAX_TIME_AGO); Period maxTimeAgo = formatter.parsePeriod(maxTimeAgoStr); DateTime earliest = current.minus(maxTimeAgo); // get latest time String minTimeAgoStr = this.state.getProp(TimeBasedSubDirDatasetsFinder.COMPACTION_TIMEBASED_MIN_TIME_AGO, TimeBasedSubDirDatasetsFinder.DEFAULT_COMPACTION_TIMEBASED_MIN_TIME_AGO); Period minTimeAgo = formatter.parsePeriod(minTimeAgoStr); DateTime latest = current.minus(minTimeAgo); if (earliest.isBefore(folderTime) && latest.isAfter(folderTime)) { log.info("{} falls in the user defined time range", dataset.datasetRoot()); return true; } } catch (Exception e) { log.error("{} cannot be verified because of {}", dataset.datasetRoot(), e.toString()); } return false; }