/** * The job dataset may already exist if the ApplicationMaster was restarted */ @SuppressWarnings("unchecked") private static <E> Dataset<E> loadOrCreateJobDataset(JobContext jobContext) { Dataset<Object> dataset = load(jobContext).getDataset(); String jobDatasetName = getJobDatasetName(jobContext); DatasetRepository repo = getDatasetRepository(jobContext); if (repo.exists(TEMP_NAMESPACE, jobDatasetName)) { Dataset<E> tempDataset = repo.load(TEMP_NAMESPACE, jobDatasetName, DatasetKeyOutputFormat.<E>getType(jobContext)); try { Compatibility.checkCompatible(dataset.getDescriptor(), tempDataset.getDescriptor()); return tempDataset; } catch (RuntimeException ex) { // swallow } } return repo.create(TEMP_NAMESPACE, jobDatasetName, copy(dataset.getDescriptor()), DatasetKeyOutputFormat.<E>getType(jobContext)); }
private static <E> Dataset<E> loadOrCreateTaskAttemptDataset(TaskAttemptContext taskContext) { String taskAttemptDatasetName = getTaskAttemptDatasetName(taskContext); DatasetRepository repo = getDatasetRepository(taskContext); Dataset<E> jobDataset = loadJobDataset(taskContext); if (repo.exists(TEMP_NAMESPACE, taskAttemptDatasetName)) { return repo.load(TEMP_NAMESPACE, taskAttemptDatasetName); } else { return repo.create(TEMP_NAMESPACE, taskAttemptDatasetName, copy(jobDataset.getDescriptor())); } }
private static void deleteJobDataset(JobContext jobContext) { DatasetRepository repo = getDatasetRepository(jobContext); repo.delete(TEMP_NAMESPACE, getJobDatasetName(jobContext)); }
private static void deleteTaskAttemptDataset(TaskAttemptContext taskContext) { DatasetRepository repo = getDatasetRepository(taskContext); String taskAttemptDatasetName = getTaskAttemptDatasetName(taskContext); if (repo.exists(TEMP_NAMESPACE, taskAttemptDatasetName)) { repo.delete(TEMP_NAMESPACE, taskAttemptDatasetName); } }
public DatasetTarget(URI uri) { this.uri = uri; Configuration temp = emptyConf(); // use appendTo since handleExisting checks for existing data DatasetKeyOutputFormat.configure(temp).appendTo(uri); this.formatBundle = outputBundle(temp); }
@Override @SuppressWarnings("unchecked") public RecordWriter<E, Void> getRecordWriter(TaskAttemptContext taskAttemptContext) { Configuration conf = Hadoop.TaskAttemptContext .getConfiguration.invoke(taskAttemptContext); View<E> target = load(taskAttemptContext); View<E> working; if (usePerTaskAttemptDatasets(target, conf)) { working = loadOrCreateTaskAttemptView(taskAttemptContext); } else { working = target; } boolean copyRecords = conf.getBoolean(KITE_COPY_RECORDS, false); String partitionDir = conf.get(KITE_PARTITION_DIR); if (working.getDataset().getDescriptor().isPartitioned() && partitionDir != null) { if (!(target instanceof FileSystemDataset)) { throw new UnsupportedOperationException("Partitions only supported for " + "FileSystemDataset. Dataset: " + target); } FileSystemDataset fsDataset = (FileSystemDataset) target; PartitionKey key = fsDataset.keyFromDirectory(new Path(partitionDir)); if (key != null && !key.getValues().isEmpty()) { working = fsDataset.getPartition(key, true); } return new DatasetRecordWriter<E>(working, copyRecords); } else { return new DatasetRecordWriter<E>(working, copyRecords); } }
@Override public OutputCommitter getOutputCommitter(TaskAttemptContext taskAttemptContext) { Configuration conf = Hadoop.TaskAttemptContext.getConfiguration.invoke(taskAttemptContext); DefaultConfiguration.init(conf); View<E> view = load(taskAttemptContext); return usePerTaskAttemptDatasets(view) ? new MergeOutputCommitter<E>() : new NullOutputCommitter(); }
private static DatasetRepository getDatasetRepository(JobContext jobContext) { Configuration conf = Hadoop.JobContext.getConfiguration.invoke(jobContext); DatasetRepository repo = DatasetRepositories.repositoryFor(conf.get(KITE_OUTPUT_URI)); if (repo instanceof TemporaryDatasetRepositoryAccessor) { Dataset<Object> dataset = load(jobContext).getDataset(); String namespace = dataset.getNamespace(); repo = ((TemporaryDatasetRepositoryAccessor) repo) .getTemporaryRepository(namespace, getJobDatasetName(jobContext)); } return repo; }
@Override public void checkOutputSpecs(JobContext jobContext) { // The committer setup will fail if the output dataset does not exist View<E> target = load(jobContext); Configuration conf = Hadoop.JobContext.getConfiguration.invoke(jobContext); switch (conf.getEnum(KITE_WRITE_MODE, WriteMode.DEFAULT)) { case APPEND: break; case OVERWRITE: // if the merge won't use replace, then delete the existing data if (!canReplace(target)) { target.deleteAll(); } break; default: case DEFAULT: boolean isReady = false; if (target instanceof Signalable) { isReady = ((Signalable)target).isReady(); } if (isReady || !target.isEmpty()) { throw new DatasetException( "View is not empty or has been signaled as ready: " + target); } break; } }
@SuppressWarnings("deprecation") private static <E> View<E> load(JobContext jobContext) { Configuration conf = Hadoop.JobContext.getConfiguration.invoke(jobContext); Class<E> type = getType(jobContext); String outputUri = conf.get(KITE_OUTPUT_URI); return Datasets.<E, View<E>>load(outputUri, type); }
public DatasetTarget(View<E> view) { this.view = view; Configuration temp = emptyConf(); // use appendTo since handleExisting checks for existing data DatasetKeyOutputFormat.configure(temp).appendTo(view); this.formatBundle = outputBundle(temp); this.uri = view.getDataset().getUri(); }
@Override @SuppressWarnings("unchecked") public RecordWriter<E, Void> getRecordWriter(TaskAttemptContext taskAttemptContext) { Configuration conf = Hadoop.TaskAttemptContext .getConfiguration.invoke(taskAttemptContext); View<E> target = load(taskAttemptContext); View<E> working; if (usePerTaskAttemptDatasets(target)) { working = loadOrCreateTaskAttemptView(taskAttemptContext); } else { working = target; } boolean copyRecords = conf.getBoolean(KITE_COPY_RECORDS, false); String partitionDir = conf.get(KITE_PARTITION_DIR); if (working.getDataset().getDescriptor().isPartitioned() && partitionDir != null) { if (!(target instanceof FileSystemDataset)) { throw new UnsupportedOperationException("Partitions only supported for " + "FileSystemDataset. Dataset: " + target); } FileSystemDataset fsDataset = (FileSystemDataset) target; PartitionKey key = fsDataset.keyFromDirectory(new Path(partitionDir)); if (key != null && !key.getValues().isEmpty()) { working = fsDataset.getPartition(key, true); } return new DatasetRecordWriter<E>(working, copyRecords); } else { return new DatasetRecordWriter<E>(working, copyRecords); } }
private static <E> Dataset<E> loadJobDataset(JobContext jobContext) { DatasetRepository repo = getDatasetRepository(jobContext); return repo.load(TEMP_NAMESPACE, getJobDatasetName(jobContext)); }
private static void deleteTaskAttemptDataset(TaskAttemptContext taskContext) { DatasetRepository repo = getDatasetRepository(taskContext); String taskAttemptDatasetName = getTaskAttemptDatasetName(taskContext); if (repo.exists(TEMP_NAMESPACE, taskAttemptDatasetName)) { repo.delete(TEMP_NAMESPACE, taskAttemptDatasetName); } }
@Override public OutputCommitter getOutputCommitter(TaskAttemptContext taskAttemptContext) { Configuration conf = Hadoop.TaskAttemptContext.getConfiguration.invoke(taskAttemptContext); DefaultConfiguration.init(conf); View<E> view = load(taskAttemptContext); return usePerTaskAttemptDatasets(view, conf) ? new MergeOutputCommitter<E>() : new NullOutputCommitter(); }
private static DatasetRepository getDatasetRepository(JobContext jobContext) { Configuration conf = Hadoop.JobContext.getConfiguration.invoke(jobContext); DatasetRepository repo = DatasetRepositories.repositoryFor(conf.get(KITE_OUTPUT_URI)); if (repo instanceof TemporaryDatasetRepositoryAccessor) { Dataset<Object> dataset = load(jobContext).getDataset(); String namespace = dataset.getNamespace(); repo = ((TemporaryDatasetRepositoryAccessor) repo) .getTemporaryRepository(namespace, getJobDatasetName(jobContext)); } return repo; }
@Override public void checkOutputSpecs(JobContext jobContext) { // The committer setup will fail if the output dataset does not exist View<E> target = load(jobContext); Configuration conf = Hadoop.JobContext.getConfiguration.invoke(jobContext); switch (conf.getEnum(KITE_WRITE_MODE, WriteMode.DEFAULT)) { case APPEND: break; case OVERWRITE: // if the merge won't use replace, then delete the existing data if (!canReplace(target)) { target.deleteAll(); } break; default: case DEFAULT: boolean isReady = false; if (target instanceof Signalable) { isReady = ((Signalable)target).isReady(); } if (isReady || !target.isEmpty()) { throw new DatasetException( "View is not empty or has been signaled as ready: " + target); } break; } }
@SuppressWarnings("deprecation") private static <E> View<E> load(JobContext jobContext) { Configuration conf = Hadoop.JobContext.getConfiguration.invoke(jobContext); Class<E> type = getType(jobContext); String outputUri = conf.get(KITE_OUTPUT_URI); return Datasets.<E, View<E>>load(outputUri, type); }
@SuppressWarnings("deprecation") private Job createJob() throws Exception { Job job = new Job(); DatasetKeyInputFormat.configure(job).readFrom(inputDataset).withType(GenericData.Record.class); job.setMapperClass(LineCountMapper.class); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(IntWritable.class); job.setReducerClass(GenericStatsReducer.class); DatasetKeyOutputFormat.configure(job).writeTo(outputDataset).withType(GenericData.Record.class); return job; }
/** * The job dataset may already exist if the ApplicationMaster was restarted */ @SuppressWarnings("unchecked") private static <E> Dataset<E> loadOrCreateJobDataset(JobContext jobContext) { Dataset<Object> dataset = load(jobContext).getDataset(); String jobDatasetName = getJobDatasetName(jobContext); DatasetRepository repo = getDatasetRepository(jobContext); if (repo.exists(TEMP_NAMESPACE, jobDatasetName)) { Dataset<E> tempDataset = repo.load(TEMP_NAMESPACE, jobDatasetName, DatasetKeyOutputFormat.<E>getType(jobContext)); try { Compatibility.checkCompatible(dataset.getDescriptor(), tempDataset.getDescriptor()); return tempDataset; } catch (RuntimeException ex) { // swallow } } return repo.create(TEMP_NAMESPACE, jobDatasetName, copy(dataset.getDescriptor()), DatasetKeyOutputFormat.<E>getType(jobContext)); }