/** * Adds settings to {@code Configuration} to use {@code DatasetKeyInputFormat} * and returns a helper to add further configuration. * * @param conf a {@code Configuration} * * @since 0.15.0 */ public static ConfigBuilder configure(Configuration conf) { setInputFormatClass(conf); return new ConfigBuilder(conf); }
/** * Returns a helper to add output options to the given {@code Configuration}. * * @param conf a {@code Configuration} * * @since 0.15.0 */ public static ConfigBuilder configure(Configuration conf) { return new ConfigBuilder(conf); }
/** * Adds configuration for {@code DatasetKeyOutputFormat} to append to the * given dataset or view URI, leaving any existing data intact. * * @param view a dataset or view * @return this for method chaining * * @since 0.16.0 */ public ConfigBuilder appendTo(View<?> view) { setAppend(); return writeTo(view); }
@Test @SuppressWarnings("deprecation") public void testJobAppend() throws Exception { populateInputDataset(); populateOutputDataset(); // existing output will be overwritten Job job = new Job(); DatasetKeyInputFormat.configure(job).readFrom(inputDataset).withType(GenericData.Record.class); job.setMapperClass(LineCountMapper.class); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(IntWritable.class); job.setReducerClass(GenericStatsReducer.class); DatasetKeyOutputFormat.configure(job).appendTo(outputDataset).withType(GenericData.Record.class); Assert.assertTrue(job.waitForCompletion(true)); checkOutput(true); }
@SuppressWarnings("deprecation") private Job createJob() throws Exception { Job job = new Job(); DatasetKeyInputFormat.configure(job).readFrom(inputDataset).withType(GenericData.Record.class); job.setMapperClass(LineCountMapper.class); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(IntWritable.class); job.setReducerClass(GenericStatsReducer.class); DatasetKeyOutputFormat.configure(job).writeTo(outputDataset).withType(GenericData.Record.class); return job; }
@Test public void testSetupJobIsIdempotent() { DatasetKeyOutputFormat.MergeOutputCommitter<Object> outputCommitter = new DatasetKeyOutputFormat.MergeOutputCommitter<Object>(); Configuration conf = DefaultConfiguration.get(); DatasetKeyOutputFormat.configure(conf).appendTo(outputDataset); JobID jobId = new JobID("jt", 42); JobContext context = Hadoop.JobContext.ctor.newInstance(conf, jobId); // setup the job outputCommitter.setupJob(context); // call setup again to simulate an ApplicationMaster restart outputCommitter.setupJob(context); }
@Override public OutputCommitter getOutputCommitter(TaskAttemptContext taskAttemptContext) { Configuration conf = Hadoop.TaskAttemptContext.getConfiguration.invoke(taskAttemptContext); DefaultConfiguration.init(conf); View<E> view = load(taskAttemptContext); return usePerTaskAttemptDatasets(view, conf) ? new MergeOutputCommitter<E>() : new NullOutputCommitter(); }
/** * Adds configuration for {@code DatasetKeyOutputFormat} to write to the * given {@link Dataset} or {@link View} instance after removing any * existing data. * <p> * The underlying dataset implementation must support View#deleteAll for * the {@code view} or the job will fail. * * @param view a dataset or view * @return this for method chaining * * @since 0.16.0 */ public ConfigBuilder overwrite(View<?> view) { setOverwrite(); return writeTo(view); }
private static void deleteTaskAttemptDataset(TaskAttemptContext taskContext) { DatasetRepository repo = getDatasetRepository(taskContext); String taskAttemptDatasetName = getTaskAttemptDatasetName(taskContext); if (repo.exists(TEMP_NAMESPACE, taskAttemptDatasetName)) { repo.delete(TEMP_NAMESPACE, taskAttemptDatasetName); } }
@Test @SuppressWarnings("deprecation") public void testJobOverwrite() throws Exception { populateInputDataset(); populateOutputDataset(); // existing output will be overwritten Job job = new Job(); DatasetKeyInputFormat.configure(job).readFrom(inputDataset).withType(GenericData.Record.class); job.setMapperClass(LineCountMapper.class); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(IntWritable.class); job.setReducerClass(GenericStatsReducer.class); DatasetKeyOutputFormat.configure(job).overwrite(outputDataset).withType(GenericData.Record.class); Assert.assertTrue(job.waitForCompletion(true)); checkOutput(false); }
/** * Adds configuration for {@code DatasetKeyOutputFormat} to append to the * given dataset or view URI, leaving any existing data intact. * * @param view a dataset or view * @return this for method chaining * * @since 0.16.0 */ public ConfigBuilder appendTo(View<?> view) { setAppend(); return writeTo(view); }
@Override public OutputCommitter getOutputCommitter(TaskAttemptContext taskAttemptContext) { Configuration conf = Hadoop.TaskAttemptContext.getConfiguration.invoke(taskAttemptContext); DefaultConfiguration.init(conf); View<E> view = load(taskAttemptContext); return usePerTaskAttemptDatasets(view) ? new MergeOutputCommitter<E>() : new NullOutputCommitter(); }
/** * Adds configuration for {@code DatasetKeyOutputFormat} to write to the * given {@link Dataset} or {@link View} instance after removing any * existing data. * <p> * The underlying dataset implementation must support View#deleteAll for * the {@code view} or the job will fail. * * @param view a dataset or view * @return this for method chaining * * @since 0.16.0 */ public ConfigBuilder overwrite(View<?> view) { setOverwrite(); return writeTo(view); }
/** * Adds settings to {@code Configuration} to use {@code DatasetKeyInputFormat} * and returns a helper to add further configuration. * * @param conf a {@code Configuration} * * @since 0.15.0 */ public static ConfigBuilder configure(Configuration conf) { setInputFormatClass(conf); return new ConfigBuilder(conf); }
/** * Returns a helper to add output options to the given {@code Configuration}. * * @param conf a {@code Configuration} * * @since 0.15.0 */ public static ConfigBuilder configure(Configuration conf) { return new ConfigBuilder(conf); }
/** * Adds configuration for {@code DatasetKeyOutputFormat} to append to the * given dataset or view URI, leaving any existing data intact. * <p> * URI formats are defined by {@link Dataset} implementations, but must * begin with "dataset:" or "view:". For more information, see * {@link Datasets}. * * @param uri a dataset or view URI * @return this for method chaining * * @since 0.16.0 */ public ConfigBuilder appendTo(URI uri) { setAppend(); return writeTo(uri); }
/** * Configures the {@code Job} to use the {@code DatasetKeyOutputFormat} and * returns a helper to add further configuration. * * @param job the {@code Job} to configure * * @since 0.15.0 */ public static ConfigBuilder configure(Job job) { job.setOutputFormatClass(DatasetKeyOutputFormat.class); return new ConfigBuilder(job); }
/** * Adds configuration for {@code DatasetKeyOutputFormat} to append to the * given dataset or view URI, leaving any existing data intact. * <p> * URI formats are defined by {@link Dataset} implementations, but must * begin with "dataset:" or "view:". For more information, see * {@link Datasets}. * * @param uri a dataset or view URI * @return this for method chaining * * @since 0.16.0 */ public ConfigBuilder appendTo(URI uri) { setAppend(); return writeTo(uri); }