private static <E> boolean usePerTaskAttemptDatasets(View<E> target) { // new API output committers are not called properly in Hadoop 1 return !Hadoop.isHadoop1() && target.getDataset() instanceof Mergeable; }
private static void setInputFormatClass(Configuration conf) { if (Hadoop.isHadoop1()) { conf.set("mapreduce.inputformat.class", DatasetKeyInputFormat.class.getName()); } else { // build a job with an empty conf Job fakeJob = Hadoop.Job.newInstance.invoke(new Configuration(false)); fakeJob.setInputFormatClass(DatasetKeyInputFormat.class); // then copy any created entries into the real conf for (Map.Entry<String, String> entry : fakeJob.getConfiguration()) { conf.set(entry.getKey(), entry.getValue()); } } }
private static void setInputFormatClass(Configuration conf) { if (Hadoop.isHadoop1()) { conf.set("mapreduce.inputformat.class", DatasetKeyInputFormat.class.getName()); } else { // build a job with an empty conf Job fakeJob = Hadoop.Job.newInstance.invoke(new Configuration(false)); fakeJob.setInputFormatClass(DatasetKeyInputFormat.class); // then copy any created entries into the real conf for (Map.Entry<String, String> entry : fakeJob.getConfiguration()) { conf.set(entry.getKey(), entry.getValue()); } } }
private static <E> boolean usePerTaskAttemptDatasets(View<E> target, Configuration conf) { // For performance reasons we should skip the intermediate task attempt and job output datasets if the // file system does not support efficient renaming (such as S3), and write to the target dataset directly. if (!FileSystemUtil.supportsRename(URI.create(target.getUri().getSchemeSpecificPart()), conf)) { return false; } // new API output committers are not called properly in Hadoop 1 return !Hadoop.isHadoop1() && target.getDataset() instanceof Mergeable; }
@SuppressWarnings("unchecked") private static boolean canReplace(View<?> view) { if (Hadoop.isHadoop1()) { // can't use replace because it is called in the OutputCommitter. return false; } Dataset<?> dataset = view.getDataset(); return (dataset instanceof Replaceable && ((Replaceable<View<?>>) dataset).canReplace(view)); } }
@SuppressWarnings("unchecked") private static boolean canReplace(View<?> view) { if (Hadoop.isHadoop1()) { // can't use replace because it is called in the OutputCommitter. return false; } Dataset<?> dataset = view.getDataset(); return (dataset instanceof Replaceable && ((Replaceable<View<?>>) dataset).canReplace(view)); } }
@Test(expected = DatasetException.class) public void testJobFailsWithEmptyButReadyOutput() throws Exception { Assume.assumeTrue(!Hadoop.isHadoop1()); populateInputDataset(); // don't populate the output, but signal it as ready ((Signalable)outputDataset).signalReady(); Job job = createJob(); job.waitForCompletion(true); }
@Test @SuppressWarnings("deprecation") public void testJobOutputDatasetSignaledReady() throws Exception { Assume.assumeTrue(!Hadoop.isHadoop1()); populateInputDataset(); populateOutputDataset(); // existing output will be overwritten Job job = new Job(); DatasetKeyInputFormat.configure(job).readFrom(inputDataset).withType(GenericData.Record.class); job.setMapperClass(LineCountMapper.class); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(IntWritable.class); job.setReducerClass(GenericStatsReducer.class); DatasetKeyOutputFormat.configure(job).overwrite(outputDataset).withType(GenericData.Record.class); Assert.assertTrue(job.waitForCompletion(true)); Assert.assertTrue("Output dataset should be signaled ready", ((Signalable)outputDataset).isReady()); }
@Test @SuppressWarnings("deprecation") public void testSignalReadyOutputView() throws Exception { Assume.assumeTrue(!Hadoop.isHadoop1()); populateInputDataset(); populateOutputDataset(); // existing output will be overwritten Job job = new Job(); DatasetKeyInputFormat.configure(job).readFrom(inputDataset).withType(GenericData.Record.class); job.setMapperClass(LineCountMapper.class); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(IntWritable.class); job.setReducerClass(GenericStatsReducer.class); View<Record> outputView = outputDataset.with("name", "apple", "banana", "carrot"); DatasetKeyOutputFormat.configure(job).appendTo(outputView).withType(GenericData.Record.class); Assert.assertTrue(job.waitForCompletion(true)); Assert.assertFalse("Output dataset should not be signaled ready", ((Signalable)outputDataset).isReady()); Assert.assertTrue("Output view should be signaled ready", ((Signalable)outputView).isReady()); }
@Test public void testAbsoluteWebHdfs() { Assume.assumeTrue(!Hadoop.isHadoop1()); String webhdfsAuth = getConfiguration().get( DFSConfigKeys.DFS_NAMENODE_HTTP_ADDRESS_KEY); DatasetRepository repo = DatasetRepositories .repositoryFor("repo:webhdfs://" + webhdfsAuth + "/tmp/data"); repo.delete("ns", "test"); repo.create("ns", "test", descriptor); Dataset<Object> ds = Datasets.<Object, Dataset<Object>> load("dataset:webhdfs://" + webhdfsAuth + "/tmp/data/ns/test", Object.class); Assert.assertNotNull("Should load dataset", ds); Assert.assertTrue(ds instanceof FileSystemDataset); Assert.assertEquals("Locations should match", URI.create("webhdfs://" + webhdfsAuth + "/tmp/data/ns/test"), ds.getDescriptor().getLocation()); Assert.assertEquals("Descriptors should match", repo.load("ns", "test").getDescriptor(), ds.getDescriptor()); Assert.assertEquals("Should report correct namespace", "ns", ds.getNamespace()); Assert.assertEquals("Should report correct name", "test", ds.getName()); repo.delete("ns", "test"); } }