/** * Adds configuration for {@code DatasetKeyInputFormat} to read from the * given dataset or view URI string. * <p> * URI formats are defined by {@link Dataset} implementations, but must * begin with "dataset:" or "view:". For more information, see * {@link Datasets}. * * @param uri a dataset or view URI string * @return this for method chaining */ public ConfigBuilder readFrom(String uri) { return readFrom(URI.create(uri)); }
@Test @SuppressWarnings("deprecation") public void testJobAppend() throws Exception { populateInputDataset(); populateOutputDataset(); // existing output will be overwritten Job job = new Job(); DatasetKeyInputFormat.configure(job).readFrom(inputDataset).withType(GenericData.Record.class); job.setMapperClass(LineCountMapper.class); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(IntWritable.class); job.setReducerClass(GenericStatsReducer.class); DatasetKeyOutputFormat.configure(job).appendTo(outputDataset).withType(GenericData.Record.class); Assert.assertTrue(job.waitForCompletion(true)); checkOutput(true); }
/** * Configures the {@code Job} to use the {@code DatasetKeyInputFormat} and * returns a helper to add further configuration. * * @param job the {@code Job} to configure * * @since 0.15.0 */ public static ConfigBuilder configure(Job job) { job.setInputFormatClass(DatasetKeyInputFormat.class); Configuration conf = Hadoop.JobContext.getConfiguration.invoke(job); return new ConfigBuilder(conf); }
DatasetKeyInputFormat.configure(job).readFrom(inputDataset);
/** * Adds configuration for {@code DatasetKeyInputFormat} to read from the * given {@link Dataset} or {@link View} instance. * * @param view a dataset or view * @return this for method chaining */ public ConfigBuilder readFrom(View<?> view) { DatasetDescriptor descriptor = view.getDataset().getDescriptor(); // if this is a partitioned dataset, add the partition location if (view instanceof FileSystemDataset) { conf.set(KITE_PARTITION_DIR, String.valueOf(descriptor.getLocation())); } // add descriptor properties to the config for (String property : descriptor.listProperties()) { conf.set(property, descriptor.getProperty(property)); } if (DataModelUtil.isGeneric(view.getType())) { Schema datasetSchema = view.getDataset().getDescriptor().getSchema(); // only set the read schema if the view is a projection if (!datasetSchema.equals(view.getSchema())) { withSchema(view.getSchema()); } } else { withType(view.getType()); } conf.set(KITE_INPUT_URI, view.getUri().toString()); return this; }
/** * Adds configuration for {@code DatasetKeyInputFormat} to read from the * given {@link Dataset} or {@link View} instance. * * @param view a dataset or view * @return this for method chaining */ public ConfigBuilder readFrom(View<?> view) { DatasetDescriptor descriptor = view.getDataset().getDescriptor(); // if this is a partitioned dataset, add the partition location if (view instanceof FileSystemDataset) { conf.set(KITE_PARTITION_DIR, String.valueOf(descriptor.getLocation())); } // add descriptor properties to the config for (String property : descriptor.listProperties()) { conf.set(property, descriptor.getProperty(property)); } if (DataModelUtil.isGeneric(view.getType())) { Schema datasetSchema = view.getDataset().getDescriptor().getSchema(); // only set the read schema if the view is a projection if (!datasetSchema.equals(view.getSchema())) { withSchema(view.getSchema()); } } else { withType(view.getType()); } conf.set(KITE_INPUT_URI, view.getUri().toString()); return this; }
Dataset<GenericRecord> outputDataset = repo.create("default", datasetName, descriptor); DatasetKeyInputFormat.configure(job).readFrom(inputDataset);
@Test @SuppressWarnings("deprecation") public void testSignalReadyOutputView() throws Exception { Assume.assumeTrue(!Hadoop.isHadoop1()); populateInputDataset(); populateOutputDataset(); // existing output will be overwritten Job job = new Job(); DatasetKeyInputFormat.configure(job).readFrom(inputDataset).withType(GenericData.Record.class); job.setMapperClass(LineCountMapper.class); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(IntWritable.class); job.setReducerClass(GenericStatsReducer.class); View<Record> outputView = outputDataset.with("name", "apple", "banana", "carrot"); DatasetKeyOutputFormat.configure(job).appendTo(outputView).withType(GenericData.Record.class); Assert.assertTrue(job.waitForCompletion(true)); Assert.assertFalse("Output dataset should not be signaled ready", ((Signalable)outputDataset).isReady()); Assert.assertTrue("Output view should be signaled ready", ((Signalable)outputView).isReady()); }
@Test @SuppressWarnings("deprecation") public void testJobOverwrite() throws Exception { populateInputDataset(); populateOutputDataset(); // existing output will be overwritten Job job = new Job(); DatasetKeyInputFormat.configure(job).readFrom(inputDataset).withType(GenericData.Record.class); job.setMapperClass(LineCountMapper.class); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(IntWritable.class); job.setReducerClass(GenericStatsReducer.class); DatasetKeyOutputFormat.configure(job).overwrite(outputDataset).withType(GenericData.Record.class); Assert.assertTrue(job.waitForCompletion(true)); checkOutput(false); }
/** * Adds configuration for {@code DatasetKeyInputFormat} to read from the * given dataset or view URI. * <p> * URI formats are defined by {@link Dataset} implementations, but must * begin with "dataset:" or "view:". For more information, see * {@link Datasets}. * * @param uri a dataset or view URI * @return this for method chaining */ public ConfigBuilder readFrom(URI uri) { return readFrom(Datasets.load(uri)); }
@Test @SuppressWarnings("deprecation") public void testJobOutputDatasetSignaledReady() throws Exception { Assume.assumeTrue(!Hadoop.isHadoop1()); populateInputDataset(); populateOutputDataset(); // existing output will be overwritten Job job = new Job(); DatasetKeyInputFormat.configure(job).readFrom(inputDataset).withType(GenericData.Record.class); job.setMapperClass(LineCountMapper.class); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(IntWritable.class); job.setReducerClass(GenericStatsReducer.class); DatasetKeyOutputFormat.configure(job).overwrite(outputDataset).withType(GenericData.Record.class); Assert.assertTrue(job.waitForCompletion(true)); Assert.assertTrue("Output dataset should be signaled ready", ((Signalable)outputDataset).isReady()); }
/** * Adds configuration for {@code DatasetKeyInputFormat} to read from the * given dataset or view URI string. * <p> * URI formats are defined by {@link Dataset} implementations, but must * begin with "dataset:" or "view:". For more information, see * {@link Datasets}. * * @param uri a dataset or view URI string * @return this for method chaining */ public ConfigBuilder readFrom(String uri) { return readFrom(URI.create(uri)); }
/** * Adds configuration for {@code DatasetKeyInputFormat} to read from the * given dataset or view URI. * <p> * URI formats are defined by {@link Dataset} implementations, but must * begin with "dataset:" or "view:". For more information, see * {@link Datasets}. * * @param uri a dataset or view URI * @return this for method chaining */ public ConfigBuilder readFrom(URI uri) { return readFrom(Datasets.load(uri)); }
/** * Configures the {@code Job} to use the {@code DatasetKeyInputFormat} and * returns a helper to add further configuration. * * @param job the {@code Job} to configure * * @since 0.15.0 */ public static ConfigBuilder configure(Job job) { job.setInputFormatClass(DatasetKeyInputFormat.class); Configuration conf = Hadoop.JobContext.getConfiguration.invoke(job); return new ConfigBuilder(conf); }
/** * Adds settings to {@code Configuration} to use {@code DatasetKeyInputFormat} * and returns a helper to add further configuration. * * @param conf a {@code Configuration} * * @since 0.15.0 */ public static ConfigBuilder configure(Configuration conf) { setInputFormatClass(conf); return new ConfigBuilder(conf); }
/** * Adds settings to {@code Configuration} to use {@code DatasetKeyInputFormat} * and returns a helper to add further configuration. * * @param conf a {@code Configuration} * * @since 0.15.0 */ public static ConfigBuilder configure(Configuration conf) { setInputFormatClass(conf); return new ConfigBuilder(conf); }
public DatasetSourceTarget(View<E> view, AvroType<E> avroType) { super(view); this.view = view; this.avroType = avroType; Configuration temp = new Configuration(false /* use an empty conf */ ); DatasetKeyInputFormat.configure(temp).readFrom(view); this.formatBundle = inputBundle(temp); }
@SuppressWarnings("deprecation") private Job createJob() throws Exception { Job job = new Job(); DatasetKeyInputFormat.configure(job).readFrom(inputDataset).withType(GenericData.Record.class); job.setMapperClass(LineCountMapper.class); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(IntWritable.class); job.setReducerClass(GenericStatsReducer.class); DatasetKeyOutputFormat.configure(job).writeTo(outputDataset).withType(GenericData.Record.class); return job; }