@SuppressWarnings({"unchecked", "deprecation"}) public RecordWriter<Object, Object> getRecordWriter(FileSystem fs,JobConf job, String baseFileName, Progressable arg3) throws IOException { String nameOutput = job.get(CONFIG_NAMED_OUTPUT, null); String fileName = getUniqueName(job, baseFileName); Schema schema = null; String schemastr = job.get(MO_PREFIX+nameOutput+".schema",null); if (schemastr!=null) schema = Schema.parse(schemastr); JobConf outputConf = new JobConf(job); outputConf.setOutputFormat(getNamedOutputFormatClass(job, nameOutput)); boolean isMapOnly = job.getNumReduceTasks() == 0; if (schema != null) { if (isMapOnly) AvroJob.setMapOutputSchema(outputConf, schema); else AvroJob.setOutputSchema(outputConf, schema); } OutputFormat outputFormat = outputConf.getOutputFormat(); return outputFormat.getRecordWriter(fs, outputConf, fileName, arg3); } }
@Test public void testNonAvroMapper() throws Exception { JobConf job = new JobConf(); Path outputPath = new Path(OUTPUT_DIR.getRoot().getPath()); outputPath.getFileSystem(job).delete(outputPath); // configure input for non-Avro sequence file job.setInputFormat(SequenceFileInputFormat.class); FileInputFormat.setInputPaths(job, file().toURI().toString()); // use a hadoop mapper that emits Avro output job.setMapperClass(NonAvroMapper.class); // reducer is default, identity // configure output for avro FileOutputFormat.setOutputPath(job, outputPath); AvroJob.setOutputSchema(job, SCHEMA); JobClient.runJob(job); checkFile(new DataFileReader<> (new File(outputPath.toString() + "/part-00000.avro"), new SpecificDatumReader<>())); }
@Test public void testSequenceFileInputFormat() throws Exception { JobConf job = new JobConf(); Path outputPath = new Path(OUTPUT_DIR.getRoot().getPath()); outputPath.getFileSystem(job).delete(outputPath); // configure input for Avro from sequence file AvroJob.setInputSequenceFile(job); FileInputFormat.setInputPaths(job, file().toURI().toString()); AvroJob.setInputSchema(job, SCHEMA); // mapper is default, identity // reducer is default, identity // configure output for avro AvroJob.setOutputSchema(job, SCHEMA); FileOutputFormat.setOutputPath(job, outputPath); JobClient.runJob(job); checkFile(new DataFileReader<> (new File(outputPath.toString() + "/part-00000.avro"), new SpecificDatumReader<>())); }
@Test public void testNonAvroMapOnly() throws Exception { JobConf job = new JobConf(); Path outputPath = new Path(OUTPUT_DIR.getRoot().getPath()); outputPath.getFileSystem(job).delete(outputPath); // configure input for non-Avro sequence file job.setInputFormat(SequenceFileInputFormat.class); FileInputFormat.setInputPaths(job, file().toURI().toString()); // use a hadoop mapper that emits Avro output job.setMapperClass(NonAvroOnlyMapper.class); // configure output for avro job.setNumReduceTasks(0); // map-only FileOutputFormat.setOutputPath(job, outputPath); AvroJob.setOutputSchema(job, SCHEMA); JobClient.runJob(job); checkFile(new DataFileReader<> (new File(outputPath.toString() + "/part-00000.avro"), new SpecificDatumReader<>())); }
AvroJob.setOutputSchema(conf, Pair.getPairSchema(Schema.create(Schema.Type.BYTES), Schema.create(Schema.Type.BYTES)));
/** Uses default mapper with no reduces for a map-only identity job. */ @Test @SuppressWarnings("deprecation") public void testMapOnly() throws Exception { JobConf job = new JobConf(); String inDir = System.getProperty("share.dir","../../../share")+"/test/data"; Path input = new Path(inDir+"/weather.avro"); Path output = new Path("target/test/weather-ident"); output.getFileSystem(job).delete(output); job.setJobName("identity map weather"); AvroJob.setInputSchema(job, Weather.SCHEMA$); AvroJob.setOutputSchema(job, Weather.SCHEMA$); FileInputFormat.setInputPaths(job, input); FileOutputFormat.setOutputPath(job, output); FileOutputFormat.setCompressOutput(job, true); job.setNumReduceTasks(0); // map-only JobClient.runJob(job); // check output is correct DatumReader<Weather> reader = new SpecificDatumReader<>(); DataFileReader<Weather> check = new DataFileReader<> (new File(inDir + "/weather.avro"), reader); DataFileReader<Weather> sorted = new DataFileReader<> (new File(output.toString() + "/part-00000.avro"), reader); for (Weather w : sorted) assertEquals(check.next(), w); check.close(); sorted.close(); }
@Test public void testJob() throws Exception { JobConf job = new JobConf(); Path outputPath = new Path(DIR.getRoot().getPath() + "/out"); outputPath.getFileSystem(job).delete(outputPath); job.setInputFormat(TextInputFormat.class); FileInputFormat.setInputPaths(job, DIR.getRoot().getPath() + "/in"); job.setMapperClass(AvroTestConverter.class); job.setNumReduceTasks(0); FileOutputFormat.setOutputPath(job, outputPath); System.out.println(createSchema()); AvroJob.setOutputSchema(job, Pair.getPairSchema(Schema.create(Schema.Type.LONG), createSchema())); job.setOutputFormat(AvroOutputFormat.class); JobClient.runJob(job); } }
@SuppressWarnings("deprecation") public void testJobNoreducer() throws Exception { JobConf job = new JobConf(); job.setNumReduceTasks(0); Path outputPath = new Path(OUTPUT_DIR.getRoot().getPath()); outputPath.getFileSystem(job).delete(outputPath); WordCountUtil.writeLinesFile(new File(INPUT_DIR.getRoot(),"lines.avro")); job.setJobName("AvroMultipleOutputs_noreducer"); AvroJob.setInputSchema(job, Schema.create(Schema.Type.STRING)); AvroJob.setOutputSchema(job, new Pair<Utf8, Long>(new Utf8(""), 0L).getSchema()); AvroJob.setMapperClass(job, MapImpl.class); FileInputFormat.setInputPaths(job, new Path(INPUT_DIR.getRoot().toString())); FileOutputFormat.setOutputPath(job, outputPath); FileOutputFormat.setCompressOutput(job, false); AvroMultipleOutputs.addNamedOutput(job, "myavro2", AvroOutputFormat.class, Schema.create(Schema.Type.STRING)); JobClient.runJob(job); }
public void testOutputFormat() throws Exception { JobConf job = new JobConf(); WordCountUtil wordCountUtil = new WordCountUtil("trevniMapredTest"); wordCountUtil.writeLinesFile(); AvroJob.setInputSchema(job, STRING); AvroJob.setOutputSchema(job, Pair.getPairSchema(STRING,LONG)); AvroJob.setMapperClass(job, MapImpl.class); AvroJob.setCombinerClass(job, ReduceImpl.class); AvroJob.setReducerClass(job, ReduceImpl.class); FileInputFormat.setInputPaths(job, new Path(wordCountUtil.getDir().toString() + "/in")); FileOutputFormat.setOutputPath(job, new Path(wordCountUtil.getDir().toString() + "/out")); FileOutputFormat.setCompressOutput(job, true); job.setOutputFormat(AvroTrevniOutputFormat.class); JobClient.runJob(job); wordCountUtil.validateCountsFile(); }
@SuppressWarnings("deprecation") public void testJob(String pathOut) throws Exception { JobConf job = new JobConf(); String pathIn = INPUT_DIR.getRoot().getPath(); WordCountUtil.writeLinesFile(pathIn + "/lines.avro"); Path outputPath = new Path(pathOut); outputPath.getFileSystem(job).delete(outputPath); job.setJobName("wordcount"); AvroJob.setInputSchema(job, Schema.create(Schema.Type.STRING)); AvroJob.setOutputSchema(job, new Pair<Utf8, Long>(new Utf8(""), 0L).getSchema()); AvroJob.setMapperClass(job, MapImpl.class); AvroJob.setCombinerClass(job, ReduceImpl.class); AvroJob.setReducerClass(job, ReduceImpl.class); FileInputFormat.setInputPaths(job, new Path(pathIn)); FileOutputFormat.setOutputPath(job, new Path(pathOut)); FileOutputFormat.setCompressOutput(job, true); WordCountUtil.setMeta(job); JobClient.runJob(job); WordCountUtil.validateCountsFile(new File(pathOut, "part-00000.avro")); }
@Test public void testJob() throws Exception { JobConf job = new JobConf(); Path inputPath1 = new Path(INPUT_DIR_1.getRoot().getPath()); Path inputPath2 = new Path(INPUT_DIR_2.getRoot().getPath()); Path outputPath = new Path(OUTPUT_DIR.getRoot().getPath()); outputPath.getFileSystem(job).delete(outputPath); writeNamesFiles(new File(inputPath1.toUri().getPath())); writeBalancesFiles(new File(inputPath2.toUri().getPath())); job.setJobName("multiple-inputs-join"); AvroMultipleInputs.addInputPath(job, inputPath1, NamesMapImpl.class, ReflectData.get().getSchema(NamesRecord.class)); AvroMultipleInputs.addInputPath(job, inputPath2, BalancesMapImpl.class, ReflectData.get().getSchema(BalancesRecord.class)); Schema keySchema = ReflectData.get().getSchema(KeyRecord.class); Schema valueSchema = ReflectData.get().getSchema(JoinableRecord.class); AvroJob.setMapOutputSchema(job, Pair.getPairSchema(keySchema, valueSchema)); AvroJob.setOutputSchema(job, ReflectData.get().getSchema(CompleteRecord.class)); AvroJob.setReducerClass(job, ReduceImpl.class); job.setNumReduceTasks(1); FileOutputFormat.setOutputPath(job, outputPath); AvroJob.setReflect(job); JobClient.runJob(job); validateCompleteFile(new File(OUTPUT_DIR.getRoot(), "part-00000.avro")); }
@Test @SuppressWarnings("deprecation") public void testJob() throws Exception { JobConf job = new JobConf(); String dir = "target/testReflectJob"; Path inputPath = new Path(dir + "/in"); Path outputPath = new Path(dir + "/out"); outputPath.getFileSystem(job).delete(outputPath); inputPath.getFileSystem(job).delete(inputPath); writeLinesFile(new File(dir+"/in")); job.setJobName("reflect"); AvroJob.setInputSchema(job, ReflectData.get().getSchema(Text.class)); AvroJob.setMapOutputSchema (job, new Pair(new Text(""), new Count(0L)).getSchema()); AvroJob.setOutputSchema(job, ReflectData.get().getSchema(WordCount.class)); AvroJob.setMapperClass(job, MapImpl.class); //AvroJob.setCombinerClass(job, ReduceImpl.class); AvroJob.setReducerClass(job, ReduceImpl.class); FileInputFormat.setInputPaths(job, inputPath); FileOutputFormat.setOutputPath(job, outputPath); AvroJob.setReflect(job); // use reflection JobClient.runJob(job); validateCountsFile(new File(new File(dir, "out"), "part-00000.avro")); }
AvroJob.setMapOutputSchema (job, Pair.getPairSchema(Weather.SCHEMA$, Schema.create(Type.NULL))); AvroJob.setOutputSchema(job, Weather.SCHEMA$);
@SuppressWarnings("deprecation") public void testJob(String pathOut) throws Exception { JobConf job = new JobConf(); String pathIn = INPUT_DIR.getRoot().getPath(); File fileIn = new File(pathIn, "lines.avro"); Path outputPath = new Path(pathOut); outputPath.getFileSystem(job).delete(outputPath); WordCountUtil.writeLinesFile(fileIn); job.setJobName("AvroMultipleOutputs"); AvroJob.setInputSchema(job, Schema.create(Schema.Type.STRING)); AvroJob.setOutputSchema(job, new Pair<Utf8, Long>(new Utf8(""), 0L).getSchema()); AvroJob.setMapperClass(job, MapImpl.class); AvroJob.setReducerClass(job, ReduceImpl.class); FileInputFormat.setInputPaths(job, pathIn); FileOutputFormat.setOutputPath(job, outputPath); FileOutputFormat.setCompressOutput(job, false); AvroMultipleOutputs.addNamedOutput(job, "myavro", AvroOutputFormat.class, new Pair<Utf8, Long>(new Utf8(""), 0L).getSchema()); AvroMultipleOutputs.addNamedOutput(job, "myavro1", AvroOutputFormat.class, Schema.create(Schema.Type.STRING)); AvroMultipleOutputs.addNamedOutput(job, "myavro2", AvroOutputFormat.class, Schema.create(Schema.Type.STRING)); WordCountUtil.setMeta(job); JobClient.runJob(job); WordCountUtil.validateCountsFile(new File(outputPath.toString(), "/part-00000.avro")); }
@SuppressWarnings({"unchecked", "deprecation"}) public RecordWriter<Object, Object> getRecordWriter(FileSystem fs,JobConf job, String baseFileName, Progressable arg3) throws IOException { String nameOutput = job.get(CONFIG_NAMED_OUTPUT, null); String fileName = getUniqueName(job, baseFileName); Schema schema = null; String schemastr = job.get(MO_PREFIX+nameOutput+".schema",null); if (schemastr!=null) schema = Schema.parse(schemastr); JobConf outputConf = new JobConf(job); outputConf.setOutputFormat(getNamedOutputFormatClass(job, nameOutput)); boolean isMapOnly = job.getNumReduceTasks() == 0; if (schema != null) { if (isMapOnly) AvroJob.setMapOutputSchema(outputConf, schema); else AvroJob.setOutputSchema(outputConf, schema); } OutputFormat outputFormat = outputConf.getOutputFormat(); return outputFormat.getRecordWriter(fs, outputConf, fileName, arg3); } }
/** * Creates a JobConf for a map-only job. Automatically loads the schema from each input file. * * @param mapperClass AvroMapper subclass implementing the map phase * @param outputSchema Schema of the mapper output * @return A configured JobConf. * @throws IOException * @throws URISyntaxException */ public JobConf createJobConf(Class<? extends AvroMapper> mapperClass, Schema outputSchema) throws IOException, URISyntaxException { JobConf conf = createJobConf(); AvroJob.setMapperClass(conf, mapperClass); AvroJob.setReducerClass(conf, AvroReducer.class); AvroJob.setOutputSchema(conf, outputSchema); conf.setNumReduceTasks(0); return conf; }
/** * Creates a JobConf for a map-reduce job. Loads the input schema from the input files. * * @param mapperClass AvroMapper subclass for the mapper. * @param reducerClass AvroReducer subclass for the reducer. * @param mapperOutputSchema Mapper output schema. Must be an instance of org.apache.avro.mapred.Pair * @param outputSchema Reducer output schema * @return A configured JobConf. * @throws IOException * @throws URISyntaxException */ public JobConf createJobConf(Class<? extends AvroMapper> mapperClass, Class<? extends AvroReducer> reducerClass, Schema mapperOutputSchema, Schema outputSchema) throws IOException, URISyntaxException { JobConf conf = createJobConf(); AvroJob.setMapperClass(conf, mapperClass); AvroJob.setReducerClass(conf, reducerClass); AvroJob.setMapOutputSchema(conf, mapperOutputSchema); AvroJob.setOutputSchema(conf, outputSchema); return conf; }
public static void main(String... args) throws Exception { JobConf job = new JobConf(); job.setJarByClass(SmallFilesMapReduce.class); Path input = new Path(args[0]); Path output = new Path(args[1]); output.getFileSystem(job).delete(output, true); AvroJob.setInputSchema(job, Stock.SCHEMA$); //<co id="ch03_avro_mr_comment1"/> AvroJob.setMapOutputSchema(job, Pair.getPairSchema(Stock.SCHEMA$, Schema.create(Schema.Type.NULL))); AvroJob.setOutputSchema(job, Stock.SCHEMA$); FileInputFormat.setInputPaths(job, input); FileOutputFormat.setOutputPath(job, output); AvroJob.setMapperClass(job, Mapper.class); //<co id="ch03_smallfilemr_comment2"/> AvroJob.setReducerClass(job, Reducer.class); FileOutputFormat.setCompressOutput(job, true); AvroJob.setOutputCodec(job, SNAPPY_CODEC); JobClient.runJob(job); }
/** * Creates a JobConf for a map-only job with an explicitly set input Schema. * * @param mapperClass AvroMapper subclass implementing the map phase * @param inputSchema Schema of the input data. * @param outputSchema Schema of the mapper output * @return A configured JobConf. * @throws IOException * @throws URISyntaxException */ public JobConf createJobConf(Class<? extends AvroMapper> mapperClass, Schema inputSchema, Schema outputSchema) throws IOException, URISyntaxException { JobConf conf = createJobConf(); AvroJob.setMapperClass(conf, mapperClass); AvroJob.setReducerClass(conf, AvroReducer.class); AvroJob.setInputSchema(conf, inputSchema); AvroJob.setOutputSchema(conf, outputSchema); conf.setNumReduceTasks(0); return conf; }
public void testOutputFormat() throws Exception { JobConf job = new JobConf(); WordCountUtil.writeLinesFile(); AvroJob.setInputSchema(job, STRING); AvroJob.setOutputSchema(job, Pair.getPairSchema(STRING,LONG)); AvroJob.setMapperClass(job, MapImpl.class); AvroJob.setCombinerClass(job, ReduceImpl.class); AvroJob.setReducerClass(job, ReduceImpl.class); FileInputFormat.setInputPaths(job, new Path(DIR + "/in")); FileOutputFormat.setOutputPath(job, new Path(DIR + "/out")); FileOutputFormat.setCompressOutput(job, true); job.setOutputFormat(AvroTrevniOutputFormat.class); JobClient.runJob(job); WordCountUtil.validateCountsFile(); }