public int run(String[] args) throws Exception { if(args.length != 3) Utils.croak("USAGE: GenerateData input-file output-dir value-size"); JobConf conf = new JobConf(getConf(), GenerateData.class); conf.setJobName("generate-data"); conf.setOutputKeyClass(Text.class); conf.setOutputValueClass(IntWritable.class); conf.setMapperClass(GenerateDataMapper.class); conf.setReducerClass(IdentityReducer.class); conf.setNumReduceTasks(0); conf.setInputFormat(TextInputFormat.class); conf.setOutputFormat(SequenceFileOutputFormat.class); conf.setOutputKeyClass(BytesWritable.class); conf.setOutputValueClass(BytesWritable.class); Path inputPath = new Path(args[0]); FileInputFormat.setInputPaths(conf, inputPath); Path outputPath = new Path(args[1]); // delete output path if it already exists FileSystem fs = outputPath.getFileSystem(conf); if(fs.exists(outputPath)) fs.delete(outputPath, true); FileOutputFormat.setOutputPath(conf, outputPath); conf.setInt("value.size", Integer.parseInt(args[2])); JobClient.runJob(conf); return 0; }
/** * Gets fully configured JobConf instance. * * @param input input file name. * @param output output directory name. * @return Job configuration */ public static JobConf getJob(String input, String output) { JobConf conf = new JobConf(HadoopWordCount1.class); conf.setJobName("wordcount"); conf.setOutputKeyClass(Text.class); conf.setOutputValueClass(IntWritable.class); setTasksClasses(conf, true, true, true); FileInputFormat.setInputPaths(conf, new Path(input)); FileOutputFormat.setOutputPath(conf, new Path(output)); return conf; }
if(!isAvro) { conf.setPartitionerClass(HadoopStoreBuilderPartitioner.class); conf.setMapperClass(mapperClass); conf.setMapOutputKeyClass(BytesWritable.class); conf.setMapOutputValueClass(BytesWritable.class); conf.setReducerClass(HadoopStoreBuilderReducer.class); conf.setOutputKeyClass(BytesWritable.class); conf.setOutputValueClass(BytesWritable.class); conf.setJarByClass(getClass()); conf.setReduceSpeculativeExecution(false); FileInputFormat.setInputPaths(conf, inputPath); conf.set("final.output.dir", outputDir.toString()); conf.set(VoldemortBuildAndPushJob.CHECKSUM_TYPE, CheckSum.toString(checkSumType)); conf.set("dfs.umaskmode", "002"); FileOutputFormat.setOutputPath(conf, tempDir); conf.setOutputKeyClass(ByteBuffer.class); conf.setOutputValueClass(ByteBuffer.class); conf.setReducerClass(AvroStoreBuilderReducer.class); Path directoryPath = new Path(outputDir.toString(), directoryName);
@Test /** * Run the identity job on a "bytes" Avro file using AvroAsTextInputFormat * and AvroTextOutputFormat to produce a sorted "bytes" Avro file. */ public void testSort() throws Exception { JobConf job = new JobConf(); String inputPath = INPUT_DIR.getRoot().getPath(); Path outputPath = new Path(OUTPUT_DIR.getRoot().getPath()); outputPath.getFileSystem(job).delete(outputPath); WordCountUtil.writeLinesBytesFile(inputPath); job.setInputFormat(AvroAsTextInputFormat.class); job.setOutputFormat(AvroTextOutputFormat.class); job.setOutputKeyClass(Text.class); FileInputFormat.setInputPaths(job, new Path(inputPath)); FileOutputFormat.setOutputPath(job, outputPath); JobClient.runJob(job); WordCountUtil.validateSortedFile(outputPath.toString() + "/part-00000.avro"); }
@Test public void testNonAvroMapper() throws Exception { JobConf job = new JobConf(); Path outputPath = new Path(OUTPUT_DIR.getRoot().getPath()); outputPath.getFileSystem(job).delete(outputPath); // configure input for non-Avro sequence file job.setInputFormat(SequenceFileInputFormat.class); FileInputFormat.setInputPaths(job, file().toURI().toString()); // use a hadoop mapper that emits Avro output job.setMapperClass(NonAvroMapper.class); // reducer is default, identity // configure output for avro FileOutputFormat.setOutputPath(job, outputPath); AvroJob.setOutputSchema(job, SCHEMA); JobClient.runJob(job); checkFile(new DataFileReader<> (new File(outputPath.toString() + "/part-00000.avro"), new SpecificDatumReader<>())); }
@Test public void testNonAvroMapOnly() throws Exception { JobConf job = new JobConf(); Path outputPath = new Path(OUTPUT_DIR.getRoot().getPath()); outputPath.getFileSystem(job).delete(outputPath); // configure input for non-Avro sequence file job.setInputFormat(SequenceFileInputFormat.class); FileInputFormat.setInputPaths(job, file().toURI().toString()); // use a hadoop mapper that emits Avro output job.setMapperClass(NonAvroOnlyMapper.class); // configure output for avro job.setNumReduceTasks(0); // map-only FileOutputFormat.setOutputPath(job, outputPath); AvroJob.setOutputSchema(job, SCHEMA); JobClient.runJob(job); checkFile(new DataFileReader<> (new File(outputPath.toString() + "/part-00000.avro"), new SpecificDatumReader<>())); }
@Test public void testNonAvroReducer() throws Exception { JobConf job = new JobConf(); Path outputPath = new Path(OUTPUT_DIR.getRoot().getPath()); outputPath.getFileSystem(job).delete(outputPath); // configure input for Avro from sequence file AvroJob.setInputSequenceFile(job); AvroJob.setInputSchema(job, SCHEMA); FileInputFormat.setInputPaths(job, file().toURI().toString()); // mapper is default, identity // use a hadoop reducer that consumes Avro input AvroJob.setMapOutputSchema(job, SCHEMA); job.setReducerClass(NonAvroReducer.class); // configure outputPath for non-Avro SequenceFile job.setOutputFormat(SequenceFileOutputFormat.class); FileOutputFormat.setOutputPath(job, outputPath); // output key/value classes are default, LongWritable/Text JobClient.runJob(job); checkFile(new SequenceFileReader<> (new File(outputPath.toString() + "/part-00000"))); }
@Test public void testSequenceFileInputFormat() throws Exception { JobConf job = new JobConf(); Path outputPath = new Path(OUTPUT_DIR.getRoot().getPath()); outputPath.getFileSystem(job).delete(outputPath); // configure input for Avro from sequence file AvroJob.setInputSequenceFile(job); FileInputFormat.setInputPaths(job, file().toURI().toString()); AvroJob.setInputSchema(job, SCHEMA); // mapper is default, identity // reducer is default, identity // configure output for avro AvroJob.setOutputSchema(job, SCHEMA); FileOutputFormat.setOutputPath(job, outputPath); JobClient.runJob(job); checkFile(new DataFileReader<> (new File(outputPath.toString() + "/part-00000.avro"), new SpecificDatumReader<>())); }
@Test public void testJob() throws Exception { JobConf job = new JobConf(); Path outputPath = new Path(DIR.getRoot().getPath() + "/out"); outputPath.getFileSystem(job).delete(outputPath); job.setInputFormat(TextInputFormat.class); FileInputFormat.setInputPaths(job, DIR.getRoot().getPath() + "/in"); job.setMapperClass(AvroTestConverter.class); job.setNumReduceTasks(0); FileOutputFormat.setOutputPath(job, outputPath); System.out.println(createSchema()); AvroJob.setOutputSchema(job, Pair.getPairSchema(Schema.create(Schema.Type.LONG), createSchema())); job.setOutputFormat(AvroOutputFormat.class); JobClient.runJob(job); } }
/** Uses default mapper with no reduces for a map-only identity job. */ @Test @SuppressWarnings("deprecation") public void testMapOnly() throws Exception { JobConf job = new JobConf(); String inDir = System.getProperty("share.dir","../../../share")+"/test/data"; Path input = new Path(inDir+"/weather.avro"); Path output = new Path("target/test/weather-ident"); output.getFileSystem(job).delete(output); job.setJobName("identity map weather"); AvroJob.setInputSchema(job, Weather.SCHEMA$); AvroJob.setOutputSchema(job, Weather.SCHEMA$); FileInputFormat.setInputPaths(job, input); FileOutputFormat.setOutputPath(job, output); FileOutputFormat.setCompressOutput(job, true); job.setNumReduceTasks(0); // map-only JobClient.runJob(job); // check output is correct DatumReader<Weather> reader = new SpecificDatumReader<>(); DataFileReader<Weather> check = new DataFileReader<> (new File(inDir + "/weather.avro"), reader); DataFileReader<Weather> sorted = new DataFileReader<> (new File(output.toString() + "/part-00000.avro"), reader); for (Weather w : sorted) assertEquals(check.next(), w); check.close(); sorted.close(); }
@SuppressWarnings("deprecation") public void testJobNoreducer() throws Exception { JobConf job = new JobConf(); job.setNumReduceTasks(0); Path outputPath = new Path(OUTPUT_DIR.getRoot().getPath()); outputPath.getFileSystem(job).delete(outputPath); WordCountUtil.writeLinesFile(new File(INPUT_DIR.getRoot(),"lines.avro")); job.setJobName("AvroMultipleOutputs_noreducer"); AvroJob.setInputSchema(job, Schema.create(Schema.Type.STRING)); AvroJob.setOutputSchema(job, new Pair<Utf8, Long>(new Utf8(""), 0L).getSchema()); AvroJob.setMapperClass(job, MapImpl.class); FileInputFormat.setInputPaths(job, new Path(INPUT_DIR.getRoot().toString())); FileOutputFormat.setOutputPath(job, outputPath); FileOutputFormat.setCompressOutput(job, false); AvroMultipleOutputs.addNamedOutput(job, "myavro2", AvroOutputFormat.class, Schema.create(Schema.Type.STRING)); JobClient.runJob(job); }
public void testOutputFormat() throws Exception { JobConf job = new JobConf(); WordCountUtil wordCountUtil = new WordCountUtil("trevniMapredTest"); wordCountUtil.writeLinesFile(); AvroJob.setInputSchema(job, STRING); AvroJob.setOutputSchema(job, Pair.getPairSchema(STRING,LONG)); AvroJob.setMapperClass(job, MapImpl.class); AvroJob.setCombinerClass(job, ReduceImpl.class); AvroJob.setReducerClass(job, ReduceImpl.class); FileInputFormat.setInputPaths(job, new Path(wordCountUtil.getDir().toString() + "/in")); FileOutputFormat.setOutputPath(job, new Path(wordCountUtil.getDir().toString() + "/out")); FileOutputFormat.setCompressOutput(job, true); job.setOutputFormat(AvroTrevniOutputFormat.class); JobClient.runJob(job); wordCountUtil.validateCountsFile(); }
@SuppressWarnings("deprecation") public void testJob(String pathOut) throws Exception { JobConf job = new JobConf(); String pathIn = INPUT_DIR.getRoot().getPath(); WordCountUtil.writeLinesFile(pathIn + "/lines.avro"); Path outputPath = new Path(pathOut); outputPath.getFileSystem(job).delete(outputPath); job.setJobName("wordcount"); AvroJob.setInputSchema(job, Schema.create(Schema.Type.STRING)); AvroJob.setOutputSchema(job, new Pair<Utf8, Long>(new Utf8(""), 0L).getSchema()); AvroJob.setMapperClass(job, MapImpl.class); AvroJob.setCombinerClass(job, ReduceImpl.class); AvroJob.setReducerClass(job, ReduceImpl.class); FileInputFormat.setInputPaths(job, new Path(pathIn)); FileOutputFormat.setOutputPath(job, new Path(pathOut)); FileOutputFormat.setCompressOutput(job, true); WordCountUtil.setMeta(job); JobClient.runJob(job); WordCountUtil.validateCountsFile(new File(pathOut, "part-00000.avro")); }
@Test @SuppressWarnings("deprecation") public void testJob() throws Exception { JobConf job = new JobConf(); String dir = "target/testReflectJob"; Path inputPath = new Path(dir + "/in"); Path outputPath = new Path(dir + "/out"); outputPath.getFileSystem(job).delete(outputPath); inputPath.getFileSystem(job).delete(inputPath); writeLinesFile(new File(dir+"/in")); job.setJobName("reflect"); AvroJob.setInputSchema(job, ReflectData.get().getSchema(Text.class)); AvroJob.setMapOutputSchema (job, new Pair(new Text(""), new Count(0L)).getSchema()); AvroJob.setOutputSchema(job, ReflectData.get().getSchema(WordCount.class)); AvroJob.setMapperClass(job, MapImpl.class); //AvroJob.setCombinerClass(job, ReduceImpl.class); AvroJob.setReducerClass(job, ReduceImpl.class); FileInputFormat.setInputPaths(job, inputPath); FileOutputFormat.setOutputPath(job, outputPath); AvroJob.setReflect(job); // use reflection JobClient.runJob(job); validateCountsFile(new File(new File(dir, "out"), "part-00000.avro")); }
@Test @SuppressWarnings("deprecation") public void testSort() throws Exception { JobConf job = new JobConf(); String inDir = "../../../share/test/data"; Path input = new Path(inDir+"/weather.avro"); Path output = new Path("target/test/weather-sort"); AvroJob.setReducerClass(job, SortReducer.class); FileInputFormat.setInputPaths(job, input); FileOutputFormat.setOutputPath(job, output); FileOutputFormat.setCompressOutput(job, true); AvroJob.setOutputCodec(job, SNAPPY_CODEC);
@SuppressWarnings("deprecation") public void testJob(String pathOut) throws Exception { JobConf job = new JobConf(); String pathIn = INPUT_DIR.getRoot().getPath(); File fileIn = new File(pathIn, "lines.avro"); Path outputPath = new Path(pathOut); outputPath.getFileSystem(job).delete(outputPath); WordCountUtil.writeLinesFile(fileIn); job.setJobName("AvroMultipleOutputs"); AvroJob.setInputSchema(job, Schema.create(Schema.Type.STRING)); AvroJob.setOutputSchema(job, new Pair<Utf8, Long>(new Utf8(""), 0L).getSchema()); AvroJob.setMapperClass(job, MapImpl.class); AvroJob.setReducerClass(job, ReduceImpl.class); FileInputFormat.setInputPaths(job, pathIn); FileOutputFormat.setOutputPath(job, outputPath); FileOutputFormat.setCompressOutput(job, false); AvroMultipleOutputs.addNamedOutput(job, "myavro", AvroOutputFormat.class, new Pair<Utf8, Long>(new Utf8(""), 0L).getSchema()); AvroMultipleOutputs.addNamedOutput(job, "myavro1", AvroOutputFormat.class, Schema.create(Schema.Type.STRING)); AvroMultipleOutputs.addNamedOutput(job, "myavro2", AvroOutputFormat.class, Schema.create(Schema.Type.STRING)); WordCountUtil.setMeta(job); JobClient.runJob(job); WordCountUtil.validateCountsFile(new File(outputPath.toString(), "/part-00000.avro")); }
protected JobConf configStage1() throws Exception { final JobConf conf = new JobConf(getConf(), ConCmptBlock.class); conf.set("block_width", "" + block_width); conf.set("recursive_diagmult", "" + recursive_diagmult); conf.setJobName("ConCmptBlock_pass1"); conf.setMapperClass(MapStage1.class); conf.setReducerClass(RedStage1.class); FileInputFormat.setInputPaths(conf, edge_path, curbm_path); FileOutputFormat.setOutputPath(conf, tempbm_path); conf.setNumReduceTasks( nreducers ); conf.setOutputKeyClass(IntWritable.class); conf.setOutputValueClass(Text.class); return conf; }
protected JobConf configStage2 () throws Exception { final JobConf conf = new JobConf(getConf(), ConCmptBlock.class); conf.set("block_width", "" + block_width); conf.setJobName("ConCmptBlock_pass2"); conf.setMapperClass(MapStage2.class); conf.setReducerClass(RedStage2.class); FileInputFormat.setInputPaths(conf, tempbm_path); FileOutputFormat.setOutputPath(conf, nextbm_path); conf.setNumReduceTasks( nreducers ); conf.setOutputKeyClass(IntWritable.class); conf.setOutputValueClass(Text.class); return conf; }
protected JobConf configStage1() throws Exception { final JobConf conf = new JobConf(getConf(), PagerankPrep.class); conf.set("make_symmetric", "" + make_symmetric); conf.setJobName("PagerankPrep_Stage1"); conf.setMapperClass(MapStage1.class); conf.setReducerClass(RedStage1.class); FileInputFormat.setInputPaths(conf, edge_path); FileOutputFormat.setOutputPath(conf, output_path); conf.setNumReduceTasks( nreducers ); conf.setOutputKeyClass(IntWritable.class); conf.setOutputValueClass(Text.class); return conf; } }
protected JobConf configStage1 () throws Exception { final JobConf conf = new JobConf(getConf(), RWRBlock.class); conf.set("number_nodes", "" + number_nodes); conf.set("mixing_c", "" + mixing_c); conf.set("block_width", "" + block_width); conf.setJobName("RWRBlock_Stage1"); conf.setMapperClass(MapStage1.class); conf.setReducerClass(RedStage1.class); fs.delete(tempmv_path, true); FileInputFormat.setInputPaths(conf, edge_path, vector_path); FileOutputFormat.setOutputPath(conf, tempmv_path); conf.setNumReduceTasks( nreducers ); conf.setOutputKeyClass(IntWritable.class); conf.setOutputValueClass(Text.class); return conf; }