Refine search
@Test public void testNonAvroMapOnly() throws Exception { JobConf job = new JobConf(); Path outputPath = new Path(OUTPUT_DIR.getRoot().getPath()); outputPath.getFileSystem(job).delete(outputPath); // configure input for non-Avro sequence file job.setInputFormat(SequenceFileInputFormat.class); FileInputFormat.setInputPaths(job, file().toURI().toString()); // use a hadoop mapper that emits Avro output job.setMapperClass(NonAvroOnlyMapper.class); // configure output for avro job.setNumReduceTasks(0); // map-only FileOutputFormat.setOutputPath(job, outputPath); AvroJob.setOutputSchema(job, SCHEMA); JobClient.runJob(job); checkFile(new DataFileReader<> (new File(outputPath.toString() + "/part-00000.avro"), new SpecificDatumReader<>())); }
this.checkSumDigestValue[chunkId] = CheckSum.getInstance(checkSumType); this.position[chunkId] = 0; this.taskIndexFileName[chunkId] = new Path(FileOutputFormat.getOutputPath(conf), getStoreName() + "." + Integer.toString(chunkId) + "_" + this.taskId + INDEX_FILE_EXTENSION + fileExtension); this.taskValueFileName[chunkId] = new Path(FileOutputFormat.getOutputPath(conf), getStoreName() + "." + Integer.toString(chunkId) + "_" + fileExtension); if(this.fs == null) this.fs = this.taskIndexFileName[chunkId].getFileSystem(conf); if(isValidCompressionEnabled) { this.indexFileStream[chunkId] = new DataOutputStream(new BufferedOutputStream(new GZIPOutputStream(fs.create(this.taskIndexFileName[chunkId]), DEFAULT_BUFFER_SIZE))); this.valueFileStream[chunkId] = new DataOutputStream(new BufferedOutputStream(new GZIPOutputStream(fs.create(this.taskValueFileName[chunkId]), DEFAULT_BUFFER_SIZE))); this.indexFileStream[chunkId] = fs.create(this.taskIndexFileName[chunkId]); this.valueFileStream[chunkId] = fs.create(this.taskValueFileName[chunkId]);
@SuppressWarnings("unchecked") public RecordWriter<TetherData, NullWritable> getRecordWriter(FileSystem ignore, JobConf job, String name, Progressable prog) throws IOException { Schema schema = AvroJob.getOutputSchema(job); final DataFileWriter writer = new DataFileWriter(new GenericDatumWriter()); if (FileOutputFormat.getCompressOutput(job)) { int level = job.getInt(AvroOutputFormat.DEFLATE_LEVEL_KEY, CodecFactory.DEFAULT_DEFLATE_LEVEL); writer.setCodec(CodecFactory.deflateCodec(level)); } Path path = FileOutputFormat.getTaskOutputPath(job, name+AvroOutputFormat.EXT); writer.create(schema, path.getFileSystem(job).create(path)); return new RecordWriter<TetherData, NullWritable>() { public void write(TetherData datum, NullWritable ignore) throws IOException { writer.appendEncoded(datum.buffer()); } public void close(Reporter reporter) throws IOException { writer.close(); } }; }
conf.setJobName(getId()); conf.setNumReduceTasks(0); conf.set("hadoop.job.ugi", hadoop_ugi); FileSystem fs = FileSystem.get(conf); FileStatus[] statuses = fs.listStatus(new Path(latestPath), filter); path = statuses[statuses.length - 1].getPath().toString(); System.out.println("Using latest folder: " + path); HadoopUtils.addAllSubPaths(conf, new Path(path)); FileOutputFormat.setOutputPath(conf, new Path(location)); FileSystem fs = FileOutputFormat.getOutputPath(conf).getFileSystem(conf); fs.delete(FileOutputFormat.getOutputPath(conf), true);
public void testOutputFormat() throws Exception { JobConf job = new JobConf(); WordCountUtil wordCountUtil = new WordCountUtil("trevniMapredTest"); wordCountUtil.writeLinesFile(); AvroJob.setInputSchema(job, STRING); AvroJob.setOutputSchema(job, Pair.getPairSchema(STRING,LONG)); AvroJob.setMapperClass(job, MapImpl.class); AvroJob.setCombinerClass(job, ReduceImpl.class); AvroJob.setReducerClass(job, ReduceImpl.class); FileInputFormat.setInputPaths(job, new Path(wordCountUtil.getDir().toString() + "/in")); FileOutputFormat.setOutputPath(job, new Path(wordCountUtil.getDir().toString() + "/out")); FileOutputFormat.setCompressOutput(job, true); job.setOutputFormat(AvroTrevniOutputFormat.class); JobClient.runJob(job); wordCountUtil.validateCountsFile(); }
JobConf conf = prepareJobConf(baseJobConf); FileSystem fs = outputDir.getFileSystem(conf); if(fs.exists(outputDir)) { info("Deleting previous output in " + outputDir + " for building store " + this.storeDef.getName()); fs.delete(outputDir, true); conf.setJarByClass(getClass()); conf.setReduceSpeculativeExecution(false); FileInputFormat.setInputPaths(conf, inputPath); conf.set("final.output.dir", outputDir.toString()); conf.set(VoldemortBuildAndPushJob.CHECKSUM_TYPE, CheckSum.toString(checkSumType)); conf.set("dfs.umaskmode", "002"); FileOutputFormat.setOutputPath(conf, tempDir); FileSystem outputFs = outputDir.getFileSystem(conf); if(outputFs.exists(outputDir)) { throw new IOException("Final output directory already exists."); JobClient jc = new JobClient(conf); RunningJob runningJob = jc.submitJob(conf); Counters counters; if (!jc.monitorAndPrintJob(conf, runningJob)) { counters = runningJob.getCounters();
(org.apache.hadoop.mapred.RecordWriter)null, context); } else { Path parentDir = new Path(context.getConfiguration().get("mapred.work.output.dir")); Path childPath = new Path(parentDir,FileOutputFormat.getUniqueName(new JobConf(context.getConfiguration()), context.getConfiguration().get("mapreduce.output.basename", "part"))); parentDir.getFileSystem(context.getConfiguration()), new JobConf(context.getConfiguration()), childPath.toString(), InternalUtil.createReporter(context)),
public int run(String[] args) throws Exception { if(args.length != 3) Utils.croak("USAGE: GenerateData input-file output-dir value-size"); JobConf conf = new JobConf(getConf(), GenerateData.class); conf.setJobName("generate-data"); conf.setOutputKeyClass(Text.class); conf.setOutputValueClass(IntWritable.class); conf.setMapperClass(GenerateDataMapper.class); conf.setReducerClass(IdentityReducer.class); conf.setNumReduceTasks(0); conf.setInputFormat(TextInputFormat.class); conf.setOutputFormat(SequenceFileOutputFormat.class); conf.setOutputKeyClass(BytesWritable.class); conf.setOutputValueClass(BytesWritable.class); Path inputPath = new Path(args[0]); FileInputFormat.setInputPaths(conf, inputPath); Path outputPath = new Path(args[1]); // delete output path if it already exists FileSystem fs = outputPath.getFileSystem(conf); if(fs.exists(outputPath)) fs.delete(outputPath, true); FileOutputFormat.setOutputPath(conf, outputPath); conf.setInt("value.size", Integer.parseInt(args[2])); JobClient.runJob(conf); return 0; }
public void setupJob(JobConf conf) throws IOException { Path outputPath = FileOutputFormat.getOutputPath(conf); if (outputPath != null) { Path tmpDir = new Path(outputPath, FileOutputCommitter.TEMP_DIR_NAME); FileSystem fileSys = tmpDir.getFileSystem(conf); if (!fileSys.mkdirs(tmpDir)) { LOG.error("Mkdirs failed to create " + tmpDir.toString()); } } }
JobConf job = new JobConf(PagerankData.class); String jobname = "Create pagerank links"; Path fout = new Path(options.getResultPath(), EDGES_DIR_NAME); job.setJobName(jobname); setPageRankLinksOptions(job); job.setOutputKeyClass(LongWritable.class); job.setOutputValueClass(Text.class); job.setNumReduceTasks(0); FileInputFormat.setInputPaths(job, dummy.getPath()); job.setInputFormat(NLineInputFormat.class); job.set("mapred.output.compression.type","BLOCK"); job.set("mapreduce.output.fileoutputformat.compress.type", "BLOCK"); FileOutputFormat.setCompressOutput(job, true); FileOutputFormat.setOutputCompressorClass(job, options.getCodecClass()); FileOutputFormat.setOutputPath(job, fout); JobClient.runJob(job); log.info("Finished Running Job: " + jobname);
@Test /** * Run the identity job on a "bytes" Avro file using AvroAsTextInputFormat * and AvroTextOutputFormat to produce a sorted "bytes" Avro file. */ public void testSort() throws Exception { JobConf job = new JobConf(); String inputPath = INPUT_DIR.getRoot().getPath(); Path outputPath = new Path(OUTPUT_DIR.getRoot().getPath()); outputPath.getFileSystem(job).delete(outputPath); WordCountUtil.writeLinesBytesFile(inputPath); job.setInputFormat(AvroAsTextInputFormat.class); job.setOutputFormat(AvroTextOutputFormat.class); job.setOutputKeyClass(Text.class); FileInputFormat.setInputPaths(job, new Path(inputPath)); FileOutputFormat.setOutputPath(job, outputPath); JobClient.runJob(job); WordCountUtil.validateSortedFile(outputPath.toString() + "/part-00000.avro"); }
public static void makeTempPath( Configuration conf ) throws IOException { // create job specific temporary directory in output path Path outputPath = FileOutputFormat.getOutputPath( asJobConfInstance( conf ) ); if( outputPath != null ) { Path tmpDir = new Path( outputPath, TEMPORARY_PATH ); FileSystem fileSys = tmpDir.getFileSystem( conf ); if( !fileSys.exists( tmpDir ) && !fileSys.mkdirs( tmpDir ) ) LOG.error( "mkdirs failed to create {}", tmpDir ); } }
@Test public void testJob() throws Exception { JobConf job = new JobConf(); Path inputPath1 = new Path(INPUT_DIR_1.getRoot().getPath()); Path inputPath2 = new Path(INPUT_DIR_2.getRoot().getPath()); Path outputPath = new Path(OUTPUT_DIR.getRoot().getPath()); outputPath.getFileSystem(job).delete(outputPath); writeNamesFiles(new File(inputPath1.toUri().getPath())); writeBalancesFiles(new File(inputPath2.toUri().getPath())); job.setJobName("multiple-inputs-join"); AvroMultipleInputs.addInputPath(job, inputPath1, NamesMapImpl.class, ReflectData.get().getSchema(NamesRecord.class)); AvroMultipleInputs.addInputPath(job, inputPath2, BalancesMapImpl.class, ReflectData.get().getSchema(BalancesRecord.class)); Schema keySchema = ReflectData.get().getSchema(KeyRecord.class); Schema valueSchema = ReflectData.get().getSchema(JoinableRecord.class); AvroJob.setMapOutputSchema(job, Pair.getPairSchema(keySchema, valueSchema)); AvroJob.setOutputSchema(job, ReflectData.get().getSchema(CompleteRecord.class)); AvroJob.setReducerClass(job, ReduceImpl.class); job.setNumReduceTasks(1); FileOutputFormat.setOutputPath(job, outputPath); AvroJob.setReflect(job); JobClient.runJob(job); validateCompleteFile(new File(OUTPUT_DIR.getRoot(), "part-00000.avro")); }
/** * Helper function to generate a {@link Path} for a file that is unique for * the task within the job output directory. * * <p>The path can be used to create custom files from within the map and * reduce tasks. The path name will be unique for each task. The path parent * will be the job output directory.</p>ls * * <p>This method uses the {@link #getUniqueName} method to make the file name * unique for the task.</p> * * @param conf the configuration for the job. * @param name the name for the file. * @return a unique path accross all tasks of the job. */ public static Path getPathForCustomFile(JobConf conf, String name) { return new Path(getWorkOutputPath(conf), getUniqueName(conf, name)); } }
File inputPath = new File(INPUT_DIR.getRoot(), "lines.avro"); JobConf job = new JobConf(); Path outputPath = new Path(outputPathStr); outputPath.getFileSystem(job).delete(outputPath); execargs.add("org.apache.avro.mapred.tether.WordCountTask"); FileInputFormat.addInputPaths(job, inputPath.toString()); FileOutputFormat.setOutputPath(job, outputPath); TetherJob.setExecutable(job, exec, execargs, false); job.set(AvroJob.OUTPUT_SCHEMA, outscheme.toString());
Path rankings = new Path(options.getResultPath(), RANKINGS); Path fout = new Path(options.getResultPath(), USERVISITS); JobConf job = new JobConf(HiveData.class); String jobname = "Create uservisits"; job.setJobName(jobname); setVisitsOptions(job); Path uagentPath = new Path(options.getWorkPath(), uagentf); DistributedCache.addCacheFile(uagentPath.toUri(), job); DistributedCache.addCacheFile(searchkeyPath.toUri(), job); job.setOutputKeyClass(LongWritable.class); job.setOutputValueClass(Text.class); job.set("mapred.output.compression.type","BLOCK"); job.set("mapreduce.output.fileoutputformat.compress.type","BLOCK"); FileOutputFormat.setCompressOutput(job, true); FileOutputFormat.setOutputCompressorClass(job, options.getCodecClass()); FileOutputFormat.setOutputPath(job, fout); log.info("Rankings file " + rankings + " as input"); log.info("Ouput file " + fout); JobClient.runJob(job); log.info("Finished Running Job: " + jobname);
public long produceSamples(Path samplePath, boolean textOutput) throws Exception { Path input = new Path(samplePath.toString() + "-seeds"); this.numSamples = writeSeeds(input); LOG.info("Generating " + this.numSamples + " of samples"); JobConf jobConf = getJobConf(); jobConf.set("genkmeansdataset.dimensions", Integer.toString(dimension)); FileInputFormat.setInputPaths(jobConf, input); FileOutputFormat.setOutputPath(jobConf, samplePath); jobConf.setMapperClass(MapClass.class); if (textOutput){ jobConf.setInputFormat(SequenceFileInputFormat.class); jobConf.setOutputFormat(TextOutputFormat.class); jobConf.setOutputKeyClass(LongWritable.class); jobConf.setOutputValueClass(VectorWritable.class); } else { jobConf.setInputFormat(SequenceFileInputFormat.class); jobConf.setOutputFormat(SequenceFileOutputFormat.class); jobConf.setOutputKeyClass(LongWritable.class); jobConf.setOutputValueClass(VectorWritable.class); } jobConf.setNumReduceTasks(0); JobClient.runJob(jobConf); return this.numSamples; }
public void checkOutputSpecs(FileSystem fs, JobConf job) throws IOException { Path out = FileOutputFormat.getOutputPath(job); if ((out == null) && (job.getNumReduceTasks() != 0)) { throw new InvalidJobConfException( "Output directory not set in JobConf."); } if (fs == null) { fs = out.getFileSystem(job); } if (fs.exists(new Path(out, CrawlDatum.PARSE_DIR_NAME))) throw new IOException("Segment already parsed!"); }
/** * Gets fully configured JobConf instance. * * @param input input file name. * @param output output directory name. * @return Job configuration */ public static JobConf getJob(String input, String output) { JobConf conf = new JobConf(HadoopWordCount1.class); conf.setJobName("wordcount"); conf.setOutputKeyClass(Text.class); conf.setOutputValueClass(IntWritable.class); setTasksClasses(conf, true, true, true); FileInputFormat.setInputPaths(conf, new Path(input)); FileOutputFormat.setOutputPath(conf, new Path(output)); return conf; }
/** * @param args * @return the JobConf * @throws IOException */ public JobConf createSubmittableJob(String[] args) throws IOException { JobConf c = new JobConf(getConf(), getClass()); c.setJobName(NAME); // Columns are space delimited StringBuilder sb = new StringBuilder(); final int columnoffset = 2; for (int i = columnoffset; i < args.length; i++) { if (i > columnoffset) { sb.append(" "); } sb.append(args[i]); } // Second argument is the table name. TableMapReduceUtil.initTableMapJob(args[1], sb.toString(), RowCounterMapper.class, ImmutableBytesWritable.class, Result.class, c); c.setNumReduceTasks(0); // First arg is the output directory. FileOutputFormat.setOutputPath(c, new Path(args[0])); return c; }