org.apache.hadoop.mapred.FileInputFormat.setInputPaths java code examples

Refine search

public int run(String[] args) throws Exception {
  if(args.length != 3)
    Utils.croak("USAGE: GenerateData input-file output-dir value-size");
  JobConf conf = new JobConf(getConf(), GenerateData.class);
  conf.setJobName("generate-data");
  conf.setOutputKeyClass(Text.class);
  conf.setOutputValueClass(IntWritable.class);
  conf.setMapperClass(GenerateDataMapper.class);
  conf.setReducerClass(IdentityReducer.class);
  conf.setNumReduceTasks(0);
  conf.setInputFormat(TextInputFormat.class);
  conf.setOutputFormat(SequenceFileOutputFormat.class);
  conf.setOutputKeyClass(BytesWritable.class);
  conf.setOutputValueClass(BytesWritable.class);
  Path inputPath = new Path(args[0]);
  FileInputFormat.setInputPaths(conf, inputPath);
  Path outputPath = new Path(args[1]);
  // delete output path if it already exists
  FileSystem fs = outputPath.getFileSystem(conf);
  if(fs.exists(outputPath))
    fs.delete(outputPath, true);
  FileOutputFormat.setOutputPath(conf, outputPath);
  conf.setInt("value.size", Integer.parseInt(args[2]));
  JobClient.runJob(conf);
  return 0;
}

/**
 * Gets fully configured JobConf instance.
 *
 * @param input input file name.
 * @param output output directory name.
 * @return Job configuration
 */
public static JobConf getJob(String input, String output) {
  JobConf conf = new JobConf(HadoopWordCount1.class);
  conf.setJobName("wordcount");
  conf.setOutputKeyClass(Text.class);
  conf.setOutputValueClass(IntWritable.class);
  setTasksClasses(conf, true, true, true);
  FileInputFormat.setInputPaths(conf, new Path(input));
  FileOutputFormat.setOutputPath(conf, new Path(output));
  return conf;
}

if(!isAvro) {
  conf.setPartitionerClass(HadoopStoreBuilderPartitioner.class);
  conf.setMapperClass(mapperClass);
  conf.setMapOutputKeyClass(BytesWritable.class);
  conf.setMapOutputValueClass(BytesWritable.class);
  conf.setReducerClass(HadoopStoreBuilderReducer.class);
conf.setOutputKeyClass(BytesWritable.class);
conf.setOutputValueClass(BytesWritable.class);
conf.setJarByClass(getClass());
conf.setReduceSpeculativeExecution(false);
FileInputFormat.setInputPaths(conf, inputPath);
conf.set("final.output.dir", outputDir.toString());
conf.set(VoldemortBuildAndPushJob.CHECKSUM_TYPE, CheckSum.toString(checkSumType));
conf.set("dfs.umaskmode", "002");
FileOutputFormat.setOutputPath(conf, tempDir);
  conf.setOutputKeyClass(ByteBuffer.class);
  conf.setOutputValueClass(ByteBuffer.class);
  conf.setReducerClass(AvroStoreBuilderReducer.class);
  Path directoryPath = new Path(outputDir.toString(), directoryName);

@Test
/**
 * Run the identity job on a "bytes" Avro file using AvroAsTextInputFormat
 * and AvroTextOutputFormat to produce a sorted "bytes" Avro file.
 */
public void testSort() throws Exception {
 JobConf job = new JobConf();
 String inputPath = INPUT_DIR.getRoot().getPath();
 Path outputPath = new Path(OUTPUT_DIR.getRoot().getPath());
 outputPath.getFileSystem(job).delete(outputPath);
 WordCountUtil.writeLinesBytesFile(inputPath);
 job.setInputFormat(AvroAsTextInputFormat.class);
 job.setOutputFormat(AvroTextOutputFormat.class);
 job.setOutputKeyClass(Text.class);
 FileInputFormat.setInputPaths(job, new Path(inputPath));
 FileOutputFormat.setOutputPath(job, outputPath);
 JobClient.runJob(job);
 WordCountUtil.validateSortedFile(outputPath.toString() + "/part-00000.avro");
}

@Test
public void testNonAvroMapper() throws Exception {
 JobConf job = new JobConf();
 Path outputPath = new Path(OUTPUT_DIR.getRoot().getPath());
 outputPath.getFileSystem(job).delete(outputPath);
 // configure input for non-Avro sequence file
 job.setInputFormat(SequenceFileInputFormat.class);
 FileInputFormat.setInputPaths(job, file().toURI().toString());
 // use a hadoop mapper that emits Avro output
 job.setMapperClass(NonAvroMapper.class);
 // reducer is default, identity
 // configure output for avro
 FileOutputFormat.setOutputPath(job, outputPath);
 AvroJob.setOutputSchema(job, SCHEMA);
 JobClient.runJob(job);
 checkFile(new DataFileReader<>
      (new File(outputPath.toString() + "/part-00000.avro"),
       new SpecificDatumReader<>()));
}

@Test
public void testNonAvroMapOnly() throws Exception {
 JobConf job = new JobConf();
 Path outputPath = new Path(OUTPUT_DIR.getRoot().getPath());
 outputPath.getFileSystem(job).delete(outputPath);
 // configure input for non-Avro sequence file
 job.setInputFormat(SequenceFileInputFormat.class);
 FileInputFormat.setInputPaths(job, file().toURI().toString());
 // use a hadoop mapper that emits Avro output
 job.setMapperClass(NonAvroOnlyMapper.class);
 // configure output for avro
 job.setNumReduceTasks(0);                     // map-only
 FileOutputFormat.setOutputPath(job, outputPath);
 AvroJob.setOutputSchema(job, SCHEMA);
 JobClient.runJob(job);
 checkFile(new DataFileReader<>
      (new File(outputPath.toString() + "/part-00000.avro"),
       new SpecificDatumReader<>()));
}

@Test
public void testNonAvroReducer() throws Exception {
 JobConf job = new JobConf();
 Path outputPath = new Path(OUTPUT_DIR.getRoot().getPath());
 outputPath.getFileSystem(job).delete(outputPath);
 // configure input for Avro from sequence file
 AvroJob.setInputSequenceFile(job);
 AvroJob.setInputSchema(job, SCHEMA);
 FileInputFormat.setInputPaths(job, file().toURI().toString());
 // mapper is default, identity
 // use a hadoop reducer that consumes Avro input
 AvroJob.setMapOutputSchema(job, SCHEMA);
 job.setReducerClass(NonAvroReducer.class);
 // configure outputPath for non-Avro SequenceFile
 job.setOutputFormat(SequenceFileOutputFormat.class);
 FileOutputFormat.setOutputPath(job, outputPath);
 // output key/value classes are default, LongWritable/Text
 JobClient.runJob(job);
 checkFile(new SequenceFileReader<>
      (new File(outputPath.toString() + "/part-00000")));
}

@Test
public void testSequenceFileInputFormat() throws Exception {
 JobConf job = new JobConf();
 Path outputPath = new Path(OUTPUT_DIR.getRoot().getPath());
 outputPath.getFileSystem(job).delete(outputPath);
 // configure input for Avro from sequence file
 AvroJob.setInputSequenceFile(job);
 FileInputFormat.setInputPaths(job, file().toURI().toString());
 AvroJob.setInputSchema(job, SCHEMA);
 // mapper is default, identity
 // reducer is default, identity
 // configure output for avro
 AvroJob.setOutputSchema(job, SCHEMA);
 FileOutputFormat.setOutputPath(job, outputPath);
 JobClient.runJob(job);
 checkFile(new DataFileReader<>
      (new File(outputPath.toString() + "/part-00000.avro"),
       new SpecificDatumReader<>()));
}

 @Test
  public void testJob() throws Exception {
  JobConf job = new JobConf();
  Path outputPath = new Path(DIR.getRoot().getPath() + "/out");
  outputPath.getFileSystem(job).delete(outputPath);

  job.setInputFormat(TextInputFormat.class);
  FileInputFormat.setInputPaths(job, DIR.getRoot().getPath() + "/in");

  job.setMapperClass(AvroTestConverter.class);
  job.setNumReduceTasks(0);

  FileOutputFormat.setOutputPath(job, outputPath);
  System.out.println(createSchema());
  AvroJob.setOutputSchema(job,
              Pair.getPairSchema(Schema.create(Schema.Type.LONG),
                        createSchema()));
  job.setOutputFormat(AvroOutputFormat.class);

  JobClient.runJob(job);
 }
}

/** Uses default mapper with no reduces for a map-only identity job. */
@Test
@SuppressWarnings("deprecation")
public void testMapOnly() throws Exception {
 JobConf job = new JobConf();
 String inDir = System.getProperty("share.dir","../../../share")+"/test/data";
 Path input = new Path(inDir+"/weather.avro");
 Path output = new Path("target/test/weather-ident");
 output.getFileSystem(job).delete(output);
 job.setJobName("identity map weather");
 AvroJob.setInputSchema(job, Weather.SCHEMA$);
 AvroJob.setOutputSchema(job, Weather.SCHEMA$);
 FileInputFormat.setInputPaths(job, input);
 FileOutputFormat.setOutputPath(job, output);
 FileOutputFormat.setCompressOutput(job, true);
 job.setNumReduceTasks(0);                     // map-only
 JobClient.runJob(job);
 // check output is correct
 DatumReader<Weather> reader = new SpecificDatumReader<>();
 DataFileReader<Weather> check = new DataFileReader<>
  (new File(inDir + "/weather.avro"), reader);
 DataFileReader<Weather> sorted = new DataFileReader<>
  (new File(output.toString() + "/part-00000.avro"), reader);
 for (Weather w : sorted)
  assertEquals(check.next(), w);
 check.close();
 sorted.close();
}

@SuppressWarnings("deprecation")
public void testJobNoreducer() throws Exception {
 JobConf job = new JobConf();
 job.setNumReduceTasks(0);
 Path outputPath = new Path(OUTPUT_DIR.getRoot().getPath());
 outputPath.getFileSystem(job).delete(outputPath);
 WordCountUtil.writeLinesFile(new File(INPUT_DIR.getRoot(),"lines.avro"));
 job.setJobName("AvroMultipleOutputs_noreducer");
 AvroJob.setInputSchema(job, Schema.create(Schema.Type.STRING));
 AvroJob.setOutputSchema(job, new Pair<Utf8, Long>(new Utf8(""), 0L).getSchema());
 AvroJob.setMapperClass(job, MapImpl.class);
 FileInputFormat.setInputPaths(job, new Path(INPUT_DIR.getRoot().toString()));
 FileOutputFormat.setOutputPath(job, outputPath);
 FileOutputFormat.setCompressOutput(job, false);
 AvroMultipleOutputs.addNamedOutput(job, "myavro2", AvroOutputFormat.class, Schema.create(Schema.Type.STRING));
 JobClient.runJob(job);
}

public void testOutputFormat() throws Exception {
 JobConf job = new JobConf();
 WordCountUtil wordCountUtil = new WordCountUtil("trevniMapredTest");
 wordCountUtil.writeLinesFile();
 AvroJob.setInputSchema(job, STRING);
 AvroJob.setOutputSchema(job, Pair.getPairSchema(STRING,LONG));
 AvroJob.setMapperClass(job, MapImpl.class);
 AvroJob.setCombinerClass(job, ReduceImpl.class);
 AvroJob.setReducerClass(job, ReduceImpl.class);
 FileInputFormat.setInputPaths(job, new Path(wordCountUtil.getDir().toString() + "/in"));
 FileOutputFormat.setOutputPath(job, new Path(wordCountUtil.getDir().toString() + "/out"));
 FileOutputFormat.setCompressOutput(job, true);
 job.setOutputFormat(AvroTrevniOutputFormat.class);
 JobClient.runJob(job);
 wordCountUtil.validateCountsFile();
}

@SuppressWarnings("deprecation")
public void testJob(String pathOut) throws Exception {
 JobConf job = new JobConf();
 String pathIn = INPUT_DIR.getRoot().getPath();
 WordCountUtil.writeLinesFile(pathIn + "/lines.avro");
 Path outputPath = new Path(pathOut);
 outputPath.getFileSystem(job).delete(outputPath);
 job.setJobName("wordcount");
 AvroJob.setInputSchema(job, Schema.create(Schema.Type.STRING));
 AvroJob.setOutputSchema(job, new Pair<Utf8, Long>(new Utf8(""), 0L).getSchema());
 AvroJob.setMapperClass(job, MapImpl.class);
 AvroJob.setCombinerClass(job, ReduceImpl.class);
 AvroJob.setReducerClass(job, ReduceImpl.class);
 FileInputFormat.setInputPaths(job, new Path(pathIn));
 FileOutputFormat.setOutputPath(job, new Path(pathOut));
 FileOutputFormat.setCompressOutput(job, true);
 WordCountUtil.setMeta(job);
 JobClient.runJob(job);
 WordCountUtil.validateCountsFile(new File(pathOut, "part-00000.avro"));
}

@Test
@SuppressWarnings("deprecation")
public void testJob() throws Exception {
 JobConf job = new JobConf();
 String dir = "target/testReflectJob";
 Path inputPath = new Path(dir + "/in");
 Path outputPath = new Path(dir + "/out");
 outputPath.getFileSystem(job).delete(outputPath);
 inputPath.getFileSystem(job).delete(inputPath);
 writeLinesFile(new File(dir+"/in"));
 job.setJobName("reflect");
 AvroJob.setInputSchema(job, ReflectData.get().getSchema(Text.class));
 AvroJob.setMapOutputSchema
  (job, new Pair(new Text(""), new Count(0L)).getSchema());
 AvroJob.setOutputSchema(job, ReflectData.get().getSchema(WordCount.class));
 AvroJob.setMapperClass(job, MapImpl.class);
 //AvroJob.setCombinerClass(job, ReduceImpl.class);
 AvroJob.setReducerClass(job, ReduceImpl.class);
 FileInputFormat.setInputPaths(job, inputPath);
 FileOutputFormat.setOutputPath(job, outputPath);
 AvroJob.setReflect(job); // use reflection
 JobClient.runJob(job);
 validateCountsFile(new File(new File(dir, "out"), "part-00000.avro"));
}

@Test
@SuppressWarnings("deprecation")
public void testSort() throws Exception {
 JobConf job = new JobConf();
 String inDir = "../../../share/test/data";
 Path input = new Path(inDir+"/weather.avro");
 Path output = new Path("target/test/weather-sort");
 AvroJob.setReducerClass(job, SortReducer.class);
 FileInputFormat.setInputPaths(job, input);
 FileOutputFormat.setOutputPath(job, output);
 FileOutputFormat.setCompressOutput(job, true);
 AvroJob.setOutputCodec(job, SNAPPY_CODEC);

@SuppressWarnings("deprecation")
public void testJob(String pathOut) throws Exception {
 JobConf job = new JobConf();
 String pathIn = INPUT_DIR.getRoot().getPath();
 File fileIn = new File(pathIn, "lines.avro");
 Path outputPath = new Path(pathOut);
 outputPath.getFileSystem(job).delete(outputPath);
 WordCountUtil.writeLinesFile(fileIn);
 job.setJobName("AvroMultipleOutputs");
 AvroJob.setInputSchema(job, Schema.create(Schema.Type.STRING));
 AvroJob.setOutputSchema(job,
     new Pair<Utf8, Long>(new Utf8(""), 0L).getSchema());
 AvroJob.setMapperClass(job, MapImpl.class);
 AvroJob.setReducerClass(job, ReduceImpl.class);
 FileInputFormat.setInputPaths(job, pathIn);
 FileOutputFormat.setOutputPath(job, outputPath);
 FileOutputFormat.setCompressOutput(job, false);
 AvroMultipleOutputs.addNamedOutput(job, "myavro", AvroOutputFormat.class, new Pair<Utf8, Long>(new Utf8(""), 0L).getSchema());
 AvroMultipleOutputs.addNamedOutput(job, "myavro1", AvroOutputFormat.class, Schema.create(Schema.Type.STRING));
 AvroMultipleOutputs.addNamedOutput(job, "myavro2", AvroOutputFormat.class, Schema.create(Schema.Type.STRING));
 WordCountUtil.setMeta(job);
 JobClient.runJob(job);
 WordCountUtil.validateCountsFile(new File(outputPath.toString(), "/part-00000.avro"));
}

protected JobConf configStage1() throws Exception
{
  final JobConf conf = new JobConf(getConf(), ConCmptBlock.class);
  conf.set("block_width", "" + block_width);
  conf.set("recursive_diagmult", "" + recursive_diagmult);
  conf.setJobName("ConCmptBlock_pass1");
  conf.setMapperClass(MapStage1.class);
  conf.setReducerClass(RedStage1.class);
  FileInputFormat.setInputPaths(conf, edge_path, curbm_path);  
  FileOutputFormat.setOutputPath(conf, tempbm_path);  
  conf.setNumReduceTasks( nreducers );
  conf.setOutputKeyClass(IntWritable.class);
  conf.setOutputValueClass(Text.class);
  return conf;
}

protected JobConf configStage2 () throws Exception
{
  final JobConf conf = new JobConf(getConf(), ConCmptBlock.class);
  conf.set("block_width", "" + block_width);
  conf.setJobName("ConCmptBlock_pass2");
  
  conf.setMapperClass(MapStage2.class);        
  conf.setReducerClass(RedStage2.class);
  FileInputFormat.setInputPaths(conf, tempbm_path);  
  FileOutputFormat.setOutputPath(conf, nextbm_path);  
  conf.setNumReduceTasks( nreducers );
  conf.setOutputKeyClass(IntWritable.class);
  conf.setOutputValueClass(Text.class);
  return conf;
}

  protected JobConf configStage1() throws Exception
  {
    final JobConf conf = new JobConf(getConf(), PagerankPrep.class);
    conf.set("make_symmetric", "" + make_symmetric);
    conf.setJobName("PagerankPrep_Stage1");

    conf.setMapperClass(MapStage1.class);
    conf.setReducerClass(RedStage1.class);

    FileInputFormat.setInputPaths(conf, edge_path);  
    FileOutputFormat.setOutputPath(conf, output_path);  

    conf.setNumReduceTasks( nreducers );

    conf.setOutputKeyClass(IntWritable.class);
    conf.setOutputValueClass(Text.class);

    return conf;
  }
}

protected JobConf configStage1 () throws Exception
{
  final JobConf conf = new JobConf(getConf(), RWRBlock.class);
  conf.set("number_nodes", "" + number_nodes);
  conf.set("mixing_c", "" + mixing_c);
  conf.set("block_width", "" + block_width);
  conf.setJobName("RWRBlock_Stage1");
  
  conf.setMapperClass(MapStage1.class);        
  conf.setReducerClass(RedStage1.class);
  fs.delete(tempmv_path, true);
  FileInputFormat.setInputPaths(conf, edge_path, vector_path);  
  FileOutputFormat.setOutputPath(conf, tempmv_path);  
  conf.setNumReduceTasks( nreducers );
  conf.setOutputKeyClass(IntWritable.class);
  conf.setOutputValueClass(Text.class);
  return conf;
}

Javadoc

Sets the given comma separated paths as the list of inputs for the map-reduce job.

Popular methods of FileInputFormat

getInputPaths
Get the list of input Paths for the map-reduce job.
addInputPath
Add a Path to the list of inputs for the map-reduce job.
addInputPaths
Add the given comma separated paths to the list of inputs for the map-reduce job.
listStatus
List input directories. Subclasses may override to, e.g., select only files matching a regular expre
getSplits
Splits files returned by #listStatus(JobConf) when they're too big.
getRecordReader
isSplitable
Is the given filename splitable? Usually, true, but if the file is stream compressed, it will not be
computeSplitSize
getBlockIndex
getInputPathFilter
Get a PathFilter instance of the filter set for the input paths.
getPathStrings
addInputPathRecursively
Add files in the input path recursively into the results.

Popular in Java

Making http post requests using okhttp
setContentView (Activity)
findViewById (Activity)
onRequestPermissionsResult (Fragment)
FileReader (java.io)
A specialized Reader that reads from a file in the file system. All read requests made by calling me
ByteBuffer (java.nio)
A buffer for bytes. A byte buffer can be created in either one of the following ways: * #allocate
Callable (java.util.concurrent)
A task that returns a result and may throw an exception. Implementors define a single method with no
Window (java.awt)
A Window object is a top-level window with no borders and no menubar. The default layout for a windo
Table (org.hibernate.mapping)
A relational table
Option (scala)
Top 12 Jupyter Notebook extensions

How to use setInputPathsmethodin org.apache.hadoop.mapred.FileInputFormat

Best Java code snippets using org.apache.hadoop.mapred.FileInputFormat.setInputPaths (Showing top 20 results out of 657)

Refine search

How to use
setInputPaths
method
in
org.apache.hadoop.mapred.FileInputFormat