org.apache.avro.mapred.AvroJob.setOutputSchema java code examples

 @SuppressWarnings({"unchecked", "deprecation"})
 public RecordWriter<Object, Object> getRecordWriter(FileSystem fs,JobConf job, String baseFileName, Progressable arg3) throws IOException {
 String nameOutput = job.get(CONFIG_NAMED_OUTPUT, null);
 String fileName = getUniqueName(job, baseFileName);
 Schema schema = null;
 String schemastr = job.get(MO_PREFIX+nameOutput+".schema",null);
 if (schemastr!=null)
  schema = Schema.parse(schemastr);
 JobConf outputConf = new JobConf(job);
 outputConf.setOutputFormat(getNamedOutputFormatClass(job, nameOutput));
 boolean isMapOnly = job.getNumReduceTasks() == 0;
 if (schema != null) {
  if (isMapOnly)
   AvroJob.setMapOutputSchema(outputConf, schema);
  else
   AvroJob.setOutputSchema(outputConf, schema);
 }
 OutputFormat outputFormat = outputConf.getOutputFormat();
 return outputFormat.getRecordWriter(fs, outputConf, fileName, arg3);
 }
}

@Test
public void testNonAvroMapper() throws Exception {
 JobConf job = new JobConf();
 Path outputPath = new Path(OUTPUT_DIR.getRoot().getPath());
 outputPath.getFileSystem(job).delete(outputPath);
 // configure input for non-Avro sequence file
 job.setInputFormat(SequenceFileInputFormat.class);
 FileInputFormat.setInputPaths(job, file().toURI().toString());
 // use a hadoop mapper that emits Avro output
 job.setMapperClass(NonAvroMapper.class);
 // reducer is default, identity
 // configure output for avro
 FileOutputFormat.setOutputPath(job, outputPath);
 AvroJob.setOutputSchema(job, SCHEMA);
 JobClient.runJob(job);
 checkFile(new DataFileReader<>
      (new File(outputPath.toString() + "/part-00000.avro"),
       new SpecificDatumReader<>()));
}

@Test
public void testSequenceFileInputFormat() throws Exception {
 JobConf job = new JobConf();
 Path outputPath = new Path(OUTPUT_DIR.getRoot().getPath());
 outputPath.getFileSystem(job).delete(outputPath);
 // configure input for Avro from sequence file
 AvroJob.setInputSequenceFile(job);
 FileInputFormat.setInputPaths(job, file().toURI().toString());
 AvroJob.setInputSchema(job, SCHEMA);
 // mapper is default, identity
 // reducer is default, identity
 // configure output for avro
 AvroJob.setOutputSchema(job, SCHEMA);
 FileOutputFormat.setOutputPath(job, outputPath);
 JobClient.runJob(job);
 checkFile(new DataFileReader<>
      (new File(outputPath.toString() + "/part-00000.avro"),
       new SpecificDatumReader<>()));
}

@Test
public void testNonAvroMapOnly() throws Exception {
 JobConf job = new JobConf();
 Path outputPath = new Path(OUTPUT_DIR.getRoot().getPath());
 outputPath.getFileSystem(job).delete(outputPath);
 // configure input for non-Avro sequence file
 job.setInputFormat(SequenceFileInputFormat.class);
 FileInputFormat.setInputPaths(job, file().toURI().toString());
 // use a hadoop mapper that emits Avro output
 job.setMapperClass(NonAvroOnlyMapper.class);
 // configure output for avro
 job.setNumReduceTasks(0);                     // map-only
 FileOutputFormat.setOutputPath(job, outputPath);
 AvroJob.setOutputSchema(job, SCHEMA);
 JobClient.runJob(job);
 checkFile(new DataFileReader<>
      (new File(outputPath.toString() + "/part-00000.avro"),
       new SpecificDatumReader<>()));
}

AvroJob.setOutputSchema(conf,
            Pair.getPairSchema(Schema.create(Schema.Type.BYTES),
                      Schema.create(Schema.Type.BYTES)));

/** Uses default mapper with no reduces for a map-only identity job. */
@Test
@SuppressWarnings("deprecation")
public void testMapOnly() throws Exception {
 JobConf job = new JobConf();
 String inDir = System.getProperty("share.dir","../../../share")+"/test/data";
 Path input = new Path(inDir+"/weather.avro");
 Path output = new Path("target/test/weather-ident");
 output.getFileSystem(job).delete(output);
 job.setJobName("identity map weather");
 AvroJob.setInputSchema(job, Weather.SCHEMA$);
 AvroJob.setOutputSchema(job, Weather.SCHEMA$);
 FileInputFormat.setInputPaths(job, input);
 FileOutputFormat.setOutputPath(job, output);
 FileOutputFormat.setCompressOutput(job, true);
 job.setNumReduceTasks(0);                     // map-only
 JobClient.runJob(job);
 // check output is correct
 DatumReader<Weather> reader = new SpecificDatumReader<>();
 DataFileReader<Weather> check = new DataFileReader<>
  (new File(inDir + "/weather.avro"), reader);
 DataFileReader<Weather> sorted = new DataFileReader<>
  (new File(output.toString() + "/part-00000.avro"), reader);
 for (Weather w : sorted)
  assertEquals(check.next(), w);
 check.close();
 sorted.close();
}

 @Test
  public void testJob() throws Exception {
  JobConf job = new JobConf();
  Path outputPath = new Path(DIR.getRoot().getPath() + "/out");
  outputPath.getFileSystem(job).delete(outputPath);

  job.setInputFormat(TextInputFormat.class);
  FileInputFormat.setInputPaths(job, DIR.getRoot().getPath() + "/in");

  job.setMapperClass(AvroTestConverter.class);
  job.setNumReduceTasks(0);

  FileOutputFormat.setOutputPath(job, outputPath);
  System.out.println(createSchema());
  AvroJob.setOutputSchema(job,
              Pair.getPairSchema(Schema.create(Schema.Type.LONG),
                        createSchema()));
  job.setOutputFormat(AvroOutputFormat.class);

  JobClient.runJob(job);
 }
}

@SuppressWarnings("deprecation")
public void testJobNoreducer() throws Exception {
 JobConf job = new JobConf();
 job.setNumReduceTasks(0);
 Path outputPath = new Path(OUTPUT_DIR.getRoot().getPath());
 outputPath.getFileSystem(job).delete(outputPath);
 WordCountUtil.writeLinesFile(new File(INPUT_DIR.getRoot(),"lines.avro"));
 job.setJobName("AvroMultipleOutputs_noreducer");
 AvroJob.setInputSchema(job, Schema.create(Schema.Type.STRING));
 AvroJob.setOutputSchema(job, new Pair<Utf8, Long>(new Utf8(""), 0L).getSchema());
 AvroJob.setMapperClass(job, MapImpl.class);
 FileInputFormat.setInputPaths(job, new Path(INPUT_DIR.getRoot().toString()));
 FileOutputFormat.setOutputPath(job, outputPath);
 FileOutputFormat.setCompressOutput(job, false);
 AvroMultipleOutputs.addNamedOutput(job, "myavro2", AvroOutputFormat.class, Schema.create(Schema.Type.STRING));
 JobClient.runJob(job);
}

public void testOutputFormat() throws Exception {
 JobConf job = new JobConf();
 WordCountUtil wordCountUtil = new WordCountUtil("trevniMapredTest");
 wordCountUtil.writeLinesFile();
 AvroJob.setInputSchema(job, STRING);
 AvroJob.setOutputSchema(job, Pair.getPairSchema(STRING,LONG));
 AvroJob.setMapperClass(job, MapImpl.class);
 AvroJob.setCombinerClass(job, ReduceImpl.class);
 AvroJob.setReducerClass(job, ReduceImpl.class);
 FileInputFormat.setInputPaths(job, new Path(wordCountUtil.getDir().toString() + "/in"));
 FileOutputFormat.setOutputPath(job, new Path(wordCountUtil.getDir().toString() + "/out"));
 FileOutputFormat.setCompressOutput(job, true);
 job.setOutputFormat(AvroTrevniOutputFormat.class);
 JobClient.runJob(job);
 wordCountUtil.validateCountsFile();
}

@SuppressWarnings("deprecation")
public void testJob(String pathOut) throws Exception {
 JobConf job = new JobConf();
 String pathIn = INPUT_DIR.getRoot().getPath();
 WordCountUtil.writeLinesFile(pathIn + "/lines.avro");
 Path outputPath = new Path(pathOut);
 outputPath.getFileSystem(job).delete(outputPath);
 job.setJobName("wordcount");
 AvroJob.setInputSchema(job, Schema.create(Schema.Type.STRING));
 AvroJob.setOutputSchema(job, new Pair<Utf8, Long>(new Utf8(""), 0L).getSchema());
 AvroJob.setMapperClass(job, MapImpl.class);
 AvroJob.setCombinerClass(job, ReduceImpl.class);
 AvroJob.setReducerClass(job, ReduceImpl.class);
 FileInputFormat.setInputPaths(job, new Path(pathIn));
 FileOutputFormat.setOutputPath(job, new Path(pathOut));
 FileOutputFormat.setCompressOutput(job, true);
 WordCountUtil.setMeta(job);
 JobClient.runJob(job);
 WordCountUtil.validateCountsFile(new File(pathOut, "part-00000.avro"));
}

@Test
public void testJob() throws Exception {
 JobConf job = new JobConf();
 Path inputPath1 = new Path(INPUT_DIR_1.getRoot().getPath());
 Path inputPath2 = new Path(INPUT_DIR_2.getRoot().getPath());
 Path outputPath = new Path(OUTPUT_DIR.getRoot().getPath());
 outputPath.getFileSystem(job).delete(outputPath);
 writeNamesFiles(new File(inputPath1.toUri().getPath()));
 writeBalancesFiles(new File(inputPath2.toUri().getPath()));
 job.setJobName("multiple-inputs-join");
 AvroMultipleInputs.addInputPath(job, inputPath1, NamesMapImpl.class,
     ReflectData.get().getSchema(NamesRecord.class));
 AvroMultipleInputs.addInputPath(job, inputPath2, BalancesMapImpl.class,
     ReflectData.get().getSchema(BalancesRecord.class));
 Schema keySchema = ReflectData.get().getSchema(KeyRecord.class);
 Schema valueSchema = ReflectData.get().getSchema(JoinableRecord.class);
 AvroJob.setMapOutputSchema(job,
     Pair.getPairSchema(keySchema, valueSchema));
 AvroJob.setOutputSchema(job,
     ReflectData.get().getSchema(CompleteRecord.class));
 AvroJob.setReducerClass(job, ReduceImpl.class);
 job.setNumReduceTasks(1);
 FileOutputFormat.setOutputPath(job, outputPath);
 AvroJob.setReflect(job);
 JobClient.runJob(job);
 validateCompleteFile(new File(OUTPUT_DIR.getRoot(), "part-00000.avro"));
}

@Test
@SuppressWarnings("deprecation")
public void testJob() throws Exception {
 JobConf job = new JobConf();
 String dir = "target/testReflectJob";
 Path inputPath = new Path(dir + "/in");
 Path outputPath = new Path(dir + "/out");
 outputPath.getFileSystem(job).delete(outputPath);
 inputPath.getFileSystem(job).delete(inputPath);
 writeLinesFile(new File(dir+"/in"));
 job.setJobName("reflect");
 AvroJob.setInputSchema(job, ReflectData.get().getSchema(Text.class));
 AvroJob.setMapOutputSchema
  (job, new Pair(new Text(""), new Count(0L)).getSchema());
 AvroJob.setOutputSchema(job, ReflectData.get().getSchema(WordCount.class));
 AvroJob.setMapperClass(job, MapImpl.class);
 //AvroJob.setCombinerClass(job, ReduceImpl.class);
 AvroJob.setReducerClass(job, ReduceImpl.class);
 FileInputFormat.setInputPaths(job, inputPath);
 FileOutputFormat.setOutputPath(job, outputPath);
 AvroJob.setReflect(job); // use reflection
 JobClient.runJob(job);
 validateCountsFile(new File(new File(dir, "out"), "part-00000.avro"));
}

AvroJob.setMapOutputSchema
 (job, Pair.getPairSchema(Weather.SCHEMA$, Schema.create(Type.NULL)));
AvroJob.setOutputSchema(job, Weather.SCHEMA$);

@SuppressWarnings("deprecation")
public void testJob(String pathOut) throws Exception {
 JobConf job = new JobConf();
 String pathIn = INPUT_DIR.getRoot().getPath();
 File fileIn = new File(pathIn, "lines.avro");
 Path outputPath = new Path(pathOut);
 outputPath.getFileSystem(job).delete(outputPath);
 WordCountUtil.writeLinesFile(fileIn);
 job.setJobName("AvroMultipleOutputs");
 AvroJob.setInputSchema(job, Schema.create(Schema.Type.STRING));
 AvroJob.setOutputSchema(job,
     new Pair<Utf8, Long>(new Utf8(""), 0L).getSchema());
 AvroJob.setMapperClass(job, MapImpl.class);
 AvroJob.setReducerClass(job, ReduceImpl.class);
 FileInputFormat.setInputPaths(job, pathIn);
 FileOutputFormat.setOutputPath(job, outputPath);
 FileOutputFormat.setCompressOutput(job, false);
 AvroMultipleOutputs.addNamedOutput(job, "myavro", AvroOutputFormat.class, new Pair<Utf8, Long>(new Utf8(""), 0L).getSchema());
 AvroMultipleOutputs.addNamedOutput(job, "myavro1", AvroOutputFormat.class, Schema.create(Schema.Type.STRING));
 AvroMultipleOutputs.addNamedOutput(job, "myavro2", AvroOutputFormat.class, Schema.create(Schema.Type.STRING));
 WordCountUtil.setMeta(job);
 JobClient.runJob(job);
 WordCountUtil.validateCountsFile(new File(outputPath.toString(), "/part-00000.avro"));
}

 @SuppressWarnings({"unchecked", "deprecation"})
 public RecordWriter<Object, Object> getRecordWriter(FileSystem fs,JobConf job, String baseFileName, Progressable arg3) throws IOException {
 String nameOutput = job.get(CONFIG_NAMED_OUTPUT, null);
 String fileName = getUniqueName(job, baseFileName);
 Schema schema = null;
 String schemastr = job.get(MO_PREFIX+nameOutput+".schema",null);
 if (schemastr!=null)
  schema = Schema.parse(schemastr);
 JobConf outputConf = new JobConf(job);
 outputConf.setOutputFormat(getNamedOutputFormatClass(job, nameOutput));
 boolean isMapOnly = job.getNumReduceTasks() == 0;
 if (schema != null) {
  if (isMapOnly)
   AvroJob.setMapOutputSchema(outputConf, schema);
  else
   AvroJob.setOutputSchema(outputConf, schema);
 }
 OutputFormat outputFormat = outputConf.getOutputFormat();
 return outputFormat.getRecordWriter(fs, outputConf, fileName, arg3);
 }
}

/**
 * Creates a JobConf for a map-only job. Automatically loads the schema from each input file.
 * 
 * @param mapperClass AvroMapper subclass implementing the map phase
 * @param outputSchema Schema of the mapper output
 * @return A configured JobConf.
 * @throws IOException
 * @throws URISyntaxException 
 */
public JobConf createJobConf(Class<? extends AvroMapper> mapperClass, 
               Schema outputSchema) throws IOException, URISyntaxException
{
 JobConf conf = createJobConf();
 AvroJob.setMapperClass(conf, mapperClass);
 AvroJob.setReducerClass(conf, AvroReducer.class);
 AvroJob.setOutputSchema(conf, outputSchema);
 
 conf.setNumReduceTasks(0);
 return conf;
}

/**
 * Creates a JobConf for a map-reduce job. Loads the input schema from the input files.
 * 
 * @param mapperClass AvroMapper subclass for the mapper.
 * @param reducerClass AvroReducer subclass for the reducer.
 * @param mapperOutputSchema Mapper output schema. Must be an instance of org.apache.avro.mapred.Pair
 * @param outputSchema Reducer output schema
 * @return A configured JobConf.
 * @throws IOException
 * @throws URISyntaxException 
 */
public JobConf createJobConf(Class<? extends AvroMapper> mapperClass,
               Class<? extends AvroReducer> reducerClass,
               Schema mapperOutputSchema,
               Schema outputSchema) throws IOException, URISyntaxException
{
 JobConf conf = createJobConf();
 AvroJob.setMapperClass(conf, mapperClass);
 AvroJob.setReducerClass(conf, reducerClass);
 AvroJob.setMapOutputSchema(conf, mapperOutputSchema);
 AvroJob.setOutputSchema(conf, outputSchema);
 return conf;
}

public static void main(String... args) throws Exception {
 JobConf job = new JobConf();
 job.setJarByClass(SmallFilesMapReduce.class);
 Path input = new Path(args[0]);
 Path output = new Path(args[1]);
 output.getFileSystem(job).delete(output, true);
 AvroJob.setInputSchema(job,
   Stock.SCHEMA$);  //<co id="ch03_avro_mr_comment1"/>
 AvroJob.setMapOutputSchema(job, Pair.getPairSchema(Stock.SCHEMA$,
   Schema.create(Schema.Type.NULL)));
 AvroJob.setOutputSchema(job,
   Stock.SCHEMA$);
 FileInputFormat.setInputPaths(job, input);
 FileOutputFormat.setOutputPath(job, output);
 AvroJob.setMapperClass(job,
   Mapper.class);    //<co id="ch03_smallfilemr_comment2"/>
 AvroJob.setReducerClass(job,
   Reducer.class);
 FileOutputFormat.setCompressOutput(job, true);
 AvroJob.setOutputCodec(job, SNAPPY_CODEC);
 JobClient.runJob(job);
}

/**
 * Creates a JobConf for a map-only job with an explicitly set input Schema.
 * 
 * @param mapperClass AvroMapper subclass implementing the map phase
 * @param inputSchema Schema of the input data.
 * @param outputSchema Schema of the mapper output
 * @return A configured JobConf.
 * @throws IOException
 * @throws URISyntaxException 
 */
public JobConf createJobConf(Class<? extends AvroMapper> mapperClass, 
               Schema inputSchema, 
               Schema outputSchema) throws IOException, URISyntaxException
{
 JobConf conf = createJobConf();
 AvroJob.setMapperClass(conf, mapperClass);
 AvroJob.setReducerClass(conf, AvroReducer.class);
 
 AvroJob.setInputSchema(conf, inputSchema);
 AvroJob.setOutputSchema(conf, outputSchema);
 
 conf.setNumReduceTasks(0);
 return conf;
}

public void testOutputFormat() throws Exception {
 JobConf job = new JobConf();
 
 WordCountUtil.writeLinesFile();
 
 AvroJob.setInputSchema(job, STRING);
 AvroJob.setOutputSchema(job, Pair.getPairSchema(STRING,LONG));
 
 AvroJob.setMapperClass(job, MapImpl.class);        
 AvroJob.setCombinerClass(job, ReduceImpl.class);
 AvroJob.setReducerClass(job, ReduceImpl.class);
 
 FileInputFormat.setInputPaths(job, new Path(DIR + "/in"));
 FileOutputFormat.setOutputPath(job, new Path(DIR + "/out"));
 FileOutputFormat.setCompressOutput(job, true);
 
 job.setOutputFormat(AvroTrevniOutputFormat.class);
 JobClient.runJob(job);
 
 WordCountUtil.validateCountsFile();
}

Javadoc

Configure a job's output schema. Unless this is a map-only job, this must be a Pair schema.

Popular methods of AvroJob

getOutputSchema
Return a job's output key schema.
getMapOutputSchema
Return a job's map output key schema.
setInputSchema
Configure a job's map input schema.
setMapperClass
Configure a job's mapper implementation.
setReducerClass
Configure a job's reducer implementation.
getInputSchema
Return a job's map input schema.
setCombinerClass
Configure a job's combiner implementation.
setMapOutputSchema
Configure a job's map output schema. The map output schema defaults to the output schema and need on
setOutputCodec
Configure a job's output compression codec.
configureAvroInput
configureAvroJob
configureAvroOutput

Popular in Java

Updating database using SQL prepared statement
getContentResolver (Context)
compareTo (BigDecimal)
onCreateOptionsMenu (Activity)
Pointer (com.sun.jna)
An abstraction for a native pointer data type. A Pointer instance represents, on the Java side, a na
IOException (java.io)
Signals a general, I/O-related error. Error details may be specified when calling the constructor, a
ArrayList (java.util)
ArrayList is an implementation of List, backed by an array. All optional operations including adding
Set (java.util)
A Set is a data structure which does not allow duplicate elements.
GridLayout (java.awt)
The GridLayout class is a layout manager that lays out a container's components in a rectangular gri
Kernel (java.awt.image)
Top PhpStorm plugins

How to use setOutputSchemamethodin org.apache.avro.mapred.AvroJob

Best Java code snippets using org.apache.avro.mapred.AvroJob.setOutputSchema (Showing top 20 results out of 315)

How to use
setOutputSchema
method
in
org.apache.avro.mapred.AvroJob