org.apache.hadoop.mapreduce.InputFormat java code examples

Refine search

protected int getMapInputSplitCount()
    throws ClassNotFoundException, JobException, IOException, InterruptedException {
  if (job == null) {
    throw new JobException("Job is null");
  }
  InputFormat<?, ?> input = ReflectionUtils.newInstance(job.getInputFormatClass(), job.getConfiguration());
  return input.getSplits(job).size();
}

@Override
public void initialize(final InputSplit inputSplit, final TaskAttemptContext taskAttemptContext) throws IOException, InterruptedException {
  final Configuration configuration = taskAttemptContext.getConfiguration();
  final InputFormat<NullWritable, VertexWritable> inputFormat = ReflectionUtils.newInstance(configuration.getClass(Constants.GREMLIN_HADOOP_GRAPH_READER, InputFormat.class, InputFormat.class), configuration);
  if (!(inputFormat instanceof GraphFilterAware) && configuration.get(Constants.GREMLIN_HADOOP_GRAPH_FILTER, null) != null)
    this.graphFilter = VertexProgramHelper.deserialize(ConfUtil.makeApacheConfiguration(configuration), Constants.GREMLIN_HADOOP_GRAPH_FILTER);
  this.recordReader = inputFormat.createRecordReader(inputSplit, taskAttemptContext);
  this.recordReader.initialize(inputSplit, taskAttemptContext);
}

public void testBinary() throws IOException, InterruptedException {
 Job job = Job.getInstance();
 FileSystem fs = FileSystem.getLocal(job.getConfiguration());
 Path dir = new Path(System.getProperty("test.build.data",".") + "/mapred");
 Path file = new Path(dir, "testbinary.seq");
 Random r = new Random();
 long seed = r.nextLong();
  job.getConfiguration(), file, Text.class, Text.class);
 try {
  for (int i = 0; i < RECORDS; ++i) {
 DataInputBuffer buf = new DataInputBuffer();
 FileInputFormat.setInputPaths(job, file);
 for (InputSplit split : bformat.getSplits(job)) {
  RecordReader<BytesWritable, BytesWritable> reader =
     bformat.createRecordReader(split, context);
  MapContext<BytesWritable, BytesWritable, BytesWritable, BytesWritable> 
   mcontext = new MapContextImpl<BytesWritable, BytesWritable,
    BytesWritable, BytesWritable>(job.getConfiguration(), 
    context.getTaskAttemptID(), reader, null, null, 
    MapReduceTestUtil.createDummyReporter(), 
    split);
  reader.initialize(split, mcontext);
  try {
   while (reader.nextKeyValue()) {
    bkey = reader.getCurrentKey();
    bval = reader.getCurrentValue();
    tkey.set(Integer.toString(r.nextInt(), 36));

JobConf job = new JobConf();
fs = FileSystem.getLocal(job);
Path rootDir = new Path(TEST_ROOT_DIR);
createInputFile(rootDir);
 ReflectionUtils.newInstance(jContext.getInputFormatClass(), job);
List<InputSplit> splits = input.getSplits(jContext);
JobSplitWriter.createSplitFiles(new Path(TEST_ROOT_DIR), job, 
        new Path(TEST_ROOT_DIR).getFileSystem(job),
        splits);
TaskSplitMetaInfo[] splitMetaInfo =

/**
 * Create the needed objects for reading the splits of the filepath given as argument.
 * This method should run before the scheduleSplits method.
 * 
 * @param filepath
 */
@SuppressWarnings({ "deprecation", "unchecked" })
public void setJob(String filepath, String tag) {
  try {
    conf.set("start_tag", "<" + tag + ">");
    conf.set("end_tag", "</" + tag + ">");
    job = new Job(conf, "Read from HDFS");
    Path input = new Path(filepath);
    FileInputFormat.addInputPath(job, input);
    job.setInputFormatClass(XmlCollectionWithTagInputFormat.class);
    inputFormat = ReflectionUtils.newInstance(job.getInputFormatClass(), job.getConfiguration());
    splits = inputFormat.getSplits(job);
  } catch (IOException | ClassNotFoundException | InterruptedException e) {
    if (LOGGER.isLoggable(Level.SEVERE)) {
      LOGGER.severe(e.getMessage());
    }
  }
}

Path [] files = { new Path("file1"), new Path("file2") };
long [] lengths = { 1, 1 };
RecordReader rr = inputFormat.createRecordReader(split, context);
assertTrue("Unexpected RR type!", rr instanceof CombineFileRecordReader);
rr.initialize(split, context);
assertTrue(rr.nextKeyValue());
assertEquals("file1", rr.getCurrentValue().toString());

public void testRecordReaderInit() throws InterruptedException, IOException {
 // Test that we properly initialize the child recordreader when
 // CombineFileInputFormat and CombineFileRecordReader are used.
 TaskAttemptID taskId = new TaskAttemptID("jt", 0, TaskType.MAP, 0, 0);
 Configuration conf1 = new Configuration();
 conf1.set(DUMMY_KEY, "STATE1");
 TaskAttemptContext context1 = new TaskAttemptContextImpl(conf1, taskId);
 // This will create a CombineFileRecordReader that itself contains a
 // DummyRecordReader.
 InputFormat inputFormat = new ChildRRInputFormat();
 Path [] files = { new Path("file1") };
 long [] lengths = { 1 };
 CombineFileSplit split = new CombineFileSplit(files, lengths);
 RecordReader rr = inputFormat.createRecordReader(split, context1);
 assertTrue("Unexpected RR type!", rr instanceof CombineFileRecordReader);
 // Verify that the initial configuration is the one being used.
 // Right after construction the dummy key should have value "STATE1"
 assertEquals("Invalid initial dummy key value", "STATE1",
  rr.getCurrentKey().toString());
 // Switch the active context for the RecordReader...
 Configuration conf2 = new Configuration();
 conf2.set(DUMMY_KEY, "STATE2");
 TaskAttemptContext context2 = new TaskAttemptContextImpl(conf2, taskId);
 rr.initialize(split, context2);
 // And verify that the new context is updated into the child record reader.
 assertEquals("Invalid secondary dummy key value", "STATE2",
  rr.getCurrentKey().toString());
}

if(!UriUtil.isHDFSFile(location))
  continue;
Path path = new Path(location);
FileSystem fs = path.getFileSystem(conf);
if (fs.exists(path)) {
  LoadFunc loader = (LoadFunc) PigContext
  .instantiateFuncFromSpec(ld.getLFile()
      .getFuncSpec());
  Job job = new Job(conf);
  loader.setUDFContextSignature(ld.getSignature());
  loader.setLocation(location, job);
  InputFormat inf = loader.getInputFormat();
  List<InputSplit> splits = inf.getSplits(HadoopShims.cloneJobContext(job));
  List<List<InputSplit>> results = MapRedUtil
  .getCombinePigSplits(splits,

@SuppressWarnings("unchecked")
private <T extends InputSplit>
int writeNewSplits(JobContext job, Path jobSubmitDir) throws IOException,
  InterruptedException, ClassNotFoundException {
 Configuration conf = job.getConfiguration();
 InputFormat<?, ?> input =
  ReflectionUtils.newInstance(job.getInputFormatClass(), conf);
 List<InputSplit> splits = input.getSplits(job);
 T[] array = (T[]) splits.toArray(new InputSplit[splits.size()]);
 // sort the splits into order based on size, so that the biggest
 // go first
 Arrays.sort(array, new SplitComparator());
 JobSplitWriter.createSplitFiles(jobSubmitDir, conf, 
   jobSubmitDir.getFileSystem(conf), array);
 return array.length;
}

public void testHDFSReadWriteOperators() throws Exception {
  FileInputFormat.setInputPaths(conf, HDFS_INPUT_PATH);
  FileOutputFormat.setOutputPath(conf, new Path(HDFS_OUTPUT_PATH));
  conf.setInputFormatClass(TextInputFormat.class);
  InputFormat inputFormat = ReflectionUtils.newInstance(conf.getInputFormatClass(), getConfiguration());
  List<InputSplit> splits = inputFormat.getSplits(conf);

  ReflectionUtils.newInstance(taskContext.getMapperClass(), job);
  ReflectionUtils.newInstance(taskContext.getInputFormatClass(), job);
split = getSplitDetails(new Path(splitIndex.getSplitLocation()),
  splitIndex.getStartOffset());
   (inputFormat.createRecordReader(split, taskContext), reporter);
     mapContext);
input.initialize(split, mapperContext);
mapper.run(mapperContext);
mapPhase.complete();
setPhase(TaskStatus.Phase.SORT);
statusUpdate(umbilical);
input.close();
output.close(mapperContext);

 @SuppressWarnings("unchecked")
 public RecordReader<K, V> createRecordReader(InputSplit split, 
   TaskAttemptContext context) throws IOException, InterruptedException {
  Configuration conf = context.getConfiguration();
  InputFormat<K, V> indirIF = (InputFormat)ReflectionUtils.newInstance(
    conf.getClass(INDIRECT_INPUT_FORMAT,
     SequenceFileInputFormat.class), conf);
  IndirectSplit is = ((IndirectSplit)split);
  return indirIF.createRecordReader(new FileSplit(is.getPath(), 0,
     is.getLength(), (String[])null), context);
 }
}

public Object[] getSample(InputFormat inf, Job job) throws IOException, InterruptedException {
 long counter = 0;
 List<InputSplit> splits = inf.getSplits(job);
 ArrayList<K> samples = new ArrayList<K>(numSamples);
 int splitsToSample = Math.min(maxSplitsSampled, splits.size());
   job.getConfiguration(), new TaskAttemptID());
  RecordReader<K, V> reader = inf.createRecordReader(splits.get(i), samplingContext);
  reader.initialize(splits.get(i), samplingContext);
  while (reader.nextKeyValue()) {
   if (r.nextDouble() <= freq) {
    if (samples.size() < numSamples) {
      LOG.info(String.format("Fill: Collected %d samples from %d splits", counter, i));
     counter++;
     samples.add(ReflectionUtils.copy(job.getConfiguration(), reader.getCurrentKey(), null));
    } else {
      samples.set(ind, ReflectionUtils.copy(job.getConfiguration(),
       reader.getCurrentKey(), null));
      if (counter % 1000 == 0)

 import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.mapreduce.InputFormat;
import org.apache.hadoop.mapreduce.RecordReader;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.hadoop.mapreduce.TaskAttemptID;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;
import org.apache.hadoop.mapreduce.task.TaskAttemptContextImpl;
import org.apache.hadoop.util.ReflectionUtils;
import java.io.File;

Configuration conf = new Configuration(false);
conf.set("fs.default.name", "file:///");

File testFile = new File("path/to/file");
Path path = new Path(testFile.getAbsoluteFile().toURI());
FileSplit split = new FileSplit(path, 0, testFile.length(), null);

InputFormat inputFormat = ReflectionUtils.newInstance(MyInputFormat.class, conf);
TaskAttemptContext context = new TaskAttemptContextImpl(conf, new TaskAttemptID());
RecordReader reader = inputFormat.createRecordReader(split, context);

reader.initialize(split, context);

InputFormat<?, ?> format = ReflectionUtils.newInstance(ctx.getInputFormatClass(), ctx.getConfiguration());
List<InputSplit> splits = format.getSplits(ctx);
    FileSplit s = (FileSplit)nativeSplit;
    res.add(new HadoopFileBlock(s.getLocations(), s.getPath().toUri(), s.getStart(), s.getLength()));

@Override
public List<InputSplit> getSplits(JobContext job) throws IOException, InterruptedException {
 List<InputSplit> splits = Lists.newArrayList();
 Configuration base = job.getConfiguration();
 Map<FormatBundle, Map<Integer, List<Path>>> formatNodeMap = CrunchInputs.getFormatNodeMap(job);
 // First, build a map of InputFormats to Paths
 for (Map.Entry<FormatBundle, Map<Integer, List<Path>>> entry : formatNodeMap.entrySet()) {
  FormatBundle inputBundle = entry.getKey();
  Configuration conf = new Configuration(base);
  inputBundle.configure(conf);
  Job jobCopy = new Job(conf);
  InputFormat<?, ?> format = (InputFormat<?, ?>) ReflectionUtils.newInstance(inputBundle.getFormatClass(),
    jobCopy.getConfiguration());
  for (Map.Entry<Integer, List<Path>> nodeEntry : entry.getValue().entrySet()) {
   Integer nodeIndex = nodeEntry.getKey();
   List<Path> paths = nodeEntry.getValue();
   FileInputFormat.setInputPaths(jobCopy, paths.toArray(new Path[paths.size()]));
   // Get splits for each input path and tag with InputFormat
   // and Mapper types by wrapping in a TaggedInputSplit.
   List<InputSplit> pathSplits = format.getSplits(jobCopy);
   for (InputSplit pathSplit : pathSplits) {
    splits.add(new CrunchInputSplit(pathSplit, inputBundle.getFormatClass(),
      nodeIndex, jobCopy.getConfiguration()));
   }
  }
 }
 return splits;
}

@SuppressWarnings("unchecked")
public List<InputSplit> getSplits(JobContext job) throws IOException, InterruptedException {
 Configuration conf = job.getConfiguration();
 Job jobCopy = new Job(conf);
 List<InputSplit> splits = new ArrayList<InputSplit>();
 Map<Path, List<String>> formatMap = PangoolMultipleInputs.getInputFormatMap(job);
 Map<Path, List<String>> mapperMap = PangoolMultipleInputs.getInputProcessorFileMap(job);
 for (Map.Entry<Path, List<String>> entry : formatMap.entrySet()) {
  for (int inputId = 0; inputId < entry.getValue().size(); inputId++) {
   FileInputFormat.setInputPaths(jobCopy, entry.getKey());
   InputFormat inputFormat = InstancesDistributor.loadInstance(conf, InputFormat.class, entry.getValue().get(
     inputId), true);
   PangoolMultipleInputs.setSpecificInputContext(jobCopy.getConfiguration(), entry.getValue().get(inputId),
     inputId);
   List<InputSplit> pathSplits = inputFormat.getSplits(jobCopy);
   for (InputSplit pathSplit : pathSplits) {
    splits.add(new TaggedInputSplit(pathSplit, conf, entry.getValue().get(inputId), mapperMap.get(entry.getKey())
      .get(inputId), inputId));
   }
  }
 }
 return splits;
}

@Override
public RecordReader<NullWritable, Variant> createRecordReader(InputSplit split, TaskAttemptContext context)
    throws IOException, InterruptedException {
  if (inputFormat == null) {
    init(context.getConfiguration());
  }
  RecordReader<NullWritable, VariantDBWritable> recordReader = inputFormat.createRecordReader(split, context);
  return new RecordReaderTransform<>(recordReader, VariantDBWritable::getVariant);
}

private static List<Text> readSplit(InputFormat<LongWritable,Text> format,
 InputSplit split, Job job) throws IOException, InterruptedException {
 List<Text> result = new ArrayList<Text>();
 Configuration conf = job.getConfiguration();
 TaskAttemptContext context = MapReduceTestUtil.
  createDummyMapTaskAttemptContext(conf);
 RecordReader<LongWritable, Text> reader = format.createRecordReader(split,
  MapReduceTestUtil.createDummyMapTaskAttemptContext(conf));
 MapContext<LongWritable,Text,LongWritable,Text> mcontext =
  new MapContextImpl<LongWritable,Text,LongWritable,Text>(conf,
  context.getTaskAttemptID(), reader, null, null,
  MapReduceTestUtil.createDummyReporter(),
  split);
 reader.initialize(split, mcontext);
 while (reader.nextKeyValue()) {
  result.add(new Text(reader.getCurrentValue()));
 }
 return result;
}

public CrunchRecordReader(InputSplit inputSplit, final TaskAttemptContext context)
  throws IOException, InterruptedException {
 CrunchInputSplit crunchSplit = (CrunchInputSplit) inputSplit;
 InputFormat<K, V> inputFormat = (InputFormat<K, V>) ReflectionUtils
   .newInstance(crunchSplit.getInputFormatClass(), crunchSplit.getConf());
 this.delegate = inputFormat.createRecordReader(
   crunchSplit.getInputSplit(), TaskAttemptContextFactory.create(
     crunchSplit.getConf(), context.getTaskAttemptID()));
}

Javadoc

InputFormat describes the input-specification for a Map-Reduce job.

The Map-Reduce framework relies on the InputFormat of the job to:

Validate the input-specification of the job.
Split-up the input file(s) into logical InputSplits, each of which is then assigned to an individual Mapper.
Provide the RecordReader implementation to be used to glean input records from the logical InputSplit for processing by the Mapper.

The default behavior of file-based InputFormats, typically sub-classes of FileInputFormat, is to split the input into logical InputSplits based on the total size, in bytes, of the input files. However, the FileSystem blocksize of the input files is treated as an upper bound for input splits. A lower bound on the split size can be set via mapreduce.input.fileinputformat.split.minsize.

Clearly, logical splits based on input-size is insufficient for many applications since record boundaries are to respected. In such cases, the application has to also implement a RecordReader on whom lies the responsibility to respect record-boundaries and present a record-oriented view of the logical InputSplit to the individual task.

Most used methods

getSplits
Logically split the set of input files for the job.Each InputSplit is then assigned to an individual
createRecordReader
Create a record reader for a given split. The framework will call RecordReader#initialize(InputSplit

Popular in Java

Creating JSON documents from java classes using gson
getResourceAsStream (ClassLoader)
requestLocationUpdates (LocationManager)
getApplicationContext (Context)
Connection (java.sql)
A connection represents a link from a Java application to a database. All SQL statements and results
SortedSet (java.util)
SortedSet is a Set which iterates over its elements in a sorted order. The order is determined eithe
Pattern (java.util.regex)
Patterns are compiled regular expressions. In many cases, convenience methods such as String#matches
Collectors (java.util.stream)
XPath (javax.xml.xpath)
XPath provides access to the XPath evaluation environment and expressions. Evaluation of XPath Expr
Notification (javax.management)
Github Copilot alternatives

How to useInputFormat in org.apache.hadoop.mapreduce

Best Java code snippets using org.apache.hadoop.mapreduce.InputFormat (Showing top 20 results out of 459)

Refine search

How to use
InputFormat
in
org.apache.hadoop.mapreduce