org.apache.hadoop.mapred.InputFormat java code examples

Refine search

return ugi.doAs((PrivilegedExceptionAction<List<LogicalInputSplit>>) () -> {
 final List<LogicalInputSplit> splits = new ArrayList<>();
 final JobConf job = new JobConf(hiveConf);
 HiveUtilities.addConfToJob(job, properties);
 HiveUtilities.verifyAndAddTransactionalProperties(job, sd);
 job.setInputFormat(HiveUtilities.getInputFormatClass(job, sd, hiveReadEntry.getTable()));
 final Path path = new Path(sd.getLocation());
 final FileSystem fs = path.getFileSystem(job);
 if (fs.exists(path)) {
  FileInputFormat.addInputPath(job, path);
  final InputFormat<?, ?> format = job.getInputFormat();
  InputSplit[] inputSplits = format.getSplits(job, 1);

  throws Exception
JobConf configuration = new JobConf(new Configuration(false));
configuration.set(READ_COLUMN_IDS_CONF_STR, "0");
configuration.setBoolean(READ_ALL_COLUMNS, false);
RecordReader<K, V> recordReader = inputFormat.getRecordReader(
    new FileSplit(new Path(tempFile.getFile().getAbsolutePath()), 0, tempFile.getFile().length(), (String[]) null),
    configuration,
    NULL);
K key = recordReader.createKey();
V value = recordReader.createValue();
while (recordReader.next(key, value)) {
  Object expectedValue = iterator.next();

 localJc.set(FileInputFormat.INPUT_DIR,
   org.apache.hadoop.util.StringUtils.escapeString(parentDir.getAbsolutePath()));
 inputSplits = inputFormat.getSplits(localJc, 1);
 actualSplitNum = inputSplits.length;
rr = inputFormat.getRecordReader(inputSplits[currentSplitPointer],
 localJc, reporter);
currentSplitPointer++;

System.out.println("Files found: ");
for (AcidUtils.ParsedDelta pd : current) {
 System.out.println(pd.getPath().toString());
JobConf job = new JobConf();
job.set("mapred.input.dir", partitionPath.toString());
job.set(BUCKET_COUNT, Integer.toString(buckets));
job.set(IOConstants.SCHEMA_EVOLUTION_COLUMNS, "id,msg");
job.set(IOConstants.SCHEMA_EVOLUTION_COLUMNS_TYPES, "bigint:string");
job.set(ValidWriteIdList.VALID_WRITEIDS_KEY, writeIds.toString());
job.set(ValidTxnList.VALID_TXNS_KEY, conf.get(ValidTxnList.VALID_TXNS_KEY));
InputSplit[] splits = inf.getSplits(job, buckets);
Assert.assertEquals(numExpectedFiles, splits.length);
org.apache.hadoop.mapred.RecordReader<NullWritable, OrcStruct> rr =
    inf.getRecordReader(splits[0], job, Reporter.NULL);
NullWritable key = rr.createKey();
OrcStruct value = rr.createValue();
for (String record : records) {
 Assert.assertEquals(true, rr.next(key, value));
 Assert.assertEquals(record, value.toString());

Path dir = new Path(tPart.getSd().getLocation());
long numRows = 0;
long rawDataSize = 0;
long fileSize = 0;
long numFiles = 0;
FileSystem fs = dir.getFileSystem(conf);
FileStatus[] fileList = HiveStatsUtils.getFileStatusRecurse(dir, -1, fs);
    new String[] { partn.getLocation() });
  org.apache.hadoop.mapred.RecordReader<?, ?> recordReader =
    inputFormat.getRecordReader(dummySplit, jc, Reporter.NULL);
  StatsProvidingRecordReader statsRR;
  if (recordReader instanceof StatsProvidingRecordReader) {
   statsAvailable = true;
  recordReader.close();

@SuppressWarnings("unchecked") // InputFormat instantiation
static long readBench(JobConf conf) throws IOException {
 InputFormat inf = conf.getInputFormat();
 final String fn = conf.get("test.filebench.name", "");
 Path pin = new Path(FileInputFormat.getInputPaths(conf)[0], fn);
 FileStatus in = pin.getFileSystem(conf).getFileStatus(pin);
 RecordReader rr = inf.getRecordReader(new FileSplit(pin, 0, in.getLen(), 
                    (String[])null), conf, Reporter.NULL);
 try {
  Object key = rr.createKey();
  Object val = rr.createValue();
  Date start = new Date();
  while (rr.next(key, val));
  Date end = new Date();
  return end.getTime() - start.getTime();
 } finally {
  rr.close();
 }
}

OutputFormat<?, ?> outFormat = new OrcOutputFormat();
RecordWriter writer =
  outFormat.getRecordWriter(fs, conf, testFilePath.toString(),
    Reporter.NULL);
writer.write(NullWritable.get(),
inspector = (StructObjectInspector) serde.getObjectInspector();
InputFormat<?,?> in = new OrcInputFormat();
FileInputFormat.setInputPaths(conf, testFilePath.toString());
InputSplit[] splits = in.getSplits(conf, 1);
assertEquals(1, splits.length);
ColumnProjectionUtils.appendReadColumns(conf, Collections.singletonList(1));
conf.set("columns", "z,r");
conf.set("columns.types", "int:struct<x:int,y:int>");
org.apache.hadoop.mapred.RecordReader reader =
  in.getRecordReader(splits[0], conf, Reporter.NULL);
Object key = reader.createKey();
Object value = reader.createValue();
int rowNum = 0;
List<? extends StructField> fields = inspector.getAllStructFieldRefs();
IntObjectInspector intInspector =
  (IntObjectInspector) fields.get(0).getFieldObjectInspector();
while (reader.next(key, value)) {
 assertEquals(null, inspector.getStructFieldData(value, fields.get(0)));
 Object sub = inspector.getStructFieldData(value, fields.get(1));

TupleDomain<HiveColumnHandle> effectivePredicate = (TupleDomain<HiveColumnHandle>) compactEffectivePredicate;
Path path = new Path(getPartitionLocation(table, partition.getPartition()));
Configuration configuration = hdfsEnvironment.getConfiguration(hdfsContext, path);
InputFormat<?, ?> inputFormat = getInputFormat(configuration, schema, false);
FileSystem fs = hdfsEnvironment.getFileSystem(hdfsContext, path);
boolean s3SelectPushdownEnabled = shouldEnablePushdownForTable(session, table, path.toString(), partition.getPartition());
    targetJob.setInputFormat(TextInputFormat.class);
    targetInputFormat.configure(targetJob);
    FileInputFormat.setInputPaths(targetJob, targetPath);
    InputSplit[] targetSplits = targetInputFormat.getSplits(targetJob, 0);
  FileInputFormat.setInputPaths(jobConf, path);
  InputSplit[] splits = inputFormat.getSplits(jobConf, 0);

String sargStr;
createTestSarg(inspector, udf, childExpr);
InputSplit[] splits = in.getSplits(conf, 1);
assertEquals(5, splits.length);
en = new ExprNodeGenericFuncDesc(inspector, udf, childExpr);
sargStr = SerializationUtilities.serializeExpression(en);
conf.set("hive.io.filter.expr.serialized", sargStr);
splits = in.getSplits(conf, 1);
assertEquals(0, splits.length);
en = new ExprNodeGenericFuncDesc(inspector, udf, childExpr);
sargStr = SerializationUtilities.serializeExpression(en);
conf.set("hive.io.filter.expr.serialized", sargStr);
splits = in.getSplits(conf, 1);
assertEquals(1, splits.length);
en = new ExprNodeGenericFuncDesc(inspector, udf, childExpr);
sargStr = SerializationUtilities.serializeExpression(en);
conf.set("hive.io.filter.expr.serialized", sargStr);
splits = in.getSplits(conf, 1);
assertEquals(2, splits.length);
splits = in.getSplits(conf, 1);
assertEquals(3, splits.length);
splits = in.getSplits(conf, 1);
assertEquals(4, splits.length);

JobConf jobConf = taskCtx0.jobConf();
InputFormat inFormat = jobConf.getInputFormat();
  HadoopFileBlock block = (HadoopFileBlock)split;
  nativeSplit = new FileSplit(new Path(block.file().toString()), block.start(), block.length(), EMPTY_HOSTS);
    fileName(), taskCtx0.attemptId());
  RecordReader reader = inFormat.getRecordReader(nativeSplit, jobConf, reporter);
  Mapper mapper = ReflectionUtils.newInstance(jobConf.getMapperClass(), jobConf);
  Object key = reader.createKey();
  Object val = reader.createValue();
      while (reader.next(key, val)) {
        if (isCancelled())
          throw new HadoopTaskCancelledException("Map task cancelled.");

(InputFormat<?, ?>) ReflectionUtils.newInstance(JavaUtils.loadClass(realInputFormatName),
 jobConf);
MapWork mapWork = Utilities.getMapWork(jobConf);
List<Path> paths = Utilities.getInputPathsTez(jobConf, mapWork);
FileSystem fs = paths.get(0).getFileSystem(jobConf);
FileStatus[] fileStatuses = fs.listStatus(paths.get(0));
if (fileStatuses.length == 0) {
splits = inputFormat.getSplits(jobConf, (int) (availableSlots * waves));
tezCounters = new TezCounters();
groupName = HiveInputCounters.class.getName();
vertexName = jobConf.get(Operator.CONTEXT_NAME_KEY, "");
counterName = Utilities.getVertexCounterName(HiveInputCounters.RAW_INPUT_SPLITS.name(), vertexName);
tezCounters.findCounter(groupName, counterName).increment(splits.length);
  final String fileStr = path.toString();
  if (!files.contains(fileStr)) {
   files.add(fileStr);

FileInputFormat.setInputPaths(
  conf,
  new Path(JobHelper.getURIFromSegment(segment.getSegment()))
);
try {
 return Arrays.stream(fio.getSplits(conf, 1)).flatMap(
   (final org.apache.hadoop.mapred.InputSplit split) -> {
    try {

public void testFormat() throws Exception {
 JobConf job = new JobConf(conf);
 FileSystem fs = FileSystem.getLocal(conf);
 Path dir = new Path(System.getProperty("test.build.data",".") + "/mapred");
 Path file = new Path(dir, "test.seq");
 FileInputFormat.setInputPaths(job, dir);
   int numSplits =
    random.nextInt(MAX_LENGTH/(SequenceFile.SYNC_INTERVAL/20))+1;
   InputSplit[] splits = format.getSplits(job, numSplits);
   for (int j = 0; j < splits.length; j++) {
    RecordReader<RecInt, RecBuffer> reader =
     format.getRecordReader(splits[j], job, Reporter.NULL);
    try {
     int count = 0;
     while (reader.next(key, value)) {
      assertFalse("Key in multiple partitions.", bits.get(key.getData()));
      bits.set(key.getData());
     reader.close();

JobConf conf = new JobConf();
FileInputFormat.addInputPath(conf, new Path(path));
InputSplit[] splits = informat.getSplits(conf, 10000);
assertTrue(splits.length > 3); //want to test that splitting is working b/c i made really big files
for(InputSplit split: splits) {
  RecordReader<Text, BytesWritable> rr = informat.getRecordReader(split, conf, Reporter.NULL);
  Text t = new Text();
  BytesWritable b = new BytesWritable();
  while(rr.next(t, b)) {
    results.put(t.toString(), new String(Utils.getBytes(b)));
  rr.close();

/**
 * Get paths from a Hive location using the provided input format.
 */
public static Set<Path> getPaths(InputFormat<?, ?> inputFormat, Path location) throws IOException {
 JobConf jobConf = new JobConf(getHadoopConfiguration());
 Set<Path> paths = Sets.newHashSet();
 FileInputFormat.addInputPaths(jobConf, location.toString());
 InputSplit[] splits = inputFormat.getSplits(jobConf, 1000);
 for (InputSplit split : splits) {
  if (!(split instanceof FileSplit)) {
   throw new IOException("Not a file split. Found " + split.getClass().getName());
  }
  FileSplit fileSplit = (FileSplit) split;
  paths.add(fileSplit.getPath());
 }
 return paths;
}

        ) throws IOException, InterruptedException,
             ClassNotFoundException {
InputSplit inputSplit = getSplitDetails(new Path(splitIndex.getSplitLocation()),
    splitIndex.getStartOffset());
 job.getInputFormat().getRecordReader(inputSplit, job, reporter);
RecordReader<INKEY,INVALUE> in = isSkipping() ? 
  new SkippingRecordReader<INKEY,INVALUE>(rawIn, umbilical, reporter) :
  new TrackedRecordReader<INKEY,INVALUE>(rawIn, reporter);
job.setBoolean(JobContext.SKIP_RECORDS, isSkipping());
int numReduceTasks = conf.getNumReduceTasks();
LOG.info("numReduceTasks: " + numReduceTasks);
MapOutputCollector collector = null;
 ReflectionUtils.newInstance(job.getMapRunnerClass(), job);
} finally {
 in.close();                               // close input
 collector.close();

public InputSplit[] getSplits(JobConf conf, int numSplits) throws IOException {
 JobConf confCopy = new JobConf(conf);
 List<InputSplit> splits = new ArrayList<>();
  Schema schema = schemaEntry.getKey();
  System.out.println(schema);
  InputFormat format = (InputFormat) ReflectionUtils.newInstance(
    AvroInputFormat.class, conf);
  List<Path> paths = schemaEntry.getValue();
    mapperClass = (Class<? extends AvroMapper>) conf.getMapperClass();
   FileInputFormat.setInputPaths(confCopy, paths.toArray(new Path[paths
     .size()]));
   InputSplit[] pathSplits = format.getSplits(confCopy, numSplits);
   for (InputSplit pathSplit : pathSplits) {
    splits.add(new TaggedInputSplit(pathSplit, conf, format.getClass(),

JobConf job = new JobConf();
job.set("mapred.input.dir", partitionLocation.toString());
job.set(hive_metastoreConstants.BUCKET_COUNT, Integer.toString(table.getSd().getNumBuckets()));
job.set(IOConstants.SCHEMA_EVOLUTION_COLUMNS, "id,msg");
job.set(IOConstants.SCHEMA_EVOLUTION_COLUMNS_TYPES, "bigint:string");
job.set(ValidWriteIdList.VALID_WRITEIDS_KEY, writeIds.toString());
job.set(ValidTxnList.VALID_TXNS_KEY, validTxnList.writeToString());
InputSplit[] splits = inputFormat.getSplits(job, 1);
assertEquals(numSplitsExpected, splits.length);
for(InputSplit is : splits) {
 final AcidRecordReader<NullWritable, OrcStruct> recordReader = (AcidRecordReader<NullWritable, OrcStruct>) inputFormat
  .getRecordReader(is, job, Reporter.NULL);

   throw new IOException("Acid table: " + table.getTableName()
       + " is missing from the ValidWriteIdList config: "
       + conf.get(ValidTxnWriteIdList.VALID_TABLES_WRITEIDS_KEY));
if (finalDirs.isEmpty() && dirsWithFileOriginals.isEmpty()) {
 if (!conf.getBoolean(Utilities.ENSURE_OPERATORS_EXECUTED, false)) {
  LOG.warn("No valid inputs found in " + dirs);
 } else if (validMmWriteIdList != null) {
    dirs.get(0).toString()), ZeroRowsInputFormat.class.getName()));
conf.setInputFormat(inputFormat.getClass());
int headerCount = 0;
int footerCount = 0;
 FileInputFormat.setInputPaths(conf, finalDirs.toArray(new Path[finalDirs.size()]));
 InputSplit[] iss = inputFormat.getSplits(conf, splits);
 for (InputSplit is : iss) {
  result.add(new HiveInputSplit(is, inputFormatClass.getName()));
 InputSplit[] iss = inputFormat.getSplits(nonRecConf, splits);
 for (InputSplit is : iss) {
  result.add(new HiveInputSplit(is, inputFormatClass.getName()));
   finalDirs.get(0).toString()), ZeroRowsInputFormat.class.getName()));

 job.set("mapred.input.dir", org.apache.hadoop.util.StringUtils.escapeString(currPath
   .toString()));
 inputSplits = inputFormat.getSplits(job, 1);
 splitNum = 0;
 serde = tmp.getDeserializerClass().newInstance();
  currRecReader.close();
  currRecReader = null;
currRecReader = inputFormat.getRecordReader(inputSplits[splitNum++], job, Reporter.NULL);
key = currRecReader.createKey();
value = currRecReader.createValue();
return currRecReader;

Javadoc

InputFormat describes the input-specification for a Map-Reduce job.

The Map-Reduce framework relies on the InputFormat of the job to:

Validate the input-specification of the job.
Split-up the input file(s) into logical InputSplits, each of which is then assigned to an individual Mapper.
Provide the RecordReader implementation to be used to glean input records from the logical InputSplit for processing by the Mapper.

The default behavior of file-based InputFormats, typically sub-classes of FileInputFormat, is to split the input into logical InputSplits based on the total size, in bytes, of the input files. However, the FileSystem blocksize of the input files is treated as an upper bound for input splits. A lower bound on the split size can be set via mapred.min.split.size.

Clearly, logical splits based on input-size is insufficient for many applications since record boundaries are to respected. In such cases, the application has to also implement a RecordReader on whom lies the responsibilty to respect record-boundaries and present a record-oriented view of the logical InputSplit to the individual task.

Most used methods

getSplits
Logically split the set of input files for the job.Each InputSplit is then assigned to an individual
getRecordReader
Get the RecordReader for the given InputSplit.It is the responsibility of the RecordReader to respec

Popular in Java

Creating JSON documents from java classes using gson
getExternalFilesDir (Context)
getSupportFragmentManager (FragmentActivity)
runOnUiThread (Activity)
URI (java.net)
A Uniform Resource Identifier that identifies an abstract or physical resource, as specified by RFC
Dictionary (java.util)
Note: Do not use this class since it is obsolete. Please use the Map interface for new implementatio
Color (java.awt)
The Color class is used to encapsulate colors in the default sRGB color space or colors in arbitrary
Component (java.awt)
A component is an object having a graphical representation that can be displayed on the screen and t
BufferedImage (java.awt.image)
The BufferedImage subclass describes an java.awt.Image with an accessible buffer of image data. All
Join (org.hibernate.mapping)
Top plugins for WebStorm

How to useInputFormat in org.apache.hadoop.mapred

Best Java code snippets using org.apache.hadoop.mapred.InputFormat (Showing top 20 results out of 1,251)

Refine search

How to use
InputFormat
in
org.apache.hadoop.mapred