org.apache.hadoop.mapred.InputFormat.getSplits java code examples

@Override
public InputSplit[] getSplits(JobConf job, int numSplits) throws IOException {
 return sourceInputFormat.getSplits(job, numSplits);
}

@Override
public HadoopInputSplit[] createInputSplits(int minNumSplits)
    throws IOException {
  org.apache.hadoop.mapred.InputSplit[] splitArray = mapredInputFormat.getSplits(jobConf, minNumSplits);
  HadoopInputSplit[] hiSplit = new HadoopInputSplit[splitArray.length];
  for (int i = 0; i < splitArray.length; i++) {
    hiSplit[i] = new HadoopInputSplit(i, splitArray[i], jobConf);
  }
  return hiSplit;
}

private void generateWrappedSplits(InputFormat inputFormat,
  List<FetchInputFormatSplit> inputSplits, JobConf job) throws IOException {
 InputSplit[] splits = inputFormat.getSplits(job, 1);
 for (int i = 0; i < splits.length; i++) {
  inputSplits.add(new FetchInputFormatSplit(splits[i], inputFormat));
 }
}

 private int addBHISplit(FileStatus status, InputFormat inputFormat, Class inputFormatClass,
   int numOrigSplits, JobConf newjob, ArrayList<InputSplit> result) throws IOException {
  LOG.info("block size: " + status.getBlockSize());
  LOG.info("file length: " + status.getLen());
  FileInputFormat.setInputPaths(newjob, status.getPath());
  InputSplit[] iss = inputFormat.getSplits(newjob, 0);
  if (iss != null && iss.length > 0) {
   numOrigSplits += iss.length;
   result.add(new BucketizedHiveInputSplit(iss, inputFormatClass
     .getName()));
  }
  return numOrigSplits;
 }
}

public void buildInputStreamFileStatus() throws IOException {
 String xlearningInputs = envs.get(XLearningConstants.Environment.XLEARNING_INPUTS.toString());
 if (StringUtils.isBlank(xlearningInputs)) {
  LOG.info("Application has no inputs");
  return;
 }
 String[] inputPathTuple = StringUtils.split(xlearningInputs, "#");
 if (inputPathTuple.length < 2) {
  throw new RuntimeException("Error input path format " + xlearningInputs);
 }
 String inputPathRemote = inputPathTuple[0];
 if (!StringUtils.isBlank(inputPathRemote)) {
  JobConf jobConf = new JobConf(conf);
  jobConf.set(XLearningConstants.STREAM_INPUT_DIR, inputPathRemote);
  InputFormat inputFormat = ReflectionUtils.newInstance(conf.getClass(XLearningConfiguration.XLEARNING_INPUTF0RMAT_CLASS, XLearningConfiguration.DEFAULT_XLEARNING_INPUTF0RMAT_CLASS, InputFormat.class),
    jobConf);
  inputFileSplits = inputFormat.getSplits(jobConf, 1);
 } else {
  throw new RuntimeException("Error input path format " + xlearningInputs);
 }
}

/**
 * Get paths from a Hive location using the provided input format.
 */
public static Set<Path> getPaths(InputFormat<?, ?> inputFormat, Path location) throws IOException {
 JobConf jobConf = new JobConf(getHadoopConfiguration());
 Set<Path> paths = Sets.newHashSet();
 FileInputFormat.addInputPaths(jobConf, location.toString());
 InputSplit[] splits = inputFormat.getSplits(jobConf, 1000);
 for (InputSplit split : splits) {
  if (!(split instanceof FileSplit)) {
   throw new IOException("Not a file split. Found " + split.getClass().getName());
  }
  FileSplit fileSplit = (FileSplit) split;
  paths.add(fileSplit.getPath());
 }
 return paths;
}

);
try {
 return Arrays.stream(fio.getSplits(conf, 1)).flatMap(
   (final org.apache.hadoop.mapred.InputSplit split) -> {
    try {

LOG.info("file length: " + status.getLen());
FileInputFormat.setInputPaths(newjob, status.getPath());
InputSplit[] iss = inputFormat.getSplits(newjob, 0);
if (iss != null && iss.length > 0) {
 numOrigSplits += iss.length;

private void addSplitsForGroup(List<Path> dirs, TableScanOperator tableScan, JobConf conf,
  InputFormat inputFormat, Class<? extends InputFormat> inputFormatClass, int splits,
  TableDesc table, List<InputSplit> result) throws IOException {
 Utilities.copyTablePropertiesToConf(table, conf);
 if (tableScan != null) {
  pushFilters(conf, tableScan);
 }
 FileInputFormat.setInputPaths(conf, dirs.toArray(new Path[dirs.size()]));
 conf.setInputFormat(inputFormat.getClass());
 int headerCount = 0;
 int footerCount = 0;
 if (table != null) {
  headerCount = Utilities.getHeaderCount(table);
  footerCount = Utilities.getFooterCount(table, conf);
  if (headerCount != 0 || footerCount != 0) {
   // Input file has header or footer, cannot be splitted.
   HiveConf.setLongVar(conf, ConfVars.MAPREDMINSPLITSIZE, Long.MAX_VALUE);
  }
 }
 InputSplit[] iss = inputFormat.getSplits(conf, splits);
 for (InputSplit is : iss) {
  result.add(new HiveInputSplit(is, inputFormatClass.getName()));
 }
}

/**
 * @param jobConf Job configuration.
 * @return Collection of mapped splits.
 * @throws IgniteCheckedException If mapping failed.
 */
public static Collection<HadoopInputSplit> splitJob(JobConf jobConf) throws IgniteCheckedException {
  try {
    InputFormat<?, ?> format = jobConf.getInputFormat();
    assert format != null;
    InputSplit[] splits = format.getSplits(jobConf, 0);
    Collection<HadoopInputSplit> res = new ArrayList<>(splits.length);
    for (int i = 0; i < splits.length; i++) {
      InputSplit nativeSplit = splits[i];
      if (nativeSplit instanceof FileSplit) {
        FileSplit s = (FileSplit)nativeSplit;
        res.add(new HadoopFileBlock(s.getLocations(), s.getPath().toUri(), s.getStart(), s.getLength()));
      }
      else
        res.add(HadoopUtils.wrapSplit(i, nativeSplit, nativeSplit.getLocations()));
    }
    return res;
  }
  catch (IOException e) {
    throw new IgniteCheckedException(e);
  }
}

public InputSplit[] doGetSplits(JobConf job, int numSplits) throws IOException {
 super.init(job);
 Path[] dirs = FileInputFormat.getInputPaths(job);
 if (dirs.length == 0) {
  throw new IOException("No input paths specified in job");
 }
 JobConf newjob = new JobConf(job);
 ArrayList<InputSplit> result = new ArrayList<InputSplit>();
 // for each dir, get the InputFormat, and do getSplits.
 PartitionDesc part;
 for (Path dir : dirs) {
  part = HiveFileFormatUtils
    .getPartitionDescFromPathRecursively(pathToPartitionInfo, dir,
      IOPrepareCache.get().allocatePartitionDescMap(), true);
  // create a new InputFormat instance if this is the first time to see this
  // class
  Class inputFormatClass = part.getInputFileFormatClass();
  InputFormat inputFormat = getInputFormatFromCache(inputFormatClass, job);
  Utilities.copyTableJobPropertiesToConf(part.getTableDesc(), newjob);
  FileInputFormat.setInputPaths(newjob, dir);
  newjob.setInputFormat(inputFormat.getClass());
  InputSplit[] iss = inputFormat.getSplits(newjob, numSplits / dirs.length);
  for (InputSplit is : iss) {
   result.add(new HiveInputSplit(is, inputFormatClass.getName()));
  }
 }
 return result.toArray(new HiveInputSplit[result.size()]);
}

InputSplit[] splits = inputFormat.getSplits(jobConf, 0);

protected FetchInputFormatSplit[] getNextSplits() throws Exception {
 while (getNextPath()) {
  // not using FileInputFormat.setInputPaths() here because it forces a connection to the
  // default file system - which may or may not be online during pure metadata operations
  job.set("mapred.input.dir", StringUtils.escapeString(currPath.toString()));
  // Fetch operator is not vectorized and as such turn vectorization flag off so that
  // non-vectorized record reader is created below.
  HiveConf.setBoolVar(job, HiveConf.ConfVars.HIVE_VECTORIZATION_ENABLED, false);
  Class<? extends InputFormat> formatter = currDesc.getInputFileFormatClass();
  Utilities.copyTableJobPropertiesToConf(currDesc.getTableDesc(), job);
  InputFormat inputFormat = getInputFormatFromCache(formatter, job);
  InputSplit[] splits = inputFormat.getSplits(job, 1);
  FetchInputFormatSplit[] inputSplits = new FetchInputFormatSplit[splits.length];
  for (int i = 0; i < splits.length; i++) {
   inputSplits[i] = new FetchInputFormatSplit(splits[i], inputFormat);
  }
  if (work.getSplitSample() != null) {
   inputSplits = splitSampling(work.getSplitSample(), inputSplits);
  }
  if (inputSplits.length > 0) {
   return inputSplits;
  }
 }
 return null;
}

inputSplits = inputFormat.getSplits(localJc, 1);
actualSplitNum = inputSplits.length;

@Test
public void testEmptyFile() throws Exception {
 Properties properties = new Properties();
 properties.setProperty("columns", "x,y");
 properties.setProperty("columns.types", "int:int");
 HiveOutputFormat<?, ?> outFormat = new OrcOutputFormat();
 org.apache.hadoop.hive.ql.exec.FileSinkOperator.RecordWriter writer =
   outFormat.getHiveRecordWriter(conf, testFilePath, MyRow.class, true,
     properties, Reporter.NULL);
 writer.close(true);
 AbstractSerDe serde = new OrcSerde();
 SerDeUtils.initializeSerDe(serde, conf, properties, null);
 InputFormat<?,?> in = new OrcInputFormat();
 FileInputFormat.setInputPaths(conf, testFilePath.toString());
 InputSplit[] splits = in.getSplits(conf, 1);
 assertTrue(0 == splits.length);
 assertEquals(null, serde.getSerDeStats());
}

job.set(ValidWriteIdList.VALID_WRITEIDS_KEY, writeIds.toString());
job.set(ValidTxnList.VALID_TXNS_KEY, validTxnList.writeToString());
InputSplit[] splits = inputFormat.getSplits(job, 1);
assertEquals(numSplitsExpected, splits.length);

@Test(expected = RuntimeException.class)
public void testSplitGenFailure() throws IOException {
 Properties properties = new Properties();
 HiveOutputFormat<?, ?> outFormat = new OrcOutputFormat();
 org.apache.hadoop.hive.ql.exec.FileSinkOperator.RecordWriter writer =
   outFormat.getHiveRecordWriter(conf, testFilePath, MyRow.class, true,
     properties, Reporter.NULL);
 writer.write(new OrcSerde().serialize(null,null));
 writer.close(true);
 InputFormat<?,?> in = new OrcInputFormat();
 fs.setPermission(testFilePath, FsPermission.createImmutable((short) 0333));
 FileInputFormat.setInputPaths(conf, testFilePath.toString());
 try {
  in.getSplits(conf, 1);
 } catch (RuntimeException e) {
  assertEquals(true, e.getMessage().contains("Permission denied"));
  throw e;
 }
}

job.set(ValidWriteIdList.VALID_WRITEIDS_KEY, writeIds.toString());
job.set(ValidTxnList.VALID_TXNS_KEY, conf.get(ValidTxnList.VALID_TXNS_KEY));
InputSplit[] splits = inf.getSplits(job, buckets);
Assert.assertEquals(numExpectedFiles, splits.length);
org.apache.hadoop.mapred.RecordReader<NullWritable, OrcStruct> rr =

InputFormat<?,?> in = new OrcInputFormat();
FileInputFormat.setInputPaths(conf, testFilePath.toString());
InputSplit[] splits = in.getSplits(conf, 1);
assertEquals(0, splits.length);

InputFormat<?,?> in = new OrcInputFormat();
FileInputFormat.setInputPaths(conf, testFilePath.toString());
InputSplit[] splits = in.getSplits(conf, 1);
assertEquals(0, splits.length);

Javadoc

Logically split the set of input files for the job.

Each InputSplit is then assigned to an individual Mapperfor processing.

Note: The split is a logical split of the inputs and the input files are not physically split into chunks. For e.g. a split could be <input-file-path, start, offset> tuple.

Popular methods of InputFormat

getRecordReader
Get the RecordReader for the given InputSplit.It is the responsibility of the RecordReader to respec

Popular in Java

Reading from database using SQL prepared statement
scheduleAtFixedRate (ScheduledExecutorService)
getExternalFilesDir (Context)
scheduleAtFixedRate (Timer)
HttpServer (com.sun.net.httpserver)
This class implements a simple HTTP server. A HttpServer is bound to an IP address and port number a
IOException (java.io)
Signals a general, I/O-related error. Error details may be specified when calling the constructor, a
Timestamp (java.sql)
A Java representation of the SQL TIMESTAMP type. It provides the capability of representing the SQL
TreeMap (java.util)
Walk the nodes of the tree left-to-right or right-to-left. Note that in descending iterations, next
TimeUnit (java.util.concurrent)
A TimeUnit represents time durations at a given unit of granularity and provides utility methods to
ZipFile (java.util.zip)
This class provides random read access to a zip file. You pay more to read the zip file's central di
Best IntelliJ plugins

How to use getSplitsmethodin org.apache.hadoop.mapred.InputFormat

Best Java code snippets using org.apache.hadoop.mapred.InputFormat.getSplits (Showing top 20 results out of 558)

How to use
getSplits
method
in
org.apache.hadoop.mapred.InputFormat