org.apache.hadoop.mapreduce.lib.input.FileInputFormat.setMaxInputSplitSize java code examples

/**
 * Configures the Hadoop MapReduce job.
 *
 * @return Instance of the Hadoop MapRed job.
 * @throws IOException If failed.
 */
@SuppressWarnings("deprecation")
private Job createConfigBasedHadoopJob() throws IOException {
  Job jobCfg = new Job();
  Configuration cfg = jobCfg.getConfiguration();
  // Use explicit configuration of distributed file system, if provided.
  cfg.addResource(U.resolveIgniteUrl(DFS_CFG));
  jobCfg.setJobName("HadoopPopularWordExample");
  jobCfg.setJarByClass(HadoopPopularWords.class);
  jobCfg.setInputFormatClass(TextInputFormat.class);
  jobCfg.setOutputKeyClass(Text.class);
  jobCfg.setOutputValueClass(IntWritable.class);
  jobCfg.setMapperClass(TokenizingMapper.class);
  jobCfg.setReducerClass(TopNWordsReducer.class);
  FileInputFormat.setInputPaths(jobCfg, BOOKS_DFS_DIR);
  FileOutputFormat.setOutputPath(jobCfg, RESULT_DFS_DIR);
  // Local job tracker allows the only task per wave, but text input format
  // replaces it with the calculated value based on input split size option.
  if ("local".equals(cfg.get("mapred.job.tracker", "local"))) {
    // Split job into tasks using 32MB split size.
    FileInputFormat.setMinInputSplitSize(jobCfg, 32L * 1024 * 1024);
    FileInputFormat.setMaxInputSplitSize(jobCfg, Long.MAX_VALUE);
  }
  return jobCfg;
}

@Override
public List<BoundedDataSource<Pair<K, V>>> split(long desiredSplitSizeBytes) {
 final Job job = newJob();
 long splitSize = Math.max(MIN_SPLIT_SIZE, desiredSplitSizeBytes);
 LOG.info(String.format("%s's max and min input split size will be set to %,d .",
   FileInputFormat.class.getSimpleName(), desiredSplitSizeBytes));
 FileInputFormat.setMinInputSplitSize(job, splitSize);
 FileInputFormat.setMaxInputSplitSize(job, splitSize);
 return doSplit(job);
}

private int runSyncJob(Path source, Path destination, Path tmpDir, Path input,
            Path output)
  throws IOException, InterruptedException, ClassNotFoundException {
 Job job = new Job(getConf(), "HDFS Sync job");
 job.setJarByClass(getClass());
 job.setInputFormatClass(TextInputFormat.class);
 job.setMapperClass(HdfsSyncMapper.class);
 job.setReducerClass(HdfsSyncReducer.class);
 job.setOutputKeyClass(LongWritable.class);
 job.setOutputValueClass(Text.class);
 job.getConfiguration().set(SRC_PATH_CONF, source.toString());
 job.getConfiguration().set(DST_PATH_CONF, destination.toString());
 job.getConfiguration().set(TMP_PATH_CONF, tmpDir.toString());
 FileInputFormat.setInputPaths(job, input);
 FileInputFormat.setInputDirRecursive(job, true);
 FileInputFormat.setMaxInputSplitSize(job,
     this.getConf().getLong( FileInputFormat.SPLIT_MAXSIZE, 60000L));
 FileOutputFormat.setOutputPath(job, new Path(output.toString()));
 FileOutputFormat.setOutputCompressorClass(job, GzipCodec.class);
 boolean success = job.waitForCompletion(true);
 return success ? 0 : 1;
}

private int runSyncJob(Path source, Path destination, Path tmpDir, Path input,
            Path output)
  throws IOException, InterruptedException, ClassNotFoundException {
 Job job = new Job(getConf(), "HDFS Sync job");
 job.setJarByClass(getClass());
 job.setInputFormatClass(TextInputFormat.class);
 job.setMapperClass(HdfsSyncMapper.class);
 job.setReducerClass(HdfsSyncReducer.class);
 job.setOutputKeyClass(LongWritable.class);
 job.setOutputValueClass(Text.class);
 job.getConfiguration().set(SRC_PATH_CONF, source.toString());
 job.getConfiguration().set(DST_PATH_CONF, destination.toString());
 job.getConfiguration().set(TMP_PATH_CONF, tmpDir.toString());
 FileInputFormat.setInputPaths(job, input);
 FileInputFormat.setInputDirRecursive(job, true);
 FileInputFormat.setMaxInputSplitSize(job,
     this.getConf().getLong( FileInputFormat.SPLIT_MAXSIZE, 60000L));
 FileOutputFormat.setOutputPath(job, new Path(output.toString()));
 FileOutputFormat.setOutputCompressorClass(job, GzipCodec.class);
 boolean success = job.waitForCompletion(true);
 return success ? 0 : 1;
}

private int runMetastoreCompareJobWithTextInput(Path input, Path output)
 throws IOException, InterruptedException, ClassNotFoundException {
 Job job = Job.getInstance(this.getConf(), "Stage1: Metastore Compare Job with Input List");
 job.setJarByClass(this.getClass());
 job.setInputFormatClass(TextInputFormat.class);
 job.setMapperClass(Stage1ProcessTableMapperWithTextInput.class);
 job.setReducerClass(Stage1PartitionCompareReducer.class);
 FileInputFormat.setInputPaths(job, input);
 FileInputFormat.setMaxInputSplitSize(job,
   this.getConf().getLong(FileInputFormat.SPLIT_MAXSIZE, 60000L));
 job.setOutputKeyClass(LongWritable.class);
 job.setOutputValueClass(Text.class);
 FileOutputFormat.setOutputPath(job, output);
 FileOutputFormat.setOutputCompressorClass(job, GzipCodec.class);
 job.setNumReduceTasks(getConf().getInt(
   ConfigurationKeys.BATCH_JOB_METASTORE_PARALLELISM,
   150));
 boolean success = job.waitForCompletion(true);
 return success ? 0 : 1;
}

private int runMetastoreCompareJobWithTextInput(Path input, Path output)
 throws IOException, InterruptedException, ClassNotFoundException {
 Job job = Job.getInstance(this.getConf(), "Stage1: Metastore Compare Job with Input List");
 job.setJarByClass(this.getClass());
 job.setInputFormatClass(TextInputFormat.class);
 job.setMapperClass(Stage1ProcessTableMapperWithTextInput.class);
 job.setReducerClass(Stage1PartitionCompareReducer.class);
 FileInputFormat.setInputPaths(job, input);
 FileInputFormat.setMaxInputSplitSize(job,
   this.getConf().getLong(FileInputFormat.SPLIT_MAXSIZE, 60000L));
 job.setOutputKeyClass(LongWritable.class);
 job.setOutputValueClass(Text.class);
 FileOutputFormat.setOutputPath(job, output);
 FileOutputFormat.setOutputCompressorClass(job, GzipCodec.class);
 job.setNumReduceTasks(getConf().getInt(
   ConfigurationKeys.BATCH_JOB_METASTORE_PARALLELISM,
   150));
 boolean success = job.waitForCompletion(true);
 return success ? 0 : 1;
}

private int runHdfsCopyJob(Path input, Path output)
 throws IOException, InterruptedException, ClassNotFoundException, TemplateRenderException {
 LOG.info("Starting job for step 2...");
 Job job = Job.getInstance(this.getConf(), "Stage 2: HDFS Copy Job");
 job.setJarByClass(this.getClass());
 job.setInputFormatClass(TextInputFormat.class);
 job.setMapperClass(Stage2DirectoryCopyMapper.class);
 job.setReducerClass(Stage2DirectoryCopyReducer.class);
 FileInputFormat.setInputPaths(job, input);
 FileInputFormat.setInputDirRecursive(job, true);
 FileInputFormat.setMaxInputSplitSize(job,
   this.getConf().getLong(FileInputFormat.SPLIT_MAXSIZE, 60000L));
 job.setOutputKeyClass(LongWritable.class);
 job.setOutputValueClass(Text.class);
 FileOutputFormat.setOutputPath(job, output);
 FileOutputFormat.setOutputCompressorClass(job, GzipCodec.class);
 job.setNumReduceTasks(getConf().getInt(
   ConfigurationKeys.BATCH_JOB_COPY_PARALLELISM,
   150));
 boolean success = job.waitForCompletion(true);
 if (success) {
  LOG.info("Job for step 2 finished successfully! To view logging data, run the following "
    + "commands in Hive: \n\n"
    + VelocityUtils.renderTemplate(STEP2_HQL_TEMPLATE, velocityContext)
    + "\n");
 }
 return success ? 0 : 1;
}

private int runHdfsCopyJob(Path input, Path output)
 throws IOException, InterruptedException, ClassNotFoundException, TemplateRenderException {
 LOG.info("Starting job for step 2...");
 Job job = Job.getInstance(this.getConf(), "Stage 2: HDFS Copy Job");
 job.setJarByClass(this.getClass());
 job.setInputFormatClass(TextInputFormat.class);
 job.setMapperClass(Stage2DirectoryCopyMapper.class);
 job.setReducerClass(Stage2DirectoryCopyReducer.class);
 FileInputFormat.setInputPaths(job, input);
 FileInputFormat.setInputDirRecursive(job, true);
 FileInputFormat.setMaxInputSplitSize(job,
   this.getConf().getLong(FileInputFormat.SPLIT_MAXSIZE, 60000L));
 job.setOutputKeyClass(LongWritable.class);
 job.setOutputValueClass(Text.class);
 FileOutputFormat.setOutputPath(job, output);
 FileOutputFormat.setOutputCompressorClass(job, GzipCodec.class);
 job.setNumReduceTasks(getConf().getInt(
   ConfigurationKeys.BATCH_JOB_COPY_PARALLELISM,
   150));
 boolean success = job.waitForCompletion(true);
 if (success) {
  LOG.info("Job for step 2 finished successfully! To view logging data, run the following "
    + "commands in Hive: \n\n"
    + VelocityUtils.renderTemplate(STEP2_HQL_TEMPLATE, velocityContext)
    + "\n");
 }
 return success ? 0 : 1;
}

long inputSize = createInputDirectory(fileSys, input, pent, depth);
FileInputFormat.setMaxInputSplitSize(job, (inputSize/numMaps));

private int runCommitChangeJob(Path input, Path output)
 throws IOException, InterruptedException, ClassNotFoundException, TemplateRenderException {
 LOG.info("Starting job for step 3...");
 Job job = Job.getInstance(this.getConf(), "Stage3: Commit Change Job");
 job.setJarByClass(this.getClass());
 job.setInputFormatClass(TextInputFormat.class);
 job.setMapperClass(Stage3CommitChangeMapper.class);
 job.setNumReduceTasks(0);
 job.setOutputKeyClass(Text.class);
 job.setOutputValueClass(Text.class);
 FileInputFormat.setInputPaths(job, input);
 FileInputFormat.setInputDirRecursive(job, true);
 FileInputFormat.setMaxInputSplitSize(job,
   this.getConf().getLong(FileInputFormat.SPLIT_MAXSIZE, 60000L));
 FileOutputFormat.setOutputPath(job, output);
 FileOutputFormat.setOutputCompressorClass(job, GzipCodec.class);
 job.setNumReduceTasks(getConf().getInt(
   ConfigurationKeys.BATCH_JOB_METASTORE_PARALLELISM,
   150));
 boolean success = job.waitForCompletion(true);
 if (success) {
  LOG.info("Job for step 3 finished successfully! To view logging data, run the following "
    + "commands in Hive: \n\n"
    + VelocityUtils.renderTemplate(STEP3_HQL_TEMPLATE, velocityContext));
 }
 return success ? 0 : 1;
}

private int runCommitChangeJob(Path input, Path output)
 throws IOException, InterruptedException, ClassNotFoundException, TemplateRenderException {
 LOG.info("Starting job for step 3...");
 Job job = Job.getInstance(this.getConf(), "Stage3: Commit Change Job");
 job.setJarByClass(this.getClass());
 job.setInputFormatClass(TextInputFormat.class);
 job.setMapperClass(Stage3CommitChangeMapper.class);
 job.setNumReduceTasks(0);
 job.setOutputKeyClass(Text.class);
 job.setOutputValueClass(Text.class);
 FileInputFormat.setInputPaths(job, input);
 FileInputFormat.setInputDirRecursive(job, true);
 FileInputFormat.setMaxInputSplitSize(job,
   this.getConf().getLong(FileInputFormat.SPLIT_MAXSIZE, 60000L));
 FileOutputFormat.setOutputPath(job, output);
 FileOutputFormat.setOutputCompressorClass(job, GzipCodec.class);
 job.setNumReduceTasks(getConf().getInt(
   ConfigurationKeys.BATCH_JOB_METASTORE_PARALLELISM,
   150));
 boolean success = job.waitForCompletion(true);
 if (success) {
  LOG.info("Job for step 3 finished successfully! To view logging data, run the following "
    + "commands in Hive: \n\n"
    + VelocityUtils.renderTemplate(STEP3_HQL_TEMPLATE, velocityContext));
 }
 return success ? 0 : 1;
}

protected List<InputSplit> computeSplits(long desiredBundleSizeBytes) throws IOException, IllegalAccessException,
    InstantiationException {
  Job job = jobInstance();
  FileInputFormat.setMinInputSplitSize(job, desiredBundleSizeBytes);
  FileInputFormat.setMaxInputSplitSize(job, desiredBundleSizeBytes);
  return createFormat(job).getSplits(job);
}

protected List<InputSplit> computeSplits(long desiredBundleSizeBytes) throws IOException, IllegalAccessException,
    InstantiationException {
  Job job = jobInstance();
  FileInputFormat.setMinInputSplitSize(job, desiredBundleSizeBytes);
  FileInputFormat.setMaxInputSplitSize(job, desiredBundleSizeBytes);
  return createFormat(job).getSplits(job);
}

private List<InputSplit> computeSplits(long desiredBundleSizeBytes,
                    SerializableConfiguration serializableConfiguration)
  throws IOException, IllegalAccessException, InstantiationException {
 Job job = SerializableConfiguration.newJob(serializableConfiguration);
 FileInputFormat.setMinInputSplitSize(job, desiredBundleSizeBytes);
 FileInputFormat.setMaxInputSplitSize(job, desiredBundleSizeBytes);
 return createFormat(job).getSplits(job);
}

@Override
public void initialize() {
 Preconditions.checkState(ReaderWriterState.NEW.equals(state),
   "A reader may not be opened more than once - current state:%s", state);
 try {
  FileInputFormat format = InputFormatUtil.newInputFormatInstance(descriptor);
  Job job = Hadoop.Job.newInstance.invoke(conf);
  FileInputFormat.addInputPath(job, path);
  // attempt to minimize the number of InputSplits
  FileStatus stat = fs.getFileStatus(path);
  FileInputFormat.setMaxInputSplitSize(job, stat.getLen());
  this.splits = format.getSplits(job).iterator();
  this.shouldAdvance = true;
  this.state = ReaderWriterState.OPEN;
 } catch (RuntimeException e) {
  this.state = ReaderWriterState.ERROR;
  throw new DatasetOperationException("Cannot calculate splits", e);
 } catch (IOException e) {
  this.state = ReaderWriterState.ERROR;
  throw new DatasetIOException("Cannot calculate splits", e);
 }
}

  random.nextInt(MAX_LENGTH / (SequenceFile.SYNC_INTERVAL / 20)) + 1;
FileInputFormat.setMaxInputSplitSize(job, 
 fs.getFileStatus(inFile).getLen() / numSplits);
TaskAttemptContext context = MapReduceTestUtil.

FileInputFormat.setMaxInputSplitSize(job, config.getMaxSplitSize());
inputFormatClass = inputFormatProvider.getInputFormatClassName();
Configuration hConf = job.getConfiguration();

int numSplits =
 random.nextInt(MAX_LENGTH / (SequenceFile.SYNC_INTERVAL / 20)) + 1;
FileInputFormat.setMaxInputSplitSize(job, 
 fs.getFileStatus(file).getLen() / numSplits);
for (InputSplit split : format.getSplits(job)) {

  random.nextInt(MAX_LENGTH / (SequenceFile.SYNC_INTERVAL / 20)) + 1;
FileInputFormat.setMaxInputSplitSize(job, 
 fs.getFileStatus(inFile).getLen() / numSplits);
TaskAttemptContext context = MapReduceTestUtil.

FileInputFormat.setMaxInputSplitSize(job, 
    fs.getFileStatus(path).getLen() / numSplits);
MultipleInputs.addInputPath(job, path, TextInputFormat.class,

Javadoc

Set the maximum split size

Popular methods of FileInputFormat

setInputPaths
Set the array of Paths as the list of inputs for the map-reduce job.
addInputPath
Add a Path to the list of inputs for the map-reduce job.
getInputPaths
Get the list of input Paths for the map-reduce job.
getSplits
Generate the list of files and make them into FileSplits.
listStatus
List input directories. Subclasses may override to, e.g., select only files matching a regular expre
addInputPaths
Add the given comma separated paths to the list of inputs for the map-reduce job.
isSplitable
Is the given filename splitable? Usually, true, but if the file is stream compressed, it will not be
setMinInputSplitSize
Set the minimum input split size
createRecordReader
setInputDirRecursive
computeSplitSize
getBlockIndex

Popular in Java

Creating JSON documents from java classes using gson
setRequestProperty (URLConnection)
scheduleAtFixedRate (ScheduledExecutorService)
scheduleAtFixedRate (Timer)
Pointer (com.sun.jna)
An abstraction for a native pointer data type. A Pointer instance represents, on the Java side, a na
Enumeration (java.util)
A legacy iteration interface.New code should use Iterator instead. Iterator replaces the enumeration
Properties (java.util)
A Properties object is a Hashtable where the keys and values must be Strings. Each property can have
StringTokenizer (java.util)
Breaks a string into tokens; new code should probably use String#split.> // Legacy code: StringTo
HttpServletRequest (javax.servlet.http)
Extends the javax.servlet.ServletRequest interface to provide request information for HTTP servlets.
Logger (org.slf4j)
The org.slf4j.Logger interface is the main user entry point of SLF4J API. It is expected that loggin
CodeWhisperer alternatives

How to use setMaxInputSplitSizemethodin org.apache.hadoop.mapreduce.lib.input.FileInputFormat

Best Java code snippets using org.apache.hadoop.mapreduce.lib.input.FileInputFormat.setMaxInputSplitSize (Showing top 20 results out of 315)

How to use
setMaxInputSplitSize
method
in
org.apache.hadoop.mapreduce.lib.input.FileInputFormat