org.apache.hadoop.mapreduce.InputFormat.getSplits java code examples

@Override
public List<InputSplit> getSplits(JobContext context) throws IOException, InterruptedException {
  return inputFormat.getSplits(context);
}

protected int getMapInputSplitCount()
    throws ClassNotFoundException, JobException, IOException, InterruptedException {
  if (job == null) {
    throw new JobException("Job is null");
  }
  InputFormat<?, ?> input = ReflectionUtils.newInstance(job.getInputFormatClass(), job.getConfiguration());
  return input.getSplits(job).size();
}

public static double getTotalMapInputMB(Job job)
    throws ClassNotFoundException, IOException, InterruptedException, JobException {
  if (job == null) {
    throw new JobException("Job is null");
  }
  long mapInputBytes = 0;
  InputFormat<?, ?> input = ReflectionUtils.newInstance(job.getInputFormatClass(), job.getConfiguration());
  for (InputSplit split : input.getSplits(job)) {
    mapInputBytes += split.getLength();
  }
  
  // 0 input bytes is possible when the segment range hits no partition on a partitioned hive table (KYLIN-2470) 
  if (mapInputBytes == 0) {
    logger.warn("Map input splits are 0 bytes, something is wrong?");
  }
  
  double totalMapInputMB = (double) mapInputBytes / 1024 / 1024;
  return totalMapInputMB;
}

@Override
public HadoopInputSplit[] createInputSplits(int minNumSplits)
    throws IOException {
  configuration.setInt("mapreduce.input.fileinputformat.split.minsize", minNumSplits);
  JobContext jobContext = new JobContextImpl(configuration, new JobID());
  jobContext.getCredentials().addAll(this.credentials);
  Credentials currentUserCreds = getCredentialsFromUGI(UserGroupInformation.getCurrentUser());
  if (currentUserCreds != null) {
    jobContext.getCredentials().addAll(currentUserCreds);
  }
  List<org.apache.hadoop.mapreduce.InputSplit> splits;
  try {
    splits = this.mapreduceInputFormat.getSplits(jobContext);
  } catch (InterruptedException e) {
    throw new IOException("Could not get Splits.", e);
  }
  HadoopInputSplit[] hadoopInputSplits = new HadoopInputSplit[splits.size()];
  for (int i = 0; i < hadoopInputSplits.length; i++) {
    hadoopInputSplits[i] = new HadoopInputSplit(i, splits.get(i), jobContext);
  }
  return hadoopInputSplits;
}

List<InputSplit> splits = format.getSplits(ctx);

@Override
public List<InputSplit> getSplits(final JobContext jobContext) throws IOException, InterruptedException {
  final Configuration configuration = jobContext.getConfiguration();
  return ReflectionUtils.newInstance(configuration.getClass(Constants.GREMLIN_HADOOP_GRAPH_READER, InputFormat.class, InputFormat.class), configuration).getSplits(jobContext);
}

protected int getMapInputSplitCount() throws ClassNotFoundException, JobException, IOException, InterruptedException {
  if (job == null) {
    throw new JobException("Job is null");
  }
  InputFormat<?, ?> input = ReflectionUtils.newInstance(job.getInputFormatClass(), job.getConfiguration());
  return input.getSplits(job).size();
}

protected double getTotalMapInputMB() throws ClassNotFoundException, IOException, InterruptedException, JobException {
  if (job == null) {
    throw new JobException("Job is null");
  }
  long mapInputBytes = 0;
  InputFormat<?, ?> input = ReflectionUtils.newInstance(job.getInputFormatClass(), job.getConfiguration());
  for (InputSplit split : input.getSplits(job)) {
    mapInputBytes += split.getLength();
  }
  if (mapInputBytes == 0) {
    throw new IllegalArgumentException("Map input splits are 0 bytes, something is wrong!");
  }
  double totalMapInputMB = (double) mapInputBytes / 1024 / 1024;
  return totalMapInputMB;
}

public Object[] getSample(InputFormat inf, Job job) throws IOException, InterruptedException {
 long counter = 0;
 List<InputSplit> splits = inf.getSplits(job);
 ArrayList<K> samples = new ArrayList<K>(numSamples);
 int splitsToSample = Math.min(maxSplitsSampled, splits.size());

public HadoopElementIterator(final HadoopGraph graph) {
  try {
    this.graph = graph;
    final Configuration configuration = ConfUtil.makeHadoopConfiguration(this.graph.configuration());
    final InputFormat<NullWritable, VertexWritable> inputFormat = ConfUtil.getReaderAsInputFormat(configuration);
    if (inputFormat instanceof FileInputFormat) {
      final Storage storage = FileSystemStorage.open(configuration);
      if (!this.graph.configuration().containsKey(Constants.GREMLIN_HADOOP_INPUT_LOCATION))
        return; // there is no input location and thus, no data (empty graph)
      if (!Constants.getSearchGraphLocation(this.graph.configuration().getInputLocation(), storage).isPresent())
        return; // there is no data at the input location (empty graph)
      configuration.set(Constants.MAPREDUCE_INPUT_FILEINPUTFORMAT_INPUTDIR, Constants.getSearchGraphLocation(this.graph.configuration().getInputLocation(), storage).get());
    }
    final List<InputSplit> splits = inputFormat.getSplits(new JobContextImpl(configuration, new JobID(UUID.randomUUID().toString(), 1)));
    for (final InputSplit split : splits) {
      this.readers.add(inputFormat.createRecordReader(split, new TaskAttemptContextImpl(configuration, new TaskAttemptID())));
    }
  } catch (final Exception e) {
    throw new IllegalStateException(e.getMessage(), e);
  }
}

@Override
@edu.umd.cs.findbugs.annotations.SuppressWarnings(value="UWF_FIELD_NOT_INITIALIZED_IN_CONSTRUCTOR",
  justification="Delegate set by setConf")
public List<InputSplit> getSplits(JobContext jobContext) throws IOException,
  InterruptedException {
 return delegate.getSplits(jobContext);
}

@Override
public List<InputSplit> getSplits(final JobContext jobContext) throws IOException, InterruptedException {
  final Configuration configuration = jobContext.getConfiguration();
  return ReflectionUtils.newInstance(configuration.getClass(Constants.GREMLIN_HADOOP_GRAPH_READER, InputFormat.class, InputFormat.class), configuration).getSplits(jobContext);
}

 @SuppressWarnings("unchecked")
@Override
public List<InputSplit> getSplits(JobContext context) 
throws IOException, InterruptedException {
   instantiateWhenNeeded();
   return instance.getSplits(context);
}

protected int getMapInputSplitCount() throws ClassNotFoundException, JobException, IOException, InterruptedException {
  if (job == null) {
    throw new JobException("Job is null");
  }
  InputFormat<?, ?> input = ReflectionUtils.newInstance(job.getInputFormatClass(), job.getConfiguration());
  return input.getSplits(job).size();
}

@Override
public List<InputSplit> getSplits(JobContext context) throws IOException,
  InterruptedException {
 List<InputSplit> originalSplits = wrappedInputFormat.getSplits(context);
 TezMapReduceSplitsGrouper grouper = new TezMapReduceSplitsGrouper();
 String wrappedInputFormatName = wrappedInputFormat.getClass().getName();
 return grouper
   .getGroupedSplits(conf, originalSplits, desiredNumSplits, wrappedInputFormatName, estimator,
     locationProvider);
}

protected List<BoundedDataSource<Pair<K, V>>> doSplit(Job job) {
 return ExceptionUtils.unchecked(() -> newInputFormatClass()
   .getSplits(job)
   .stream()
   .map(split -> new HadoopSplit<>(this, split))
   .collect(Collectors.toList()));
}

@Override
public List<InputSplit> getSplits(JobContext context)
  throws IOException, InterruptedException {
 return getInputFormat(HadoopCompat.getConfiguration(context))
   .getSplits(context);
}

public List<InputSplit> getSplits(JobContext context)
  throws IOException, InterruptedException {
 return inf.getSplits(
       new JobContextImpl(getConf(context.getConfiguration()), 
                context.getJobID()));
}

public List<InputSplit> getSplits(JobContext context)
  throws IOException, InterruptedException {
 return inf.getSplits(
       new JobContextImpl(getConf(context.getConfiguration()), 
                context.getJobID()));
}

public List<InputSplit> getSplits(JobContext context)
  throws IOException, InterruptedException {
 return inf.getSplits(
       new JobContextImpl(getConf(context.getConfiguration()), 
                context.getJobID()));
}

Javadoc

Logically split the set of input files for the job.

Each InputSplit is then assigned to an individual Mapperfor processing.

Note: The split is a logical split of the inputs and the input files are not physically split into chunks. For e.g. a split could be <input-file-path, start, offset> tuple. The InputFormat also creates the RecordReader to read the InputSplit.

Popular methods of InputFormat

createRecordReader
Create a record reader for a given split. The framework will call RecordReader#initialize(InputSplit

Popular in Java

Making http requests using okhttp
notifyDataSetChanged (ArrayAdapter)
orElseThrow (Optional)
Return the contained value, if present, otherwise throw an exception to be created by the provided s
compareTo (BigDecimal)
MessageDigest (java.security)
Uses a one-way hash function to turn an arbitrary number of bytes into a fixed-length byte sequence.
Dictionary (java.util)
Note: Do not use this class since it is obsolete. Please use the Map interface for new implementatio
LinkedList (java.util)
Doubly-linked list implementation of the List and Dequeinterfaces. Implements all optional list oper
Callable (java.util.concurrent)
A task that returns a result and may throw an exception. Implementors define a single method with no
Table (com.google.common.collect)
A collection that associates an ordered pair of keys, called a row key and a column key, with a sing
Get (org.apache.hadoop.hbase.client)
Used to perform Get operations on a single row. To get everything for a row, instantiate a Get objec
Top Vim plugins

How to use getSplitsmethodin org.apache.hadoop.mapreduce.InputFormat

Best Java code snippets using org.apache.hadoop.mapreduce.InputFormat.getSplits (Showing top 20 results out of 432)

How to use
getSplits
method
in
org.apache.hadoop.mapreduce.InputFormat