org.apache.hadoop.mapreduce.lib.input.FileInputFormat.getInputPaths java code examples

@Override
public BaseStatistics getStatistics(BaseStatistics cachedStats) throws IOException {
  // only gather base statistics for FileInputFormats
  if (!(mapreduceInputFormat instanceof FileInputFormat)) {
    return null;
  }
  JobContext jobContext = new JobContextImpl(configuration, null);
  final FileBaseStatistics cachedFileStats = (cachedStats instanceof FileBaseStatistics) ?
      (FileBaseStatistics) cachedStats : null;
  try {
    final org.apache.hadoop.fs.Path[] paths = FileInputFormat.getInputPaths(jobContext);
    return getFileStats(cachedFileStats, paths, new ArrayList<FileStatus>(1));
  } catch (IOException ioex) {
    if (LOG.isWarnEnabled()) {
      LOG.warn("Could not determine statistics due to an io error: "
          + ioex.getMessage());
    }
  } catch (Throwable t) {
    if (LOG.isErrorEnabled()) {
      LOG.error("Unexpected problem while getting the file statistics: "
          + t.getMessage(), t);
    }
  }
  // no statistics available
  return null;
}

public static Schema getNewestSchemaFromSource(Job job, FileSystem fs) throws IOException {
 Path[] sourceDirs = FileInputFormat.getInputPaths(job);
 List<FileStatus> files = new ArrayList<FileStatus>();
 for (Path sourceDir : sourceDirs) {
  files.addAll(Arrays.asList(fs.listStatus(sourceDir)));
 }
 Collections.sort(files, new LastModifiedDescComparator());
 for (FileStatus file : files) {
  Schema schema = getNewestSchemaFromSource(file.getPath(), fs);
  if (schema != null) {
   return schema;
  }
 }
 return null;
}

 throws IOException, InterruptedException {
Path[] inputPaths = FileInputFormat.getInputPaths(context);
if (inputPaths == null || inputPaths.length == 0) {
 throw new IOException("No input found!");

Path[] paths = FileInputFormat.getInputPaths(jobContext);

Path[] tablePaths = FileInputFormat.getInputPaths(jobContext);

private Schema getNewestSchemaFromSource(Job job) throws IOException {
 FileSystem fs = FileSystem.get(job.getConfiguration());
 Path[] sourceDirs = FileInputFormat.getInputPaths(job);
 List<FileStatus> files = new ArrayList<FileStatus>();
 for (Path sourceDir : sourceDirs) {
  files.addAll(Arrays.asList(fs.listStatus(sourceDir)));
 }
 Collections.sort(files, new ReverseLastModifiedComparitor());
 for (FileStatus f : files) {
  Schema schema = getNewestSchemaFromSource(f.getPath(), fs);
  if (schema != null)
   return schema;
 }
 return null;
}

final List<InputSplit> psplits = Lists.newArrayListWithExpectedSize(splits.size());
Path[] tablePaths = FileInputFormat.getInputPaths(ShimLoader.getHadoopShims()
    .newJobContext(new Job(jobConf)));
boolean splitByStats = jobConf.getBoolean(PhoenixStorageHandlerConstants.SPLIT_BY_STATS,

Map<CamusRequest, EtlKey> offsetKeys = getPreviousOffsets(FileInputFormat.getInputPaths(context), context);
Set<String> moveLatest = getMoveToLatestTopicsSet(context);
String camusRequestEmailMessage = "";

/**
 * Returns input paths.
 * @param context current job
 * @return the input paths, or an empty list if they are not set
 * @throws IOException if failed to resolve paths
 * @throws IllegalArgumentException if some parameters were {@code null}
 * @since 0.7.0
 */
public static List<Path> getInputPaths(JobContext context) throws IOException {
  if (context == null) {
    throw new IllegalArgumentException("job must not be null"); //$NON-NLS-1$
  }
  Path[] paths = FileInputFormat.getInputPaths(context);
  if (paths == null || paths.length == 0) {
    return Collections.emptyList();
  }
  return Arrays.asList(paths);
}

@Override
public List<InputSplit> getSplits(JobContext context)
    throws IOException {
  Path[] paths = FileInputFormat.getInputPaths(context);
  return FluentIterable.from(BaseInputFormat.getSplits(context.getConfiguration(), paths))
      .transform(_fromSplit)
      .toList();
}

@Override
public List<InputSplit> getSplits(JobContext context)
    throws IOException {
  Path[] paths = FileInputFormat.getInputPaths(context);
  return FluentIterable.from(BaseInputFormat.getSplits(context.getConfiguration(), paths))
      .transform(_fromSplit)
      .toList();
}

public static Schema getNewestSchemaFromSource(Job job, FileSystem fs) throws IOException {
 Path[] sourceDirs = FileInputFormat.getInputPaths(job);
 List<FileStatus> files = new ArrayList<FileStatus>();
 for (Path sourceDir : sourceDirs) {
  files.addAll(Arrays.asList(fs.listStatus(sourceDir)));
 }
 Collections.sort(files, new LastModifiedDescComparator());
 for (FileStatus file : files) {
  Schema schema = getNewestSchemaFromSource(file.getPath(), fs);
  if (schema != null) {
   return schema;
  }
 }
 return null;
}

public static Schema getNewestSchemaFromSource(Job job, FileSystem fs) throws IOException {
 Path[] sourceDirs = FileInputFormat.getInputPaths(job);
 List<FileStatus> files = new ArrayList<FileStatus>();
 for (Path sourceDir : sourceDirs) {
  files.addAll(Arrays.asList(fs.listStatus(sourceDir)));
 }
 Collections.sort(files, new LastModifiedDescComparator());
 for (FileStatus file : files) {
  Schema schema = getNewestSchemaFromSource(file.getPath(), fs);
  if (schema != null) {
   return schema;
  }
 }
 return null;
}

 @Override
 public void run(MRJob job) throws IOException {
  Configuration conf = job.getJob().getConfiguration();
  if (conf.getBoolean(RuntimeParameters.CREATE_DIR, false)) {
   Path[] inputPaths = FileInputFormat.getInputPaths(job.getJob());
   for (Path inputPath : inputPaths) {
    FileSystem fs = inputPath.getFileSystem(conf);
    if (!fs.exists(inputPath)) {
     try {
      fs.mkdirs(inputPath);
     } catch (IOException e) {
     }
    }
   }
  }
 }
}

public static void log(Job job, Logger log) throws ClassNotFoundException {
  log.debug ("{} -> {} ({}, {}) -> {}#{} ({}, {}) -> {}", 
    new Object[]{
      job.getInputFormatClass().getSimpleName(), job.getMapperClass().getSimpleName(), job.getMapOutputKeyClass().getSimpleName(), job.getMapOutputValueClass().getSimpleName(), 
      job.getReducerClass().getSimpleName(), job.getNumReduceTasks(), job.getOutputKeyClass().getSimpleName(), job.getOutputValueClass().getSimpleName(), job.getOutputFormatClass().getSimpleName()
    }
  );
  Path[] inputs = FileInputFormat.getInputPaths(job);
  Path output = FileOutputFormat.getOutputPath(job);
  log.debug("input: {}", inputs[0]);
  log.debug("output: {}", output);
}

protected Map<String, Tap> fileInputToTaps( JobConf jobConf )
 {
 Path[] paths = FileInputFormat.getInputPaths( jobConf );
 if( paths == null || paths.length == 0 )
  {
  try
   {
   paths = org.apache.hadoop.mapreduce.lib.input.FileInputFormat.getInputPaths( new Job( jobConf ) );
   }
  catch( IOException exception )
   {
   throw new CascadingException( exception );
   }
  }
 Map<String, Tap> taps = new HashMap<>();
 if( paths == null )
  return taps;
 for( Path path : paths )
  toSourceTap( jobConf, taps, path );
 return taps;
 }

protected Map<String, Tap> fileInputToTaps( JobConf jobConf )
 {
 Path[] paths = FileInputFormat.getInputPaths( jobConf );
 if( paths == null || paths.length == 0 )
  {
  try
   {
   paths = org.apache.hadoop.mapreduce.lib.input.FileInputFormat.getInputPaths( new Job( jobConf ) );
   }
  catch( IOException exception )
   {
   throw new CascadingException( exception );
   }
  }
 Map<String, Tap> taps = new HashMap<>();
 if( paths == null )
  return taps;
 for( Path path : paths )
  toSourceTap( jobConf, taps, path );
 return taps;
 }

@Override
public InputSplit[] getSplits(JobConf jobConf, int numSplits) throws IOException {
 try (DatasetAccessor datasetAccessor = new DatasetAccessor(jobConf)) {
  try {
   datasetAccessor.initialize();
  } catch (Exception e) {
   throw new IOException("Could not get dataset", e);
  }
  try (RecordScannable recordScannable = datasetAccessor.getDataset()) {
   Job job = new Job(jobConf);
   JobContext jobContext = ShimLoader.getHadoopShims().newJobContext(job);
   Path[] tablePaths = FileInputFormat.getInputPaths(jobContext);
   List<Split> dsSplits = recordScannable.getSplits();
   InputSplit[] inputSplits = new InputSplit[dsSplits.size()];
   for (int i = 0; i < dsSplits.size(); i++) {
    inputSplits[i] = new DatasetInputSplit(dsSplits.get(i), tablePaths[0]);
   }
   return inputSplits;
  }
 }
}

@Override
public InputSplit[] getSplits(JobConf jobConf, int numSplits) throws IOException {
 try (DatasetAccessor datasetAccessor = new DatasetAccessor(jobConf)) {
  try {
   datasetAccessor.initialize();
  } catch (Exception e) {
   throw new IOException("Could not get dataset", e);
  }
  try (RecordScannable recordScannable = datasetAccessor.getDataset()) {
   Job job = new Job(jobConf);
   JobContext jobContext = ShimLoader.getHadoopShims().newJobContext(job);
   Path[] tablePaths = FileInputFormat.getInputPaths(jobContext);
   List<Split> dsSplits = recordScannable.getSplits();
   InputSplit[] inputSplits = new InputSplit[dsSplits.size()];
   for (int i = 0; i < dsSplits.size(); i++) {
    inputSplits[i] = new DatasetInputSplit(dsSplits.get(i), tablePaths[0]);
   }
   return inputSplits;
  }
 }
}

private StreamInputSplitFinder<InputSplit> getSplitFinder(JobConf conf) throws IOException {
 // first get the context we are in
 ContextManager.Context context = ContextManager.getContext(conf);
 Preconditions.checkNotNull(context);
 StreamConfig streamConfig = context.getStreamConfig(getStreamId(conf));
 // make sure we get the current generation so we don't read events that occurred before a truncate.
 Location streamPath = StreamUtils.createGenerationLocation(streamConfig.getLocation(),
                               StreamUtils.getGeneration(streamConfig));
 StreamInputSplitFinder.Builder builder = StreamInputSplitFinder.builder(streamPath.toURI());
 // Get the Hive table path for the InputSplit created. It is just to satisfy hive. The InputFormat never uses it.
 JobContext jobContext = ShimLoader.getHadoopShims().newJobContext(Job.getInstance(conf));
 final Path[] tablePaths = FileInputFormat.getInputPaths(jobContext);
 return setupBuilder(conf, streamConfig, builder).build(new StreamInputSplitFactory<InputSplit>() {
  @Override
  public InputSplit createSplit(Path eventPath, Path indexPath, long startTime, long endTime,
                 long start, long length, @Nullable String[] locations) {
   return new StreamInputSplit(tablePaths[0], eventPath, indexPath, startTime, endTime, start, length, locations);
  }
 });
}

Javadoc

Get the list of input Paths for the map-reduce job.

Popular methods of FileInputFormat

setInputPaths
Set the array of Paths as the list of inputs for the map-reduce job.
addInputPath
Add a Path to the list of inputs for the map-reduce job.
getSplits
Generate the list of files and make them into FileSplits.
listStatus
List input directories. Subclasses may override to, e.g., select only files matching a regular expre
addInputPaths
Add the given comma separated paths to the list of inputs for the map-reduce job.
setMaxInputSplitSize
Set the maximum split size
isSplitable
Is the given filename splitable? Usually, true, but if the file is stream compressed, it will not be
setMinInputSplitSize
Set the minimum input split size
createRecordReader
setInputDirRecursive
computeSplitSize
getBlockIndex

Popular in Java

Creating JSON documents from java classes using gson
setRequestProperty (URLConnection)
scheduleAtFixedRate (ScheduledExecutorService)
scheduleAtFixedRate (Timer)
Pointer (com.sun.jna)
An abstraction for a native pointer data type. A Pointer instance represents, on the Java side, a na
Enumeration (java.util)
A legacy iteration interface.New code should use Iterator instead. Iterator replaces the enumeration
Properties (java.util)
A Properties object is a Hashtable where the keys and values must be Strings. Each property can have
StringTokenizer (java.util)
Breaks a string into tokens; new code should probably use String#split.> // Legacy code: StringTo
HttpServletRequest (javax.servlet.http)
Extends the javax.servlet.ServletRequest interface to provide request information for HTTP servlets.
Logger (org.slf4j)
The org.slf4j.Logger interface is the main user entry point of SLF4J API. It is expected that loggin
Github Copilot alternatives

How to use getInputPathsmethodin org.apache.hadoop.mapreduce.lib.input.FileInputFormat

Best Java code snippets using org.apache.hadoop.mapreduce.lib.input.FileInputFormat.getInputPaths (Showing top 20 results out of 378)

How to use
getInputPaths
method
in
org.apache.hadoop.mapreduce.lib.input.FileInputFormat