@Override public BaseStatistics getStatistics(BaseStatistics cachedStats) throws IOException { // only gather base statistics for FileInputFormats if (!(mapreduceInputFormat instanceof FileInputFormat)) { return null; } JobContext jobContext = new JobContextImpl(configuration, null); final FileBaseStatistics cachedFileStats = (cachedStats instanceof FileBaseStatistics) ? (FileBaseStatistics) cachedStats : null; try { final org.apache.hadoop.fs.Path[] paths = FileInputFormat.getInputPaths(jobContext); return getFileStats(cachedFileStats, paths, new ArrayList<FileStatus>(1)); } catch (IOException ioex) { if (LOG.isWarnEnabled()) { LOG.warn("Could not determine statistics due to an io error: " + ioex.getMessage()); } } catch (Throwable t) { if (LOG.isErrorEnabled()) { LOG.error("Unexpected problem while getting the file statistics: " + t.getMessage(), t); } } // no statistics available return null; }
public static Schema getNewestSchemaFromSource(Job job, FileSystem fs) throws IOException { Path[] sourceDirs = FileInputFormat.getInputPaths(job); List<FileStatus> files = new ArrayList<FileStatus>(); for (Path sourceDir : sourceDirs) { files.addAll(Arrays.asList(fs.listStatus(sourceDir))); } Collections.sort(files, new LastModifiedDescComparator()); for (FileStatus file : files) { Schema schema = getNewestSchemaFromSource(file.getPath(), fs); if (schema != null) { return schema; } } return null; }
throws IOException, InterruptedException { Path[] inputPaths = FileInputFormat.getInputPaths(context); if (inputPaths == null || inputPaths.length == 0) { throw new IOException("No input found!");
Path[] paths = FileInputFormat.getInputPaths(jobContext);
Path[] tablePaths = FileInputFormat.getInputPaths(jobContext);
private Schema getNewestSchemaFromSource(Job job) throws IOException { FileSystem fs = FileSystem.get(job.getConfiguration()); Path[] sourceDirs = FileInputFormat.getInputPaths(job); List<FileStatus> files = new ArrayList<FileStatus>(); for (Path sourceDir : sourceDirs) { files.addAll(Arrays.asList(fs.listStatus(sourceDir))); } Collections.sort(files, new ReverseLastModifiedComparitor()); for (FileStatus f : files) { Schema schema = getNewestSchemaFromSource(f.getPath(), fs); if (schema != null) return schema; } return null; }
final List<InputSplit> psplits = Lists.newArrayListWithExpectedSize(splits.size()); Path[] tablePaths = FileInputFormat.getInputPaths(ShimLoader.getHadoopShims() .newJobContext(new Job(jobConf))); boolean splitByStats = jobConf.getBoolean(PhoenixStorageHandlerConstants.SPLIT_BY_STATS,
Map<CamusRequest, EtlKey> offsetKeys = getPreviousOffsets(FileInputFormat.getInputPaths(context), context); Set<String> moveLatest = getMoveToLatestTopicsSet(context); String camusRequestEmailMessage = "";
/** * Returns input paths. * @param context current job * @return the input paths, or an empty list if they are not set * @throws IOException if failed to resolve paths * @throws IllegalArgumentException if some parameters were {@code null} * @since 0.7.0 */ public static List<Path> getInputPaths(JobContext context) throws IOException { if (context == null) { throw new IllegalArgumentException("job must not be null"); //$NON-NLS-1$ } Path[] paths = FileInputFormat.getInputPaths(context); if (paths == null || paths.length == 0) { return Collections.emptyList(); } return Arrays.asList(paths); }
@Override public List<InputSplit> getSplits(JobContext context) throws IOException { Path[] paths = FileInputFormat.getInputPaths(context); return FluentIterable.from(BaseInputFormat.getSplits(context.getConfiguration(), paths)) .transform(_fromSplit) .toList(); }
@Override public List<InputSplit> getSplits(JobContext context) throws IOException { Path[] paths = FileInputFormat.getInputPaths(context); return FluentIterable.from(BaseInputFormat.getSplits(context.getConfiguration(), paths)) .transform(_fromSplit) .toList(); }
public static Schema getNewestSchemaFromSource(Job job, FileSystem fs) throws IOException { Path[] sourceDirs = FileInputFormat.getInputPaths(job); List<FileStatus> files = new ArrayList<FileStatus>(); for (Path sourceDir : sourceDirs) { files.addAll(Arrays.asList(fs.listStatus(sourceDir))); } Collections.sort(files, new LastModifiedDescComparator()); for (FileStatus file : files) { Schema schema = getNewestSchemaFromSource(file.getPath(), fs); if (schema != null) { return schema; } } return null; }
public static Schema getNewestSchemaFromSource(Job job, FileSystem fs) throws IOException { Path[] sourceDirs = FileInputFormat.getInputPaths(job); List<FileStatus> files = new ArrayList<FileStatus>(); for (Path sourceDir : sourceDirs) { files.addAll(Arrays.asList(fs.listStatus(sourceDir))); } Collections.sort(files, new LastModifiedDescComparator()); for (FileStatus file : files) { Schema schema = getNewestSchemaFromSource(file.getPath(), fs); if (schema != null) { return schema; } } return null; }
@Override public void run(MRJob job) throws IOException { Configuration conf = job.getJob().getConfiguration(); if (conf.getBoolean(RuntimeParameters.CREATE_DIR, false)) { Path[] inputPaths = FileInputFormat.getInputPaths(job.getJob()); for (Path inputPath : inputPaths) { FileSystem fs = inputPath.getFileSystem(conf); if (!fs.exists(inputPath)) { try { fs.mkdirs(inputPath); } catch (IOException e) { } } } } } }
public static void log(Job job, Logger log) throws ClassNotFoundException { log.debug ("{} -> {} ({}, {}) -> {}#{} ({}, {}) -> {}", new Object[]{ job.getInputFormatClass().getSimpleName(), job.getMapperClass().getSimpleName(), job.getMapOutputKeyClass().getSimpleName(), job.getMapOutputValueClass().getSimpleName(), job.getReducerClass().getSimpleName(), job.getNumReduceTasks(), job.getOutputKeyClass().getSimpleName(), job.getOutputValueClass().getSimpleName(), job.getOutputFormatClass().getSimpleName() } ); Path[] inputs = FileInputFormat.getInputPaths(job); Path output = FileOutputFormat.getOutputPath(job); log.debug("input: {}", inputs[0]); log.debug("output: {}", output); }
protected Map<String, Tap> fileInputToTaps( JobConf jobConf ) { Path[] paths = FileInputFormat.getInputPaths( jobConf ); if( paths == null || paths.length == 0 ) { try { paths = org.apache.hadoop.mapreduce.lib.input.FileInputFormat.getInputPaths( new Job( jobConf ) ); } catch( IOException exception ) { throw new CascadingException( exception ); } } Map<String, Tap> taps = new HashMap<>(); if( paths == null ) return taps; for( Path path : paths ) toSourceTap( jobConf, taps, path ); return taps; }
protected Map<String, Tap> fileInputToTaps( JobConf jobConf ) { Path[] paths = FileInputFormat.getInputPaths( jobConf ); if( paths == null || paths.length == 0 ) { try { paths = org.apache.hadoop.mapreduce.lib.input.FileInputFormat.getInputPaths( new Job( jobConf ) ); } catch( IOException exception ) { throw new CascadingException( exception ); } } Map<String, Tap> taps = new HashMap<>(); if( paths == null ) return taps; for( Path path : paths ) toSourceTap( jobConf, taps, path ); return taps; }
@Override public InputSplit[] getSplits(JobConf jobConf, int numSplits) throws IOException { try (DatasetAccessor datasetAccessor = new DatasetAccessor(jobConf)) { try { datasetAccessor.initialize(); } catch (Exception e) { throw new IOException("Could not get dataset", e); } try (RecordScannable recordScannable = datasetAccessor.getDataset()) { Job job = new Job(jobConf); JobContext jobContext = ShimLoader.getHadoopShims().newJobContext(job); Path[] tablePaths = FileInputFormat.getInputPaths(jobContext); List<Split> dsSplits = recordScannable.getSplits(); InputSplit[] inputSplits = new InputSplit[dsSplits.size()]; for (int i = 0; i < dsSplits.size(); i++) { inputSplits[i] = new DatasetInputSplit(dsSplits.get(i), tablePaths[0]); } return inputSplits; } } }
@Override public InputSplit[] getSplits(JobConf jobConf, int numSplits) throws IOException { try (DatasetAccessor datasetAccessor = new DatasetAccessor(jobConf)) { try { datasetAccessor.initialize(); } catch (Exception e) { throw new IOException("Could not get dataset", e); } try (RecordScannable recordScannable = datasetAccessor.getDataset()) { Job job = new Job(jobConf); JobContext jobContext = ShimLoader.getHadoopShims().newJobContext(job); Path[] tablePaths = FileInputFormat.getInputPaths(jobContext); List<Split> dsSplits = recordScannable.getSplits(); InputSplit[] inputSplits = new InputSplit[dsSplits.size()]; for (int i = 0; i < dsSplits.size(); i++) { inputSplits[i] = new DatasetInputSplit(dsSplits.get(i), tablePaths[0]); } return inputSplits; } } }
private StreamInputSplitFinder<InputSplit> getSplitFinder(JobConf conf) throws IOException { // first get the context we are in ContextManager.Context context = ContextManager.getContext(conf); Preconditions.checkNotNull(context); StreamConfig streamConfig = context.getStreamConfig(getStreamId(conf)); // make sure we get the current generation so we don't read events that occurred before a truncate. Location streamPath = StreamUtils.createGenerationLocation(streamConfig.getLocation(), StreamUtils.getGeneration(streamConfig)); StreamInputSplitFinder.Builder builder = StreamInputSplitFinder.builder(streamPath.toURI()); // Get the Hive table path for the InputSplit created. It is just to satisfy hive. The InputFormat never uses it. JobContext jobContext = ShimLoader.getHadoopShims().newJobContext(Job.getInstance(conf)); final Path[] tablePaths = FileInputFormat.getInputPaths(jobContext); return setupBuilder(conf, streamConfig, builder).build(new StreamInputSplitFactory<InputSplit>() { @Override public InputSplit createSplit(Path eventPath, Path indexPath, long startTime, long endTime, long start, long length, @Nullable String[] locations) { return new StreamInputSplit(tablePaths[0], eventPath, indexPath, startTime, endTime, start, length, locations); } }); }