@Override public InputSplit[] getSplits(JobConf job, int numSplits) throws IOException { return sourceInputFormat.getSplits(job, numSplits); }
@Override public HadoopInputSplit[] createInputSplits(int minNumSplits) throws IOException { org.apache.hadoop.mapred.InputSplit[] splitArray = mapredInputFormat.getSplits(jobConf, minNumSplits); HadoopInputSplit[] hiSplit = new HadoopInputSplit[splitArray.length]; for (int i = 0; i < splitArray.length; i++) { hiSplit[i] = new HadoopInputSplit(i, splitArray[i], jobConf); } return hiSplit; }
private void generateWrappedSplits(InputFormat inputFormat, List<FetchInputFormatSplit> inputSplits, JobConf job) throws IOException { InputSplit[] splits = inputFormat.getSplits(job, 1); for (int i = 0; i < splits.length; i++) { inputSplits.add(new FetchInputFormatSplit(splits[i], inputFormat)); } }
private int addBHISplit(FileStatus status, InputFormat inputFormat, Class inputFormatClass, int numOrigSplits, JobConf newjob, ArrayList<InputSplit> result) throws IOException { LOG.info("block size: " + status.getBlockSize()); LOG.info("file length: " + status.getLen()); FileInputFormat.setInputPaths(newjob, status.getPath()); InputSplit[] iss = inputFormat.getSplits(newjob, 0); if (iss != null && iss.length > 0) { numOrigSplits += iss.length; result.add(new BucketizedHiveInputSplit(iss, inputFormatClass .getName())); } return numOrigSplits; } }
public void buildInputStreamFileStatus() throws IOException { String xlearningInputs = envs.get(XLearningConstants.Environment.XLEARNING_INPUTS.toString()); if (StringUtils.isBlank(xlearningInputs)) { LOG.info("Application has no inputs"); return; } String[] inputPathTuple = StringUtils.split(xlearningInputs, "#"); if (inputPathTuple.length < 2) { throw new RuntimeException("Error input path format " + xlearningInputs); } String inputPathRemote = inputPathTuple[0]; if (!StringUtils.isBlank(inputPathRemote)) { JobConf jobConf = new JobConf(conf); jobConf.set(XLearningConstants.STREAM_INPUT_DIR, inputPathRemote); InputFormat inputFormat = ReflectionUtils.newInstance(conf.getClass(XLearningConfiguration.XLEARNING_INPUTF0RMAT_CLASS, XLearningConfiguration.DEFAULT_XLEARNING_INPUTF0RMAT_CLASS, InputFormat.class), jobConf); inputFileSplits = inputFormat.getSplits(jobConf, 1); } else { throw new RuntimeException("Error input path format " + xlearningInputs); } }
/** * Get paths from a Hive location using the provided input format. */ public static Set<Path> getPaths(InputFormat<?, ?> inputFormat, Path location) throws IOException { JobConf jobConf = new JobConf(getHadoopConfiguration()); Set<Path> paths = Sets.newHashSet(); FileInputFormat.addInputPaths(jobConf, location.toString()); InputSplit[] splits = inputFormat.getSplits(jobConf, 1000); for (InputSplit split : splits) { if (!(split instanceof FileSplit)) { throw new IOException("Not a file split. Found " + split.getClass().getName()); } FileSplit fileSplit = (FileSplit) split; paths.add(fileSplit.getPath()); } return paths; }
); try { return Arrays.stream(fio.getSplits(conf, 1)).flatMap( (final org.apache.hadoop.mapred.InputSplit split) -> { try {
LOG.info("file length: " + status.getLen()); FileInputFormat.setInputPaths(newjob, status.getPath()); InputSplit[] iss = inputFormat.getSplits(newjob, 0); if (iss != null && iss.length > 0) { numOrigSplits += iss.length;
private void addSplitsForGroup(List<Path> dirs, TableScanOperator tableScan, JobConf conf, InputFormat inputFormat, Class<? extends InputFormat> inputFormatClass, int splits, TableDesc table, List<InputSplit> result) throws IOException { Utilities.copyTablePropertiesToConf(table, conf); if (tableScan != null) { pushFilters(conf, tableScan); } FileInputFormat.setInputPaths(conf, dirs.toArray(new Path[dirs.size()])); conf.setInputFormat(inputFormat.getClass()); int headerCount = 0; int footerCount = 0; if (table != null) { headerCount = Utilities.getHeaderCount(table); footerCount = Utilities.getFooterCount(table, conf); if (headerCount != 0 || footerCount != 0) { // Input file has header or footer, cannot be splitted. HiveConf.setLongVar(conf, ConfVars.MAPREDMINSPLITSIZE, Long.MAX_VALUE); } } InputSplit[] iss = inputFormat.getSplits(conf, splits); for (InputSplit is : iss) { result.add(new HiveInputSplit(is, inputFormatClass.getName())); } }
/** * @param jobConf Job configuration. * @return Collection of mapped splits. * @throws IgniteCheckedException If mapping failed. */ public static Collection<HadoopInputSplit> splitJob(JobConf jobConf) throws IgniteCheckedException { try { InputFormat<?, ?> format = jobConf.getInputFormat(); assert format != null; InputSplit[] splits = format.getSplits(jobConf, 0); Collection<HadoopInputSplit> res = new ArrayList<>(splits.length); for (int i = 0; i < splits.length; i++) { InputSplit nativeSplit = splits[i]; if (nativeSplit instanceof FileSplit) { FileSplit s = (FileSplit)nativeSplit; res.add(new HadoopFileBlock(s.getLocations(), s.getPath().toUri(), s.getStart(), s.getLength())); } else res.add(HadoopUtils.wrapSplit(i, nativeSplit, nativeSplit.getLocations())); } return res; } catch (IOException e) { throw new IgniteCheckedException(e); } }
public InputSplit[] doGetSplits(JobConf job, int numSplits) throws IOException { super.init(job); Path[] dirs = FileInputFormat.getInputPaths(job); if (dirs.length == 0) { throw new IOException("No input paths specified in job"); } JobConf newjob = new JobConf(job); ArrayList<InputSplit> result = new ArrayList<InputSplit>(); // for each dir, get the InputFormat, and do getSplits. PartitionDesc part; for (Path dir : dirs) { part = HiveFileFormatUtils .getPartitionDescFromPathRecursively(pathToPartitionInfo, dir, IOPrepareCache.get().allocatePartitionDescMap(), true); // create a new InputFormat instance if this is the first time to see this // class Class inputFormatClass = part.getInputFileFormatClass(); InputFormat inputFormat = getInputFormatFromCache(inputFormatClass, job); Utilities.copyTableJobPropertiesToConf(part.getTableDesc(), newjob); FileInputFormat.setInputPaths(newjob, dir); newjob.setInputFormat(inputFormat.getClass()); InputSplit[] iss = inputFormat.getSplits(newjob, numSplits / dirs.length); for (InputSplit is : iss) { result.add(new HiveInputSplit(is, inputFormatClass.getName())); } } return result.toArray(new HiveInputSplit[result.size()]); }
InputSplit[] splits = inputFormat.getSplits(jobConf, 0);
protected FetchInputFormatSplit[] getNextSplits() throws Exception { while (getNextPath()) { // not using FileInputFormat.setInputPaths() here because it forces a connection to the // default file system - which may or may not be online during pure metadata operations job.set("mapred.input.dir", StringUtils.escapeString(currPath.toString())); // Fetch operator is not vectorized and as such turn vectorization flag off so that // non-vectorized record reader is created below. HiveConf.setBoolVar(job, HiveConf.ConfVars.HIVE_VECTORIZATION_ENABLED, false); Class<? extends InputFormat> formatter = currDesc.getInputFileFormatClass(); Utilities.copyTableJobPropertiesToConf(currDesc.getTableDesc(), job); InputFormat inputFormat = getInputFormatFromCache(formatter, job); InputSplit[] splits = inputFormat.getSplits(job, 1); FetchInputFormatSplit[] inputSplits = new FetchInputFormatSplit[splits.length]; for (int i = 0; i < splits.length; i++) { inputSplits[i] = new FetchInputFormatSplit(splits[i], inputFormat); } if (work.getSplitSample() != null) { inputSplits = splitSampling(work.getSplitSample(), inputSplits); } if (inputSplits.length > 0) { return inputSplits; } } return null; }
inputSplits = inputFormat.getSplits(localJc, 1); actualSplitNum = inputSplits.length;
@Test public void testEmptyFile() throws Exception { Properties properties = new Properties(); properties.setProperty("columns", "x,y"); properties.setProperty("columns.types", "int:int"); HiveOutputFormat<?, ?> outFormat = new OrcOutputFormat(); org.apache.hadoop.hive.ql.exec.FileSinkOperator.RecordWriter writer = outFormat.getHiveRecordWriter(conf, testFilePath, MyRow.class, true, properties, Reporter.NULL); writer.close(true); AbstractSerDe serde = new OrcSerde(); SerDeUtils.initializeSerDe(serde, conf, properties, null); InputFormat<?,?> in = new OrcInputFormat(); FileInputFormat.setInputPaths(conf, testFilePath.toString()); InputSplit[] splits = in.getSplits(conf, 1); assertTrue(0 == splits.length); assertEquals(null, serde.getSerDeStats()); }
job.set(ValidWriteIdList.VALID_WRITEIDS_KEY, writeIds.toString()); job.set(ValidTxnList.VALID_TXNS_KEY, validTxnList.writeToString()); InputSplit[] splits = inputFormat.getSplits(job, 1); assertEquals(numSplitsExpected, splits.length);
@Test(expected = RuntimeException.class) public void testSplitGenFailure() throws IOException { Properties properties = new Properties(); HiveOutputFormat<?, ?> outFormat = new OrcOutputFormat(); org.apache.hadoop.hive.ql.exec.FileSinkOperator.RecordWriter writer = outFormat.getHiveRecordWriter(conf, testFilePath, MyRow.class, true, properties, Reporter.NULL); writer.write(new OrcSerde().serialize(null,null)); writer.close(true); InputFormat<?,?> in = new OrcInputFormat(); fs.setPermission(testFilePath, FsPermission.createImmutable((short) 0333)); FileInputFormat.setInputPaths(conf, testFilePath.toString()); try { in.getSplits(conf, 1); } catch (RuntimeException e) { assertEquals(true, e.getMessage().contains("Permission denied")); throw e; } }
job.set(ValidWriteIdList.VALID_WRITEIDS_KEY, writeIds.toString()); job.set(ValidTxnList.VALID_TXNS_KEY, conf.get(ValidTxnList.VALID_TXNS_KEY)); InputSplit[] splits = inf.getSplits(job, buckets); Assert.assertEquals(numExpectedFiles, splits.length); org.apache.hadoop.mapred.RecordReader<NullWritable, OrcStruct> rr =
InputFormat<?,?> in = new OrcInputFormat(); FileInputFormat.setInputPaths(conf, testFilePath.toString()); InputSplit[] splits = in.getSplits(conf, 1); assertEquals(0, splits.length);
InputFormat<?,?> in = new OrcInputFormat(); FileInputFormat.setInputPaths(conf, testFilePath.toString()); InputSplit[] splits = in.getSplits(conf, 1); assertEquals(0, splits.length);