return ugi.doAs((PrivilegedExceptionAction<List<LogicalInputSplit>>) () -> { final List<LogicalInputSplit> splits = new ArrayList<>(); final JobConf job = new JobConf(hiveConf); HiveUtilities.addConfToJob(job, properties); HiveUtilities.verifyAndAddTransactionalProperties(job, sd); job.setInputFormat(HiveUtilities.getInputFormatClass(job, sd, hiveReadEntry.getTable())); final Path path = new Path(sd.getLocation()); final FileSystem fs = path.getFileSystem(job); if (fs.exists(path)) { FileInputFormat.addInputPath(job, path); final InputFormat<?, ?> format = job.getInputFormat(); InputSplit[] inputSplits = format.getSplits(job, 1);
throws Exception JobConf configuration = new JobConf(new Configuration(false)); configuration.set(READ_COLUMN_IDS_CONF_STR, "0"); configuration.setBoolean(READ_ALL_COLUMNS, false); RecordReader<K, V> recordReader = inputFormat.getRecordReader( new FileSplit(new Path(tempFile.getFile().getAbsolutePath()), 0, tempFile.getFile().length(), (String[]) null), configuration, NULL); K key = recordReader.createKey(); V value = recordReader.createValue(); while (recordReader.next(key, value)) { Object expectedValue = iterator.next();
localJc.set(FileInputFormat.INPUT_DIR, org.apache.hadoop.util.StringUtils.escapeString(parentDir.getAbsolutePath())); inputSplits = inputFormat.getSplits(localJc, 1); actualSplitNum = inputSplits.length; rr = inputFormat.getRecordReader(inputSplits[currentSplitPointer], localJc, reporter); currentSplitPointer++;
System.out.println("Files found: "); for (AcidUtils.ParsedDelta pd : current) { System.out.println(pd.getPath().toString()); JobConf job = new JobConf(); job.set("mapred.input.dir", partitionPath.toString()); job.set(BUCKET_COUNT, Integer.toString(buckets)); job.set(IOConstants.SCHEMA_EVOLUTION_COLUMNS, "id,msg"); job.set(IOConstants.SCHEMA_EVOLUTION_COLUMNS_TYPES, "bigint:string"); job.set(ValidWriteIdList.VALID_WRITEIDS_KEY, writeIds.toString()); job.set(ValidTxnList.VALID_TXNS_KEY, conf.get(ValidTxnList.VALID_TXNS_KEY)); InputSplit[] splits = inf.getSplits(job, buckets); Assert.assertEquals(numExpectedFiles, splits.length); org.apache.hadoop.mapred.RecordReader<NullWritable, OrcStruct> rr = inf.getRecordReader(splits[0], job, Reporter.NULL); NullWritable key = rr.createKey(); OrcStruct value = rr.createValue(); for (String record : records) { Assert.assertEquals(true, rr.next(key, value)); Assert.assertEquals(record, value.toString());
Path dir = new Path(tPart.getSd().getLocation()); long numRows = 0; long rawDataSize = 0; long fileSize = 0; long numFiles = 0; FileSystem fs = dir.getFileSystem(conf); FileStatus[] fileList = HiveStatsUtils.getFileStatusRecurse(dir, -1, fs); new String[] { partn.getLocation() }); org.apache.hadoop.mapred.RecordReader<?, ?> recordReader = inputFormat.getRecordReader(dummySplit, jc, Reporter.NULL); StatsProvidingRecordReader statsRR; if (recordReader instanceof StatsProvidingRecordReader) { statsAvailable = true; recordReader.close();
@SuppressWarnings("unchecked") // InputFormat instantiation static long readBench(JobConf conf) throws IOException { InputFormat inf = conf.getInputFormat(); final String fn = conf.get("test.filebench.name", ""); Path pin = new Path(FileInputFormat.getInputPaths(conf)[0], fn); FileStatus in = pin.getFileSystem(conf).getFileStatus(pin); RecordReader rr = inf.getRecordReader(new FileSplit(pin, 0, in.getLen(), (String[])null), conf, Reporter.NULL); try { Object key = rr.createKey(); Object val = rr.createValue(); Date start = new Date(); while (rr.next(key, val)); Date end = new Date(); return end.getTime() - start.getTime(); } finally { rr.close(); } }
OutputFormat<?, ?> outFormat = new OrcOutputFormat(); RecordWriter writer = outFormat.getRecordWriter(fs, conf, testFilePath.toString(), Reporter.NULL); writer.write(NullWritable.get(), inspector = (StructObjectInspector) serde.getObjectInspector(); InputFormat<?,?> in = new OrcInputFormat(); FileInputFormat.setInputPaths(conf, testFilePath.toString()); InputSplit[] splits = in.getSplits(conf, 1); assertEquals(1, splits.length); ColumnProjectionUtils.appendReadColumns(conf, Collections.singletonList(1)); conf.set("columns", "z,r"); conf.set("columns.types", "int:struct<x:int,y:int>"); org.apache.hadoop.mapred.RecordReader reader = in.getRecordReader(splits[0], conf, Reporter.NULL); Object key = reader.createKey(); Object value = reader.createValue(); int rowNum = 0; List<? extends StructField> fields = inspector.getAllStructFieldRefs(); IntObjectInspector intInspector = (IntObjectInspector) fields.get(0).getFieldObjectInspector(); while (reader.next(key, value)) { assertEquals(null, inspector.getStructFieldData(value, fields.get(0))); Object sub = inspector.getStructFieldData(value, fields.get(1));
TupleDomain<HiveColumnHandle> effectivePredicate = (TupleDomain<HiveColumnHandle>) compactEffectivePredicate; Path path = new Path(getPartitionLocation(table, partition.getPartition())); Configuration configuration = hdfsEnvironment.getConfiguration(hdfsContext, path); InputFormat<?, ?> inputFormat = getInputFormat(configuration, schema, false); FileSystem fs = hdfsEnvironment.getFileSystem(hdfsContext, path); boolean s3SelectPushdownEnabled = shouldEnablePushdownForTable(session, table, path.toString(), partition.getPartition()); targetJob.setInputFormat(TextInputFormat.class); targetInputFormat.configure(targetJob); FileInputFormat.setInputPaths(targetJob, targetPath); InputSplit[] targetSplits = targetInputFormat.getSplits(targetJob, 0); FileInputFormat.setInputPaths(jobConf, path); InputSplit[] splits = inputFormat.getSplits(jobConf, 0);
String sargStr; createTestSarg(inspector, udf, childExpr); InputSplit[] splits = in.getSplits(conf, 1); assertEquals(5, splits.length); en = new ExprNodeGenericFuncDesc(inspector, udf, childExpr); sargStr = SerializationUtilities.serializeExpression(en); conf.set("hive.io.filter.expr.serialized", sargStr); splits = in.getSplits(conf, 1); assertEquals(0, splits.length); en = new ExprNodeGenericFuncDesc(inspector, udf, childExpr); sargStr = SerializationUtilities.serializeExpression(en); conf.set("hive.io.filter.expr.serialized", sargStr); splits = in.getSplits(conf, 1); assertEquals(1, splits.length); en = new ExprNodeGenericFuncDesc(inspector, udf, childExpr); sargStr = SerializationUtilities.serializeExpression(en); conf.set("hive.io.filter.expr.serialized", sargStr); splits = in.getSplits(conf, 1); assertEquals(2, splits.length); splits = in.getSplits(conf, 1); assertEquals(3, splits.length); splits = in.getSplits(conf, 1); assertEquals(4, splits.length);
JobConf jobConf = taskCtx0.jobConf(); InputFormat inFormat = jobConf.getInputFormat(); HadoopFileBlock block = (HadoopFileBlock)split; nativeSplit = new FileSplit(new Path(block.file().toString()), block.start(), block.length(), EMPTY_HOSTS); fileName(), taskCtx0.attemptId()); RecordReader reader = inFormat.getRecordReader(nativeSplit, jobConf, reporter); Mapper mapper = ReflectionUtils.newInstance(jobConf.getMapperClass(), jobConf); Object key = reader.createKey(); Object val = reader.createValue(); while (reader.next(key, val)) { if (isCancelled()) throw new HadoopTaskCancelledException("Map task cancelled.");
(InputFormat<?, ?>) ReflectionUtils.newInstance(JavaUtils.loadClass(realInputFormatName), jobConf); MapWork mapWork = Utilities.getMapWork(jobConf); List<Path> paths = Utilities.getInputPathsTez(jobConf, mapWork); FileSystem fs = paths.get(0).getFileSystem(jobConf); FileStatus[] fileStatuses = fs.listStatus(paths.get(0)); if (fileStatuses.length == 0) { splits = inputFormat.getSplits(jobConf, (int) (availableSlots * waves)); tezCounters = new TezCounters(); groupName = HiveInputCounters.class.getName(); vertexName = jobConf.get(Operator.CONTEXT_NAME_KEY, ""); counterName = Utilities.getVertexCounterName(HiveInputCounters.RAW_INPUT_SPLITS.name(), vertexName); tezCounters.findCounter(groupName, counterName).increment(splits.length); final String fileStr = path.toString(); if (!files.contains(fileStr)) { files.add(fileStr);
FileInputFormat.setInputPaths( conf, new Path(JobHelper.getURIFromSegment(segment.getSegment())) ); try { return Arrays.stream(fio.getSplits(conf, 1)).flatMap( (final org.apache.hadoop.mapred.InputSplit split) -> { try {
public void testFormat() throws Exception { JobConf job = new JobConf(conf); FileSystem fs = FileSystem.getLocal(conf); Path dir = new Path(System.getProperty("test.build.data",".") + "/mapred"); Path file = new Path(dir, "test.seq"); FileInputFormat.setInputPaths(job, dir); int numSplits = random.nextInt(MAX_LENGTH/(SequenceFile.SYNC_INTERVAL/20))+1; InputSplit[] splits = format.getSplits(job, numSplits); for (int j = 0; j < splits.length; j++) { RecordReader<RecInt, RecBuffer> reader = format.getRecordReader(splits[j], job, Reporter.NULL); try { int count = 0; while (reader.next(key, value)) { assertFalse("Key in multiple partitions.", bits.get(key.getData())); bits.set(key.getData()); reader.close();
JobConf conf = new JobConf(); FileInputFormat.addInputPath(conf, new Path(path)); InputSplit[] splits = informat.getSplits(conf, 10000); assertTrue(splits.length > 3); //want to test that splitting is working b/c i made really big files for(InputSplit split: splits) { RecordReader<Text, BytesWritable> rr = informat.getRecordReader(split, conf, Reporter.NULL); Text t = new Text(); BytesWritable b = new BytesWritable(); while(rr.next(t, b)) { results.put(t.toString(), new String(Utils.getBytes(b))); rr.close();
/** * Get paths from a Hive location using the provided input format. */ public static Set<Path> getPaths(InputFormat<?, ?> inputFormat, Path location) throws IOException { JobConf jobConf = new JobConf(getHadoopConfiguration()); Set<Path> paths = Sets.newHashSet(); FileInputFormat.addInputPaths(jobConf, location.toString()); InputSplit[] splits = inputFormat.getSplits(jobConf, 1000); for (InputSplit split : splits) { if (!(split instanceof FileSplit)) { throw new IOException("Not a file split. Found " + split.getClass().getName()); } FileSplit fileSplit = (FileSplit) split; paths.add(fileSplit.getPath()); } return paths; }
) throws IOException, InterruptedException, ClassNotFoundException { InputSplit inputSplit = getSplitDetails(new Path(splitIndex.getSplitLocation()), splitIndex.getStartOffset()); job.getInputFormat().getRecordReader(inputSplit, job, reporter); RecordReader<INKEY,INVALUE> in = isSkipping() ? new SkippingRecordReader<INKEY,INVALUE>(rawIn, umbilical, reporter) : new TrackedRecordReader<INKEY,INVALUE>(rawIn, reporter); job.setBoolean(JobContext.SKIP_RECORDS, isSkipping()); int numReduceTasks = conf.getNumReduceTasks(); LOG.info("numReduceTasks: " + numReduceTasks); MapOutputCollector collector = null; ReflectionUtils.newInstance(job.getMapRunnerClass(), job); } finally { in.close(); // close input collector.close();
public InputSplit[] getSplits(JobConf conf, int numSplits) throws IOException { JobConf confCopy = new JobConf(conf); List<InputSplit> splits = new ArrayList<>(); Schema schema = schemaEntry.getKey(); System.out.println(schema); InputFormat format = (InputFormat) ReflectionUtils.newInstance( AvroInputFormat.class, conf); List<Path> paths = schemaEntry.getValue(); mapperClass = (Class<? extends AvroMapper>) conf.getMapperClass(); FileInputFormat.setInputPaths(confCopy, paths.toArray(new Path[paths .size()])); InputSplit[] pathSplits = format.getSplits(confCopy, numSplits); for (InputSplit pathSplit : pathSplits) { splits.add(new TaggedInputSplit(pathSplit, conf, format.getClass(),
JobConf job = new JobConf(); job.set("mapred.input.dir", partitionLocation.toString()); job.set(hive_metastoreConstants.BUCKET_COUNT, Integer.toString(table.getSd().getNumBuckets())); job.set(IOConstants.SCHEMA_EVOLUTION_COLUMNS, "id,msg"); job.set(IOConstants.SCHEMA_EVOLUTION_COLUMNS_TYPES, "bigint:string"); job.set(ValidWriteIdList.VALID_WRITEIDS_KEY, writeIds.toString()); job.set(ValidTxnList.VALID_TXNS_KEY, validTxnList.writeToString()); InputSplit[] splits = inputFormat.getSplits(job, 1); assertEquals(numSplitsExpected, splits.length); for(InputSplit is : splits) { final AcidRecordReader<NullWritable, OrcStruct> recordReader = (AcidRecordReader<NullWritable, OrcStruct>) inputFormat .getRecordReader(is, job, Reporter.NULL);
throw new IOException("Acid table: " + table.getTableName() + " is missing from the ValidWriteIdList config: " + conf.get(ValidTxnWriteIdList.VALID_TABLES_WRITEIDS_KEY)); if (finalDirs.isEmpty() && dirsWithFileOriginals.isEmpty()) { if (!conf.getBoolean(Utilities.ENSURE_OPERATORS_EXECUTED, false)) { LOG.warn("No valid inputs found in " + dirs); } else if (validMmWriteIdList != null) { dirs.get(0).toString()), ZeroRowsInputFormat.class.getName())); conf.setInputFormat(inputFormat.getClass()); int headerCount = 0; int footerCount = 0; FileInputFormat.setInputPaths(conf, finalDirs.toArray(new Path[finalDirs.size()])); InputSplit[] iss = inputFormat.getSplits(conf, splits); for (InputSplit is : iss) { result.add(new HiveInputSplit(is, inputFormatClass.getName())); InputSplit[] iss = inputFormat.getSplits(nonRecConf, splits); for (InputSplit is : iss) { result.add(new HiveInputSplit(is, inputFormatClass.getName())); finalDirs.get(0).toString()), ZeroRowsInputFormat.class.getName()));
job.set("mapred.input.dir", org.apache.hadoop.util.StringUtils.escapeString(currPath .toString())); inputSplits = inputFormat.getSplits(job, 1); splitNum = 0; serde = tmp.getDeserializerClass().newInstance(); currRecReader.close(); currRecReader = null; currRecReader = inputFormat.getRecordReader(inputSplits[splitNum++], job, Reporter.NULL); key = currRecReader.createKey(); value = currRecReader.createValue(); return currRecReader;