Refine search
protected int getMapInputSplitCount() throws ClassNotFoundException, JobException, IOException, InterruptedException { if (job == null) { throw new JobException("Job is null"); } InputFormat<?, ?> input = ReflectionUtils.newInstance(job.getInputFormatClass(), job.getConfiguration()); return input.getSplits(job).size(); }
@Override public void initialize(final InputSplit inputSplit, final TaskAttemptContext taskAttemptContext) throws IOException, InterruptedException { final Configuration configuration = taskAttemptContext.getConfiguration(); final InputFormat<NullWritable, VertexWritable> inputFormat = ReflectionUtils.newInstance(configuration.getClass(Constants.GREMLIN_HADOOP_GRAPH_READER, InputFormat.class, InputFormat.class), configuration); if (!(inputFormat instanceof GraphFilterAware) && configuration.get(Constants.GREMLIN_HADOOP_GRAPH_FILTER, null) != null) this.graphFilter = VertexProgramHelper.deserialize(ConfUtil.makeApacheConfiguration(configuration), Constants.GREMLIN_HADOOP_GRAPH_FILTER); this.recordReader = inputFormat.createRecordReader(inputSplit, taskAttemptContext); this.recordReader.initialize(inputSplit, taskAttemptContext); }
public void testBinary() throws IOException, InterruptedException { Job job = Job.getInstance(); FileSystem fs = FileSystem.getLocal(job.getConfiguration()); Path dir = new Path(System.getProperty("test.build.data",".") + "/mapred"); Path file = new Path(dir, "testbinary.seq"); Random r = new Random(); long seed = r.nextLong(); job.getConfiguration(), file, Text.class, Text.class); try { for (int i = 0; i < RECORDS; ++i) { DataInputBuffer buf = new DataInputBuffer(); FileInputFormat.setInputPaths(job, file); for (InputSplit split : bformat.getSplits(job)) { RecordReader<BytesWritable, BytesWritable> reader = bformat.createRecordReader(split, context); MapContext<BytesWritable, BytesWritable, BytesWritable, BytesWritable> mcontext = new MapContextImpl<BytesWritable, BytesWritable, BytesWritable, BytesWritable>(job.getConfiguration(), context.getTaskAttemptID(), reader, null, null, MapReduceTestUtil.createDummyReporter(), split); reader.initialize(split, mcontext); try { while (reader.nextKeyValue()) { bkey = reader.getCurrentKey(); bval = reader.getCurrentValue(); tkey.set(Integer.toString(r.nextInt(), 36));
JobConf job = new JobConf(); fs = FileSystem.getLocal(job); Path rootDir = new Path(TEST_ROOT_DIR); createInputFile(rootDir); ReflectionUtils.newInstance(jContext.getInputFormatClass(), job); List<InputSplit> splits = input.getSplits(jContext); JobSplitWriter.createSplitFiles(new Path(TEST_ROOT_DIR), job, new Path(TEST_ROOT_DIR).getFileSystem(job), splits); TaskSplitMetaInfo[] splitMetaInfo =
/** * Create the needed objects for reading the splits of the filepath given as argument. * This method should run before the scheduleSplits method. * * @param filepath */ @SuppressWarnings({ "deprecation", "unchecked" }) public void setJob(String filepath, String tag) { try { conf.set("start_tag", "<" + tag + ">"); conf.set("end_tag", "</" + tag + ">"); job = new Job(conf, "Read from HDFS"); Path input = new Path(filepath); FileInputFormat.addInputPath(job, input); job.setInputFormatClass(XmlCollectionWithTagInputFormat.class); inputFormat = ReflectionUtils.newInstance(job.getInputFormatClass(), job.getConfiguration()); splits = inputFormat.getSplits(job); } catch (IOException | ClassNotFoundException | InterruptedException e) { if (LOGGER.isLoggable(Level.SEVERE)) { LOGGER.severe(e.getMessage()); } } }
Path [] files = { new Path("file1"), new Path("file2") }; long [] lengths = { 1, 1 }; RecordReader rr = inputFormat.createRecordReader(split, context); assertTrue("Unexpected RR type!", rr instanceof CombineFileRecordReader); rr.initialize(split, context); assertTrue(rr.nextKeyValue()); assertEquals("file1", rr.getCurrentValue().toString());
public void testRecordReaderInit() throws InterruptedException, IOException { // Test that we properly initialize the child recordreader when // CombineFileInputFormat and CombineFileRecordReader are used. TaskAttemptID taskId = new TaskAttemptID("jt", 0, TaskType.MAP, 0, 0); Configuration conf1 = new Configuration(); conf1.set(DUMMY_KEY, "STATE1"); TaskAttemptContext context1 = new TaskAttemptContextImpl(conf1, taskId); // This will create a CombineFileRecordReader that itself contains a // DummyRecordReader. InputFormat inputFormat = new ChildRRInputFormat(); Path [] files = { new Path("file1") }; long [] lengths = { 1 }; CombineFileSplit split = new CombineFileSplit(files, lengths); RecordReader rr = inputFormat.createRecordReader(split, context1); assertTrue("Unexpected RR type!", rr instanceof CombineFileRecordReader); // Verify that the initial configuration is the one being used. // Right after construction the dummy key should have value "STATE1" assertEquals("Invalid initial dummy key value", "STATE1", rr.getCurrentKey().toString()); // Switch the active context for the RecordReader... Configuration conf2 = new Configuration(); conf2.set(DUMMY_KEY, "STATE2"); TaskAttemptContext context2 = new TaskAttemptContextImpl(conf2, taskId); rr.initialize(split, context2); // And verify that the new context is updated into the child record reader. assertEquals("Invalid secondary dummy key value", "STATE2", rr.getCurrentKey().toString()); }
if(!UriUtil.isHDFSFile(location)) continue; Path path = new Path(location); FileSystem fs = path.getFileSystem(conf); if (fs.exists(path)) { LoadFunc loader = (LoadFunc) PigContext .instantiateFuncFromSpec(ld.getLFile() .getFuncSpec()); Job job = new Job(conf); loader.setUDFContextSignature(ld.getSignature()); loader.setLocation(location, job); InputFormat inf = loader.getInputFormat(); List<InputSplit> splits = inf.getSplits(HadoopShims.cloneJobContext(job)); List<List<InputSplit>> results = MapRedUtil .getCombinePigSplits(splits,
@SuppressWarnings("unchecked") private <T extends InputSplit> int writeNewSplits(JobContext job, Path jobSubmitDir) throws IOException, InterruptedException, ClassNotFoundException { Configuration conf = job.getConfiguration(); InputFormat<?, ?> input = ReflectionUtils.newInstance(job.getInputFormatClass(), conf); List<InputSplit> splits = input.getSplits(job); T[] array = (T[]) splits.toArray(new InputSplit[splits.size()]); // sort the splits into order based on size, so that the biggest // go first Arrays.sort(array, new SplitComparator()); JobSplitWriter.createSplitFiles(jobSubmitDir, conf, jobSubmitDir.getFileSystem(conf), array); return array.length; }
public void testHDFSReadWriteOperators() throws Exception { FileInputFormat.setInputPaths(conf, HDFS_INPUT_PATH); FileOutputFormat.setOutputPath(conf, new Path(HDFS_OUTPUT_PATH)); conf.setInputFormatClass(TextInputFormat.class); InputFormat inputFormat = ReflectionUtils.newInstance(conf.getInputFormatClass(), getConfiguration()); List<InputSplit> splits = inputFormat.getSplits(conf);
ReflectionUtils.newInstance(taskContext.getMapperClass(), job); ReflectionUtils.newInstance(taskContext.getInputFormatClass(), job); split = getSplitDetails(new Path(splitIndex.getSplitLocation()), splitIndex.getStartOffset()); (inputFormat.createRecordReader(split, taskContext), reporter); mapContext); input.initialize(split, mapperContext); mapper.run(mapperContext); mapPhase.complete(); setPhase(TaskStatus.Phase.SORT); statusUpdate(umbilical); input.close(); output.close(mapperContext);
@SuppressWarnings("unchecked") public RecordReader<K, V> createRecordReader(InputSplit split, TaskAttemptContext context) throws IOException, InterruptedException { Configuration conf = context.getConfiguration(); InputFormat<K, V> indirIF = (InputFormat)ReflectionUtils.newInstance( conf.getClass(INDIRECT_INPUT_FORMAT, SequenceFileInputFormat.class), conf); IndirectSplit is = ((IndirectSplit)split); return indirIF.createRecordReader(new FileSplit(is.getPath(), 0, is.getLength(), (String[])null), context); } }
public Object[] getSample(InputFormat inf, Job job) throws IOException, InterruptedException { long counter = 0; List<InputSplit> splits = inf.getSplits(job); ArrayList<K> samples = new ArrayList<K>(numSamples); int splitsToSample = Math.min(maxSplitsSampled, splits.size()); job.getConfiguration(), new TaskAttemptID()); RecordReader<K, V> reader = inf.createRecordReader(splits.get(i), samplingContext); reader.initialize(splits.get(i), samplingContext); while (reader.nextKeyValue()) { if (r.nextDouble() <= freq) { if (samples.size() < numSamples) { LOG.info(String.format("Fill: Collected %d samples from %d splits", counter, i)); counter++; samples.add(ReflectionUtils.copy(job.getConfiguration(), reader.getCurrentKey(), null)); } else { samples.set(ind, ReflectionUtils.copy(job.getConfiguration(), reader.getCurrentKey(), null)); if (counter % 1000 == 0)
import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.Path; import org.apache.hadoop.mapreduce.InputFormat; import org.apache.hadoop.mapreduce.RecordReader; import org.apache.hadoop.mapreduce.TaskAttemptContext; import org.apache.hadoop.mapreduce.TaskAttemptID; import org.apache.hadoop.mapreduce.lib.input.FileSplit; import org.apache.hadoop.mapreduce.task.TaskAttemptContextImpl; import org.apache.hadoop.util.ReflectionUtils; import java.io.File; Configuration conf = new Configuration(false); conf.set("fs.default.name", "file:///"); File testFile = new File("path/to/file"); Path path = new Path(testFile.getAbsoluteFile().toURI()); FileSplit split = new FileSplit(path, 0, testFile.length(), null); InputFormat inputFormat = ReflectionUtils.newInstance(MyInputFormat.class, conf); TaskAttemptContext context = new TaskAttemptContextImpl(conf, new TaskAttemptID()); RecordReader reader = inputFormat.createRecordReader(split, context); reader.initialize(split, context);
InputFormat<?, ?> format = ReflectionUtils.newInstance(ctx.getInputFormatClass(), ctx.getConfiguration()); List<InputSplit> splits = format.getSplits(ctx); FileSplit s = (FileSplit)nativeSplit; res.add(new HadoopFileBlock(s.getLocations(), s.getPath().toUri(), s.getStart(), s.getLength()));
@Override public List<InputSplit> getSplits(JobContext job) throws IOException, InterruptedException { List<InputSplit> splits = Lists.newArrayList(); Configuration base = job.getConfiguration(); Map<FormatBundle, Map<Integer, List<Path>>> formatNodeMap = CrunchInputs.getFormatNodeMap(job); // First, build a map of InputFormats to Paths for (Map.Entry<FormatBundle, Map<Integer, List<Path>>> entry : formatNodeMap.entrySet()) { FormatBundle inputBundle = entry.getKey(); Configuration conf = new Configuration(base); inputBundle.configure(conf); Job jobCopy = new Job(conf); InputFormat<?, ?> format = (InputFormat<?, ?>) ReflectionUtils.newInstance(inputBundle.getFormatClass(), jobCopy.getConfiguration()); for (Map.Entry<Integer, List<Path>> nodeEntry : entry.getValue().entrySet()) { Integer nodeIndex = nodeEntry.getKey(); List<Path> paths = nodeEntry.getValue(); FileInputFormat.setInputPaths(jobCopy, paths.toArray(new Path[paths.size()])); // Get splits for each input path and tag with InputFormat // and Mapper types by wrapping in a TaggedInputSplit. List<InputSplit> pathSplits = format.getSplits(jobCopy); for (InputSplit pathSplit : pathSplits) { splits.add(new CrunchInputSplit(pathSplit, inputBundle.getFormatClass(), nodeIndex, jobCopy.getConfiguration())); } } } return splits; }
@SuppressWarnings("unchecked") public List<InputSplit> getSplits(JobContext job) throws IOException, InterruptedException { Configuration conf = job.getConfiguration(); Job jobCopy = new Job(conf); List<InputSplit> splits = new ArrayList<InputSplit>(); Map<Path, List<String>> formatMap = PangoolMultipleInputs.getInputFormatMap(job); Map<Path, List<String>> mapperMap = PangoolMultipleInputs.getInputProcessorFileMap(job); for (Map.Entry<Path, List<String>> entry : formatMap.entrySet()) { for (int inputId = 0; inputId < entry.getValue().size(); inputId++) { FileInputFormat.setInputPaths(jobCopy, entry.getKey()); InputFormat inputFormat = InstancesDistributor.loadInstance(conf, InputFormat.class, entry.getValue().get( inputId), true); PangoolMultipleInputs.setSpecificInputContext(jobCopy.getConfiguration(), entry.getValue().get(inputId), inputId); List<InputSplit> pathSplits = inputFormat.getSplits(jobCopy); for (InputSplit pathSplit : pathSplits) { splits.add(new TaggedInputSplit(pathSplit, conf, entry.getValue().get(inputId), mapperMap.get(entry.getKey()) .get(inputId), inputId)); } } } return splits; }
@Override public RecordReader<NullWritable, Variant> createRecordReader(InputSplit split, TaskAttemptContext context) throws IOException, InterruptedException { if (inputFormat == null) { init(context.getConfiguration()); } RecordReader<NullWritable, VariantDBWritable> recordReader = inputFormat.createRecordReader(split, context); return new RecordReaderTransform<>(recordReader, VariantDBWritable::getVariant); }
private static List<Text> readSplit(InputFormat<LongWritable,Text> format, InputSplit split, Job job) throws IOException, InterruptedException { List<Text> result = new ArrayList<Text>(); Configuration conf = job.getConfiguration(); TaskAttemptContext context = MapReduceTestUtil. createDummyMapTaskAttemptContext(conf); RecordReader<LongWritable, Text> reader = format.createRecordReader(split, MapReduceTestUtil.createDummyMapTaskAttemptContext(conf)); MapContext<LongWritable,Text,LongWritable,Text> mcontext = new MapContextImpl<LongWritable,Text,LongWritable,Text>(conf, context.getTaskAttemptID(), reader, null, null, MapReduceTestUtil.createDummyReporter(), split); reader.initialize(split, mcontext); while (reader.nextKeyValue()) { result.add(new Text(reader.getCurrentValue())); } return result; }
public CrunchRecordReader(InputSplit inputSplit, final TaskAttemptContext context) throws IOException, InterruptedException { CrunchInputSplit crunchSplit = (CrunchInputSplit) inputSplit; InputFormat<K, V> inputFormat = (InputFormat<K, V>) ReflectionUtils .newInstance(crunchSplit.getInputFormatClass(), crunchSplit.getConf()); this.delegate = inputFormat.createRecordReader( crunchSplit.getInputSplit(), TaskAttemptContextFactory.create( crunchSplit.getConf(), context.getTaskAttemptID())); }