TextInputFormat inputFormat = new TextInputFormat(); JobConf newjob = new JobConf(job); newjob.setInputFormat(TextInputFormat.class); inputFormat.configure(newjob); InputSplit[] iss = inputFormat.getSplits(newjob, numSubSplits); for (InputSplit is : iss) { result.add(new SymlinkTextInputSplit(symlinkPath, (FileSplit)is));
@Override public RecordReader<LongWritable, Text> getRecordReader( InputSplit split, JobConf job, Reporter reporter) throws IOException { InputSplit targetSplit = ((SymlinkTextInputSplit)split).getTargetSplit(); // The target data is in TextInputFormat. TextInputFormat inputFormat = new TextInputFormat(); inputFormat.configure(job); RecordReader innerReader = null; try { innerReader = inputFormat.getRecordReader(targetSplit, job, reporter); } catch (Exception e) { innerReader = HiveIOExceptionHandlerUtil .handleRecordReaderCreationException(e, job); } HiveRecordReader rr = new HiveRecordReader(innerReader, job); rr.initIOContext((FileSplit)targetSplit, job, TextInputFormat.class, innerReader); return rr; }
public static void main(String[] args) throws Exception { if (args.length < 2) { System.err.println("Usage: WordCount <input path> <result path>"); return; } final String inputPath = args[0]; final String outputPath = args[1]; final ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment(); // Set up the Hadoop Input Format HadoopInputFormat<LongWritable, Text> hadoopInputFormat = new HadoopInputFormat<LongWritable, Text>(new TextInputFormat(), LongWritable.class, Text.class, new JobConf()); TextInputFormat.addInputPath(hadoopInputFormat.getJobConf(), new Path(inputPath)); // Create a Flink job with it DataSet<Tuple2<LongWritable, Text>> text = env.createInput(hadoopInputFormat); DataSet<Tuple2<Text, LongWritable>> words = text.flatMap(new HadoopMapFunction<LongWritable, Text, Text, LongWritable>(new Tokenizer())) .groupBy(0).reduceGroup(new HadoopReduceCombineFunction<Text, LongWritable, Text, LongWritable>(new Counter(), new Counter())); // Set up Hadoop Output Format HadoopOutputFormat<Text, LongWritable> hadoopOutputFormat = new HadoopOutputFormat<Text, LongWritable>(new TextOutputFormat<Text, LongWritable>(), new JobConf()); hadoopOutputFormat.getJobConf().set("mapred.textoutputformat.separator", " "); TextOutputFormat.setOutputPath(hadoopOutputFormat.getJobConf(), new Path(outputPath)); // Output & Execute words.output(hadoopOutputFormat).setParallelism(1); env.execute("Hadoop Compat WordCount"); }
@BeforeClass public void setUp() { inputFormat = new TextInputFormat(); inputFormat.configure(new JobConf()); }
public void testNumInputs() throws Exception { JobConf job = new JobConf(conf); dfs = newDFSCluster(job); FileSystem fs = dfs.getFileSystem(); System.out.println("FileSystem " + fs.getUri()); Path inputDir = new Path("/foo/"); final int numFiles = 10; String fileNameBase = "part-0000"; for (int i=0; i < numFiles; ++i) { createInputs(fs, inputDir, fileNameBase + String.valueOf(i)); } createInputs(fs, inputDir, "_meta"); createInputs(fs, inputDir, "_temp"); // split it using a file input format TextInputFormat.addInputPath(job, inputDir); TextInputFormat inFormat = new TextInputFormat(); inFormat.configure(job); InputSplit[] splits = inFormat.getSplits(job, 1); assertEquals("Expected value of " + FileInputFormat.NUM_INPUT_FILES, numFiles, job.getLong(FileInputFormat.NUM_INPUT_FILES, 0)); }
private void task2(String inputPath, String outputPath) throws IOException { LOG.info("Exracting anchor text (phase 2)..."); LOG.info(" - input: " + inputPath); LOG.info(" - output: " + outputPath); JobConf conf = new JobConf(getConf(), ExtractWikipediaAnchorText.class); conf.setJobName(String.format( "ExtractWikipediaAnchorText:phase2[input: %s, output: %s]", inputPath, outputPath)); // Gathers everything together for convenience; feasible for Wikipedia. conf.setNumReduceTasks(1); TextInputFormat.addInputPath(conf, new Path(inputPath)); TextOutputFormat.setOutputPath(conf, new Path(outputPath)); conf.setInputFormat(SequenceFileInputFormat.class); conf.setOutputFormat(MapFileOutputFormat.class); conf.setMapOutputKeyClass(IntWritable.class); conf.setMapOutputValueClass(Text.class); conf.setOutputKeyClass(IntWritable.class); conf.setOutputValueClass(HMapStIW.class); conf.setMapperClass(MyMapper2.class); conf.setReducerClass(MyReducer2.class); // Delete the output directory if it exists already. FileSystem.get(conf).delete(new Path(outputPath), true); JobClient.runJob(conf); // Clean up intermediate data. FileSystem.get(conf).delete(new Path(inputPath), true); }
TupleDomain<HiveColumnHandle> effectivePredicate = (TupleDomain<HiveColumnHandle>) compactEffectivePredicate; Path path = new Path(getPartitionLocation(table, partition.getPartition())); Configuration configuration = hdfsEnvironment.getConfiguration(hdfsContext, path); InputFormat<?, ?> inputFormat = getInputFormat(configuration, schema, false); FileSystem fs = hdfsEnvironment.getFileSystem(hdfsContext, path); boolean s3SelectPushdownEnabled = shouldEnablePushdownForTable(session, table, path.toString(), partition.getPartition()); for (Path targetPath : getTargetPathsFromSymlink(fs, path)) { TextInputFormat targetInputFormat = new TextInputFormat(); targetJob.setInputFormat(TextInputFormat.class); targetInputFormat.configure(targetJob); FileInputFormat.setInputPaths(targetJob, targetPath); InputSplit[] targetSplits = targetInputFormat.getSplits(targetJob, 0);
TextInputFormat inputFormat = new TextInputFormat(); JobConf job = new JobConf(fsConf); job.setInt("io.file.buffer.size", context.getConfig().getInt(ExecConstants.TEXT_LINE_READER_BUFFER_SIZE)); job.setInputFormat(inputFormat.getClass()); try { reader = inputFormat.getRecordReader(split, job, Reporter.NULL); key = reader.createKey(); value = reader.createValue();
@Test public void testListStatusErrorOnNonExistantDir() throws IOException { Configuration conf = new Configuration(); conf.setInt(FileInputFormat.LIST_STATUS_NUM_THREADS, numThreads); org.apache.hadoop.mapreduce.lib.input.TestFileInputFormat .configureTestErrorOnNonExistantDir(conf, localFs); JobConf jobConf = new JobConf(conf); TextInputFormat fif = new TextInputFormat(); fif.configure(jobConf); try { fif.listStatus(jobConf); Assert.fail("Expecting an IOException for a missing Input path"); } catch (IOException e) { Path expectedExceptionPath = new Path(TEST_ROOT_DIR, "input2"); expectedExceptionPath = localFs.makeQualified(expectedExceptionPath); Assert.assertTrue(e instanceof InvalidInputException); Assert.assertEquals( "Input path does not exist: " + expectedExceptionPath.toString(), e.getMessage()); } }
Path output, OperationsParams params) throws IOException { JobConf job = new JobConf(params, CatUnion.class); job.setJobName("Union"); FileSystem outFs = output.getFileSystem(job); if (outFs.exists(output)) { if (params.getBoolean("overwrite", false)) { job.setNumMapTasks(clusterStatus.getMaxMapTasks() * 5); job.setNumReduceTasks(Math.max(1, clusterStatus.getMaxReduceTasks() * 9 / 10)); job.setInputFormat(ShapeLineInputFormat.class); TextInputFormat.addInputPath(job, shapeFile); DistributedCache.addCacheFile(categoryFile.toUri(), job); job.setOutputFormat(TextOutputFormat.class);
public static void main(String[] args) throws Exception { System.setProperty("hazelcast.logging.type", "log4j"); Path inputPath = new Path(HadoopWordCount.class.getClassLoader().getResource("books").getPath()); Path outputPath = new Path(OUTPUT_PATH); JobConf jobConfig = new JobConf(); jobConfig.setInputFormat(TextInputFormat.class); jobConfig.setOutputFormat(TextOutputFormat.class); TextOutputFormat.setOutputPath(jobConfig, outputPath); TextInputFormat.addInputPath(jobConfig, inputPath);
@Override public void setFile(String file, long offset, long length) { JobConf defaultConf = new JobConf(); this.split = new FileSplit( new Path( file ), offset, length, defaultConf); this.hasMore = true; this.jobConf = defaultConf; //this.split = split; this.input_format = new TextInputFormat(); try { this.reader = input_format.getRecordReader(this.split, this.jobConf, voidReporter); } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); } this.key = reader.createKey(); }
private InputSplit[] getSplits(JobConf conf, int numSplits, String path) throws Exception { FileInputFormat.setInputPaths(conf, new Path(path)); if (inputFormat == null) { inputFormat = inputFormatClass.newInstance(); String inputFormatClassName = inputFormatClass.getName(); if (inputFormatClassName.equals("org.apache.hadoop.mapred.TextInputFormat")) { ((TextInputFormat)inputFormat).configure(conf); } else if (inputFormatClassName.equals("org.apache.hadoop.mapred.KeyValueTextInputFormat")) { ((KeyValueTextInputFormat)inputFormat).configure(conf); } } return inputFormat.getSplits(conf, numSplits); // return null; }
private static void doSingleBzip2BufferSize(JobConf jConf) throws IOException { TextInputFormat format = new TextInputFormat(); format.configure(jConf); format.setMinSplitSize(5500); // work around 256-byte/22-splits issue InputSplit[] splits = format.getSplits(jConf, 100); assertEquals("compressed splits == 2", 2, splits.length); FileSplit tmp = (FileSplit) splits[0]; if (tmp.getPath().getName().equals("testCompressThenConcat.txt.gz")) { System.out.println(" (swapping)"); splits[0] = splits[1];
public RecordReader getRecordReader(InputSplit split, JobConf job, Reporter reporter) throws IOException { FileSplit fileSplit = (FileSplit) split; FileSystem fs = FileSystem.get(fileSplit.getPath().toUri(), job); FSDataInputStream is = fs.open(fileSplit.getPath()); byte[] header = new byte[3]; RecordReader reader = null; try { is.readFully(header); } catch (EOFException eof) { reader = textInputFormat.getRecordReader(split, job, reporter); } finally { is.close(); } if (header[0] == 'S' && header[1] == 'E' && header[2] == 'Q') { reader = seqFileInputFormat.getRecordReader(split, job, reporter); } else { reader = textInputFormat.getRecordReader(split, job, reporter); } return reader; }
@Test public void testListStatusNestedNonRecursive() throws IOException { Configuration conf = new Configuration(); conf.setInt(FileInputFormat.LIST_STATUS_NUM_THREADS, numThreads); List<Path> expectedPaths = org.apache.hadoop.mapreduce.lib.input.TestFileInputFormat .configureTestNestedNonRecursive(conf, localFs); JobConf jobConf = new JobConf(conf); TextInputFormat fif = new TextInputFormat(); fif.configure(jobConf); FileStatus[] statuses = fif.listStatus(jobConf); org.apache.hadoop.mapreduce.lib.input.TestFileInputFormat .verifyFileStatuses(expectedPaths, Lists.newArrayList(statuses), localFs); }
/** * Hack to configure InputFormats before they get used. * @param inputFormat InputFormat to configure * @param conf Configuration to use */ public static void configureInputFormat(InputFormat inputFormat, Configuration conf) { JobConf jobConf = new JobConf(conf); setJobConfIfPossible(inputFormat, jobConf); // TextInputFormat is not always JobConfigurable, so we need to explicitly // call this here to make sure it gets configured with the // compression codecs. if (inputFormat instanceof TextInputFormat) { ((TextInputFormat) inputFormat).configure(jobConf); } }
@Override public InputSplit[] getSplits(JobConf job, int splits) throws IOException { // ensure that the table properties were copied assertEquals("val1", job.get("myprop1")); assertEquals("val2", job.get("myprop2")); // ensure that both of the partitions are in the complete list. String[] dirs = job.get("hive.complete.dir.list").split("\t"); assertEquals(2, dirs.length); Arrays.sort(dirs); assertEquals(true, dirs[0].endsWith("/state=CA")); assertEquals(true, dirs[1].endsWith("/state=OR")); return super.getSplits(job, splits); } }
/** * generate splits for this run * * @param input_path * @param job * @return */ private InputSplit[] generateDebugSplits(Path input_path, JobConf job) { long block_size = localFs.getDefaultBlockSize(); log.info("default block size: " + (block_size / 1024 / 1024) + "MB"); // ---- set where we'll read the input files from ------------- FileInputFormat.setInputPaths(job, input_path); // try splitting the file in a variety of sizes TextInputFormat format = new TextInputFormat(); format.configure(job); int numSplits = 1; InputSplit[] splits = null; try { splits = format.getSplits(job, numSplits); } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); } return splits; }
@Override public void configure(JobConf job) { this.job = job; format.configure(job); }