/** * Configures the Hadoop MapReduce job. * * @return Instance of the Hadoop MapRed job. * @throws IOException If failed. */ @SuppressWarnings("deprecation") private Job createConfigBasedHadoopJob() throws IOException { Job jobCfg = new Job(); Configuration cfg = jobCfg.getConfiguration(); // Use explicit configuration of distributed file system, if provided. cfg.addResource(U.resolveIgniteUrl(DFS_CFG)); jobCfg.setJobName("HadoopPopularWordExample"); jobCfg.setJarByClass(HadoopPopularWords.class); jobCfg.setInputFormatClass(TextInputFormat.class); jobCfg.setOutputKeyClass(Text.class); jobCfg.setOutputValueClass(IntWritable.class); jobCfg.setMapperClass(TokenizingMapper.class); jobCfg.setReducerClass(TopNWordsReducer.class); FileInputFormat.setInputPaths(jobCfg, BOOKS_DFS_DIR); FileOutputFormat.setOutputPath(jobCfg, RESULT_DFS_DIR); // Local job tracker allows the only task per wave, but text input format // replaces it with the calculated value based on input split size option. if ("local".equals(cfg.get("mapred.job.tracker", "local"))) { // Split job into tasks using 32MB split size. FileInputFormat.setMinInputSplitSize(jobCfg, 32L * 1024 * 1024); FileInputFormat.setMaxInputSplitSize(jobCfg, Long.MAX_VALUE); } return jobCfg; }
@Override public List<BoundedDataSource<Pair<K, V>>> split(long desiredSplitSizeBytes) { final Job job = newJob(); long splitSize = Math.max(MIN_SPLIT_SIZE, desiredSplitSizeBytes); LOG.info(String.format("%s's max and min input split size will be set to %,d .", FileInputFormat.class.getSimpleName(), desiredSplitSizeBytes)); FileInputFormat.setMinInputSplitSize(job, splitSize); FileInputFormat.setMaxInputSplitSize(job, splitSize); return doSplit(job); }
private int runSyncJob(Path source, Path destination, Path tmpDir, Path input, Path output) throws IOException, InterruptedException, ClassNotFoundException { Job job = new Job(getConf(), "HDFS Sync job"); job.setJarByClass(getClass()); job.setInputFormatClass(TextInputFormat.class); job.setMapperClass(HdfsSyncMapper.class); job.setReducerClass(HdfsSyncReducer.class); job.setOutputKeyClass(LongWritable.class); job.setOutputValueClass(Text.class); job.getConfiguration().set(SRC_PATH_CONF, source.toString()); job.getConfiguration().set(DST_PATH_CONF, destination.toString()); job.getConfiguration().set(TMP_PATH_CONF, tmpDir.toString()); FileInputFormat.setInputPaths(job, input); FileInputFormat.setInputDirRecursive(job, true); FileInputFormat.setMaxInputSplitSize(job, this.getConf().getLong( FileInputFormat.SPLIT_MAXSIZE, 60000L)); FileOutputFormat.setOutputPath(job, new Path(output.toString())); FileOutputFormat.setOutputCompressorClass(job, GzipCodec.class); boolean success = job.waitForCompletion(true); return success ? 0 : 1; }
private int runSyncJob(Path source, Path destination, Path tmpDir, Path input, Path output) throws IOException, InterruptedException, ClassNotFoundException { Job job = new Job(getConf(), "HDFS Sync job"); job.setJarByClass(getClass()); job.setInputFormatClass(TextInputFormat.class); job.setMapperClass(HdfsSyncMapper.class); job.setReducerClass(HdfsSyncReducer.class); job.setOutputKeyClass(LongWritable.class); job.setOutputValueClass(Text.class); job.getConfiguration().set(SRC_PATH_CONF, source.toString()); job.getConfiguration().set(DST_PATH_CONF, destination.toString()); job.getConfiguration().set(TMP_PATH_CONF, tmpDir.toString()); FileInputFormat.setInputPaths(job, input); FileInputFormat.setInputDirRecursive(job, true); FileInputFormat.setMaxInputSplitSize(job, this.getConf().getLong( FileInputFormat.SPLIT_MAXSIZE, 60000L)); FileOutputFormat.setOutputPath(job, new Path(output.toString())); FileOutputFormat.setOutputCompressorClass(job, GzipCodec.class); boolean success = job.waitForCompletion(true); return success ? 0 : 1; }
private int runMetastoreCompareJobWithTextInput(Path input, Path output) throws IOException, InterruptedException, ClassNotFoundException { Job job = Job.getInstance(this.getConf(), "Stage1: Metastore Compare Job with Input List"); job.setJarByClass(this.getClass()); job.setInputFormatClass(TextInputFormat.class); job.setMapperClass(Stage1ProcessTableMapperWithTextInput.class); job.setReducerClass(Stage1PartitionCompareReducer.class); FileInputFormat.setInputPaths(job, input); FileInputFormat.setMaxInputSplitSize(job, this.getConf().getLong(FileInputFormat.SPLIT_MAXSIZE, 60000L)); job.setOutputKeyClass(LongWritable.class); job.setOutputValueClass(Text.class); FileOutputFormat.setOutputPath(job, output); FileOutputFormat.setOutputCompressorClass(job, GzipCodec.class); job.setNumReduceTasks(getConf().getInt( ConfigurationKeys.BATCH_JOB_METASTORE_PARALLELISM, 150)); boolean success = job.waitForCompletion(true); return success ? 0 : 1; }
private int runMetastoreCompareJobWithTextInput(Path input, Path output) throws IOException, InterruptedException, ClassNotFoundException { Job job = Job.getInstance(this.getConf(), "Stage1: Metastore Compare Job with Input List"); job.setJarByClass(this.getClass()); job.setInputFormatClass(TextInputFormat.class); job.setMapperClass(Stage1ProcessTableMapperWithTextInput.class); job.setReducerClass(Stage1PartitionCompareReducer.class); FileInputFormat.setInputPaths(job, input); FileInputFormat.setMaxInputSplitSize(job, this.getConf().getLong(FileInputFormat.SPLIT_MAXSIZE, 60000L)); job.setOutputKeyClass(LongWritable.class); job.setOutputValueClass(Text.class); FileOutputFormat.setOutputPath(job, output); FileOutputFormat.setOutputCompressorClass(job, GzipCodec.class); job.setNumReduceTasks(getConf().getInt( ConfigurationKeys.BATCH_JOB_METASTORE_PARALLELISM, 150)); boolean success = job.waitForCompletion(true); return success ? 0 : 1; }
private int runHdfsCopyJob(Path input, Path output) throws IOException, InterruptedException, ClassNotFoundException, TemplateRenderException { LOG.info("Starting job for step 2..."); Job job = Job.getInstance(this.getConf(), "Stage 2: HDFS Copy Job"); job.setJarByClass(this.getClass()); job.setInputFormatClass(TextInputFormat.class); job.setMapperClass(Stage2DirectoryCopyMapper.class); job.setReducerClass(Stage2DirectoryCopyReducer.class); FileInputFormat.setInputPaths(job, input); FileInputFormat.setInputDirRecursive(job, true); FileInputFormat.setMaxInputSplitSize(job, this.getConf().getLong(FileInputFormat.SPLIT_MAXSIZE, 60000L)); job.setOutputKeyClass(LongWritable.class); job.setOutputValueClass(Text.class); FileOutputFormat.setOutputPath(job, output); FileOutputFormat.setOutputCompressorClass(job, GzipCodec.class); job.setNumReduceTasks(getConf().getInt( ConfigurationKeys.BATCH_JOB_COPY_PARALLELISM, 150)); boolean success = job.waitForCompletion(true); if (success) { LOG.info("Job for step 2 finished successfully! To view logging data, run the following " + "commands in Hive: \n\n" + VelocityUtils.renderTemplate(STEP2_HQL_TEMPLATE, velocityContext) + "\n"); } return success ? 0 : 1; }
private int runHdfsCopyJob(Path input, Path output) throws IOException, InterruptedException, ClassNotFoundException, TemplateRenderException { LOG.info("Starting job for step 2..."); Job job = Job.getInstance(this.getConf(), "Stage 2: HDFS Copy Job"); job.setJarByClass(this.getClass()); job.setInputFormatClass(TextInputFormat.class); job.setMapperClass(Stage2DirectoryCopyMapper.class); job.setReducerClass(Stage2DirectoryCopyReducer.class); FileInputFormat.setInputPaths(job, input); FileInputFormat.setInputDirRecursive(job, true); FileInputFormat.setMaxInputSplitSize(job, this.getConf().getLong(FileInputFormat.SPLIT_MAXSIZE, 60000L)); job.setOutputKeyClass(LongWritable.class); job.setOutputValueClass(Text.class); FileOutputFormat.setOutputPath(job, output); FileOutputFormat.setOutputCompressorClass(job, GzipCodec.class); job.setNumReduceTasks(getConf().getInt( ConfigurationKeys.BATCH_JOB_COPY_PARALLELISM, 150)); boolean success = job.waitForCompletion(true); if (success) { LOG.info("Job for step 2 finished successfully! To view logging data, run the following " + "commands in Hive: \n\n" + VelocityUtils.renderTemplate(STEP2_HQL_TEMPLATE, velocityContext) + "\n"); } return success ? 0 : 1; }
long inputSize = createInputDirectory(fileSys, input, pent, depth); FileInputFormat.setMaxInputSplitSize(job, (inputSize/numMaps));
private int runCommitChangeJob(Path input, Path output) throws IOException, InterruptedException, ClassNotFoundException, TemplateRenderException { LOG.info("Starting job for step 3..."); Job job = Job.getInstance(this.getConf(), "Stage3: Commit Change Job"); job.setJarByClass(this.getClass()); job.setInputFormatClass(TextInputFormat.class); job.setMapperClass(Stage3CommitChangeMapper.class); job.setNumReduceTasks(0); job.setOutputKeyClass(Text.class); job.setOutputValueClass(Text.class); FileInputFormat.setInputPaths(job, input); FileInputFormat.setInputDirRecursive(job, true); FileInputFormat.setMaxInputSplitSize(job, this.getConf().getLong(FileInputFormat.SPLIT_MAXSIZE, 60000L)); FileOutputFormat.setOutputPath(job, output); FileOutputFormat.setOutputCompressorClass(job, GzipCodec.class); job.setNumReduceTasks(getConf().getInt( ConfigurationKeys.BATCH_JOB_METASTORE_PARALLELISM, 150)); boolean success = job.waitForCompletion(true); if (success) { LOG.info("Job for step 3 finished successfully! To view logging data, run the following " + "commands in Hive: \n\n" + VelocityUtils.renderTemplate(STEP3_HQL_TEMPLATE, velocityContext)); } return success ? 0 : 1; }
private int runCommitChangeJob(Path input, Path output) throws IOException, InterruptedException, ClassNotFoundException, TemplateRenderException { LOG.info("Starting job for step 3..."); Job job = Job.getInstance(this.getConf(), "Stage3: Commit Change Job"); job.setJarByClass(this.getClass()); job.setInputFormatClass(TextInputFormat.class); job.setMapperClass(Stage3CommitChangeMapper.class); job.setNumReduceTasks(0); job.setOutputKeyClass(Text.class); job.setOutputValueClass(Text.class); FileInputFormat.setInputPaths(job, input); FileInputFormat.setInputDirRecursive(job, true); FileInputFormat.setMaxInputSplitSize(job, this.getConf().getLong(FileInputFormat.SPLIT_MAXSIZE, 60000L)); FileOutputFormat.setOutputPath(job, output); FileOutputFormat.setOutputCompressorClass(job, GzipCodec.class); job.setNumReduceTasks(getConf().getInt( ConfigurationKeys.BATCH_JOB_METASTORE_PARALLELISM, 150)); boolean success = job.waitForCompletion(true); if (success) { LOG.info("Job for step 3 finished successfully! To view logging data, run the following " + "commands in Hive: \n\n" + VelocityUtils.renderTemplate(STEP3_HQL_TEMPLATE, velocityContext)); } return success ? 0 : 1; }
protected List<InputSplit> computeSplits(long desiredBundleSizeBytes) throws IOException, IllegalAccessException, InstantiationException { Job job = jobInstance(); FileInputFormat.setMinInputSplitSize(job, desiredBundleSizeBytes); FileInputFormat.setMaxInputSplitSize(job, desiredBundleSizeBytes); return createFormat(job).getSplits(job); }
protected List<InputSplit> computeSplits(long desiredBundleSizeBytes) throws IOException, IllegalAccessException, InstantiationException { Job job = jobInstance(); FileInputFormat.setMinInputSplitSize(job, desiredBundleSizeBytes); FileInputFormat.setMaxInputSplitSize(job, desiredBundleSizeBytes); return createFormat(job).getSplits(job); }
private List<InputSplit> computeSplits(long desiredBundleSizeBytes, SerializableConfiguration serializableConfiguration) throws IOException, IllegalAccessException, InstantiationException { Job job = SerializableConfiguration.newJob(serializableConfiguration); FileInputFormat.setMinInputSplitSize(job, desiredBundleSizeBytes); FileInputFormat.setMaxInputSplitSize(job, desiredBundleSizeBytes); return createFormat(job).getSplits(job); }
@Override public void initialize() { Preconditions.checkState(ReaderWriterState.NEW.equals(state), "A reader may not be opened more than once - current state:%s", state); try { FileInputFormat format = InputFormatUtil.newInputFormatInstance(descriptor); Job job = Hadoop.Job.newInstance.invoke(conf); FileInputFormat.addInputPath(job, path); // attempt to minimize the number of InputSplits FileStatus stat = fs.getFileStatus(path); FileInputFormat.setMaxInputSplitSize(job, stat.getLen()); this.splits = format.getSplits(job).iterator(); this.shouldAdvance = true; this.state = ReaderWriterState.OPEN; } catch (RuntimeException e) { this.state = ReaderWriterState.ERROR; throw new DatasetOperationException("Cannot calculate splits", e); } catch (IOException e) { this.state = ReaderWriterState.ERROR; throw new DatasetIOException("Cannot calculate splits", e); } }
random.nextInt(MAX_LENGTH / (SequenceFile.SYNC_INTERVAL / 20)) + 1; FileInputFormat.setMaxInputSplitSize(job, fs.getFileStatus(inFile).getLen() / numSplits); TaskAttemptContext context = MapReduceTestUtil.
FileInputFormat.setMaxInputSplitSize(job, config.getMaxSplitSize()); inputFormatClass = inputFormatProvider.getInputFormatClassName(); Configuration hConf = job.getConfiguration();
int numSplits = random.nextInt(MAX_LENGTH / (SequenceFile.SYNC_INTERVAL / 20)) + 1; FileInputFormat.setMaxInputSplitSize(job, fs.getFileStatus(file).getLen() / numSplits); for (InputSplit split : format.getSplits(job)) {
random.nextInt(MAX_LENGTH / (SequenceFile.SYNC_INTERVAL / 20)) + 1; FileInputFormat.setMaxInputSplitSize(job, fs.getFileStatus(inFile).getLen() / numSplits); TaskAttemptContext context = MapReduceTestUtil.
FileInputFormat.setMaxInputSplitSize(job, fs.getFileStatus(path).getLen() / numSplits); MultipleInputs.addInputPath(job, path, TextInputFormat.class,