@Override public List<InputSplit> getSplits(JobContext cx) throws IOException { Job modifiedJob = Job.getInstance(cx.getConfiguration()); setSplitSize(modifiedJob); FileInputFormat.setInputDirRecursive(modifiedJob, true); return cleanSplits(super.getSplits(modifiedJob)); }
private int runSyncJob(Path source, Path destination, Path tmpDir, Path input, Path output) throws IOException, InterruptedException, ClassNotFoundException { Job job = new Job(getConf(), "HDFS Sync job"); job.setJarByClass(getClass()); job.setInputFormatClass(TextInputFormat.class); job.setMapperClass(HdfsSyncMapper.class); job.setReducerClass(HdfsSyncReducer.class); job.setOutputKeyClass(LongWritable.class); job.setOutputValueClass(Text.class); job.getConfiguration().set(SRC_PATH_CONF, source.toString()); job.getConfiguration().set(DST_PATH_CONF, destination.toString()); job.getConfiguration().set(TMP_PATH_CONF, tmpDir.toString()); FileInputFormat.setInputPaths(job, input); FileInputFormat.setInputDirRecursive(job, true); FileInputFormat.setMaxInputSplitSize(job, this.getConf().getLong( FileInputFormat.SPLIT_MAXSIZE, 60000L)); FileOutputFormat.setOutputPath(job, new Path(output.toString())); FileOutputFormat.setOutputCompressorClass(job, GzipCodec.class); boolean success = job.waitForCompletion(true); return success ? 0 : 1; }
@Override public List<InputSplit> getSplits(JobContext cx) throws IOException { Job modifiedJob = Job.getInstance(cx.getConfiguration()); setSplitSize(modifiedJob); FileInputFormat.setInputDirRecursive(modifiedJob, true); return cleanSplits(super.getSplits(modifiedJob)); }
@Override public List<InputSplit> getSplits(JobContext cx) throws IOException { Job modifiedJob = Job.getInstance(cx.getConfiguration()); setSplitSize(modifiedJob); FileInputFormat.setInputDirRecursive(modifiedJob, true); return cleanSplits(super.getSplits(modifiedJob)); }
private int runSyncJob(Path source, Path destination, Path tmpDir, Path input, Path output) throws IOException, InterruptedException, ClassNotFoundException { Job job = new Job(getConf(), "HDFS Sync job"); job.setJarByClass(getClass()); job.setInputFormatClass(TextInputFormat.class); job.setMapperClass(HdfsSyncMapper.class); job.setReducerClass(HdfsSyncReducer.class); job.setOutputKeyClass(LongWritable.class); job.setOutputValueClass(Text.class); job.getConfiguration().set(SRC_PATH_CONF, source.toString()); job.getConfiguration().set(DST_PATH_CONF, destination.toString()); job.getConfiguration().set(TMP_PATH_CONF, tmpDir.toString()); FileInputFormat.setInputPaths(job, input); FileInputFormat.setInputDirRecursive(job, true); FileInputFormat.setMaxInputSplitSize(job, this.getConf().getLong( FileInputFormat.SPLIT_MAXSIZE, 60000L)); FileOutputFormat.setOutputPath(job, new Path(output.toString())); FileOutputFormat.setOutputCompressorClass(job, GzipCodec.class); boolean success = job.waitForCompletion(true); return success ? 0 : 1; }
private int runHdfsCopyJob(Path input, Path output) throws IOException, InterruptedException, ClassNotFoundException, TemplateRenderException { LOG.info("Starting job for step 2..."); Job job = Job.getInstance(this.getConf(), "Stage 2: HDFS Copy Job"); job.setJarByClass(this.getClass()); job.setInputFormatClass(TextInputFormat.class); job.setMapperClass(Stage2DirectoryCopyMapper.class); job.setReducerClass(Stage2DirectoryCopyReducer.class); FileInputFormat.setInputPaths(job, input); FileInputFormat.setInputDirRecursive(job, true); FileInputFormat.setMaxInputSplitSize(job, this.getConf().getLong(FileInputFormat.SPLIT_MAXSIZE, 60000L)); job.setOutputKeyClass(LongWritable.class); job.setOutputValueClass(Text.class); FileOutputFormat.setOutputPath(job, output); FileOutputFormat.setOutputCompressorClass(job, GzipCodec.class); job.setNumReduceTasks(getConf().getInt( ConfigurationKeys.BATCH_JOB_COPY_PARALLELISM, 150)); boolean success = job.waitForCompletion(true); if (success) { LOG.info("Job for step 2 finished successfully! To view logging data, run the following " + "commands in Hive: \n\n" + VelocityUtils.renderTemplate(STEP2_HQL_TEMPLATE, velocityContext) + "\n"); } return success ? 0 : 1; }
private int runHdfsCopyJob(Path input, Path output) throws IOException, InterruptedException, ClassNotFoundException, TemplateRenderException { LOG.info("Starting job for step 2..."); Job job = Job.getInstance(this.getConf(), "Stage 2: HDFS Copy Job"); job.setJarByClass(this.getClass()); job.setInputFormatClass(TextInputFormat.class); job.setMapperClass(Stage2DirectoryCopyMapper.class); job.setReducerClass(Stage2DirectoryCopyReducer.class); FileInputFormat.setInputPaths(job, input); FileInputFormat.setInputDirRecursive(job, true); FileInputFormat.setMaxInputSplitSize(job, this.getConf().getLong(FileInputFormat.SPLIT_MAXSIZE, 60000L)); job.setOutputKeyClass(LongWritable.class); job.setOutputValueClass(Text.class); FileOutputFormat.setOutputPath(job, output); FileOutputFormat.setOutputCompressorClass(job, GzipCodec.class); job.setNumReduceTasks(getConf().getInt( ConfigurationKeys.BATCH_JOB_COPY_PARALLELISM, 150)); boolean success = job.waitForCompletion(true); if (success) { LOG.info("Job for step 2 finished successfully! To view logging data, run the following " + "commands in Hive: \n\n" + VelocityUtils.renderTemplate(STEP2_HQL_TEMPLATE, velocityContext) + "\n"); } return success ? 0 : 1; }
private int runCommitChangeJob(Path input, Path output) throws IOException, InterruptedException, ClassNotFoundException, TemplateRenderException { LOG.info("Starting job for step 3..."); Job job = Job.getInstance(this.getConf(), "Stage3: Commit Change Job"); job.setJarByClass(this.getClass()); job.setInputFormatClass(TextInputFormat.class); job.setMapperClass(Stage3CommitChangeMapper.class); job.setNumReduceTasks(0); job.setOutputKeyClass(Text.class); job.setOutputValueClass(Text.class); FileInputFormat.setInputPaths(job, input); FileInputFormat.setInputDirRecursive(job, true); FileInputFormat.setMaxInputSplitSize(job, this.getConf().getLong(FileInputFormat.SPLIT_MAXSIZE, 60000L)); FileOutputFormat.setOutputPath(job, output); FileOutputFormat.setOutputCompressorClass(job, GzipCodec.class); job.setNumReduceTasks(getConf().getInt( ConfigurationKeys.BATCH_JOB_METASTORE_PARALLELISM, 150)); boolean success = job.waitForCompletion(true); if (success) { LOG.info("Job for step 3 finished successfully! To view logging data, run the following " + "commands in Hive: \n\n" + VelocityUtils.renderTemplate(STEP3_HQL_TEMPLATE, velocityContext)); } return success ? 0 : 1; }
private int runCommitChangeJob(Path input, Path output) throws IOException, InterruptedException, ClassNotFoundException, TemplateRenderException { LOG.info("Starting job for step 3..."); Job job = Job.getInstance(this.getConf(), "Stage3: Commit Change Job"); job.setJarByClass(this.getClass()); job.setInputFormatClass(TextInputFormat.class); job.setMapperClass(Stage3CommitChangeMapper.class); job.setNumReduceTasks(0); job.setOutputKeyClass(Text.class); job.setOutputValueClass(Text.class); FileInputFormat.setInputPaths(job, input); FileInputFormat.setInputDirRecursive(job, true); FileInputFormat.setMaxInputSplitSize(job, this.getConf().getLong(FileInputFormat.SPLIT_MAXSIZE, 60000L)); FileOutputFormat.setOutputPath(job, output); FileOutputFormat.setOutputCompressorClass(job, GzipCodec.class); job.setNumReduceTasks(getConf().getInt( ConfigurationKeys.BATCH_JOB_METASTORE_PARALLELISM, 150)); boolean success = job.waitForCompletion(true); if (success) { LOG.info("Job for step 3 finished successfully! To view logging data, run the following " + "commands in Hive: \n\n" + VelocityUtils.renderTemplate(STEP3_HQL_TEMPLATE, velocityContext)); } return success ? 0 : 1; }
job.setMapOutputValueClass(LongWritable.class); job.setInputFormatClass(RioFileInputFormat.class); FileInputFormat.setInputDirRecursive(job, true); FileInputFormat.setInputPaths(job, source); TableMapReduceUtil.addDependencyJars(job);
try (HTable hTable = HalyardTableUtils.getTable(getConf(), target, true, getConf().getInt(SPLIT_BITS_PROPERTY, 3))) { HFileOutputFormat2.configureIncrementalLoad(job, hTable.getTableDescriptor(), hTable.getRegionLocator()); FileInputFormat.setInputDirRecursive(job, true); FileInputFormat.setInputPaths(job, source); FileOutputFormat.setOutputPath(job, new Path(workdir));
FileInputFormat.setInputPathFilter(job, RegexPathFilter.class); FileInputFormat.setInputDirRecursive(job, config.shouldReadRecursively());