org.apache.hadoop.mapred.TextInputFormat java code examples

Refine search

TextInputFormat inputFormat = new TextInputFormat();
JobConf newjob = new JobConf(job);
newjob.setInputFormat(TextInputFormat.class);
inputFormat.configure(newjob);
 InputSplit[] iss = inputFormat.getSplits(newjob, numSubSplits);
 for (InputSplit is : iss) {
  result.add(new SymlinkTextInputSplit(symlinkPath, (FileSplit)is));

@Override
public RecordReader<LongWritable, Text> getRecordReader(
  InputSplit split, JobConf job, Reporter reporter) throws IOException {
 InputSplit targetSplit = ((SymlinkTextInputSplit)split).getTargetSplit();
 // The target data is in TextInputFormat.
 TextInputFormat inputFormat = new TextInputFormat();
 inputFormat.configure(job);
 RecordReader innerReader = null;
 try {
  innerReader = inputFormat.getRecordReader(targetSplit, job,
    reporter);
 } catch (Exception e) {
  innerReader = HiveIOExceptionHandlerUtil
    .handleRecordReaderCreationException(e, job);
 }
 HiveRecordReader rr = new HiveRecordReader(innerReader, job);
 rr.initIOContext((FileSplit)targetSplit, job, TextInputFormat.class, innerReader);
 return rr;
}

public static void main(String[] args) throws Exception {
  if (args.length < 2) {
    System.err.println("Usage: WordCount <input path> <result path>");
    return;
  }
  final String inputPath = args[0];
  final String outputPath = args[1];
  final ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment();
  // Set up the Hadoop Input Format
  HadoopInputFormat<LongWritable, Text> hadoopInputFormat = new HadoopInputFormat<LongWritable, Text>(new TextInputFormat(), LongWritable.class, Text.class, new JobConf());
  TextInputFormat.addInputPath(hadoopInputFormat.getJobConf(), new Path(inputPath));
  // Create a Flink job with it
  DataSet<Tuple2<LongWritable, Text>> text = env.createInput(hadoopInputFormat);
  DataSet<Tuple2<Text, LongWritable>> words =
      text.flatMap(new HadoopMapFunction<LongWritable, Text, Text, LongWritable>(new Tokenizer()))
        .groupBy(0).reduceGroup(new HadoopReduceCombineFunction<Text, LongWritable, Text, LongWritable>(new Counter(), new Counter()));
  // Set up Hadoop Output Format
  HadoopOutputFormat<Text, LongWritable> hadoopOutputFormat =
      new HadoopOutputFormat<Text, LongWritable>(new TextOutputFormat<Text, LongWritable>(), new JobConf());
  hadoopOutputFormat.getJobConf().set("mapred.textoutputformat.separator", " ");
  TextOutputFormat.setOutputPath(hadoopOutputFormat.getJobConf(), new Path(outputPath));
  // Output & Execute
  words.output(hadoopOutputFormat).setParallelism(1);
  env.execute("Hadoop Compat WordCount");
}

@BeforeClass
public void setUp()
{
  inputFormat = new TextInputFormat();
  inputFormat.configure(new JobConf());
}

public void testNumInputs() throws Exception {
 JobConf job = new JobConf(conf);
 dfs = newDFSCluster(job);
 FileSystem fs = dfs.getFileSystem();
 System.out.println("FileSystem " + fs.getUri());
 Path inputDir = new Path("/foo/");
 final int numFiles = 10;
 String fileNameBase = "part-0000";
 for (int i=0; i < numFiles; ++i) {
  createInputs(fs, inputDir, fileNameBase + String.valueOf(i));
 }
 createInputs(fs, inputDir, "_meta");
 createInputs(fs, inputDir, "_temp");
 // split it using a file input format
 TextInputFormat.addInputPath(job, inputDir);
 TextInputFormat inFormat = new TextInputFormat();
 inFormat.configure(job);
 InputSplit[] splits = inFormat.getSplits(job, 1);
 assertEquals("Expected value of " + FileInputFormat.NUM_INPUT_FILES,
        numFiles, job.getLong(FileInputFormat.NUM_INPUT_FILES, 0));
}

private void task2(String inputPath, String outputPath) throws IOException {
 LOG.info("Exracting anchor text (phase 2)...");
 LOG.info(" - input: " + inputPath);
 LOG.info(" - output: " + outputPath);
 JobConf conf = new JobConf(getConf(), ExtractWikipediaAnchorText.class);
 conf.setJobName(String.format(
   "ExtractWikipediaAnchorText:phase2[input: %s, output: %s]", inputPath, outputPath));
 // Gathers everything together for convenience; feasible for Wikipedia.
 conf.setNumReduceTasks(1);
 TextInputFormat.addInputPath(conf, new Path(inputPath));
 TextOutputFormat.setOutputPath(conf, new Path(outputPath));
 conf.setInputFormat(SequenceFileInputFormat.class);
 conf.setOutputFormat(MapFileOutputFormat.class);
 conf.setMapOutputKeyClass(IntWritable.class);
 conf.setMapOutputValueClass(Text.class);
 conf.setOutputKeyClass(IntWritable.class);
 conf.setOutputValueClass(HMapStIW.class);
 conf.setMapperClass(MyMapper2.class);
 conf.setReducerClass(MyReducer2.class);
 // Delete the output directory if it exists already.
 FileSystem.get(conf).delete(new Path(outputPath), true);
 JobClient.runJob(conf);
 // Clean up intermediate data.
 FileSystem.get(conf).delete(new Path(inputPath), true);
}

TupleDomain<HiveColumnHandle> effectivePredicate = (TupleDomain<HiveColumnHandle>) compactEffectivePredicate;
Path path = new Path(getPartitionLocation(table, partition.getPartition()));
Configuration configuration = hdfsEnvironment.getConfiguration(hdfsContext, path);
InputFormat<?, ?> inputFormat = getInputFormat(configuration, schema, false);
FileSystem fs = hdfsEnvironment.getFileSystem(hdfsContext, path);
boolean s3SelectPushdownEnabled = shouldEnablePushdownForTable(session, table, path.toString(), partition.getPartition());
  for (Path targetPath : getTargetPathsFromSymlink(fs, path)) {
    TextInputFormat targetInputFormat = new TextInputFormat();
    targetJob.setInputFormat(TextInputFormat.class);
    targetInputFormat.configure(targetJob);
    FileInputFormat.setInputPaths(targetJob, targetPath);
    InputSplit[] targetSplits = targetInputFormat.getSplits(targetJob, 0);

TextInputFormat inputFormat = new TextInputFormat();
JobConf job = new JobConf(fsConf);
job.setInt("io.file.buffer.size", context.getConfig().getInt(ExecConstants.TEXT_LINE_READER_BUFFER_SIZE));
job.setInputFormat(inputFormat.getClass());
try {
 reader = inputFormat.getRecordReader(split, job, Reporter.NULL);
 key = reader.createKey();
 value = reader.createValue();

@Test
public void testListStatusErrorOnNonExistantDir() throws IOException {
 Configuration conf = new Configuration();
 conf.setInt(FileInputFormat.LIST_STATUS_NUM_THREADS, numThreads);
 org.apache.hadoop.mapreduce.lib.input.TestFileInputFormat
   .configureTestErrorOnNonExistantDir(conf, localFs);
 JobConf jobConf = new JobConf(conf);
 TextInputFormat fif = new TextInputFormat();
 fif.configure(jobConf);
 try {
  fif.listStatus(jobConf);
  Assert.fail("Expecting an IOException for a missing Input path");
 } catch (IOException e) {
  Path expectedExceptionPath = new Path(TEST_ROOT_DIR, "input2");
  expectedExceptionPath = localFs.makeQualified(expectedExceptionPath);
  Assert.assertTrue(e instanceof InvalidInputException);
  Assert.assertEquals(
    "Input path does not exist: " + expectedExceptionPath.toString(),
    e.getMessage());
 }
}

 Path output, OperationsParams params) throws IOException {
JobConf job = new JobConf(params, CatUnion.class);
job.setJobName("Union");
FileSystem outFs = output.getFileSystem(job);
if (outFs.exists(output)) {
 if (params.getBoolean("overwrite", false)) {
job.setNumMapTasks(clusterStatus.getMaxMapTasks() * 5);
job.setNumReduceTasks(Math.max(1, clusterStatus.getMaxReduceTasks() * 9 / 10));
job.setInputFormat(ShapeLineInputFormat.class);
TextInputFormat.addInputPath(job, shapeFile);
DistributedCache.addCacheFile(categoryFile.toUri(), job);
job.setOutputFormat(TextOutputFormat.class);

public static void main(String[] args) throws Exception {
  System.setProperty("hazelcast.logging.type", "log4j");
  Path inputPath = new Path(HadoopWordCount.class.getClassLoader().getResource("books").getPath());
  Path outputPath = new Path(OUTPUT_PATH);
  JobConf jobConfig = new JobConf();
  jobConfig.setInputFormat(TextInputFormat.class);
  jobConfig.setOutputFormat(TextOutputFormat.class);
  TextOutputFormat.setOutputPath(jobConfig, outputPath);
  TextInputFormat.addInputPath(jobConfig, inputPath);

@Override
public void setFile(String file, long offset, long length) {
 JobConf defaultConf = new JobConf();
 this.split = new FileSplit( new Path( file ), offset, length, defaultConf); 
 this.hasMore = true;
 this.jobConf = defaultConf;
 //this.split = split;
 this.input_format = new TextInputFormat();
 try {
  this.reader = input_format.getRecordReader(this.split, this.jobConf, voidReporter);
 } catch (IOException e) {
  // TODO Auto-generated catch block
  e.printStackTrace();
 }
 this.key = reader.createKey();
 
}

private InputSplit[] getSplits(JobConf conf, int numSplits, String path) throws Exception
{
 FileInputFormat.setInputPaths(conf, new Path(path));
 if (inputFormat == null) {
  inputFormat = inputFormatClass.newInstance();
  String inputFormatClassName = inputFormatClass.getName();
  if (inputFormatClassName.equals("org.apache.hadoop.mapred.TextInputFormat")) {
   ((TextInputFormat)inputFormat).configure(conf);
  } else if (inputFormatClassName.equals("org.apache.hadoop.mapred.KeyValueTextInputFormat")) {
   ((KeyValueTextInputFormat)inputFormat).configure(conf);
  }
 }
 return inputFormat.getSplits(conf, numSplits);
 // return null;
}

private static void doSingleBzip2BufferSize(JobConf jConf) throws IOException {
 TextInputFormat format = new TextInputFormat();
 format.configure(jConf);
 format.setMinSplitSize(5500);  // work around 256-byte/22-splits issue
 InputSplit[] splits = format.getSplits(jConf, 100);
 assertEquals("compressed splits == 2", 2, splits.length);
 FileSplit tmp = (FileSplit) splits[0];
 if (tmp.getPath().getName().equals("testCompressThenConcat.txt.gz")) {
  System.out.println("  (swapping)");
  splits[0] = splits[1];

public RecordReader getRecordReader(InputSplit split, JobConf job,
 Reporter reporter) throws IOException {
 FileSplit fileSplit = (FileSplit) split;
 FileSystem fs = FileSystem.get(fileSplit.getPath().toUri(), job);
 FSDataInputStream is = fs.open(fileSplit.getPath());
 byte[] header = new byte[3];
 RecordReader reader = null;
 try {
  is.readFully(header);
 } catch (EOFException eof) {
  reader = textInputFormat.getRecordReader(split, job, reporter);
 } finally {
  is.close();
 }
 if (header[0] == 'S' && header[1] == 'E' && header[2] == 'Q') {
  reader = seqFileInputFormat.getRecordReader(split, job, reporter);
 } else {
  reader = textInputFormat.getRecordReader(split, job, reporter);
 }
 return reader;
}

@Test
public void testListStatusNestedNonRecursive() throws IOException {
 Configuration conf = new Configuration();
 conf.setInt(FileInputFormat.LIST_STATUS_NUM_THREADS, numThreads);
 List<Path> expectedPaths = org.apache.hadoop.mapreduce.lib.input.TestFileInputFormat
   .configureTestNestedNonRecursive(conf, localFs);
 JobConf jobConf = new JobConf(conf);
 TextInputFormat fif = new TextInputFormat();
 fif.configure(jobConf);
 FileStatus[] statuses = fif.listStatus(jobConf);
 org.apache.hadoop.mapreduce.lib.input.TestFileInputFormat
   .verifyFileStatuses(expectedPaths, Lists.newArrayList(statuses),
     localFs);
}

/**
 * Hack to configure InputFormats before they get used.
 * @param inputFormat InputFormat to configure
 * @param conf Configuration to use
 */
public static void configureInputFormat(InputFormat inputFormat,
  Configuration conf)
{
 JobConf jobConf = new JobConf(conf);
 setJobConfIfPossible(inputFormat, jobConf);
 // TextInputFormat is not always JobConfigurable, so we need to explicitly
 // call this here to make sure it gets configured with the
 // compression codecs.
 if (inputFormat instanceof TextInputFormat) {
  ((TextInputFormat) inputFormat).configure(jobConf);
 }
}

 @Override
 public InputSplit[] getSplits(JobConf job, int splits) throws IOException {
  // ensure that the table properties were copied
  assertEquals("val1", job.get("myprop1"));
  assertEquals("val2", job.get("myprop2"));
  // ensure that both of the partitions are in the complete list.
  String[] dirs = job.get("hive.complete.dir.list").split("\t");
  assertEquals(2, dirs.length);
  Arrays.sort(dirs);
  assertEquals(true, dirs[0].endsWith("/state=CA"));
  assertEquals(true, dirs[1].endsWith("/state=OR"));
  return super.getSplits(job, splits);
 }
}

/**
 * generate splits for this run
 * 
 * @param input_path
 * @param job
 * @return
 */
private InputSplit[] generateDebugSplits(Path input_path, JobConf job) {
  long block_size = localFs.getDefaultBlockSize();
  log.info("default block size: " + (block_size / 1024 / 1024)
      + "MB");
  // ---- set where we'll read the input files from -------------
  FileInputFormat.setInputPaths(job, input_path);
  // try splitting the file in a variety of sizes
  TextInputFormat format = new TextInputFormat();
  format.configure(job);
  int numSplits = 1;
  InputSplit[] splits = null;
  try {
    splits = format.getSplits(job, numSplits);
  } catch (IOException e) {
    // TODO Auto-generated catch block
    e.printStackTrace();
  }
  return splits;
}

@Override
public void configure(JobConf job) {
 this.job = job;
 format.configure(job);
}

Javadoc

An InputFormat for plain text files. Files are broken into lines. Either linefeed or carriage-return are used to signal end of line. Keys are the position in the file, and values are the line of text..

Most used methods

Popular in Java

Start an intent from android
orElseThrow (Optional)
Return the contained value, if present, otherwise throw an exception to be created by the provided s
startActivity (Activity)
notifyDataSetChanged (ArrayAdapter)
Runnable (java.lang)
Represents a command that can be executed. Often used to run code in a different Thread.
Thread (java.lang)
A thread is a thread of execution in a program. The Java Virtual Machine allows an application to ha
Collections (java.util)
This class consists exclusively of static methods that operate on or return collections. It contains
Deque (java.util)
A linear collection that supports element insertion and removal at both ends. The name deque is shor
Locale (java.util)
Locale represents a language/country/variant combination. Locales are used to alter the presentatio
Servlet (javax.servlet)
Defines methods that all servlets must implement. A servlet is a small Java program that runs within
Best plugins for Eclipse

How to useTextInputFormat in org.apache.hadoop.mapred

Best Java code snippets using org.apache.hadoop.mapred.TextInputFormat (Showing top 20 results out of 342)

Refine search

How to use
TextInputFormat
in
org.apache.hadoop.mapred