org.apache.hadoop.mapred.TextInputFormat.getSplits java code examples

@Override
public InputSplit[] getSplits(JobConf job, int numSplits) throws IOException {
 return format.getSplits(job, numSplits);
}

 @Override
 public InputSplit[] getSplits(JobConf job, int splits) throws IOException {
  // ensure that the table properties were copied
  assertEquals("val1", job.get("myprop1"));
  assertEquals("val2", job.get("myprop2"));
  // ensure that both of the partitions are in the complete list.
  String[] dirs = job.get("hive.complete.dir.list").split("\t");
  assertEquals(2, dirs.length);
  Arrays.sort(dirs);
  assertEquals(true, dirs[0].endsWith("/state=CA"));
  assertEquals(true, dirs[1].endsWith("/state=OR"));
  return super.getSplits(job, splits);
 }
}

InputSplit[] iss = inputFormat.getSplits(newjob, numSubSplits);
for (InputSplit is : iss) {
 result.add(new SymlinkTextInputSplit(symlinkPath, (FileSplit)is));

InputSplit[] iss = inputFormat.getSplits(newjob, numSubSplits);
for (InputSplit is : iss) {
 result.add(new SymlinkTextInputSplit(symlinkPath, (FileSplit)is));

targetInputFormat.configure(targetJob);
FileInputFormat.setInputPaths(targetJob, targetPath);
InputSplit[] targetSplits = targetInputFormat.getSplits(targetJob, 0);

@Test
public void testIgnoreDirs() throws Exception {
 Configuration conf = getConfiguration();
 conf.setBoolean(FileInputFormat.INPUT_DIR_NONRECURSIVE_IGNORE_SUBDIRS, true);
 conf.setInt(FileInputFormat.LIST_STATUS_NUM_THREADS, numThreads);
 conf.set(org.apache.hadoop.mapreduce.lib.input.FileInputFormat.INPUT_DIR, "test:///a1");
 MockFileSystem mockFs = (MockFileSystem) new Path("test:///").getFileSystem(conf);
 JobConf job = new JobConf(conf);
 TextInputFormat fileInputFormat = new TextInputFormat();
 fileInputFormat.configure(job);
 InputSplit[] splits = fileInputFormat.getSplits(job, 1);
 Assert.assertEquals("Input splits are not correct", 1, splits.length);
 FileSystem.closeAll();
}

@Test
public void testMaxBlockLocationsOldSplitsWithErasureCoding()
  throws Exception {
 JobConf jobConf = new JobConf(conf);
 org.apache.hadoop.mapred.TextInputFormat fileInputFormat
   = new org.apache.hadoop.mapred.TextInputFormat();
 fileInputFormat.configure(jobConf);
 final org.apache.hadoop.mapred.InputSplit[] splits =
   fileInputFormat.getSplits(jobConf, 1);
 JobSplitWriter.createSplitFiles(submitDir, conf, fs, splits);
 validateSplitMetaInfo();
}

@Test
public void testListLocatedStatus() throws Exception {
 Configuration conf = getConfiguration();
 conf.setBoolean("fs.test.impl.disable.cache", false);
 conf.setInt(FileInputFormat.LIST_STATUS_NUM_THREADS, numThreads);
 conf.set(org.apache.hadoop.mapreduce.lib.input.FileInputFormat.INPUT_DIR,
   "test:///a1/a2");
 MockFileSystem mockFs =
   (MockFileSystem) new Path("test:///").getFileSystem(conf);
 Assert.assertEquals("listLocatedStatus already called",
   0, mockFs.numListLocatedStatusCalls);
 JobConf job = new JobConf(conf);
 TextInputFormat fileInputFormat = new TextInputFormat();
 fileInputFormat.configure(job);
 InputSplit[] splits = fileInputFormat.getSplits(job, 1);
 Assert.assertEquals("Input splits are not correct", 2, splits.length);
 Assert.assertEquals("listLocatedStatuss calls",
   1, mockFs.numListLocatedStatusCalls);
 FileSystem.closeAll();
}

@Override
public InputSplit[] getSplits(JobConf job, int numSplits) throws IOException {
 return format.getSplits(job, numSplits);
}

@Override
public InputSplit[] getSplits(JobConf job, int numSplits) throws IOException {
 return format.getSplits(job, numSplits);
}

@Override
public InputSplit[] getSplits(JobConf job, int numSplits) throws IOException {
 return format.getSplits(job, numSplits);
}

@Override
public InputSplit[] getSplits(JobConf job, int numSplits) throws IOException {
 return format.getSplits(job, numSplits);
}

@Test
public void testSplitLocationInfo() throws Exception {
 Configuration conf = getConfiguration();
 conf.set(org.apache.hadoop.mapreduce.lib.input.FileInputFormat.INPUT_DIR,
   "test:///a1/a2");
 JobConf job = new JobConf(conf);
 TextInputFormat fileInputFormat = new TextInputFormat();
 fileInputFormat.configure(job);
 FileSplit[] splits = (FileSplit[]) fileInputFormat.getSplits(job, 1);
 String[] locations = splits[0].getLocations();
 Assert.assertEquals(2, locations.length);
 SplitLocationInfo[] locationInfo = splits[0].getLocationInfo();
 Assert.assertEquals(2, locationInfo.length);
 SplitLocationInfo localhostInfo = locations[0].equals("localhost") ?
   locationInfo[0] : locationInfo[1];
 SplitLocationInfo otherhostInfo = locations[0].equals("otherhost") ?
   locationInfo[0] : locationInfo[1];
 Assert.assertTrue(localhostInfo.isOnDisk());
 Assert.assertTrue(localhostInfo.isInMemory());
 Assert.assertTrue(otherhostInfo.isOnDisk());
 Assert.assertFalse(otherhostInfo.isInMemory());
}

/**
 * Test using the gzip codec and an empty input file
 */
@Test
public void testGzipEmpty() throws IOException {
 JobConf job = new JobConf(defaultConf);
 CompressionCodec gzip = new GzipCodec();
 ReflectionUtils.setConf(gzip, job);
 localFs.delete(workDir, true);
 writeFile(localFs, new Path(workDir, "empty.gz"), gzip, "");
 FileInputFormat.setInputPaths(job, workDir);
 TextInputFormat format = new TextInputFormat();
 format.configure(job);
 InputSplit[] splits = format.getSplits(job, 100);
 assertEquals("Compressed files of length 0 are not returned from FileInputFormat.getSplits().",
        1, splits.length);
 List<Text> results = readSplit(format, splits[0], job);
 assertEquals("Compressed empty file length == 0", 0, results.size());
}

/**
 * generate splits for this run
 *
 * @param input_path
 * @param job
 * @return
 */
private InputSplit[] generateDebugSplits(Path input_path, JobConf job) {
  long block_size = localFs.getDefaultBlockSize();
  log.info("default block size: " + (block_size / 1024 / 1024)
      + "MB");
  // ---- set where we'll read the input files from -------------
  FileInputFormat.setInputPaths(job, input_path);
  // try splitting the file in a variety of sizes
  TextInputFormat format = new TextInputFormat();
  format.configure(job);
  int numSplits = 1;
  InputSplit[] splits = null;
  try {
    splits = format.getSplits(job, numSplits);
  } catch (IOException e) {
    log.error("Error with splits",e);
  }
  return splits;
}

/**
 * generate splits for this run
 * 
 * @param input_path
 * @param job
 * @return
 */
private InputSplit[] generateDebugSplits(Path input_path, JobConf job) {
  long block_size = localFs.getDefaultBlockSize();
  log.info("default block size: " + (block_size / 1024 / 1024)
      + "MB");
  // ---- set where we'll read the input files from -------------
  FileInputFormat.setInputPaths(job, input_path);
  // try splitting the file in a variety of sizes
  TextInputFormat format = new TextInputFormat();
  format.configure(job);
  int numSplits = 1;
  InputSplit[] splits = null;
  try {
    splits = format.getSplits(job, numSplits);
  } catch (IOException e) {
    // TODO Auto-generated catch block
    e.printStackTrace();
  }
  return splits;
}

public void testNumInputs() throws Exception {
 JobConf job = new JobConf(conf);
 dfs = newDFSCluster(job);
 FileSystem fs = dfs.getFileSystem();
 System.out.println("FileSystem " + fs.getUri());
 Path inputDir = new Path("/foo/");
 final int numFiles = 10;
 String fileNameBase = "part-0000";
 for (int i=0; i < numFiles; ++i) {
  createInputs(fs, inputDir, fileNameBase + String.valueOf(i));
 }
 createInputs(fs, inputDir, "_meta");
 createInputs(fs, inputDir, "_temp");
 // split it using a file input format
 TextInputFormat.addInputPath(job, inputDir);
 TextInputFormat inFormat = new TextInputFormat();
 inFormat.configure(job);
 InputSplit[] splits = inFormat.getSplits(job, 1);
 assertEquals("Expected value of " + FileInputFormat.NUM_INPUT_FILES,
        numFiles, job.getLong(FileInputFormat.NUM_INPUT_FILES, 0));
}

private static void doSingleGzipBufferSize(JobConf jConf) throws IOException {
 TextInputFormat format = new TextInputFormat();
 format.configure(jConf);
 // here's Nth pair of DecompressorStreams:
 InputSplit[] splits = format.getSplits(jConf, 100);
 assertEquals("compressed splits == 2", 2, splits.length);
 FileSplit tmp = (FileSplit) splits[0];
 if (tmp.getPath().getName().equals("testCompressThenConcat.txt.gz")) {
  System.out.println("  (swapping)");
  splits[0] = splits[1];
  splits[1] = tmp;
 }
 List<Text> results = readSplit(format, splits[0], jConf);
 assertEquals("splits[0] length (num lines)", 84, results.size());
 assertEquals("splits[0][0]",
  "Call me Ishmael. Some years ago--never mind how long precisely--having",
  results.get(0).toString());
 assertEquals("splits[0][42]",
  "Tell me, does the magnetic virtue of the needles of the compasses of",
  results.get(42).toString());
 results = readSplit(format, splits[1], jConf);
 assertEquals("splits[1] length (num lines)", 84, results.size());
 assertEquals("splits[1][0]",
  "Call me Ishmael. Some years ago--never mind how long precisely--having",
  results.get(0).toString());
 assertEquals("splits[1][42]",
  "Tell me, does the magnetic virtue of the needles of the compasses of",
  results.get(42).toString());
}

public void testNumInputs() throws Exception {
 JobConf job = new JobConf(conf);
 dfs = newDFSCluster(job);
 FileSystem fs = dfs.getFileSystem();
 System.out.println("FileSystem " + fs.getUri());
 Path inputDir = new Path("/foo/");
 final int numFiles = 10;
 String fileNameBase = "part-0000";
 for (int i=0; i < numFiles; ++i) {
  createInputs(fs, inputDir, fileNameBase + String.valueOf(i));
 }
 createInputs(fs, inputDir, "_meta");
 createInputs(fs, inputDir, "_temp");
 // split it using a file input format
 TextInputFormat.addInputPath(job, inputDir);
 TextInputFormat inFormat = new TextInputFormat();
 inFormat.configure(job);
 InputSplit[] splits = inFormat.getSplits(job, 1);
 assertEquals("Expected value of " + FileInputFormat.NUM_INPUT_FILES,
        numFiles, job.getLong(FileInputFormat.NUM_INPUT_FILES, 0));
}

/**
 * Test using the gzip codec and an empty input file
 */
@Test (timeout=5000)
public void testGzipEmpty() throws IOException {
 JobConf job = new JobConf(defaultConf);
 CompressionCodec gzip = new GzipCodec();
 ReflectionUtils.setConf(gzip, job);
 localFs.delete(workDir, true);
 writeFile(localFs, new Path(workDir, "empty.gz"), gzip, "");
 FileInputFormat.setInputPaths(job, workDir);
 TextInputFormat format = new TextInputFormat();
 format.configure(job);
 InputSplit[] splits = format.getSplits(job, 100);
 assertEquals("Compressed files of length 0 are not returned from FileInputFormat.getSplits().",
        1, splits.length);
 List<Text> results = readSplit(format, splits[0], job);
 assertEquals("Compressed empty file length == 0", 0, results.size());
}

Popular methods of TextInputFormat

Popular in Java

Finding current android device location
setContentView (Activity)
getOriginalFilename (MultipartFile)
Return the original filename in the client's filesystem.This may contain path information depending
runOnUiThread (Activity)
ConnectException (java.net)
A ConnectException is thrown if a connection cannot be established to a remote host on a specific po
Connection (java.sql)
A connection represents a link from a Java application to a database. All SQL statements and results
Date (java.sql)
A class which can consume and produce dates in SQL Date format. Dates are represented in SQL as yyyy
Locale (java.util)
Locale represents a language/country/variant combination. Locales are used to alter the presentatio
TimerTask (java.util)
The TimerTask class represents a task to run at a specified time. The task may be run once or repeat
Scheduler (org.quartz)
This is the main interface of a Quartz Scheduler. A Scheduler maintains a registry of org.quartz.Job
Top Vim plugins

How to use getSplitsmethodin org.apache.hadoop.mapred.TextInputFormat

Best Java code snippets using org.apache.hadoop.mapred.TextInputFormat.getSplits (Showing top 20 results out of 315)

How to use
getSplits
method
in
org.apache.hadoop.mapred.TextInputFormat