public static boolean isEmptyPath(JobConf job, Path dirPath, Context ctx) throws Exception { if (ctx != null) { ContentSummary cs = ctx.getCS(dirPath); if (cs != null) { if (LOG.isDebugEnabled()) { LOG.debug("Content Summary cached for {} length: {} num files: {} " + "num directories: {}", dirPath, cs.getLength(), cs.getFileCount(), cs.getDirectoryCount()); } return (cs.getLength() == 0 && cs.getFileCount() == 0 && cs.getDirectoryCount() <= 1); } else { LOG.debug("Content Summary not cached for {}", dirPath); } } return isEmptyPath(job, dirPath); }
public static boolean isEmptyPath(JobConf job, Path dirPath, Context ctx) throws Exception { if (ctx != null) { ContentSummary cs = ctx.getCS(dirPath); if (cs != null) { LOG.info("Content Summary " + dirPath + "length: " + cs.getLength() + " num files: " + cs.getFileCount() + " num directories: " + cs.getDirectoryCount()); return (cs.getLength() == 0 && cs.getFileCount() == 0 && cs.getDirectoryCount() <= 1); } else { LOG.info("Content Summary not cached for " + dirPath); } } return isEmptyPath(job, dirPath); }
public boolean isEmpty(Path path) throws IOException, MetaException { ContentSummary contents = getFs(path).getContentSummary(path); if (contents != null && contents.getFileCount() == 0 && contents.getDirectoryCount() == 1) { return true; } return false; }
@Override public ContentSummary getContentSummary(Path p, JobConf job) throws IOException { //length, file count, directory count long[] summary = {0, 0, 0}; List<Path> targetPaths = new ArrayList<Path>(); List<Path> symlinkPaths = new ArrayList<Path>(); try { getTargetPathsFromSymlinksDirs( job, new Path[]{p}, targetPaths, symlinkPaths); } catch (Exception e) { throw new IOException( "Error parsing symlinks from specified job input path.", e); } for(Path path : targetPaths) { FileSystem fs = path.getFileSystem(job); ContentSummary cs = fs.getContentSummary(path); summary[0] += cs.getLength(); summary[1] += cs.getFileCount(); summary[2] += cs.getDirectoryCount(); } return new ContentSummary(summary[0], summary[1], summary[2]); }
@Override public ContentSummary getContentSummary(Path p, JobConf job) throws IOException { //length, file count, directory count long[] summary = {0, 0, 0}; List<Path> targetPaths = new ArrayList<Path>(); List<Path> symlinkPaths = new ArrayList<Path>(); try { getTargetPathsFromSymlinksDirs( job, new Path[]{p}, targetPaths, symlinkPaths); } catch (Exception e) { throw new IOException( "Error parsing symlinks from specified job input path.", e); } for(Path path : targetPaths) { FileSystem fs = path.getFileSystem(job); ContentSummary cs = fs.getContentSummary(path); summary[0] += cs.getLength(); summary[1] += cs.getFileCount(); summary[2] += cs.getDirectoryCount(); } return new ContentSummary(summary[0], summary[1], summary[2]); }
summary[0] += cs.getLength(); summary[1] += cs.getFileCount(); summary[2] += cs.getDirectoryCount();
summary[2] += cs.getDirectoryCount(); LOG.info("Cache Content Summary for {} length: {} file count: {} " + " directory count: {}", entry.getKey(), cs.getLength(), cs.getFileCount(), cs.getDirectoryCount());
@Override public ContentSummary getContentSummary(Path f) throws IOException { // HarFileSystem has a bug where this method does not work properly // if the underlying FS is HDFS. See MAPREDUCE-1877 for more // information. This method is from FileSystem. FileStatus status = getFileStatus(f); if (!status.isDir()) { // f is a file return new ContentSummary(status.getLen(), 1, 0); } // f is a directory long[] summary = {0, 0, 1}; for(FileStatus s : listStatus(f)) { ContentSummary c = s.isDir() ? getContentSummary(s.getPath()) : new ContentSummary(s.getLen(), 1, 0); summary[0] += c.getLength(); summary[1] += c.getFileCount(); summary[2] += c.getDirectoryCount(); } return new ContentSummary(summary[0], summary[1], summary[2]); } }
@Override public int hashCode() { long result = getLength() ^ getFileCount() ^ getDirectoryCount() ^ getSnapshotLength() ^ getSnapshotFileCount() ^ getSnapshotDirectoryCount() ^ getSnapshotSpaceConsumed() ^ getErasureCodingPolicy().hashCode(); return ((int) result) ^ super.hashCode(); }
summary[0] += cs.getLength(); summary[1] += cs.getFileCount(); summary[2] += cs.getDirectoryCount(); summary[2] += cs.getDirectoryCount(); + cs.getFileCount() + " directory count: " + cs.getDirectoryCount());
@Override public boolean equals(Object to) { if (this == to) { return true; } else if (to instanceof ContentSummary) { ContentSummary right = (ContentSummary) to; return getLength() == right.getLength() && getFileCount() == right.getFileCount() && getDirectoryCount() == right.getDirectoryCount() && getSnapshotLength() == right.getSnapshotLength() && getSnapshotFileCount() == right.getSnapshotFileCount() && getSnapshotDirectoryCount() == right.getSnapshotDirectoryCount() && getSnapshotSpaceConsumed() == right.getSnapshotSpaceConsumed() && getErasureCodingPolicy().equals(right.getErasureCodingPolicy()) && super.equals(to); } else { return super.equals(to); } }
@Test public void testGetInputSummaryWithASingleThread() throws IOException { final int NUM_PARTITIONS = 5; final int BYTES_PER_FILE = 5; JobConf jobConf = new JobConf(); Properties properties = new Properties(); jobConf.setInt(HiveConf.ConfVars.HIVE_EXEC_INPUT_LISTING_MAX_THREADS.varname, 0); ContentSummary summary = runTestGetInputSummary(jobConf, properties, NUM_PARTITIONS, BYTES_PER_FILE, HiveInputFormat.class); assertEquals(NUM_PARTITIONS * BYTES_PER_FILE, summary.getLength()); assertEquals(NUM_PARTITIONS, summary.getFileCount()); assertEquals(NUM_PARTITIONS, summary.getDirectoryCount()); }
/** Return the {@link ContentSummary} of a given {@link Path}. * @param f path to use * @throws FileNotFoundException if the path does not resolve * @throws IOException IO failure */ public ContentSummary getContentSummary(Path f) throws IOException { FileStatus status = getFileStatus(f); if (status.isFile()) { // f is a file long length = status.getLen(); return new ContentSummary.Builder().length(length). fileCount(1).directoryCount(0).spaceConsumed(length).build(); } // f is a directory long[] summary = {0, 0, 1}; for(FileStatus s : listStatus(f)) { long length = s.getLen(); ContentSummary c = s.isDirectory() ? getContentSummary(s.getPath()) : new ContentSummary.Builder().length(length). fileCount(1).directoryCount(0).spaceConsumed(length).build(); summary[0] += c.getLength(); summary[1] += c.getFileCount(); summary[2] += c.getDirectoryCount(); } return new ContentSummary.Builder().length(summary[0]). fileCount(summary[1]).directoryCount(summary[2]). spaceConsumed(summary[0]).build(); }
assertEquals(0, cs.getDirectoryCount());
srcSummary.getDirectoryCount(), dstSummary.getDirectoryCount());
@Test public void testGetInputSummaryWithInputEstimator() throws IOException, HiveException { final int NUM_PARTITIONS = 5; final int BYTES_PER_FILE = 10; final int NUM_OF_ROWS = 5; JobConf jobConf = new JobConf(); Properties properties = new Properties(); jobConf.setInt(Utilities.DEPRECATED_MAPRED_DFSCLIENT_PARALLELISM_MAX, 2); properties.setProperty(hive_metastoreConstants.META_TABLE_STORAGE, InputEstimatorTestClass.class.getName()); InputEstimatorTestClass.setEstimation(new InputEstimator.Estimation(NUM_OF_ROWS, BYTES_PER_FILE)); /* Let's write more bytes to the files to test that Estimator is actually working returning the file size not from the filesystem */ ContentSummary summary = runTestGetInputSummary(jobConf, properties, NUM_PARTITIONS, BYTES_PER_FILE * 2, HiveInputFormat.class); assertEquals(NUM_PARTITIONS * BYTES_PER_FILE, summary.getLength()); assertEquals(NUM_PARTITIONS * -1, summary.getFileCount()); // Current getInputSummary() returns -1 for each file found assertEquals(NUM_PARTITIONS * -1, summary.getDirectoryCount()); // Current getInputSummary() returns -1 for each file found // Test deprecated mapred.dfsclient.parallelism.max jobConf.setInt(HiveConf.ConfVars.HIVE_EXEC_INPUT_LISTING_MAX_THREADS.varname, 0); jobConf.setInt(HiveConf.ConfVars.HIVE_EXEC_INPUT_LISTING_MAX_THREADS.varname, 2); properties.setProperty(hive_metastoreConstants.META_TABLE_STORAGE, InputEstimatorTestClass.class.getName()); InputEstimatorTestClass.setEstimation(new InputEstimator.Estimation(NUM_OF_ROWS, BYTES_PER_FILE)); /* Let's write more bytes to the files to test that Estimator is actually working returning the file size not from the filesystem */ summary = runTestGetInputSummary(jobConf, properties, NUM_PARTITIONS, BYTES_PER_FILE * 2, HiveInputFormat.class); assertEquals(NUM_PARTITIONS * BYTES_PER_FILE, summary.getLength()); assertEquals(NUM_PARTITIONS * -1, summary.getFileCount()); // Current getInputSummary() returns -1 for each file found assertEquals(NUM_PARTITIONS * -1, summary.getDirectoryCount()); // Current getInputSummary() returns -1 for each file found }
/** * Scenario: Empty input directory, i.e. no symlink file. * * Expected: Should return empty result set without any exception. */ public void testAccuracy2() throws IOException { fileSystem.mkdirs(symlinkDir); FileInputFormat.setInputPaths(job, symlinkDir); SymlinkTextInputFormat inputFormat = new SymlinkTextInputFormat(); ContentSummary cs = inputFormat.getContentSummary(symlinkDir, job); assertEquals(0, cs.getLength()); assertEquals(0, cs.getFileCount()); assertEquals(0, cs.getDirectoryCount()); InputSplit[] splits = inputFormat.getSplits(job, 2); log.info("Number of splits: " + splits.length); // Read all values. List<String> received = new ArrayList<String>(); for (InputSplit split : splits) { RecordReader<LongWritable, Text> reader = inputFormat.getRecordReader(split, job, reporter); LongWritable key = reader.createKey(); Text value = reader.createValue(); while (reader.next(key, value)) { received.add(value.toString()); } reader.close(); } List<String> expected = new ArrayList<String>(); assertEquals(expected, received); }
@Test public void testGetInputSummaryWithMultipleThreads() throws IOException { final int NUM_PARTITIONS = 5; final int BYTES_PER_FILE = 5; JobConf jobConf = new JobConf(); Properties properties = new Properties(); jobConf.setInt(HiveConf.ConfVars.HIVE_EXEC_INPUT_LISTING_MAX_THREADS.varname, 2); ContentSummary summary = runTestGetInputSummary(jobConf, properties, NUM_PARTITIONS, BYTES_PER_FILE, HiveInputFormat.class); assertEquals(NUM_PARTITIONS * BYTES_PER_FILE, summary.getLength()); assertEquals(NUM_PARTITIONS, summary.getFileCount()); assertEquals(NUM_PARTITIONS, summary.getDirectoryCount()); // Test deprecated mapred.dfsclient.parallelism.max jobConf.setInt(HiveConf.ConfVars.HIVE_EXEC_INPUT_LISTING_MAX_THREADS.varname, 0); jobConf.setInt(Utilities.DEPRECATED_MAPRED_DFSCLIENT_PARALLELISM_MAX, 2); summary = runTestGetInputSummary(jobConf, properties, NUM_PARTITIONS, BYTES_PER_FILE, HiveInputFormat.class); assertEquals(NUM_PARTITIONS * BYTES_PER_FILE, summary.getLength()); assertEquals(NUM_PARTITIONS, summary.getFileCount()); assertEquals(NUM_PARTITIONS, summary.getDirectoryCount()); }
summary[0] += c.getLength(); summary[1] += c.getFileCount(); summary[2] += c.getDirectoryCount();
@Test public void testGetInputSummaryWithContentSummaryInputFormat() throws IOException { final int NUM_PARTITIONS = 5; final int BYTES_PER_FILE = 10; JobConf jobConf = new JobConf(); Properties properties = new Properties(); jobConf.setInt(Utilities.DEPRECATED_MAPRED_DFSCLIENT_PARALLELISM_MAX, 2); ContentSummaryInputFormatTestClass.setContentSummary( new ContentSummary.Builder().length(BYTES_PER_FILE).fileCount(2).directoryCount(1).build()); /* Let's write more bytes to the files to test that ContentSummaryInputFormat is actually working returning the file size not from the filesystem */ ContentSummary summary = runTestGetInputSummary(jobConf, properties, NUM_PARTITIONS, BYTES_PER_FILE * 2, ContentSummaryInputFormatTestClass.class); assertEquals(NUM_PARTITIONS * BYTES_PER_FILE, summary.getLength()); assertEquals(NUM_PARTITIONS * 2, summary.getFileCount()); assertEquals(NUM_PARTITIONS, summary.getDirectoryCount()); }