/** * Computes the total number of input files. If block sampling was used it will scale this * value by the highest sample percentage (as an estimate for # input files). * * @param inputSummary * @param work * @param highestSamplePercentage * @return */ public static long getTotalInputNumFiles (ContentSummary inputSummary, MapWork work, double highestSamplePercentage) { long totalInputNumFiles = inputSummary.getFileCount(); if (work.getNameToSplitSample() == null || work.getNameToSplitSample().isEmpty()) { // If percentage block sampling wasn't used, we don't need to do any estimation return totalInputNumFiles; } if (highestSamplePercentage >= 0) { totalInputNumFiles = Math.min((long) (totalInputNumFiles * (highestSamplePercentage / 100D)) , totalInputNumFiles); } return totalInputNumFiles; }
/** * Computes the total number of input files. If block sampling was used it will scale this * value by the highest sample percentage (as an estimate for # input files). * * @param inputSummary * @param work * @param highestSamplePercentage * @return */ public static long getTotalInputNumFiles (ContentSummary inputSummary, MapWork work, double highestSamplePercentage) { long totalInputNumFiles = inputSummary.getFileCount(); if (MapUtils.isEmpty(work.getNameToSplitSample())) { // If percentage block sampling wasn't used, we don't need to do any estimation return totalInputNumFiles; } if (highestSamplePercentage >= 0) { totalInputNumFiles = Math.min((long) (totalInputNumFiles * (highestSamplePercentage / 100D)) , totalInputNumFiles); } return totalInputNumFiles; }
public static boolean isEmptyPath(JobConf job, Path dirPath, Context ctx) throws Exception { if (ctx != null) { ContentSummary cs = ctx.getCS(dirPath); if (cs != null) { if (LOG.isDebugEnabled()) { LOG.debug("Content Summary cached for {} length: {} num files: {} " + "num directories: {}", dirPath, cs.getLength(), cs.getFileCount(), cs.getDirectoryCount()); } return (cs.getLength() == 0 && cs.getFileCount() == 0 && cs.getDirectoryCount() <= 1); } else { LOG.debug("Content Summary not cached for {}", dirPath); } } return isEmptyPath(job, dirPath); }
public static boolean isEmptyPath(JobConf job, Path dirPath, Context ctx) throws Exception { if (ctx != null) { ContentSummary cs = ctx.getCS(dirPath); if (cs != null) { LOG.info("Content Summary " + dirPath + "length: " + cs.getLength() + " num files: " + cs.getFileCount() + " num directories: " + cs.getDirectoryCount()); return (cs.getLength() == 0 && cs.getFileCount() == 0 && cs.getDirectoryCount() <= 1); } else { LOG.info("Content Summary not cached for " + dirPath); } } return isEmptyPath(job, dirPath); }
private boolean canBeSafelyDeleted(PathData item) throws IOException { boolean shouldDelete = true; if (safeDelete) { final long deleteLimit = getConf().getLong( HADOOP_SHELL_SAFELY_DELETE_LIMIT_NUM_FILES, HADOOP_SHELL_SAFELY_DELETE_LIMIT_NUM_FILES_DEFAULT); if (deleteLimit > 0) { ContentSummary cs = item.fs.getContentSummary(item.path); final long numFiles = cs.getFileCount(); if (numFiles > deleteLimit) { if (!ToolRunner.confirmPrompt("Proceed deleting " + numFiles + " files?")) { System.err.println("Delete aborted at user request.\n"); shouldDelete = false; } } } } return shouldDelete; }
public boolean isEmpty(Path path) throws IOException, MetaException { ContentSummary contents = getFs(path).getContentSummary(path); if (contents != null && contents.getFileCount() == 0 && contents.getDirectoryCount() == 1) { return true; } return false; }
@Override public ContentSummary getContentSummary(Path p, JobConf job) throws IOException { //length, file count, directory count long[] summary = {0, 0, 0}; List<Path> targetPaths = new ArrayList<Path>(); List<Path> symlinkPaths = new ArrayList<Path>(); try { getTargetPathsFromSymlinksDirs( job, new Path[]{p}, targetPaths, symlinkPaths); } catch (Exception e) { throw new IOException( "Error parsing symlinks from specified job input path.", e); } for(Path path : targetPaths) { FileSystem fs = path.getFileSystem(job); ContentSummary cs = fs.getContentSummary(path); summary[0] += cs.getLength(); summary[1] += cs.getFileCount(); summary[2] += cs.getDirectoryCount(); } return new ContentSummary(summary[0], summary[1], summary[2]); }
if (srcContentSummary.getFileCount() > MetastoreConf.getLongVar(conf, ConfVars.REPL_COPYFILE_MAXNUMFILES) && srcContentSummary.getLength() > LOG.info("Source is " + srcContentSummary.getFileCount() + " files. (MAX: " + MetastoreConf.getLongVar(conf, ConfVars.REPL_COPYFILE_MAXNUMFILES) + ")"); LOG.info("Launch distributed copy (distcp) job.");
@Override public ContentSummary getContentSummary(Path p, JobConf job) throws IOException { //length, file count, directory count long[] summary = {0, 0, 0}; List<Path> targetPaths = new ArrayList<Path>(); List<Path> symlinkPaths = new ArrayList<Path>(); try { getTargetPathsFromSymlinksDirs( job, new Path[]{p}, targetPaths, symlinkPaths); } catch (Exception e) { throw new IOException( "Error parsing symlinks from specified job input path.", e); } for(Path path : targetPaths) { FileSystem fs = path.getFileSystem(job); ContentSummary cs = fs.getContentSummary(path); summary[0] += cs.getLength(); summary[1] += cs.getFileCount(); summary[2] += cs.getDirectoryCount(); } return new ContentSummary(summary[0], summary[1], summary[2]); }
if (srcContentSummary.getFileCount() > conf.getLongVar(HiveConf.ConfVars.HIVE_EXEC_COPYFILE_MAXNUMFILES) && srcContentSummary.getLength() > conf.getLongVar(HiveConf.ConfVars.HIVE_EXEC_COPYFILE_MAXSIZE)) { LOG.info("Source is " + srcContentSummary.getFileCount() + " files. (MAX: " + conf.getLongVar( HiveConf.ConfVars.HIVE_EXEC_COPYFILE_MAXNUMFILES) + ")"); LOG.info("Launch distributed copy (distcp) job.");
@Test public void testCopyWithDistcp() throws IOException { Path copySrc = new Path("copySrc"); Path copyDst = new Path("copyDst"); HiveConf conf = new HiveConf(TestFileUtils.class); FileSystem mockFs = mock(FileSystem.class); when(mockFs.getUri()).thenReturn(URI.create("hdfs:///")); ContentSummary mockContentSummary = mock(ContentSummary.class); when(mockContentSummary.getFileCount()).thenReturn(Long.MAX_VALUE); when(mockContentSummary.getLength()).thenReturn(Long.MAX_VALUE); when(mockFs.getContentSummary(any(Path.class))).thenReturn(mockContentSummary); HadoopShims shims = mock(HadoopShims.class); when(shims.runDistCp(Collections.singletonList(copySrc), copyDst, conf)).thenReturn(true); Assert.assertTrue(FileUtils.copy(mockFs, copySrc, mockFs, copyDst, false, false, conf, shims)); verify(shims).runDistCp(Collections.singletonList(copySrc), copyDst, conf); }
@Override public ContentSummary getContentSummary(Path f) throws IOException { // HarFileSystem has a bug where this method does not work properly // if the underlying FS is HDFS. See MAPREDUCE-1877 for more // information. This method is from FileSystem. FileStatus status = getFileStatus(f); if (!status.isDir()) { // f is a file return new ContentSummary(status.getLen(), 1, 0); } // f is a directory long[] summary = {0, 0, 1}; for(FileStatus s : listStatus(f)) { ContentSummary c = s.isDir() ? getContentSummary(s.getPath()) : new ContentSummary(s.getLen(), 1, 0); summary[0] += c.getLength(); summary[1] += c.getFileCount(); summary[2] += c.getDirectoryCount(); } return new ContentSummary(summary[0], summary[1], summary[2]); } }
@Override public int hashCode() { long result = getLength() ^ getFileCount() ^ getDirectoryCount() ^ getSnapshotLength() ^ getSnapshotFileCount() ^ getSnapshotDirectoryCount() ^ getSnapshotSpaceConsumed() ^ getErasureCodingPolicy().hashCode(); return ((int) result) ^ super.hashCode(); }
@Override public boolean equals(Object to) { if (this == to) { return true; } else if (to instanceof ContentSummary) { ContentSummary right = (ContentSummary) to; return getLength() == right.getLength() && getFileCount() == right.getFileCount() && getDirectoryCount() == right.getDirectoryCount() && getSnapshotLength() == right.getSnapshotLength() && getSnapshotFileCount() == right.getSnapshotFileCount() && getSnapshotDirectoryCount() == right.getSnapshotDirectoryCount() && getSnapshotSpaceConsumed() == right.getSnapshotSpaceConsumed() && getErasureCodingPolicy().equals(right.getErasureCodingPolicy()) && super.equals(to); } else { return super.equals(to); } }
@Test public void testGetInputSummaryWithASingleThread() throws IOException { final int NUM_PARTITIONS = 5; final int BYTES_PER_FILE = 5; JobConf jobConf = new JobConf(); Properties properties = new Properties(); jobConf.setInt(HiveConf.ConfVars.HIVE_EXEC_INPUT_LISTING_MAX_THREADS.varname, 0); ContentSummary summary = runTestGetInputSummary(jobConf, properties, NUM_PARTITIONS, BYTES_PER_FILE, HiveInputFormat.class); assertEquals(NUM_PARTITIONS * BYTES_PER_FILE, summary.getLength()); assertEquals(NUM_PARTITIONS, summary.getFileCount()); assertEquals(NUM_PARTITIONS, summary.getDirectoryCount()); }
/** Return the {@link ContentSummary} of a given {@link Path}. * @param f path to use * @throws FileNotFoundException if the path does not resolve * @throws IOException IO failure */ public ContentSummary getContentSummary(Path f) throws IOException { FileStatus status = getFileStatus(f); if (status.isFile()) { // f is a file long length = status.getLen(); return new ContentSummary.Builder().length(length). fileCount(1).directoryCount(0).spaceConsumed(length).build(); } // f is a directory long[] summary = {0, 0, 1}; for(FileStatus s : listStatus(f)) { long length = s.getLen(); ContentSummary c = s.isDirectory() ? getContentSummary(s.getPath()) : new ContentSummary.Builder().length(length). fileCount(1).directoryCount(0).spaceConsumed(length).build(); summary[0] += c.getLength(); summary[1] += c.getFileCount(); summary[2] += c.getDirectoryCount(); } return new ContentSummary.Builder().length(summary[0]). fileCount(summary[1]).directoryCount(summary[2]). spaceConsumed(summary[0]).build(); }
@Test public void testGetInputSummaryWithInputEstimator() throws IOException, HiveException { final int NUM_PARTITIONS = 5; final int BYTES_PER_FILE = 10; final int NUM_OF_ROWS = 5; JobConf jobConf = new JobConf(); Properties properties = new Properties(); jobConf.setInt(Utilities.DEPRECATED_MAPRED_DFSCLIENT_PARALLELISM_MAX, 2); properties.setProperty(hive_metastoreConstants.META_TABLE_STORAGE, InputEstimatorTestClass.class.getName()); InputEstimatorTestClass.setEstimation(new InputEstimator.Estimation(NUM_OF_ROWS, BYTES_PER_FILE)); /* Let's write more bytes to the files to test that Estimator is actually working returning the file size not from the filesystem */ ContentSummary summary = runTestGetInputSummary(jobConf, properties, NUM_PARTITIONS, BYTES_PER_FILE * 2, HiveInputFormat.class); assertEquals(NUM_PARTITIONS * BYTES_PER_FILE, summary.getLength()); assertEquals(NUM_PARTITIONS * -1, summary.getFileCount()); // Current getInputSummary() returns -1 for each file found assertEquals(NUM_PARTITIONS * -1, summary.getDirectoryCount()); // Current getInputSummary() returns -1 for each file found // Test deprecated mapred.dfsclient.parallelism.max jobConf.setInt(HiveConf.ConfVars.HIVE_EXEC_INPUT_LISTING_MAX_THREADS.varname, 0); jobConf.setInt(HiveConf.ConfVars.HIVE_EXEC_INPUT_LISTING_MAX_THREADS.varname, 2); properties.setProperty(hive_metastoreConstants.META_TABLE_STORAGE, InputEstimatorTestClass.class.getName()); InputEstimatorTestClass.setEstimation(new InputEstimator.Estimation(NUM_OF_ROWS, BYTES_PER_FILE)); /* Let's write more bytes to the files to test that Estimator is actually working returning the file size not from the filesystem */ summary = runTestGetInputSummary(jobConf, properties, NUM_PARTITIONS, BYTES_PER_FILE * 2, HiveInputFormat.class); assertEquals(NUM_PARTITIONS * BYTES_PER_FILE, summary.getLength()); assertEquals(NUM_PARTITIONS * -1, summary.getFileCount()); // Current getInputSummary() returns -1 for each file found assertEquals(NUM_PARTITIONS * -1, summary.getDirectoryCount()); // Current getInputSummary() returns -1 for each file found }
/** * Scenario: Empty input directory, i.e. no symlink file. * * Expected: Should return empty result set without any exception. */ public void testAccuracy2() throws IOException { fileSystem.mkdirs(symlinkDir); FileInputFormat.setInputPaths(job, symlinkDir); SymlinkTextInputFormat inputFormat = new SymlinkTextInputFormat(); ContentSummary cs = inputFormat.getContentSummary(symlinkDir, job); assertEquals(0, cs.getLength()); assertEquals(0, cs.getFileCount()); assertEquals(0, cs.getDirectoryCount()); InputSplit[] splits = inputFormat.getSplits(job, 2); log.info("Number of splits: " + splits.length); // Read all values. List<String> received = new ArrayList<String>(); for (InputSplit split : splits) { RecordReader<LongWritable, Text> reader = inputFormat.getRecordReader(split, job, reporter); LongWritable key = reader.createKey(); Text value = reader.createValue(); while (reader.next(key, value)) { received.add(value.toString()); } reader.close(); } List<String> expected = new ArrayList<String>(); assertEquals(expected, received); }
@Test public void testGetInputSummaryWithMultipleThreads() throws IOException { final int NUM_PARTITIONS = 5; final int BYTES_PER_FILE = 5; JobConf jobConf = new JobConf(); Properties properties = new Properties(); jobConf.setInt(HiveConf.ConfVars.HIVE_EXEC_INPUT_LISTING_MAX_THREADS.varname, 2); ContentSummary summary = runTestGetInputSummary(jobConf, properties, NUM_PARTITIONS, BYTES_PER_FILE, HiveInputFormat.class); assertEquals(NUM_PARTITIONS * BYTES_PER_FILE, summary.getLength()); assertEquals(NUM_PARTITIONS, summary.getFileCount()); assertEquals(NUM_PARTITIONS, summary.getDirectoryCount()); // Test deprecated mapred.dfsclient.parallelism.max jobConf.setInt(HiveConf.ConfVars.HIVE_EXEC_INPUT_LISTING_MAX_THREADS.varname, 0); jobConf.setInt(Utilities.DEPRECATED_MAPRED_DFSCLIENT_PARALLELISM_MAX, 2); summary = runTestGetInputSummary(jobConf, properties, NUM_PARTITIONS, BYTES_PER_FILE, HiveInputFormat.class); assertEquals(NUM_PARTITIONS * BYTES_PER_FILE, summary.getLength()); assertEquals(NUM_PARTITIONS, summary.getFileCount()); assertEquals(NUM_PARTITIONS, summary.getDirectoryCount()); }
@Test public void testGetInputSummaryWithContentSummaryInputFormat() throws IOException { final int NUM_PARTITIONS = 5; final int BYTES_PER_FILE = 10; JobConf jobConf = new JobConf(); Properties properties = new Properties(); jobConf.setInt(Utilities.DEPRECATED_MAPRED_DFSCLIENT_PARALLELISM_MAX, 2); ContentSummaryInputFormatTestClass.setContentSummary( new ContentSummary.Builder().length(BYTES_PER_FILE).fileCount(2).directoryCount(1).build()); /* Let's write more bytes to the files to test that ContentSummaryInputFormat is actually working returning the file size not from the filesystem */ ContentSummary summary = runTestGetInputSummary(jobConf, properties, NUM_PARTITIONS, BYTES_PER_FILE * 2, ContentSummaryInputFormatTestClass.class); assertEquals(NUM_PARTITIONS * BYTES_PER_FILE, summary.getLength()); assertEquals(NUM_PARTITIONS * 2, summary.getFileCount()); assertEquals(NUM_PARTITIONS, summary.getDirectoryCount()); }