public NodeHealthCheckerService() { super(NodeHealthCheckerService.class.getName()); dirsHandler = new LocalDirsHandlerService(); }
/** * Set good local dirs and good log dirs in the configuration so that the * LocalDirAllocator objects will use this updated configuration only. */ private void updateDirsAfterTest() { Configuration conf = getConfig(); List<String> localDirs = getLocalDirs(); conf.setStrings(YarnConfiguration.NM_LOCAL_DIRS, localDirs.toArray(new String[localDirs.size()])); List<String> logDirs = getLogDirs(); conf.setStrings(YarnConfiguration.NM_LOG_DIRS, logDirs.toArray(new String[logDirs.size()])); if (!areDisksHealthy()) { // Just log. LOG.error("Most of the disks failed. " + getDisksHealthReport(false)); } }
/** * @return when the last time the node health status is reported */ long getLastHealthReportTime() { return Collections.max(Arrays.asList( dirsHandler.getLastDisksCheckTime(), nodeHealthScriptRunner == null ? 0 : nodeHealthScriptRunner.getLastReportedTime(), nodeHealthExceptionReportTime)); }
public boolean isGoodLocalDir(String path) { return isInGoodDirs(getLocalDirs(), path); }
public boolean isGoodLogDir(String path) { return isInGoodDirs(getLogDirs(), path); }
.getRelativeContainerLogDir(appIdStr, containerIdStr); Path containerLogDir = dirsHandler.getLogPathForWrite(relativeContainerLogDir, false); for (String str : command) { dirsHandler.getLocalPathForWrite( getContainerPrivateDir(appIdStr, containerIdStr) + Path.SEPARATOR + CONTAINER_SCRIPT); Path nmPrivateTokensPath = dirsHandler.getLocalPathForWrite( getContainerPrivateDir(appIdStr, containerIdStr) + Path.SEPARATOR containerIdStr)); Path nmPrivateClasspathJarDir = dirsHandler.getLocalPathForWrite( getContainerPrivateDir(appIdStr, containerIdStr)); DataOutputStream containerScriptOutStream = null; dirsHandler.getLocalPathForWrite(ContainerLocalizer.USERCACHE + Path.SEPARATOR + user + Path.SEPARATOR + ContainerLocalizer.APPCACHE + Path.SEPARATOR + appIdStr pidFilePath = dirsHandler.getLocalPathForWrite(pidFileSubpath); List<String> localDirs = dirsHandler.getLocalDirs(); List<String> logDirs = dirsHandler.getLogDirs(); if (!dirsHandler.areDisksHealthy()) { ret = ContainerExitStatus.DISKS_FAILED;
/** * The minimum fraction of number of disks needed to be healthy for a node to * be considered healthy in terms of disks is configured using * {@link YarnConfiguration#NM_MIN_HEALTHY_DISKS_FRACTION}, with a default * value of {@link YarnConfiguration#DEFAULT_NM_MIN_HEALTHY_DISKS_FRACTION}. * @return <em>false</em> if either (a) more than the allowed percentage of * nm-local-dirs failed or (b) more than the allowed percentage of * nm-log-dirs failed. */ public boolean areDisksHealthy() { if (!isDiskHealthCheckerEnabled) { return true; } int goodDirs = getLocalDirs().size(); int failedDirs = localDirs.getFailedDirs().size(); int totalConfiguredDirs = goodDirs + failedDirs; if (goodDirs/(float)totalConfiguredDirs < minNeededHealthyDisksFactor) { return false; // Not enough healthy local-dirs } goodDirs = getLogDirs().size(); failedDirs = logDirs.getFailedDirs().size(); totalConfiguredDirs = goodDirs + failedDirs; if (goodDirs/(float)totalConfiguredDirs < minNeededHealthyDisksFactor) { return false; // Not enough healthy log-dirs } return true; }
disksTurnedBad(failedLocalDirsPreCheck, failedLocalDirsPostCheck); disksTurnedGood = disksTurnedGood(failedLocalDirsPreCheck, failedLocalDirsPostCheck); disksTurnedBad(failedLogDirsPreCheck, failedLogDirsPostCheck); disksTurnedGood(failedLogDirsPreCheck, failedLogDirsPostCheck); logDiskStatus(disksFailed, disksTurnedGood); updateDirsAfterTest(); updateMetrics();
/** * Make a local and log directory inaccessible during initialization * and verify those bad directories are recognized and removed from * the list of available local and log directories. * @throws IOException */ @Test public void testDirFailuresOnStartup() throws IOException { Configuration conf = new YarnConfiguration(); String localDir1 = new File(testDir, "localDir1").getPath(); String localDir2 = new File(testDir, "localDir2").getPath(); String logDir1 = new File(testDir, "logDir1").getPath(); String logDir2 = new File(testDir, "logDir2").getPath(); conf.set(YarnConfiguration.NM_LOCAL_DIRS, localDir1 + "," + localDir2); conf.set(YarnConfiguration.NM_LOG_DIRS, logDir1 + "," + logDir2); prepareDirToFail(localDir1); prepareDirToFail(logDir2); LocalDirsHandlerService dirSvc = new LocalDirsHandlerService(); dirSvc.init(conf); List<String> localDirs = dirSvc.getLocalDirs(); Assert.assertEquals(1, localDirs.size()); Assert.assertEquals(new Path(localDir2).toString(), localDirs.get(0)); List<String> logDirs = dirSvc.getLogDirs(); Assert.assertEquals(1, logDirs.size()); Assert.assertEquals(new Path(logDir1).toString(), logDirs.get(0)); }
dirsHandler.getLocalPathForWrite( NM_PRIVATE_DIR + Path.SEPARATOR + String.format(ContainerLocalizer.TOKEN_FILE_NAME_FMT, if (dirsHandler.areDisksHealthy()) { exec.startLocalizer(nmPrivateCTokensPath, localizationServerAddress, context.getUser(), } else { throw new IOException("All disks failed. " + dirsHandler.getDisksHealthReport(false));
List<String> list = localORLogDirs ? dirsHandler.getLocalDirs() : dirsHandler.getLogDirs(); String seenDirs = StringUtils.join(",", list); LOG.info("ExpectedDirs=" + expectedDirs); isHealthy, dirsHandler.areDisksHealthy()); for (int i = 0; i < 10; i++) { Iterator<RMNode> iter = yarnCluster.getResourceManager().getRMContext()
InterruptedException { List<String> localDirs = dirsHandler.getLocalDirs(); List<String> logDirs = dirsHandler.getLogDirs(); Path classpathJarPrivateDir = dirsHandler.getLocalPathForWrite( ResourceLocalizationService.NM_PRIVATE_DIR); createUserLocalDirs(localDirs, user);
private void initializeLocalDirs(FileContext lfs) { List<String> localDirs = dirsHandler.getLocalDirs(); for (String localDir : localDirs) { initializeLocalDir(lfs, localDir); } }
public Path getLogPathForWrite(String pathStr, boolean checkWrite) throws IOException { return logDirsAllocator.getLocalPathForWrite(pathStr, LocalDirAllocator.SIZE_UNKNOWN, getConfig(), checkWrite); }
createSucceeded &= logDirs.createNonExistentDirs(localFs, permSec); if (!createSucceeded) { updateDirsAfterTest(); checkDirs();
private void initializeLogDirs(FileContext lfs) { List<String> logDirs = dirsHandler.getLogDirs(); for (String logDir : logDirs) { initializeLogDir(lfs, logDir); } }
/** * @return <em>true</em> if the node is healthy */ boolean isHealthy() { boolean scriptHealthy = nodeHealthScriptRunner == null || nodeHealthScriptRunner.isHealthy(); return nodeHealthException == null && scriptHealthy && dirsHandler.areDisksHealthy(); }
private void logDiskStatus(boolean newDiskFailure, boolean diskTurnedGood) { if (newDiskFailure) { String report = getDisksHealthReport(false); LOG.info("Disk(s) failed: " + report); } if (diskTurnedGood) { String report = getDisksHealthReport(true); LOG.info("Disk(s) turned good: " + report); } }
.getRelativeContainerLogDir(appIdStr, containerIdStr); Path containerLogDir = dirsHandler.getLogPathForWrite(relativeContainerLogDir, false); for (String str : command) { dirsHandler.getLocalPathForWrite( getContainerPrivateDir(appIdStr, containerIdStr) + Path.SEPARATOR + CONTAINER_SCRIPT); Path nmPrivateTokensPath = dirsHandler.getLocalPathForWrite( getContainerPrivateDir(appIdStr, containerIdStr) + Path.SEPARATOR containerIdStr)); Path nmPrivateClasspathJarDir = dirsHandler.getLocalPathForWrite( getContainerPrivateDir(appIdStr, containerIdStr)); DataOutputStream containerScriptOutStream = null; dirsHandler.getLocalPathForWrite(ContainerLocalizer.USERCACHE + Path.SEPARATOR + user + Path.SEPARATOR + ContainerLocalizer.APPCACHE + Path.SEPARATOR + appIdStr pidFilePath = dirsHandler.getLocalPathForWrite(pidFileSubpath); List<String> localDirs = dirsHandler.getLocalDirs(); List<String> logDirs = dirsHandler.getLogDirs(); if (!dirsHandler.areDisksHealthy()) { ret = ContainerExitStatus.DISKS_FAILED;
/** * The minimum fraction of number of disks needed to be healthy for a node to * be considered healthy in terms of disks is configured using * {@link YarnConfiguration#NM_MIN_HEALTHY_DISKS_FRACTION}, with a default * value of {@link YarnConfiguration#DEFAULT_NM_MIN_HEALTHY_DISKS_FRACTION}. * @return <em>false</em> if either (a) more than the allowed percentage of * nm-local-dirs failed or (b) more than the allowed percentage of * nm-log-dirs failed. */ public boolean areDisksHealthy() { if (!isDiskHealthCheckerEnabled) { return true; } int goodDirs = getLocalDirs().size(); int failedDirs = localDirs.getFailedDirs().size(); int totalConfiguredDirs = goodDirs + failedDirs; if (goodDirs/(float)totalConfiguredDirs < minNeededHealthyDisksFactor) { return false; // Not enough healthy local-dirs } goodDirs = getLogDirs().size(); failedDirs = logDirs.getFailedDirs().size(); totalConfiguredDirs = goodDirs + failedDirs; if (goodDirs/(float)totalConfiguredDirs < minNeededHealthyDisksFactor) { return false; // Not enough healthy log-dirs } return true; }