protected FaultInfo getFaultInfo(String hostName, boolean createIfNeccessary) { FaultInfo fi = null; synchronized (potentiallyFaultyTrackers) { fi = potentiallyFaultyTrackers.get(hostName); if (fi == null && createIfNeccessary) { fi = new FaultInfo(); potentiallyFaultyTrackers.put(hostName, fi); } } return fi; }
private void blackListTracker(String hostName, String reason, ReasonForBlackListing rfb) { FaultInfo fi = getFaultInfo(hostName, true); boolean blackListed = fi.isBlacklisted(); if (blackListed) { if (LOG.isDebugEnabled()) { LOG.debug("Adding blacklisted reason for tracker : " + hostName + " Reason for blacklisting is : " + rfb); } if (!fi.getReasonforblacklisting().contains(rfb)) { LOG.info("Adding blacklisted reason for tracker : " + hostName + " Reason for blacklisting is : " + rfb); } fi.addBlackListedReason(rfb, reason); } else { LOG.info("Blacklisting tracker : " + hostName + " Reason for blacklisting is : " + rfb); Set<TaskTracker> trackers = hostnameToTaskTracker.get(hostName); synchronized (trackers) { for (TaskTracker tracker : trackers) { tracker.cancelAllReservations(); } } removeHostCapacity(hostName); fi.setBlacklist(rfb, reason); } }
int getFaultCount(String hostName) { synchronized (potentiallyFaultyTrackers) { FaultInfo fi = null; if ((fi = potentiallyFaultyTrackers.get(hostName)) != null) { return fi.getFaultCount(); } } return 0; }
FaultInfo fi = potentiallyFaultyTrackers.get(hostName); if (fi != null && (now - fi.getLastUpdated()) > UPDATE_FAULTY_TRACKER_INTERVAL) { int numFaults = fi.getFaultCount() - 1; fi.setFaultCount(numFaults); fi.setLastUpdated(now); if (canUnBlackListTracker(hostName, ReasonForBlackListing.EXCEEDING_FAILURES)) { return (fi != null && fi.isBlacklisted());
/** * Blacklists the tracker across all jobs if * <ol> * <li>#faults are more than * MAX_BLACKLISTS_PER_TRACKER (configurable) blacklists</li> * <li>#faults is 50% (configurable) above the average #faults</li> * <li>50% the cluster is not blacklisted yet </li> * </ol> */ private boolean exceedsFaults(FaultInfo fi) { int faultCount = fi.getFaultCount(); if (faultCount >= MAX_BLACKLISTS_PER_TRACKER) { // calculate avgBlackLists long clusterSize = getClusterStatus().getTaskTrackers(); long sum = 0; for (FaultInfo f : potentiallyFaultyTrackers.values()) { sum += f.getFaultCount(); } double avg = (double) sum / clusterSize; long totalCluster = clusterSize + numBlacklistedTrackers; if ((faultCount - avg) > (AVERAGE_BLACKLIST_THRESHOLD * avg) && numBlacklistedTrackers < (totalCluster * MAX_BLACKLIST_PERCENT)) { return true; } } return false; }
/** * Blacklists the tracker across all jobs if * <ol> * <li>#faults are more than * MaxBlacklistsPerTracker (configurable) blacklists</li> * <li>#faults is 50% (configurable) above the average #faults</li> * <li>50% the cluster is not blacklisted yet </li> * </ol> */ private boolean exceedsFaults(FaultInfo fi) { int faultCount = fi.getFaultCount(); if (faultCount >= jobTrackerReconfigurable.getMaxBlacklistsPerTracker()) { // calculate avgBlackLists long clusterSize = getClusterStatus().getTaskTrackers(); long sum = 0; for (FaultInfo f : potentiallyFaultyTrackers.values()) { sum += f.getFaultCount(); } double avg = (double) sum / clusterSize; long totalCluster = clusterSize + numBlacklistedTrackers; if ((faultCount - avg) > (AverageBlacklistThreshold * avg) && numBlacklistedTrackers < (totalCluster * MAX_BLACKLIST_PERCENT)) { return true; } } return false; }
/** * Update the node health failure statistics of the given * host. * * We increment the count only when the host transitions * from healthy -> unhealthy. * * @param hostName * @param fi Fault info object for the host. */ private void updateNodeHealthFailureStatistics(String hostName, FaultInfo fi) { //Check if the node was already blacklisted due to //unhealthy reason. If so dont increment the count. if (!fi.getReasonforblacklisting().contains( ReasonForBlackListing.NODE_UNHEALTHY)) { Set<TaskTracker> trackers = hostnameToTaskTracker.get(hostName); synchronized (trackers) { for (TaskTracker t : trackers) { TaskTrackerStat stat = statistics.getTaskTrackerStat( t.getTrackerName()); stat.incrHealthCheckFailed(); } } } }
/** * Increments faults(blacklist by job) for the tracker by one. * * Adds the tracker to the potentially faulty list. * Assumes JobTracker is locked on the entry. * * @param hostName */ void incrementFaults(String hostName) { synchronized (potentiallyFaultyTrackers) { FaultInfo fi = getFaultInfo(hostName, true); long now = clock.getTime(); int numFaults = fi.getFaultCount(); ++numFaults; fi.setFaultCount(numFaults); fi.setLastUpdated(now); if (exceedsFaults(fi)) { LOG.info("Adding " + hostName + " to the blacklist" + " across all jobs"); String reason = String.format(FaultInfo.FAULT_FORMAT_STRING, numFaults); blackListTracker(hostName, reason, ReasonForBlackListing.EXCEEDING_FAILURES); } } }
void setNodeHealthStatus(String hostName, boolean isHealthy, String reason) { FaultInfo fi = null; // If tracker is not healthy, create a fault info object // blacklist it. if (!isHealthy) { fi = getFaultInfo(hostName, true); fi.setHealthy(isHealthy); synchronized (potentiallyFaultyTrackers) { blackListTracker(hostName, reason, ReasonForBlackListing.NODE_UNHEALTHY); } } else { fi = getFaultInfo(hostName, false); if (fi == null) { return; } else { if (canUnBlackListTracker(hostName, ReasonForBlackListing.NODE_UNHEALTHY)) { unBlackListTracker(hostName, ReasonForBlackListing.NODE_UNHEALTHY); } } } } }
/** * Check whether tasks can be assigned to the tracker. * * One fault of the tracker is discarded if there * are no faults during one day. So, the tracker will get a * chance again to run tasks of a job. * Assumes JobTracker is locked on the entry. * * @param hostName The tracker name * @param now The current time * * @return true if the tracker is blacklisted * false otherwise */ boolean shouldAssignTasksToTracker(String hostName, long now) { synchronized (potentiallyFaultyTrackers) { FaultInfo fi = potentiallyFaultyTrackers.get(hostName); if (fi != null && (now - fi.getLastUpdated()) > UpdateFaultyTrackerInterval) { fi.forgiveOneFault(); fi.setLastUpdated(now); if (canUnBlackListTracker(hostName, ReasonForBlackListing.EXCEEDING_FAILURES)) { unBlackListTracker(hostName, ReasonForBlackListing.EXCEEDING_FAILURES); } } return (fi != null && fi.isBlacklisted()); } }
private void blackListTracker(String hostName, String reason, ReasonForBlackListing rfb) { FaultInfo fi = getFaultInfo(hostName, true); boolean blackListed = fi.isBlacklisted(); if(blackListed) { if (LOG.isDebugEnabled()) { LOG.debug("Adding blacklisted reason for tracker : " + hostName + " Reason for blacklisting is : " + rfb); } if (!fi.getReasonforblacklisting().contains(rfb)) { LOG.info("Adding blacklisted reason for tracker : " + hostName + " Reason for blacklisting is : " + rfb); } fi.addBlackListedReason(rfb, reason); } else { LOG.info("Blacklisting tracker : " + hostName + " Reason for blacklisting is : " + rfb); Set<TaskTracker> trackers = hostnameToTaskTracker.get(hostName); synchronized (trackers) { for (TaskTracker tracker : trackers) { tracker.cancelAllReservations(); } } removeHostCapacity(hostName); fi.setBlacklist(rfb, reason); } }
/** * Increments faults(blacklist by job) for the tracker by one. * * Adds the tracker to the potentially faulty list. * Assumes JobTracker is locked on the entry. * * @param hostName */ void incrementFaults(String hostName, JobFault jf) { synchronized (potentiallyFaultyTrackers) { FaultInfo fi = getFaultInfo(hostName, true); fi.addFault(jf); fi.setLastUpdated(getClock().getTime()); if (exceedsFaults(fi)) { LOG.info("Adding " + hostName + " to the blacklist" + " across all jobs"); String reason = String.format(FaultInfo.FAULT_FORMAT_STRING, fi.getFaultCount()); blackListTracker(hostName, reason, ReasonForBlackListing.EXCEEDING_FAILURES); } } }
void setNodeHealthStatus(String hostName, boolean isHealthy, String reason) { FaultInfo fi = null; // If tracker is not healthy, create a fault info object // blacklist it. if (!isHealthy) { fi = getFaultInfo(hostName, true); fi.setHealthy(isHealthy); updateNodeHealthFailureStatistics(hostName, fi); synchronized (potentiallyFaultyTrackers) { blackListTracker(hostName, reason, ReasonForBlackListing.NODE_UNHEALTHY); } } else { fi = getFaultInfo(hostName, false); if (fi == null) { return; } else { if (canUnBlackListTracker(hostName, ReasonForBlackListing.NODE_UNHEALTHY)) { unBlackListTracker(hostName, ReasonForBlackListing.NODE_UNHEALTHY); } } } }
private void unBlackListTracker(String hostName, ReasonForBlackListing rfb) { // check if you can black list the tracker then call this methods FaultInfo fi = getFaultInfo(hostName, false); if (fi.removeBlackListedReason(rfb)) { if (fi.getReasonforblacklisting().isEmpty()) { addHostCapacity(hostName); LOG.info("Unblacklisting tracker : " + hostName); fi.unBlacklist(); //We have unBlackListed tracker, so tracker should //definitely be healthy. Check fault count if fault count //is zero don't keep it memory. if (fi.numFaults == 0) { potentiallyFaultyTrackers.remove(hostName); } } } }
/** * Removes the tracker from blacklist and * from potentially faulty list, when it is restarted. * * Assumes JobTracker is locked on the entry. * * @param hostName */ void markTrackerHealthy(String hostName) { synchronized (potentiallyFaultyTrackers) { FaultInfo fi = potentiallyFaultyTrackers.remove(hostName); if (fi != null && fi.isBlacklisted()) { LOG.info("Removing " + hostName + " from blacklist"); addHostCapacity(hostName); } } }
private void unBlackListTracker(String hostName, ReasonForBlackListing rfb) { // check if you can black list the tracker then call this methods FaultInfo fi = getFaultInfo(hostName, false); if(fi.removeBlackListedReason(rfb)) { if(fi.getReasonforblacklisting().isEmpty()) { addHostCapacity(hostName); LOG.info("Unblacklisting tracker : " + hostName); fi.unBlacklist(); //We have unBlackListed tracker, so tracker should //definitely be healthy. Check fault count if fault count //is zero don't keep it memory. if(fi.getFaultCount() == 0) { potentiallyFaultyTrackers.remove(hostName); } } } }
/** * Whether a host is blacklisted across all the jobs. * * Assumes JobTracker is locked on the entry. * @param hostName * @return */ boolean isBlacklisted(String hostName) { synchronized (potentiallyFaultyTrackers) { FaultInfo fi = null; if ((fi = potentiallyFaultyTrackers.get(hostName)) != null) { return fi.isBlacklisted(); } } return false; }
/** * Whether a host is blacklisted across all the jobs. * * Assumes JobTracker is locked on the entry. * @param hostName * @return */ boolean isBlacklisted(String hostName) { synchronized (potentiallyFaultyTrackers) { FaultInfo fi = null; if ((fi = potentiallyFaultyTrackers.get(hostName)) != null) { return fi.isBlacklisted(); } } return false; }
/** * Removes the tracker from blacklist and * from potentially faulty list, when it is restarted. * * Assumes JobTracker is locked on the entry. * * @param hostName */ void markTrackerHealthy(String hostName) { synchronized (potentiallyFaultyTrackers) { FaultInfo fi = potentiallyFaultyTrackers.remove(hostName); if (fi != null && fi.isBlacklisted()) { LOG.info("Removing " + hostName + " from blacklist"); addHostCapacity(hostName); } } }
/** Test Methods */ synchronized Set<ReasonForBlackListing> getReasonForBlackList(String host) { FaultInfo fi = faultyTrackers.getFaultInfo(host, false); if (fi == null) { return new HashSet<ReasonForBlackListing>(); } return fi.getReasonforblacklisting(); }