int getFaultCount(String hostName) { synchronized (potentiallyFaultyTrackers) { FaultInfo fi = null; if ((fi = potentiallyFaultyTrackers.get(hostName)) != null) { return fi.getFaultCount(); } } return 0; }
int getFaultCount(String hostName) { synchronized (potentiallyFaultyTrackers) { FaultInfo fi = null; if ((fi = potentiallyFaultyTrackers.get(hostName)) != null) { return fi.getFaultCount(); } } return 0; }
private void unBlackListTracker(String hostName, ReasonForBlackListing rfb) { // check if you can black list the tracker then call this methods FaultInfo fi = getFaultInfo(hostName, false); if(fi.removeBlackListedReason(rfb)) { if(fi.getReasonforblacklisting().isEmpty()) { addHostCapacity(hostName); LOG.info("Unblacklisting tracker : " + hostName); fi.unBlacklist(); //We have unBlackListed tracker, so tracker should //definitely be healthy. Check fault count if fault count //is zero don't keep it memory. if(fi.getFaultCount() == 0) { potentiallyFaultyTrackers.remove(hostName); } } } }
if (fi != null && (now - fi.getLastUpdated()) > UPDATE_FAULTY_TRACKER_INTERVAL) { int numFaults = fi.getFaultCount() - 1; fi.setFaultCount(numFaults); fi.setLastUpdated(now);
/** * Blacklists the tracker across all jobs if * <ol> * <li>#faults are more than * MAX_BLACKLISTS_PER_TRACKER (configurable) blacklists</li> * <li>#faults is 50% (configurable) above the average #faults</li> * <li>50% the cluster is not blacklisted yet </li> * </ol> */ private boolean exceedsFaults(FaultInfo fi) { int faultCount = fi.getFaultCount(); if (faultCount >= MAX_BLACKLISTS_PER_TRACKER) { // calculate avgBlackLists long clusterSize = getClusterStatus().getTaskTrackers(); long sum = 0; for (FaultInfo f : potentiallyFaultyTrackers.values()) { sum += f.getFaultCount(); } double avg = (double) sum / clusterSize; long totalCluster = clusterSize + numBlacklistedTrackers; if ((faultCount - avg) > (AVERAGE_BLACKLIST_THRESHOLD * avg) && numBlacklistedTrackers < (totalCluster * MAX_BLACKLIST_PERCENT)) { return true; } } return false; }
/** * Blacklists the tracker across all jobs if * <ol> * <li>#faults are more than * MaxBlacklistsPerTracker (configurable) blacklists</li> * <li>#faults is 50% (configurable) above the average #faults</li> * <li>50% the cluster is not blacklisted yet </li> * </ol> */ private boolean exceedsFaults(FaultInfo fi) { int faultCount = fi.getFaultCount(); if (faultCount >= jobTrackerReconfigurable.getMaxBlacklistsPerTracker()) { // calculate avgBlackLists long clusterSize = getClusterStatus().getTaskTrackers(); long sum = 0; for (FaultInfo f : potentiallyFaultyTrackers.values()) { sum += f.getFaultCount(); } double avg = (double) sum / clusterSize; long totalCluster = clusterSize + numBlacklistedTrackers; if ((faultCount - avg) > (AverageBlacklistThreshold * avg) && numBlacklistedTrackers < (totalCluster * MAX_BLACKLIST_PERCENT)) { return true; } } return false; }
/** * Increments faults(blacklist by job) for the tracker by one. * * Adds the tracker to the potentially faulty list. * Assumes JobTracker is locked on the entry. * * @param hostName */ void incrementFaults(String hostName) { synchronized (potentiallyFaultyTrackers) { FaultInfo fi = getFaultInfo(hostName, true); long now = clock.getTime(); int numFaults = fi.getFaultCount(); ++numFaults; fi.setFaultCount(numFaults); fi.setLastUpdated(now); if (exceedsFaults(fi)) { LOG.info("Adding " + hostName + " to the blacklist" + " across all jobs"); String reason = String.format(FaultInfo.FAULT_FORMAT_STRING, numFaults); blackListTracker(hostName, reason, ReasonForBlackListing.EXCEEDING_FAILURES); } } }
/** * Increments faults(blacklist by job) for the tracker by one. * * Adds the tracker to the potentially faulty list. * Assumes JobTracker is locked on the entry. * * @param hostName */ void incrementFaults(String hostName, JobFault jf) { synchronized (potentiallyFaultyTrackers) { FaultInfo fi = getFaultInfo(hostName, true); fi.addFault(jf); fi.setLastUpdated(getClock().getTime()); if (exceedsFaults(fi)) { LOG.info("Adding " + hostName + " to the blacklist" + " across all jobs"); String reason = String.format(FaultInfo.FAULT_FORMAT_STRING, fi.getFaultCount()); blackListTracker(hostName, reason, ReasonForBlackListing.EXCEEDING_FAILURES); } } }