nextHeartBeatInterval = response.getNextHeartBeatInterval(); updateMasterKeys(response); if (response.getNodeAction() == NodeAction.SHUTDOWN) { LOG .warn("Recieved SHUTDOWN signal from Resourcemanager as part of heartbeat," + " hence shutting down."); LOG.warn("Message from ResourceManager: " + response.getDiagnosticsMessage()); context.setDecommissioned(true); dispatcher.getEventHandler().handle( if (response.getNodeAction() == NodeAction.RESYNC) { LOG.warn("Node is out of sync with ResourceManager," + " hence resyncing."); LOG.warn("Message from ResourceManager: " + response.getDiagnosticsMessage()); .getContainersToBeRemovedFromNM()); lastHeartBeatID = response.getResponseId(); List<ContainerId> containersToCleanup = response .getContainersToCleanup(); if (!containersToCleanup.isEmpty()) { dispatcher.getEventHandler().handle( response.getApplicationsToCleanup(); response.getSystemCredentialsForApps();
private void updateMasterKeys(NodeHeartbeatResponse response) { // See if the master-key has rolled over MasterKey updatedMasterKey = response.getContainerTokenMasterKey(); if (updatedMasterKey != null) { // Will be non-null only on roll-over on RM side context.getContainerTokenSecretManager().setMasterKey(updatedMasterKey); } updatedMasterKey = response.getNMTokenMasterKey(); if (updatedMasterKey != null) { context.getNMTokenSecretManager().setMasterKey(updatedMasterKey); } } };
public static NodeHeartbeatResponse newNodeHeartbeatResponse(int responseId, NodeAction action, List<ContainerId> containersToCleanUp, List<ApplicationId> applicationsToCleanUp, MasterKey containerTokenMasterKey, MasterKey nmTokenMasterKey, long nextHeartbeatInterval) { NodeHeartbeatResponse response = recordFactory .newRecordInstance(NodeHeartbeatResponse.class); response.setResponseId(responseId); response.setNodeAction(action); response.setContainerTokenMasterKey(containerTokenMasterKey); response.setNMTokenMasterKey(nmTokenMasterKey); response.setNextHeartBeatInterval(nextHeartbeatInterval); if(containersToCleanUp != null) { response.addAllContainersToCleanup(containersToCleanUp); } if(applicationsToCleanUp != null) { response.addAllApplicationsToCleanup(applicationsToCleanUp); } return response; } }
@Override public void updateNodeHeartbeatResponseForCleanup(NodeHeartbeatResponse response) { this.writeLock.lock(); try { response.addAllContainersToCleanup( new ArrayList<ContainerId>(this.containersToClean)); response.addAllApplicationsToCleanup(this.finishedApplications); response.addContainersToBeRemovedFromNM( new ArrayList<ContainerId>(this.containersToBeRemovedFromNM)); this.containersToClean.clear(); this.finishedApplications.clear(); this.containersToBeRemovedFromNM.clear(); } finally { this.writeLock.unlock(); } };
@Override public void setAndUpdateNodeHeartbeatResponse( NodeHeartbeatResponse response) { this.writeLock.lock(); try { response.addAllContainersToCleanup( new ArrayList<ContainerId>(this.containersToClean)); response.addAllApplicationsToCleanup(this.finishedApplications); response.addContainersToBeRemovedFromNM( new ArrayList<ContainerId>(this.containersToBeRemovedFromNM)); response.addAllContainersToSignal(this.containersToSignal); this.completedContainers.removeAll(this.containersToBeRemovedFromNM); this.containersToClean.clear(); this.finishedApplications.clear(); this.containersToSignal.clear(); this.containersToBeRemovedFromNM.clear(); response.addAllContainersToUpdate(toBeUpdatedContainers.values()); toBeUpdatedContainers.clear(); // NOTE: This is required for backward compatibility. response.addAllContainersToDecrease(toBeDecreasedContainers.values()); toBeDecreasedContainers.clear(); // Synchronously update the last response in rmNode with updated // responseId this.latestNodeHeartBeatResponse = response; } finally { this.writeLock.unlock(); } };
nextHeartBeatInterval = response.getNextHeartBeatInterval(); updateMasterKeys(response); if (response.getNodeAction() == NodeAction.SHUTDOWN) { LOG.warn("Recieved SHUTDOWN signal from Resourcemanager as part of" + " heartbeat, hence shutting down."); LOG.warn("Message from ResourceManager: " + response.getDiagnosticsMessage()); context.setDecommissioned(true); dispatcher.getEventHandler().handle( if (response.getNodeAction() == NodeAction.RESYNC) { LOG.warn("Node is out of sync with ResourceManager," + " hence resyncing."); LOG.warn("Message from ResourceManager: " + response.getDiagnosticsMessage()); .getContainersToBeRemovedFromNM()); lastHeartbeatID = response.getResponseId(); List<ContainerId> containersToCleanup = response .getContainersToCleanup(); if (!containersToCleanup.isEmpty()) { dispatcher.getEventHandler().handle( response.getApplicationsToCleanup(); response.getSystemCredentialsForApps(); if (systemCredentials != null && !systemCredentials.isEmpty()) {
nextHeartBeatInterval = response.getNextHeartBeatInterval(); updateMasterKeys(response); .getContainersToBeRemovedFromNM()); lastHeartbeatID = response.getResponseId(); List<ContainerId> containersToCleanup = response .getContainersToCleanup(); if (!containersToCleanup.isEmpty()) { dispatcher.getEventHandler().handle( response.getApplicationsToCleanup(); response.getSystemCredentialsForApps(); if (systemCredentials != null && !systemCredentials.isEmpty()) { ((NMContext) context).setSystemCrendentialsForApps( containersToUpdate = response.getContainersToUpdate(); if (!containersToUpdate.isEmpty()) { dispatcher.getEventHandler().handle( .getContainersToSignalList(); if (!containersToSignal.isEmpty()) { dispatcher.getEventHandler().handle( response.getContainerQueuingLimit(); if (queuingLimit != null) { context.getContainerManager().updateQueuingLimit(queuingLimit);
Assert.assertEquals( NodeAction.NORMAL, nodeHeartbeat.getNodeAction()); nodeHeartbeat = nm2.nodeHeartbeat(true); Assert.assertEquals( NodeAction.NORMAL, nodeHeartbeat.getNodeAction()); writeToHostsFile("host2"); conf.set(YarnConfiguration.RM_NODES_EXCLUDE_FILE_PATH, hostFile "Node should not have been decomissioned.", NodeAction.NORMAL, nodeHeartbeat.getNodeAction()); nodeHeartbeat = nm2.nodeHeartbeat(true); Assert.assertEquals("Node should have been decomissioned but is in state" + nodeHeartbeat.getNodeAction(), NodeAction.SHUTDOWN, nodeHeartbeat.getNodeAction());
NodeHeartbeatResponse response = resourceTrackerService.nodeHeartbeat( nodeHeartBeatRequest); Assert.assertTrue(response.getResponseId() == 1); nodeStatus.setResponseId(response.getResponseId()); response = resourceTrackerService.nodeHeartbeat(nodeHeartBeatRequest); Assert.assertTrue(response.getResponseId() == 2); Assert.assertTrue(response.getResponseId() == 2); Assert.assertTrue(NodeAction.RESYNC.equals(response.getNodeAction())); Assert.assertEquals("Too far behind rm response id:2 nm response id:0", response.getDiagnosticsMessage());
responseId = heartbeatResponse.getResponseId(); MasterKey masterKeyFromRM = heartbeatResponse.getContainerTokenMasterKey(); if (masterKeyFromRM != null && masterKeyFromRM.getKeyId() != this.currentContainerTokenMasterKey masterKeyFromRM = heartbeatResponse.getNMTokenMasterKey(); if (masterKeyFromRM != null && masterKeyFromRM.getKeyId() != this.currentNMTokenMasterKey Resource newResource = heartbeatResponse.getResource(); if (newResource != null) { capability = Resources.clone(newResource);
@Test public void testResponseIdOverflow() throws Exception { Configuration conf = new Configuration(); rm = new MockRM(conf); rm.start(); MockNM nm1 = rm.registerNode("host1:1234", 5120); NodeHeartbeatResponse nodeHeartbeat = nm1.nodeHeartbeat(true); Assert.assertEquals(NodeAction.NORMAL, nodeHeartbeat.getNodeAction()); // prepare the responseId that's about to overflow RMNode node = rm.getRMContext().getRMNodes().get(nm1.getNodeId()); node.getLastNodeHeartBeatResponse().setResponseId(Integer.MAX_VALUE); nm1.setResponseId(Integer.MAX_VALUE); // heartbeat twice and check responseId nodeHeartbeat = nm1.nodeHeartbeat(true); Assert.assertEquals(NodeAction.NORMAL, nodeHeartbeat.getNodeAction()); Assert.assertEquals(0, nodeHeartbeat.getResponseId()); nodeHeartbeat = nm1.nodeHeartbeat(true); Assert.assertEquals(NodeAction.NORMAL, nodeHeartbeat.getNodeAction()); Assert.assertEquals(1, nodeHeartbeat.getResponseId()); } }
@Test(timeout=20000) public void testUpdateHeartbeatResponseForCleanup() { RMNodeImpl node = getRunningNode(); NodeId nodeId = node.getNodeID(); // Expire a container ContainerId completedContainerId = BuilderUtils.newContainerId( BuilderUtils.newApplicationAttemptId( BuilderUtils.newApplicationId(0, 0), 0), 0); node.handle(new RMNodeCleanContainerEvent(nodeId, completedContainerId)); Assert.assertEquals(1, node.getContainersToCleanUp().size()); // Finish an application ApplicationId finishedAppId = BuilderUtils.newApplicationId(0, 1); node.handle(new RMNodeCleanAppEvent(nodeId, finishedAppId)); Assert.assertEquals(1, node.getAppsToCleanup().size()); // Verify status update does not clear containers/apps to cleanup // but updating heartbeat response for cleanup does RMNodeStatusEvent statusEvent = getMockRMNodeStatusEvent(null); node.handle(statusEvent); Assert.assertEquals(1, node.getContainersToCleanUp().size()); Assert.assertEquals(1, node.getAppsToCleanup().size()); NodeHeartbeatResponse hbrsp = Records.newRecord(NodeHeartbeatResponse.class); node.setAndUpdateNodeHeartbeatResponse(hbrsp); Assert.assertEquals(0, node.getContainersToCleanUp().size()); Assert.assertEquals(0, node.getAppsToCleanup().size()); Assert.assertEquals(1, hbrsp.getContainersToCleanup().size()); Assert.assertEquals(completedContainerId, hbrsp.getContainersToCleanup().get(0)); Assert.assertEquals(1, hbrsp.getApplicationsToCleanup().size()); Assert.assertEquals(finishedAppId, hbrsp.getApplicationsToCleanup().get(0)); }
private boolean handleShutdownOrResyncCommand( NodeHeartbeatResponse response) { if (response.getNodeAction() == NodeAction.SHUTDOWN) { LOG.warn("Received SHUTDOWN signal from Resourcemanager as part of" + " heartbeat, hence shutting down."); LOG.warn("Message from ResourceManager: " + response.getDiagnosticsMessage()); context.setDecommissioned(true); dispatcher.getEventHandler().handle( new NodeManagerEvent(NodeManagerEventType.SHUTDOWN)); return true; } if (response.getNodeAction() == NodeAction.RESYNC) { LOG.warn("Node is out of sync with ResourceManager," + " hence resyncing."); LOG.warn("Message from ResourceManager: " + response.getDiagnosticsMessage()); // Invalidate the RMIdentifier while resync NodeStatusUpdaterImpl.this.rmIdentifier = ResourceManagerConstants.RM_INVALID_IDENTIFIER; dispatcher.getEventHandler().handle( new NodeManagerEvent(NodeManagerEventType.RESYNC)); pendingCompletedContainers.clear(); return true; } return false; }
+ nodeId.getHost(); LOG.info(message); shutDown.setDiagnosticsMessage(message); return shutDown; resync.setDiagnosticsMessage(message); return resync; .getResponseId()) { LOG.info("Received duplicate heartbeat from node " + rmNode.getNodeAddress()+ " responseId=" + remoteNodeStatus.getResponseId()); return lastNodeHeartbeatResponse; } else if (remoteNodeStatus.getResponseId() + 1 < lastNodeHeartbeatResponse .getResponseId()) { String message = "Too far behind rm response id:" + lastNodeHeartbeatResponse.getResponseId() + " nm response id:" + remoteNodeStatus.getResponseId(); LOG.info(message); resync.setDiagnosticsMessage(message); getResponseId() + 1, NodeAction.NORMAL, null, null, null, null, nextHeartBeatInterval); rmNode.updateNodeHeartbeatResponseForCleanup(nodeHeartBeatResponse); rmContext.getSystemCredentialsForApps(); if (!systemCredentials.isEmpty()) { nodeHeartBeatResponse.setSystemCredentialsForApps(systemCredentials);
public void heartbeat() throws IOException, YarnException { NodeStatus nodeStatus = org.apache.hadoop.yarn.server.resourcemanager.NodeManager.createNodeStatus( nodeId, getContainerStatuses(containers)); nodeStatus.setResponseId(responseID); NodeHeartbeatRequest request = recordFactory .newRecordInstance(NodeHeartbeatRequest.class); request.setNodeStatus(nodeStatus); NodeHeartbeatResponse response = resourceTrackerService .nodeHeartbeat(request); responseID = response.getResponseId(); }
if (getNextResponseId( remoteNodeStatus.getResponseId()) == lastNodeHeartbeatResponse .getResponseId()) { LOG.info("Received duplicate heartbeat from node " + rmNode.getNodeAddress()+ " responseId=" + remoteNodeStatus.getResponseId()); return lastNodeHeartbeatResponse; } else if (remoteNodeStatus.getResponseId() != lastNodeHeartbeatResponse .getResponseId()) { String message = "Too far behind rm response id:" + lastNodeHeartbeatResponse.getResponseId() + " nm response id:" + remoteNodeStatus.getResponseId(); LOG.info(message); getNextResponseId(lastNodeHeartbeatResponse.getResponseId()), NodeAction.NORMAL, null, null, null, null, nextHeartBeatInterval); rmNode.setAndUpdateNodeHeartbeatResponse(nodeHeartBeatResponse); rmContext.getSystemCredentialsForApps(); if (!systemCredentials.isEmpty()) { nodeHeartBeatResponse.setSystemCredentialsForApps(systemCredentials); NodeLabelsUtils.convertToStringSet(request.getNodeLabels()), nodeId); nodeHeartBeatResponse.setAreNodeLabelsAcceptedByRM(true); } catch (IOException ex) { nodeHeartBeatResponse.setDiagnosticsMessage(ex.getMessage()); nodeHeartBeatResponse.setAreNodeLabelsAcceptedByRM(false);
@Override public void resetLastNodeHeartBeatResponse() { this.writeLock.lock(); try { latestNodeHeartBeatResponse.setResponseId(0); } finally { this.writeLock.unlock(); } }
nodeHeartbeatResponse.getNodeAction()); Assert.assertFalse(nodeHeartbeatResponse.getAreNodeLabelsAcceptedByRM()); Assert.assertNotNull(nodeHeartbeatResponse.getDiagnosticsMessage()); rm.stop();
NodeHeartbeatResponse beatResponse = rm.getResourceTrackerService().nodeHeartbeat(beatRequest); if (! beatResponse.getContainersToCleanup().isEmpty()) { for (ContainerId containerId : beatResponse.getContainersToCleanup()){ if (amContainerList.contains(containerId)) { if (beatResponse.getNodeAction() == NodeAction.SHUTDOWN) { lastStep();
NodeHeartbeatResponse response = nm1.nodeHeartbeat(true); ByteBuffer tokenBuffer = response.getSystemCredentialsForApps().get(app.getApplicationId()); Assert.assertNotNull(tokenBuffer); Credentials appCredentials = new Credentials();