@Override @SuppressWarnings("unchecked") public void transition(ContainerImpl container, ContainerEvent event) { container.metrics.releaseContainer(container.resource); container.sendFinishedEvents(); //if the current state is NEW it means the CONTAINER_INIT was never // sent for the event, thus no need to send the CONTAINER_STOP if (container.getCurrentState() != org.apache.hadoop.yarn.api.records.ContainerState.NEW) { container.dispatcher.getEventHandler().handle(new AuxServicesEvent (AuxServicesEventType.CONTAINER_STOP, container)); } container.context.getNodeStatusUpdater().sendOutofBandHeartBeat(); } }
private ContainerStatus getContainerStatusInternal(ContainerId containerID, NMTokenIdentifier nmTokenIdentifier) throws YarnException { String containerIDStr = containerID.toString(); Container container = this.context.getContainers().get(containerID); LOG.info("Getting container-status for " + containerIDStr); authorizeGetAndStopContainerRequest(containerID, container, false, nmTokenIdentifier); if (container == null) { if (nodeStatusUpdater.isContainerRecentlyStopped(containerID)) { throw RPCUtil.getRemoteException("Container " + containerIDStr + " was recently stopped on node manager."); } else { throw RPCUtil.getRemoteException("Container " + containerIDStr + " is not handled by this NodeManager"); } } ContainerStatus containerStatus = container.cloneAndGetContainerStatus(); LOG.info("Returning " + containerStatus); return containerStatus; }
@SuppressWarnings("unchecked") private void recoverContainer(RecoveredContainerState rcs) throws IOException { StartContainerRequest req = rcs.getStartRequest(); ContainerLaunchContext launchContext = req.getContainerLaunchContext(); ContainerTokenIdentifier token = BuilderUtils.newContainerTokenIdentifier(req.getContainerToken()); ContainerId containerId = token.getContainerID(); ApplicationId appId = containerId.getApplicationAttemptId().getApplicationId(); LOG.info("Recovering " + containerId + " in state " + rcs.getStatus() + " with exit code " + rcs.getExitCode()); Application app = context.getApplications().get(appId); if (app != null) { Credentials credentials = YarnServerSecurityUtils.parseCredentials(launchContext); Container container = new ContainerImpl(getConfig(), dispatcher, req.getContainerLaunchContext(), credentials, metrics, token, context, rcs); context.getContainers().put(containerId, container); app.handle(new ApplicationContainerInitEvent(container)); } else { if (rcs.getStatus() != RecoveredContainerStatus.COMPLETED) { LOG.warn(containerId + " has no corresponding application!"); } LOG.info("Adding " + containerId + " to recently stopped containers"); nodeStatusUpdater.addCompletedContainer(containerId); } }
nm.getNodeStatusUpdater().clearFinishedContainersFromCache();
/** * If the state store throws an error after recovery has been performed * then we can not trust it any more to reflect the NM state. We need to * mark the store and node unhealthy. * Errors during the recovery will cause a service failure and thus a NM * start failure. Do not need to mark the store unhealthy for those. * @param dbErr Exception */ private void markStoreUnHealthy(DBException dbErr) { // Always log the error here, we might not see the error in the caller LOG.error("Statestore exception: ", dbErr); // We have already been marked unhealthy so no need to do it again. if (!isHealthy) { return; } // Mark unhealthy, an out of band heartbeat will be sent and the state // will remain unhealthy (not recoverable). // No need to close the store: does not make any difference at this point. isHealthy = false; // We could get here before the nodeStatusUpdater is set NodeStatusUpdater nsu = getNodeStatusUpdater(); if (nsu != null) { nsu.reportException(dbErr); } }
private void waitForContainerToFinishOnNM(ContainerId containerId) { Context nmContet = yarnCluster.getNodeManager(0).getNMContext(); int interval = 4 * 60; // Max time for container token to expire. Assert.assertNotNull(nmContet.getContainers().containsKey(containerId)); while ((interval-- > 0) && !nmContet.getContainers().get(containerId) .cloneAndGetContainerStatus().getState() .equals(ContainerState.COMPLETE)) { try { LOG.info("Waiting for " + containerId + " to complete."); Thread.sleep(1000); } catch (InterruptedException e) { } } // Normally, Containers will be removed from NM context after they are // explicitly acked by RM. Now, manually remove it for testing. yarnCluster.getNodeManager(0).getNodeStatusUpdater() .addCompletedContainer(containerId); nmContet.getContainers().remove(containerId); }
nm.getNodeStatusUpdater().clearFinishedContainersFromCache();
e.getMessage())); context.getNodeStatusUpdater().reportException(e); return ret; } catch (Throwable e) {
@SuppressWarnings("unchecked") private void recoverContainer(RecoveredContainerState rcs) throws IOException { StartContainerRequest req = rcs.getStartRequest(); ContainerLaunchContext launchContext = req.getContainerLaunchContext(); ContainerTokenIdentifier token = BuilderUtils.newContainerTokenIdentifier(req.getContainerToken()); ContainerId containerId = token.getContainerID(); ApplicationId appId = containerId.getApplicationAttemptId().getApplicationId(); LOG.info("Recovering " + containerId + " in state " + rcs.getStatus() + " with exit code " + rcs.getExitCode()); if (context.getApplications().containsKey(appId)) { Credentials credentials = parseCredentials(launchContext); Container container = new ContainerImpl(getConfig(), dispatcher, req.getContainerLaunchContext(), credentials, metrics, token, rcs.getStatus(), rcs.getExitCode(), rcs.getDiagnostics(), rcs.getKilled(), context); context.getContainers().put(containerId, container); dispatcher.getEventHandler().handle( new ApplicationContainerInitEvent(container)); } else { if (rcs.getStatus() != RecoveredContainerStatus.COMPLETED) { LOG.warn(containerId + " has no corresponding application!"); } LOG.info("Adding " + containerId + " to recently stopped containers"); nodeStatusUpdater.addCompletedContainer(containerId); } }
private ContainerStatus getContainerStatusInternal(ContainerId containerID, NMTokenIdentifier nmTokenIdentifier) throws YarnException { String containerIDStr = containerID.toString(); Container container = this.context.getContainers().get(containerID); LOG.info("Getting container-status for " + containerIDStr); authorizeGetAndStopContainerRequest(containerID, container, false, nmTokenIdentifier); if (container == null) { if (nodeStatusUpdater.isContainerRecentlyStopped(containerID)) { throw RPCUtil.getRemoteException("Container " + containerIDStr + " was recently stopped on node manager."); } else { throw RPCUtil.getRemoteException("Container " + containerIDStr + " is not handled by this NodeManager"); } } ContainerStatus containerStatus = container.cloneAndGetContainerStatus(); LOG.info("Returning " + containerStatus); return containerStatus; }
@Override @SuppressWarnings("unchecked") public void transition(ContainerImpl container, ContainerEvent event) { container.metrics.releaseContainer(container.resource); container.sendFinishedEvents(); //if the current state is NEW it means the CONTAINER_INIT was never // sent for the event, thus no need to send the CONTAINER_STOP if (container.getCurrentState() != org.apache.hadoop.yarn.api.records.ContainerState.NEW) { container.dispatcher.getEventHandler().handle(new AuxServicesEvent (AuxServicesEventType.CONTAINER_STOP, container)); } container.context.getNodeStatusUpdater().sendOutofBandHeartBeat(); } }
e.getMessage())); getContext().getNodeStatusUpdater().reportException(e); return ret; } catch (Throwable e) {
@SuppressWarnings("unchecked") private void recoverContainer(RecoveredContainerState rcs) throws IOException { StartContainerRequest req = rcs.getStartRequest(); ContainerLaunchContext launchContext = req.getContainerLaunchContext(); ContainerTokenIdentifier token = BuilderUtils.newContainerTokenIdentifier(req.getContainerToken()); ContainerId containerId = token.getContainerID(); ApplicationId appId = containerId.getApplicationAttemptId().getApplicationId(); LOG.info("Recovering " + containerId + " in state " + rcs.getStatus() + " with exit code " + rcs.getExitCode()); if (context.getApplications().containsKey(appId)) { Credentials credentials = parseCredentials(launchContext); Container container = new ContainerImpl(getConfig(), dispatcher, req.getContainerLaunchContext(), credentials, metrics, token, rcs.getStatus(), rcs.getExitCode(), rcs.getDiagnostics(), rcs.getKilled(), context); context.getContainers().put(containerId, container); dispatcher.getEventHandler().handle( new ApplicationContainerInitEvent(container)); } else { if (rcs.getStatus() != RecoveredContainerStatus.COMPLETED) { LOG.warn(containerId + " has no corresponding application!"); } LOG.info("Adding " + containerId + " to recently stopped containers"); nodeStatusUpdater.addCompletedContainer(containerId); } }
private ContainerStatus getContainerStatusInternal(ContainerId containerID, NMTokenIdentifier nmTokenIdentifier) throws YarnException { String containerIDStr = containerID.toString(); Container container = this.context.getContainers().get(containerID); LOG.info("Getting container-status for " + containerIDStr); authorizeGetAndStopContainerRequest(containerID, container, false, nmTokenIdentifier); if (container == null) { if (nodeStatusUpdater.isContainerRecentlyStopped(containerID)) { throw RPCUtil.getRemoteException("Container " + containerIDStr + " was recently stopped on node manager."); } else { throw RPCUtil.getRemoteException("Container " + containerIDStr + " is not handled by this NodeManager"); } } ContainerStatus containerStatus = container.cloneAndGetContainerStatus(); LOG.info("Returning " + containerStatus); return containerStatus; }
@Override @SuppressWarnings("unchecked") public void transition(ContainerImpl container, ContainerEvent event) { container.metrics.releaseContainer(container.resource); container.sendFinishedEvents(); //if the current state is NEW it means the CONTAINER_INIT was never // sent for the event, thus no need to send the CONTAINER_STOP if (container.getCurrentState() != org.apache.hadoop.yarn.api.records.ContainerState.NEW) { container.dispatcher.getEventHandler().handle(new AuxServicesEvent (AuxServicesEventType.CONTAINER_STOP, container)); } container.context.getNodeStatusUpdater().sendOutofBandHeartBeat(); } }
public void init(AMRMProxyApplicationContext applicationContext) { super.init(applicationContext); initLocal(applicationContext.getNMCotext().getNodeStatusUpdater() .getRMIdentifier(), applicationContext.getApplicationAttemptId(), applicationContext.getNMCotext().getContainerAllocator(), applicationContext.getNMCotext().getNMTokenSecretManager(), applicationContext.getUser()); }
private void waitForContainerToFinishOnNM(ContainerId containerId) throws InterruptedException { Context nmContext = yarnCluster.getNodeManager(0).getNMContext(); // Max time for container token to expire. final int timeout = 4 * 60 * 1000; // If the container is null, then it has already completed and been removed // from the Context by asynchronous calls. Container waitContainer = nmContext.getContainers().get(containerId); if (waitContainer != null) { try { LOG.info("Waiting for " + containerId + " to get to state " + ContainerState.COMPLETE); GenericTestUtils.waitFor(() -> ContainerState.COMPLETE.equals( waitContainer.cloneAndGetContainerStatus().getState()), 500, timeout); } catch (TimeoutException te) { LOG.error("TimeoutException", te); fail("Was waiting for " + containerId + " to get to state " + ContainerState.COMPLETE + " but was in state " + waitContainer.cloneAndGetContainerStatus().getState() + " after the timeout"); } } // Normally, Containers will be removed from NM context after they are // explicitly acked by RM. Now, manually remove it for testing. yarnCluster.getNodeManager(0).getNodeStatusUpdater() .addCompletedContainer(containerId); LOG.info("Removing container from NMContext, containerID = " + containerId); nmContext.getContainers().remove(containerId); }
protected ContainerStatus getContainerStatusInternal(ContainerId containerID, NMTokenIdentifier nmTokenIdentifier) throws YarnException { String containerIDStr = containerID.toString(); Container container = this.context.getContainers().get(containerID); LOG.info("Getting container-status for " + containerIDStr); authorizeGetAndStopContainerRequest(containerID, container, false, nmTokenIdentifier); if (container == null) { if (nodeStatusUpdater.isContainerRecentlyStopped(containerID)) { throw RPCUtil.getRemoteException("Container " + containerIDStr + " was recently stopped on node manager."); } else { throw RPCUtil.getRemoteException("Container " + containerIDStr + " is not handled by this NodeManager"); } } ContainerStatus containerStatus = container.cloneAndGetContainerStatus(); logContainerStatus("Returning ", containerStatus); return containerStatus; }