@Override public void disconnectJobManager(final JobID jobId, final Exception cause) { closeJobManagerConnection(jobId, cause); }
@Override public void disconnectTaskManager(final ResourceID resourceId, final Exception cause) { closeTaskManagerConnection(resourceId, cause); }
protected void removeJob(JobID jobId) { try { jobLeaderIdService.removeJob(jobId); } catch (Exception e) { log.warn("Could not properly remove the job {} from the job leader id service.", jobId, e); } if (jobManagerRegistrations.containsKey(jobId)) { disconnectJobManager(jobId, new Exception("Job " + jobId + "was removed")); } }
@Override public CompletableFuture<Void> closeAsync() { synchronized (lock) { resourceManager.shutDown(); return resourceManager.getTerminationFuture(); } }
/** * Callback method when current resourceManager loses leadership. */ @Override public void revokeLeadership() { runAsyncWithoutFencing( () -> { log.info("ResourceManager {} was revoked leadership. Clearing fencing token.", getAddress()); clearStateInternal(); setFencingToken(null); slotManager.suspend(); }); }
/** * Callback method when current resourceManager loses leadership. */ @Override public void revokeLeadership() { runAsyncWithoutFencing( () -> { log.info("ResourceManager {} was revoked leadership. Clearing fencing token.", getAddress()); clearState(); setFencingToken(null); slotManager.suspend(); }); }
/** * Callback method when current resourceManager is granted leadership. * * @param newLeaderSessionID unique leadershipID */ @Override public void grantLeadership(final UUID newLeaderSessionID) { runAsyncWithoutFencing( () -> { final ResourceManagerId newResourceManagerId = ResourceManagerId.fromUuid(newLeaderSessionID); log.info("ResourceManager {} was granted leadership with fencing token {}", getAddress(), newResourceManagerId); // clear the state if we've been the leader before if (getFencingToken() != null) { clearState(); } setFencingToken(newResourceManagerId); slotManager.start(getFencingToken(), getMainThreadExecutor(), new ResourceActionsImpl()); getRpcService().execute( () -> // confirming the leader session ID might be blocking, leaderElectionService.confirmLeaderSessionID(newLeaderSessionID)); }); }
jobId + " to the job id leader service.", e); onFatalError(exception); "job leader id future to verify the correct job leader.", e); onFatalError(exception); CompletableFuture<JobMasterGateway> jobMasterGatewayFuture = getRpcService().connect(jobManagerAddress, jobMasterId, JobMasterGateway.class); (JobMasterGateway jobMasterGateway, JobMasterId currentJobMasterId) -> { if (Objects.equals(currentJobMasterId, jobMasterId)) { return registerJobMasterInternal( jobMasterGateway, jobId, getMainThreadExecutor()); getRpcService().getExecutor());
private CompletableFuture<Boolean> tryAcceptLeadership(final UUID newLeaderSessionID) { if (leaderElectionService.hasLeadership(newLeaderSessionID)) { final ResourceManagerId newResourceManagerId = ResourceManagerId.fromUuid(newLeaderSessionID); log.info("ResourceManager {} was granted leadership with fencing token {}", getAddress(), newResourceManagerId); // clear the state if we've been the leader before if (getFencingToken() != null) { clearStateInternal(); } setFencingToken(newResourceManagerId); slotManager.start(getFencingToken(), getMainThreadExecutor(), new ResourceActionsImpl()); return prepareLeadershipAsync().thenApply(ignored -> true); } else { return CompletableFuture.completedFuture(false); } }
rpcService, highAvailabilityServices, resourceManager.getSelfGateway(ResourceManagerGateway.class), blobServer, heartbeatServices, resourceManager.start(); resourceManagerRetrievalService.start(resourceManagerGatewayRetriever); resourceManager.shutDown(); terminationFutures.add(resourceManager.getTerminationFuture());
@Override public CompletableFuture<RegistrationResponse> registerTaskExecutor( final String taskExecutorAddress, final ResourceID taskExecutorResourceId, final int dataPort, final HardwareDescription hardwareDescription, final Time timeout) { CompletableFuture<TaskExecutorGateway> taskExecutorGatewayFuture = getRpcService().connect(taskExecutorAddress, TaskExecutorGateway.class); return taskExecutorGatewayFuture.handleAsync( (TaskExecutorGateway taskExecutorGateway, Throwable throwable) -> { if (throwable != null) { return new RegistrationResponse.Decline(throwable.getMessage()); } else { return registerTaskExecutorInternal( taskExecutorGateway, taskExecutorAddress, taskExecutorResourceId, dataPort, hardwareDescription); } }, getMainThreadExecutor()); }
/** * Registers an info message listener. * * @param address address of infoMessage listener to register to this resource manager */ @Override public void registerInfoMessageListener(final String address) { if (infoMessageListeners.containsKey(address)) { log.warn("Receive a duplicate registration from info message listener on ({})", address); } else { CompletableFuture<InfoMessageListenerRpcGateway> infoMessageListenerRpcGatewayFuture = getRpcService() .connect(address, InfoMessageListenerRpcGateway.class); infoMessageListenerRpcGatewayFuture.whenCompleteAsync( (InfoMessageListenerRpcGateway gateway, Throwable failure) -> { if (failure != null) { log.warn("Receive a registration from unreachable info message listener on ({})", address); } else { log.info("Receive a registration from info message listener on ({})", address); infoMessageListeners.put(address, gateway); } }, getMainThreadExecutor()); } }
/** * Callback method when current resourceManager is granted leadership. * * @param newLeaderSessionID unique leadershipID */ @Override public void grantLeadership(final UUID newLeaderSessionID) { final CompletableFuture<Boolean> acceptLeadershipFuture = clearStateFuture .thenComposeAsync((ignored) -> tryAcceptLeadership(newLeaderSessionID), getUnfencedMainThreadExecutor()); final CompletableFuture<Void> confirmationFuture = acceptLeadershipFuture.thenAcceptAsync( (acceptLeadership) -> { if (acceptLeadership) { // confirming the leader session ID might be blocking, leaderElectionService.confirmLeaderSessionID(newLeaderSessionID); } }, getRpcService().getExecutor()); confirmationFuture.whenComplete( (Void ignored, Throwable throwable) -> { if (throwable != null) { onFatalError(ExceptionUtils.stripCompletionException(throwable)); } }); }
protected void releaseResource(InstanceID instanceId, Exception cause) { WorkerType worker = null; // TODO: Improve performance by having an index on the instanceId for (Map.Entry<ResourceID, WorkerRegistration<WorkerType>> entry : taskExecutors.entrySet()) { if (entry.getValue().getInstanceID().equals(instanceId)) { worker = entry.getValue().getWorker(); break; } } if (worker != null) { if (stopWorker(worker)) { closeTaskManagerConnection(worker.getResourceID(), cause); } else { log.debug("Worker {} could not be stopped.", worker.getResourceID()); } } else { // unregister in order to clean up potential left over state slotManager.unregisterTaskManager(instanceId); } }
} else { disconnectJobManager( oldJobManagerRegistration.getJobID(), new Exception("New job leader for job " + jobId + " found.")); getFencingToken(), resourceId);
checkNotNull(jobId); if (isValid(resourceManagerLeaderId)) { if (!jobLeaderIdService.containsJob(jobId)) { try { jobId + " to the job id leader service.", e); onFatalErrorAsync(exception); "job leader id future to verify the correct job leader.", e); onFatalErrorAsync(exception); Future<JobMasterGateway> jobMasterGatewayFuture = getRpcService().connect(jobManagerAddress, JobMasterGateway.class); }, getMainThreadExecutor()); }, getRpcService().getExecutor()); } else { log.debug("Discard register job manager message from {}, because the leader id " +
@Override public void run() { onFatalError(t); } });
@Override public void run() { log.info("ResourceManager {} was granted leadership with leader session ID {}", getAddress(), newLeaderSessionID); // clear the state if we've been the leader before if (leaderSessionId != null) { clearState(); } leaderSessionId = newLeaderSessionID; slotManager.start(leaderSessionId, getMainThreadExecutor(), new ResourceManagerActionsImpl()); getRpcService().execute(new Runnable() { @Override public void run() { // confirming the leader session ID might be blocking, leaderElectionService.confirmLeaderSessionID(newLeaderSessionID); } }); } });
private void clearStateInternal() { jobManagerRegistrations.clear(); jmResourceIdRegistrations.clear(); taskExecutors.clear(); try { jobLeaderIdService.clear(); } catch (Exception e) { onFatalError(new ResourceManagerException("Could not properly clear the job leader id service.", e)); } clearStateFuture = clearStateAsync(); }
@Override public void start() throws Exception { // start a leader super.start(); leaderElectionService = highAvailabilityServices.getResourceManagerLeaderElectionService(); initialize(); try { leaderElectionService.start(this); } catch (Exception e) { throw new ResourceManagerException("Could not start the leader election service.", e); } try { jobLeaderIdService.start(new JobLeaderIdActionsImpl()); } catch (Exception e) { throw new ResourceManagerException("Could not start the job leader id service.", e); } registerSlotAndTaskExecutorMetrics(); }