/** * Suspending job, all the running tasks will be cancelled, and communication with other components * will be disposed. * * <p>Mostly job is suspended because of the leadership has been revoked, one can be restart this job by * calling the {@link #start(JobMasterId, Time)} method once we take the leadership back again. * * @param cause The reason of why this job been suspended. */ private Acknowledge suspendExecution(final Exception cause) { validateRunsInMainThread(); if (getFencingToken() == null) { log.debug("Job has already been suspended or shutdown."); return Acknowledge.get(); } // not leader anymore --> set the JobMasterId to null setFencingToken(null); try { resourceManagerLeaderRetriever.stop(); } catch (Throwable t) { log.warn("Failed to stop resource manager leader retriever when suspending.", t); } suspendAndClearExecutionGraphFields(cause); // the slot pool stops receiving messages and clears its pooled slots slotPoolGateway.suspend(); // disconnect from resource manager: closeResourceManagerConnection(cause); return Acknowledge.get(); }
/** * Suspending job, all the running tasks will be cancelled, and communication with other components * will be disposed. * * <p>Mostly job is suspended because of the leadership has been revoked, one can be restart this job by * calling the {@link #start(JobMasterId, Time)} method once we take the leadership back again. * * @param cause The reason of why this job been suspended. */ private Acknowledge suspendExecution(final Exception cause) { validateRunsInMainThread(); if (getFencingToken() == null) { log.debug("Job has already been suspended or shutdown."); return Acknowledge.get(); } // not leader anymore --> set the JobMasterId to null setFencingToken(null); try { resourceManagerLeaderRetriever.stop(); } catch (Throwable t) { log.warn("Failed to stop resource manager leader retriever when suspending.", t); } suspendAndClearExecutionGraphFields(cause); // the slot pool stops receiving messages and clears its pooled slots slotPoolGateway.suspend(); // disconnect from resource manager: closeResourceManagerConnection(cause); return Acknowledge.get(); }
/** * Suspending job, and communication with other components will be disposed. * * <p>Mostly job is suspended without cancelling running tasks because of the leadership has been revoked, * the one who takes the leadership can take over the control. * * @param cause The reason of why this job been suspended. */ private Acknowledge suspendExecution(final Exception cause) { validateRunsInMainThread(); if (getFencingToken() == null) { log.debug("Job has already been suspended or shutdown."); return Acknowledge.get(); } // not leader anymore --> set the JobMasterId to null setFencingToken(null); try { resourceManagerLeaderRetriever.stop(); } catch (Throwable t) { log.warn("Failed to stop resource manager leader retriever when suspending.", t); } suspendAndClearExecutionGraphFields(cause); // flush the operation logs. operationLogManager.stop(); // the slot pool stops receiving messages and clears its pooled slots. slotPoolGateway.suspend(); // disconnect from resource manager: closeResourceManagerConnection(cause); return Acknowledge.get(); }
private void resetAndScheduleExecutionGraph() throws Exception { validateRunsInMainThread(); final CompletableFuture<Void> executionGraphAssignedFuture; if (executionGraph.getState() == JobStatus.CREATED) { executionGraphAssignedFuture = CompletableFuture.completedFuture(null); } else { suspendAndClearExecutionGraphFields(new FlinkException("ExecutionGraph is being reset in order to be rescheduled.")); final JobManagerJobMetricGroup newJobManagerJobMetricGroup = jobMetricGroupFactory.create(jobGraph); final ExecutionGraph newExecutionGraph = createAndRestoreExecutionGraph(newJobManagerJobMetricGroup); executionGraphAssignedFuture = executionGraph.getTerminationFuture().handleAsync( (JobStatus ignored, Throwable throwable) -> { assignExecutionGraph(newExecutionGraph, newJobManagerJobMetricGroup); return null; }, getMainThreadExecutor()); } executionGraphAssignedFuture.thenRun(this::scheduleExecutionGraph); }
private void resetAndScheduleExecutionGraph() throws Exception { validateRunsInMainThread(); final CompletableFuture<Void> executionGraphAssignedFuture; if (executionGraph.getState() == JobStatus.CREATED) { executionGraphAssignedFuture = CompletableFuture.completedFuture(null); } else { suspendAndClearExecutionGraphFields(new FlinkException("ExecutionGraph is being reset in order to be rescheduled.")); final JobManagerJobMetricGroup newJobManagerJobMetricGroup = jobMetricGroupFactory.create(jobGraph); final ExecutionGraph newExecutionGraph = createAndRestoreExecutionGraph(newJobManagerJobMetricGroup); executionGraphAssignedFuture = executionGraph.getTerminationFuture().handleAsync( (JobStatus ignored, Throwable throwable) -> { assignExecutionGraph(newExecutionGraph, newJobManagerJobMetricGroup); return null; }, getMainThreadExecutor()); } executionGraphAssignedFuture.thenRun(this::scheduleExecutionGraph); }