private void makeAllOneRegion(List<ExecutionJobVertex> jobVertices) { LOG.warn("Cannot decompose ExecutionGraph into individual failover regions due to use of " + "Co-Location constraints (iterations). Job will fail over as one holistic unit."); final ArrayList<ExecutionVertex> allVertices = new ArrayList<>(); for (ExecutionJobVertex ejv : jobVertices) { // safe some incremental size growing allVertices.ensureCapacity(allVertices.size() + ejv.getParallelism()); for (ExecutionVertex ev : ejv.getTaskVertices()) { allVertices.add(ev); } } final FailoverRegion singleRegion = new FailoverRegion(executionGraph, executor, allVertices, regionFailLimit); for (ExecutionVertex ev : allVertices) { vertexToRegion.put(ev, singleRegion); } }
private void reset(long globalModVersionOfFailover) { if (transitionState(JobStatus.CANCELED, JobStatus.CREATED)) { // reset all connected ExecutionVertexes final Collection<CoLocationGroup> colGroups = new HashSet<>(); for (ExecutionVertex ev : connectedExecutionVertices) { CoLocationGroup cgroup = ev.getJobVertex().getCoLocationGroup(); if (cgroup != null && !colGroups.contains(cgroup)){ cgroup.resetConstraints(); colGroups.add(cgroup); } } restart(globalModVersionOfFailover); } else { failover(globalModVersionOfFailover, new FlinkException("FailoverRegion " + id + " switch from CANCELLED to CREATED fail.")); } }
private void allVerticesInTerminalState(long globalModVersionOfFailover) { while (true) { JobStatus curStatus = this.state; if (curStatus.equals(JobStatus.CANCELLING)) { if (transitionState(curStatus, JobStatus.CANCELED)) { reset(globalModVersionOfFailover); break; } } else { LOG.info("FailoverRegion {} is {} when allVerticesInTerminalState.", id, state); break; } } }
/** * Restart the region by notify the schedule plugin. */ private void restart(long globalModVersionOfFailover) { try { if (transitionState(JobStatus.CREATED, JobStatus.RUNNING)) { // Let the scheduler event to reschedule connected ExecutionVertices executionGraph.resetExecutionVerticesAndNotify(globalModVersionOfFailover, connectedExecutionVertices); } else { failover(globalModVersionOfFailover, new FlinkException("FailoverRegion " + id + " witch from CREATED to RUNNING fail.")); } } catch (GlobalModVersionMismatch e) { // happens when a global recovery happens concurrently to the regional recovery // should do nothing } catch (Exception e) { failover(globalModVersionOfFailover, new FlinkException("FailoverRegion " + id + " restart failed.", e)); } }
private void failover(long globalModVersionOfFailover) { if (!executionGraph.getRestartStrategy().canRestart()) { executionGraph.failGlobal(new FlinkException("RestartStrategy validate fail")); } else { JobStatus curStatus = this.state; if (curStatus.equals(JobStatus.RUNNING)) { cancel(globalModVersionOfFailover); } else if (curStatus.equals(JobStatus.CANCELED)) { reset(globalModVersionOfFailover); } else { LOG.info("FailoverRegion {} is {} when notified to failover.", id, state); } } }
JobStatus curStatus = this.state; if (curStatus.equals(JobStatus.RUNNING)) { if (transitionState(curStatus, JobStatus.CANCELLING)) { (Void ignored, Throwable throwable) -> { if (throwable != null) { failover(globalModVersionOfFailover, new FlinkException("Could not cancel all execution job vertices properly.", throwable)); } else { allVerticesInTerminalState(globalModVersionOfFailover);
private void cancel(final long globalModVersionOfFailover) { while (true) { JobStatus curStatus = this.state; if (curStatus.equals(JobStatus.RUNNING)) { if (transitionState(curStatus, JobStatus.CANCELLING)) { // we build a future that is complete once all vertices have reached a terminal state final ArrayList<CompletableFuture<?>> futures = new ArrayList<>(connectedExecutionVertexes.size()); // cancel all tasks (that still need cancelling) for (ExecutionVertex vertex : connectedExecutionVertexes) { futures.add(vertex.cancel()); } final FutureUtils.ConjunctFuture<Void> allTerminal = FutureUtils.waitForAll(futures); allTerminal.thenAcceptAsync( (Void value) -> allVerticesInTerminalState(globalModVersionOfFailover), executor); break; } } else { LOG.info("FailoverRegion {} is {} when cancel.", id, state); break; } } }
@Override public void onTaskFailure(Execution taskExecution, Throwable cause) { final ExecutionVertex ev = taskExecution.getVertex(); final FailoverRegion failoverRegion = vertexToRegion.get(ev); if (failoverRegion == null) { executionGraph.failGlobal(new FlinkException( "Can not find a failover region for the execution " + ev.getTaskNameWithSubtaskIndex(), cause)); } else { LOG.info("Recovering task failure for {} #{} ({}) via restart of failover region", taskExecution.getVertex().getTaskNameWithSubtaskIndex(), taskExecution.getAttemptNumber(), taskExecution.getAttemptId()); failoverRegion.onExecutionFail(taskExecution, cause); } }
public void onExecutionFail(Execution taskExecution, Throwable cause) { // TODO: check if need to failover the preceding region if (!executionGraph.getRestartStrategy().canRestart()) { // delegate the failure to a global fail that will check the restart strategy and not restart executionGraph.failGlobal(cause); } else { cancel(taskExecution.getGlobalModVersion()); } }
public void onExecutionFail(Execution taskExecution, Throwable cause) { // TODO: check if need to failover the preceding region failover(taskExecution.getGlobalModVersion(), cause); }
private void cancel(final long globalModVersionOfFailover) { while (true) { JobStatus curStatus = this.state; if (curStatus.equals(JobStatus.RUNNING)) { if (transitionState(curStatus, JobStatus.CANCELLING)) { // we build a future that is complete once all vertices have reached a terminal state final ArrayList<Future<?>> futures = new ArrayList<>(connectedExecutionVertexes.size()); // cancel all tasks (that still need cancelling) for (ExecutionVertex vertex : connectedExecutionVertexes) { futures.add(vertex.cancel()); } final FutureUtils.ConjunctFuture<Void> allTerminal = FutureUtils.waitForAll(futures); allTerminal.thenAcceptAsync(new AcceptFunction<Void>() { @Override public void accept(Void value) { allVerticesInTerminalState(globalModVersionOfFailover); } }, executor); break; } } else { LOG.info("FailoverRegion {} is {} when cancel.", id, state); break; } } }
private void failover(long globalModVersionOfFailover) { if (!executionGraph.getRestartStrategy().canRestart()) { executionGraph.failGlobal(new FlinkException("RestartStrategy validate fail")); } else { JobStatus curStatus = this.state; if (curStatus.equals(JobStatus.RUNNING)) { cancel(globalModVersionOfFailover); } else if (curStatus.equals(JobStatus.CANCELED)) { reset(globalModVersionOfFailover); } else { LOG.info("FailoverRegion {} is {} when notified to failover.", id, state); } } }
private void restart(long globalModVersionOfFailover) { try { if (transitionState(JobStatus.CREATED, JobStatus.RUNNING)) { failover(globalModVersionOfFailover); failover(globalModVersionOfFailover); failover(globalModVersionOfFailover);
private void cancel(final long globalModVersionOfFailover) { while (true) { JobStatus curStatus = this.state; if (curStatus.equals(JobStatus.RUNNING)) { if (transitionState(curStatus, JobStatus.CANCELLING)) { // we build a future that is complete once all vertices have reached a terminal state final ArrayList<CompletableFuture<?>> futures = new ArrayList<>(connectedExecutionVertexes.size()); // cancel all tasks (that still need cancelling) for (ExecutionVertex vertex : connectedExecutionVertexes) { futures.add(vertex.cancel()); } final FutureUtils.ConjunctFuture<Void> allTerminal = FutureUtils.waitForAll(futures); allTerminal.thenAcceptAsync( (Void value) -> allVerticesInTerminalState(globalModVersionOfFailover), executor); break; } } else { LOG.info("FailoverRegion {} is {} when cancel.", id, state); break; } } }
@Override public void onTaskFailure(Execution taskExecution, Throwable cause) { final ExecutionVertex ev = taskExecution.getVertex(); final FailoverRegion failoverRegion = vertexToRegion.get(ev); if (failoverRegion == null) { executionGraph.failGlobal(new FlinkException( "Can not find a failover region for the execution " + ev.getTaskNameWithSubtaskIndex(), cause)); } else { LOG.info("Recovering task failure for {} #{} ({}) via restart of failover region", taskExecution.getVertex().getTaskNameWithSubtaskIndex(), taskExecution.getAttemptNumber(), taskExecution.getAttemptId()); failoverRegion.onExecutionFail(taskExecution, cause); } }
public void onExecutionFail(Execution taskExecution, Throwable cause) { // TODO: check if need to failover the preceding region if (!executionGraph.getRestartStrategy().canRestart()) { // delegate the failure to a global fail that will check the restart strategy and not restart executionGraph.failGlobal(cause); } else { cancel(taskExecution.getGlobalModVersion()); } }
if (transitionState(JobStatus.CANCELED, JobStatus.CREATED)) { restart(globalModVersionOfFailover); failover(globalModVersionOfFailover); failover(globalModVersionOfFailover);
private void allVerticesInTerminalState(long globalModVersionOfFailover) { while (true) { JobStatus curStatus = this.state; if (curStatus.equals(JobStatus.CANCELLING)) { if (transitionState(curStatus, JobStatus.CANCELED)) { reset(globalModVersionOfFailover); break; } } else { LOG.info("FailoverRegion {} is {} when allVerticesInTerminalState.", id, state); break; } } }
private void failover(long globalModVersionOfFailover) { if (!executionGraph.getRestartStrategy().canRestart()) { executionGraph.failGlobal(new FlinkException("RestartStrategy validate fail")); } else { JobStatus curStatus = this.state; if (curStatus.equals(JobStatus.RUNNING)) { cancel(globalModVersionOfFailover); } else if (curStatus.equals(JobStatus.CANCELED)) { reset(globalModVersionOfFailover); } else { LOG.info("FailoverRegion {} is {} when notified to failover.", id, state); } } }
private void restart(long globalModVersionOfFailover) { try { if (transitionState(JobStatus.CREATED, JobStatus.RUNNING)) { failover(globalModVersionOfFailover); failover(globalModVersionOfFailover); failover(globalModVersionOfFailover);