.requestKvStateLocation(jobId, queryableStateName) .whenComplete( (KvStateLocation kvStateLocation, Throwable throwable) -> {
@Test public void testStopJobAfterSavepoint() throws Exception { setUpWithCheckpointInterval(10L); final String savepointLocation = cancelWithSavepoint(); final JobStatus jobStatus = clusterClient.getJobStatus(jobGraph.getJobID()).get(); assertThat(jobStatus, isOneOf(JobStatus.CANCELED, JobStatus.CANCELLING)); final List<Path> savepoints; try (Stream<Path> savepointFiles = Files.list(savepointDirectory)) { savepoints = savepointFiles.map(Path::getFileName).collect(Collectors.toList()); } assertThat(savepoints, hasItem(Paths.get(savepointLocation).getFileName())); }
/** * Tests that cancel with savepoint without a properly configured savepoint * directory, will fail with a meaningful exception message. */ @Test public void testCancelWithSavepointWithoutConfiguredSavepointDirectory() throws Exception { setUpWithCheckpointInterval(10L); try { clusterClient.cancelWithSavepoint(jobGraph.getJobID(), null); } catch (Exception e) { if (!ExceptionUtils.findThrowableWithMessage(e, "savepoint directory").isPresent()) { throw e; } } }
new RestHandlerException("should trigger retry", HttpResponseStatus.SERVICE_UNAVAILABLE), JobExecutionResultResponseBody.inProgress(), JobExecutionResultResponseBody.created(new JobResult.Builder() .applicationStatus(ApplicationStatus.SUCCEEDED) .jobId(jobId) .netRuntime(Long.MAX_VALUE) .accumulatorResults(Collections.singletonMap("testName", new SerializedValue<>(OptionalFailure.of(1.0)))) .build()), JobExecutionResultResponseBody.created(new JobResult.Builder() .applicationStatus(ApplicationStatus.FAILED) .jobId(jobId) .netRuntime(Long.MAX_VALUE) .serializedThrowable(new SerializedThrowable(new RuntimeException("expected"))) .build()));
TestJobExecutionResultHandler testJobExecutionResultHandler = new TestJobExecutionResultHandler( JobExecutionResultResponseBody.created(new JobResult.Builder() .applicationStatus(ApplicationStatus.SUCCEEDED) .jobId(jobId) .netRuntime(Long.MAX_VALUE) .build()));
private void closeResourceManagerConnection(Exception cause) { if (establishedResourceManagerConnection != null) { dissolveResourceManagerConnection(establishedResourceManagerConnection, cause); establishedResourceManagerConnection = null; } if (resourceManagerConnection != null) { // stop a potentially ongoing registration process resourceManagerConnection.close(); resourceManagerConnection = null; } }
@Override public JobSubmissionResult submitJob(JobGraph jobGraph, ClassLoader classLoader) throws ProgramInvocationException { final CompletableFuture<JobSubmissionResult> jobSubmissionResultFuture = submitJob(jobGraph); if (isDetached()) { try { return jobSubmissionResultFuture.get(); } catch (InterruptedException | ExecutionException e) { ExceptionUtils.checkInterrupted(e); throw new ProgramInvocationException("Could not run job in detached mode.", jobGraph.getJobID(), e); } } else { final CompletableFuture<JobResult> jobResultFuture = jobSubmissionResultFuture.thenCompose( (JobSubmissionResult ignored) -> requestJobResult(jobGraph.getJobID())); final JobResult jobResult; try { jobResult = jobResultFuture.get(); } catch (InterruptedException | ExecutionException e) { ExceptionUtils.checkInterrupted(e); throw new ProgramInvocationException("Could not run job", jobGraph.getJobID(), e); } try { return jobResult.toJobExecutionResult(classLoader); } catch (JobExecutionException e) { throw new ProgramInvocationException("Job failed", jobGraph.getJobID(), e); } catch (IOException | ClassNotFoundException e) { throw new ProgramInvocationException("Job failed", jobGraph.getJobID(), e); } } }
private void executeSchedulingTest(Configuration configuration) throws Exception { configuration.setInteger(RestOptions.PORT, 0); final long slotIdleTimeout = 50L; configuration.setLong(JobManagerOptions.SLOT_IDLE_TIMEOUT, slotIdleTimeout); final int parallelism = 4; final MiniClusterConfiguration miniClusterConfiguration = new MiniClusterConfiguration.Builder() .setConfiguration(configuration) .setNumTaskManagers(parallelism) .setNumSlotsPerTaskManager(1) .build(); try (MiniCluster miniCluster = new MiniCluster(miniClusterConfiguration)) { miniCluster.start(); MiniClusterClient miniClusterClient = new MiniClusterClient(configuration, miniCluster); JobGraph jobGraph = createJobGraph(slotIdleTimeout << 1, parallelism); CompletableFuture<JobSubmissionResult> submissionFuture = miniClusterClient.submitJob(jobGraph); // wait for the submission to succeed JobSubmissionResult jobSubmissionResult = submissionFuture.get(); CompletableFuture<JobResult> resultFuture = miniClusterClient.requestJobResult(jobSubmissionResult.getJobID()); JobResult jobResult = resultFuture.get(); assertThat(jobResult.getSerializedThrowable().isPresent(), is(false)); } }
@Override public void disconnectResourceManager( final ResourceManagerId resourceManagerId, final Exception cause) { if (isConnectingToResourceManager(resourceManagerId)) { reconnectToResourceManager(cause); } }
@Override public CompletableFuture<ExecutionState> requestPartitionProducerState( JobID jobId, IntermediateDataSetID resultId, ResultPartitionID partitionId) { return jobMasterGateway.requestPartitionState(resultId, partitionId); } }
@Override public void run() { resourceManagerResourceID = success.getResourceManagerResourceId(); establishResourceManagerConnection(success); } });
/** * Gets the actor gateway that can be used to send messages to the TaskManager. * * <p>This method should be removed once the new interface-based RPC abstraction is in place * * @return The actor gateway that can be used to send messages to the TaskManager. */ public TaskManagerGateway getTaskManagerGateway() { return slotContext.getTaskManagerGateway(); }
@Override public boolean triggerCheckpoint(final CheckpointMetaData checkpointMetaData, final CheckpointOptions checkpointOptions) { final TaskStateSnapshot checkpointStateHandles = new TaskStateSnapshot(); checkpointStateHandles.putSubtaskStateByOperatorID( OperatorID.fromJobVertexID(getEnvironment().getJobVertexId()), new OperatorSubtaskState()); getEnvironment().acknowledgeCheckpoint( checkpointMetaData.getCheckpointId(), new CheckpointMetrics(), checkpointStateHandles); triggerCheckpointLatch.countDown(); return true; }
private void closeResourceManagerConnection(Exception cause) { if (establishedResourceManagerConnection != null) { dissolveResourceManagerConnection(establishedResourceManagerConnection, cause); establishedResourceManagerConnection = null; } if (resourceManagerConnection != null) { // stop a potentially ongoing registration process resourceManagerConnection.close(); resourceManagerConnection = null; } }
private <T> void deployJobOnNewCluster( ClusterDescriptor<T> clusterDescriptor, JobGraph jobGraph, Result<T> result, ClassLoader classLoader) throws Exception { ClusterClient<T> clusterClient = null; try { // deploy job cluster with job attached clusterClient = clusterDescriptor.deployJobCluster(context.getClusterSpec(), jobGraph, false); // save information about the new cluster result.setClusterInformation(clusterClient.getClusterId(), clusterClient.getWebInterfaceURL()); // get result if (awaitJobResult) { // we need to hard cast for now final JobExecutionResult jobResult = ((RestClusterClient<T>) clusterClient) .requestJobResult(jobGraph.getJobID()) .get() .toJobExecutionResult(context.getClassLoader()); // throws exception if job fails executionResultBucket.add(jobResult); } } finally { try { if (clusterClient != null) { clusterClient.shutdown(); } } catch (Exception e) { // ignore } } }
assertThat(jobResult.getSerializedThrowable().isPresent(), is(false)); } finally { if (clusterClient != null) {
@Test public void testStopJobAfterSavepointWithDeactivatedPeriodicCheckpointing() throws Exception { // set checkpointInterval to Long.MAX_VALUE, which means deactivated checkpointing setUpWithCheckpointInterval(Long.MAX_VALUE); final String savepointLocation = cancelWithSavepoint(); final JobStatus jobStatus = clusterClient.getJobStatus(jobGraph.getJobID()).get(60, TimeUnit.SECONDS); assertThat(jobStatus, isOneOf(JobStatus.CANCELED, JobStatus.CANCELLING)); final List<Path> savepoints; try (Stream<Path> savepointFiles = Files.list(savepointDirectory)) { savepoints = savepointFiles.map(Path::getFileName).collect(Collectors.toList()); } assertThat(savepoints, hasItem(Paths.get(savepointLocation).getFileName())); }
private void closeResourceManagerConnection(Exception cause) { if (establishedResourceManagerConnection != null) { dissolveResourceManagerConnection(establishedResourceManagerConnection, cause); establishedResourceManagerConnection = null; } if (resourceManagerConnection != null) { // stop a potentially ongoing registration process resourceManagerConnection.close(); resourceManagerConnection = null; } }
this.lastJobExecutionResult = jobResult.toJobExecutionResult(classLoader); return lastJobExecutionResult; } catch (JobExecutionException e) {
@Test public void testDoNotCancelJobIfSavepointFails() throws Exception { setUpWithCheckpointInterval(10L); try { Files.setPosixFilePermissions(savepointDirectory, Collections.emptySet()); } catch (IOException e) { Assume.assumeNoException(e); } try { cancelWithSavepoint(); } catch (Exception e) { assertThat(ExceptionUtils.findThrowable(e, CheckpointTriggerException.class).isPresent(), equalTo(true)); } final JobStatus jobStatus = clusterClient.getJobStatus(jobGraph.getJobID()).get(60, TimeUnit.SECONDS); assertThat(jobStatus, equalTo(JobStatus.RUNNING)); // assert that checkpoints are continued to be triggered triggerCheckpointLatch = new CountDownLatch(1); assertThat(triggerCheckpointLatch.await(60L, TimeUnit.SECONDS), equalTo(true)); }