@Override public void aborted() { super.aborted(); programStateWriterWithHeartBeat.error( new Exception(String.format("No containers for %s. Abort the application", programRunId))); }
@Override public void containerLaunched(String runnableName, int instanceId, String containerId) { super.containerLaunched(runnableName, instanceId, containerId); if (runningPublished.compareAndSet(false, true)) { // The program is marked as running when the first container for the program is launched programStateWriterWithHeartBeat.running(twillRunId.getId()); } }
@Override protected Map<String, String> getConfigs() { Map<String, String> configs = new HashMap<>(super.getConfigs()); configs.put("programRunId", GSON.toJson(programRunId)); return configs; }
@Override public void completed() { super.completed(); // On normal AM completion, based on the last container failure to publish the state if (lastContainerFailure == null) { programStateWriterWithHeartBeat.completed(); } else { lastContainerFailure.writeError(programStateWriterWithHeartBeat); } }
@Override public void killed() { super.killed(); // The AM is stopped explicitly, always record the state as killed. programStateWriterWithHeartBeat.killed(); }
@Override public void containerStopped(String runnableName, int instanceId, String containerId, int exitStatus) { super.containerStopped(runnableName, instanceId, containerId, exitStatus); // Let the completed() method handle when a container has completed with no error if (exitStatus == 0) { return; } switch(programRunId.getType()) { case WORKFLOW: case SPARK: case MAPREDUCE: // For workflow, MapReduce, and spark, if there is an error, the program state is failure // We defer the actual publish to one of the completion methods (killed, completed, aborted) // as we need to know under what condition the container failed. lastContainerFailure = new ContainerFailure(runnableName, instanceId, containerId, exitStatus); break; default: // For other programs, the container will be re-launched - the program state will continue to be RUNNING // TODO Workers should be configured via runtime args // to support both retrying on failure, or just failing and not retrying. break; } }
@Override public void initialize(EventHandlerContext context) { super.initialize(context);
@Override public TwillSpecification configure() { // It is always present in cdap-default.xml final long noContainerTimeout = cConf.getLong(Constants.CFG_TWILL_NO_CONTAINER_TIMEOUT, Long.MAX_VALUE); TwillSpecification.Builder.RunnableSetter runnableSetter = addMessaging( addDatasetOpExecutor( addLogSaverService( addTransactionService( addMetricsProcessor ( addMetricsService( TwillSpecification.Builder.with().setName(NAME).withRunnable() ) ) ) ) ) ); if (cConf.getBoolean(Constants.Explore.EXPLORE_ENABLED)) { LOG.info("Adding explore runnable."); runnableSetter = addExploreService(runnableSetter); } else { LOG.info("Explore module disabled - will not launch explore runnable."); } return runnableSetter .withOrder() .begin(Constants.Service.MESSAGING_SERVICE, Constants.Service.TRANSACTION, Constants.Service.DATASET_EXECUTOR) .withEventHandler(new AbortOnTimeoutEventHandler(noContainerTimeout)) .build(); }
@Override public void completed() { super.completed(); // On normal AM completion, based on the last container failure to publish the state if (lastContainerFailure == null) { programStateWriterWithHeartBeat.completed(); } else { lastContainerFailure.writeError(programStateWriterWithHeartBeat); } }
@Override public void killed() { super.killed(); // The AM is stopped explicitly, always record the state as killed. programStateWriterWithHeartBeat.killed(); }
@Override public void containerStopped(String runnableName, int instanceId, String containerId, int exitStatus) { super.containerStopped(runnableName, instanceId, containerId, exitStatus); // Let the completed() method handle when a container has completed with no error if (exitStatus == 0) { return; } switch(programRunId.getType()) { case WORKFLOW: case SPARK: case MAPREDUCE: // For workflow, MapReduce, and spark, if there is an error, the program state is failure // We defer the actual publish to one of the completion methods (killed, completed, aborted) // as we need to know under what condition the container failed. lastContainerFailure = new ContainerFailure(runnableName, instanceId, containerId, exitStatus); break; default: // For other programs, the container will be re-launched - the program state will continue to be RUNNING // TODO Workers should be configured via runtime args // to support both retrying on failure, or just failing and not retrying. break; } }
@Override public void initialize(EventHandlerContext context) { super.initialize(context);
@Override protected Map<String, String> getConfigs() { Map<String, String> configs = new HashMap<>(super.getConfigs()); configs.put("programRunId", GSON.toJson(programRunId)); return configs; }
@Override public void aborted() { super.aborted(); programStateWriterWithHeartBeat.error( new Exception(String.format("No containers for %s. Abort the application", programRunId))); }
@Override public void containerLaunched(String runnableName, int instanceId, String containerId) { super.containerLaunched(runnableName, instanceId, containerId); if (runningPublished.compareAndSet(false, true)) { // The program is marked as running when the first container for the program is launched programStateWriterWithHeartBeat.running(twillRunId.getId()); } }