@SuppressFBWarnings("DM_EXIT") public void sendStatusUpdate(ExecutorDriver driver, Protos.TaskID taskID, Protos.TaskState taskState, String message, Logger logger) { logger.info("Sending status update \"{}\" ({})", message, taskState.name()); message = message.substring(0, Math.min(configuration.getMaxTaskMessageLength(), message.length())); try { final Protos.TaskStatus.Builder builder = Protos.TaskStatus.newBuilder() .setTaskId(taskID) .setState(taskState) .setMessage(message); driver.sendStatusUpdate(builder.build()); } catch (Throwable t) { try { logger.error("Exception while sending status updates, exiting", t); } finally { System.exit(4); } } }
/** * Records the provided {@code taskStatus} received from Mesos. */ public static void record(Protos.TaskStatus taskStatus) { // Metric name will be of the form "task_status.running" METRICS .counter(String.format("task_status.%s", taskStatus.getState().name().toLowerCase())) .inc(); }
/** {@inheritDoc} */ @Override public synchronized void statusUpdate(SchedulerDriver schedulerDriver, Protos.TaskStatus taskStatus) { final String taskId = taskStatus.getTaskId().getValue(); log.log(Level.INFO, "Received update event task: {0} is in state: {1}", new Object[]{taskId, taskStatus.getState()}); if (taskStatus.getState().equals(Protos.TaskState.TASK_FAILED) || taskStatus.getState().equals(Protos.TaskState.TASK_ERROR) || taskStatus.getState().equals(Protos.TaskState.TASK_FINISHED) || taskStatus.getState().equals(Protos.TaskState.TASK_KILLED) || taskStatus.getState().equals(Protos.TaskState.TASK_LOST)) { IgniteTask failedTask = tasks.remove(taskId); if (failedTask != null) { List<Protos.Request> requests = new ArrayList<>(); Protos.Request request = Protos.Request.newBuilder() .addResources(Protos.Resource.newBuilder() .setType(Protos.Value.Type.SCALAR) .setName(MEM) .setScalar(Protos.Value.Scalar.newBuilder().setValue(failedTask.mem()))) .addResources(Protos.Resource.newBuilder() .setType(Protos.Value.Type.SCALAR) .setName(CPU) .setScalar(Protos.Value.Scalar.newBuilder().setValue(failedTask.cpuCores()))) .build(); requests.add(request); schedulerDriver.requestResources(requests); } } }
String replaceIp = status.get().getContainerStatus() .getNetworkInfos(0) .getIpAddresses(0)
@Override public void statusUpdate(final SchedulerDriver schedulerDriver, final Protos.TaskStatus taskStatus) { String taskId = taskStatus.getTaskId().getValue(); TaskContext taskContext = TaskContext.from(taskId); String jobName = taskContext.getMetaInfo().getJobName(); log.trace("call statusUpdate task state is: {}, task id is: {}", taskStatus.getState(), taskId); jobEventBus.post(new JobStatusTraceEvent(jobName, taskContext.getId(), taskContext.getSlaveId(), JobStatusTraceEvent.Source.CLOUD_SCHEDULER, taskContext.getType(), String.valueOf(taskContext.getMetaInfo().getShardingItems()), JobStatusTraceEvent.State.valueOf(taskStatus.getState().name()), taskStatus.getMessage())); switch (taskStatus.getState()) { case TASK_RUNNING: if (!facadeService.load(jobName).isPresent()) { schedulerDriver.killTask(Protos.TaskID.newBuilder().setValue(taskId).build()); if ("BEGIN".equals(taskStatus.getMessage())) { facadeService.updateDaemonStatus(taskContext, false); } else if ("COMPLETE".equals(taskStatus.getMessage())) { facadeService.updateDaemonStatus(taskContext, true); statisticManager.taskRunSuccessfully(); break; case TASK_KILLED: log.warn("task id is: {}, status is: {}, message is: {}, source is: {}", taskId, taskStatus.getState(), taskStatus.getMessage(), taskStatus.getSource()); facadeService.removeRunning(taskContext); facadeService.addDaemonJobToReadyQueue(jobName); case TASK_FAILED: case TASK_ERROR: log.warn("task id is: {}, status is: {}, message is: {}, source is: {}", taskId, taskStatus.getState(), taskStatus.getMessage(), taskStatus.getSource()); facadeService.removeRunning(taskContext); facadeService.recordFailoverTask(taskContext);
@Override public synchronized void statusUpdate(SchedulerDriver schedulerDriver, Protos.TaskStatus taskStatus) { LOG.info("Status update of " + taskStatus.getTaskId().getValue() + " to " + taskStatus.getState().name() + " with message " + taskStatus.getMessage()); switch (taskStatus.getState()) { case TASK_FINISHED: case TASK_FAILED: if (mesosTrackers.get(tracker).taskId.equals(taskStatus.getTaskId())) { LOG.info("Removing terminated TaskTracker: " + tracker); mesosTrackers.get(tracker).stop(); break; default: LOG.error("Unexpected TaskStatus: " + taskStatus.getState().name()); break; Meter meter = metrics.taskStateMeter.get(taskStatus.getState()); if (meter != null) { meter.mark();
@Override public void statusUpdate(SchedulerDriver driver, TaskStatus status) { TaskID taskId = status.getTaskId(); LOGGER.fine("Status update: task " + taskId + " is in state " + status.getState() + (status.hasMessage() ? " with message '" + status.getMessage() + "'" : "")); LOGGER.fine("Ignoring status update " + status.getState() + " for unknown task " + taskId); return; boolean terminalState = false; switch (status.getState()) { case TASK_STAGING: case TASK_STARTING: break; default: throw new IllegalStateException("Invalid State: " + status.getState());
for (TaskStatus status : taskStatuses) { if (!TaskUtils.isTerminal(status)) { unreconciled.put(status.getTaskId().getValue(), status);
final String ecsTaskId = taskMap.get(taskStatus.getTaskId()); if (ecsTaskId == null) { for (final Task task : result.getTasks()) { if ("STOPPED".equals(task.getLastStatus())) { final Protos.TaskStatus taskStatus = Protos.TaskStatus.newBuilder() .mergeFrom(ecsTaskToStatus.get(task.getTaskArn())) .setState(Protos.TaskState.TASK_FINISHED)
@Override public void statusUpdate(final SchedulerDriver driver, final Protos.TaskStatus taskStatus) { LOG.log(Level.SEVERE, "Task Status Update:", taskStatus.toString()); ResourceStatusEventImpl.newBuilder().setIdentifier(taskStatus.getTaskId().getValue()); switch(taskStatus.getState()) { case TASK_STARTING: break; case TASK_FINISHED: if (taskStatus.getData().toStringUtf8().equals("eval_not_run")) { if (taskStatus.getMessage() != null) { resourceStatus.setDiagnostics(taskStatus.getMessage());
try { LOGGER.info("Received status update for taskId={} state={} message={} protobuf={}", status.getTaskId().getValue(), status.getState().toString(), status.getMessage(), TextFormat.shortDebugString(status)); Metrics.record(status); if (eligibleToKill) { LOGGER.info("Received status update for unknown task, marking task to be killed: {}", status.getTaskId().getValue()); TaskKiller.killTask(status.getTaskId()); } else { status.getTaskId().getValue());
statusMap.put(status.getTaskId(), status); if (isEligibleForRecovery(taskSpec.get()) && (isRecoveryNeeded(status) || markedPermanentlyFailed)) { LOGGER.info("{} needs recovery with state: {}, goal state: {}, marked permanently failed: {}", info.getName(), status.getState(), taskSpec.get().getGoal().name(), markedPermanentlyFailed); results.add(info);
case RUNNING: { if (!Protos.TaskState.TASK_RUNNING.equals(status.get().getState())) { return false; return Protos.TaskState.TASK_FINISHED.equals(status.get().getState()) && new TaskLabelReader(taskInfo).getTargetConfiguration().equals(targetConfigId); case ONCE: return Protos.TaskState.TASK_FINISHED.equals(status.get().getState()); case UNKNOWN: default:
private void updateLogviewerState(TaskStatus status) { String taskId = status.getTaskId().getValue(); if (!taskId.contains(MesosCommon.MESOS_COMPONENT_ID_DELIMITER)) { LOG.error("updateLogviewerState: taskId for logviewer, {}, isn't formatted correctly so ignoring task update", taskId); switch (status.getState()) { case TASK_STAGING: checkRunningLogviewerState(logviewerZKPath); default: mesosNimbus._driver.killTask(status.getTaskId());
private static Optional<String> getTaskStateString( StateStore stateStore, String taskName, Optional<Protos.TaskStatus> mesosStatus) { GoalStateOverride.Status overrideStatus = stateStore.fetchGoalOverrideStatus(taskName); if (!mesosStatus.isPresent()) { // This task has never been prepared -- even if its goal state is overridden, it doesn't // have a run state. return Optional.empty(); } else if (!GoalStateOverride.Status.INACTIVE.equals(overrideStatus)) { // This task is affected by an override. Use the override status as applicable. switch (overrideStatus.progress) { case COMPLETE: return Optional.of(overrideStatus.target.getSerializedName()); case IN_PROGRESS: case PENDING: return Optional.of(overrideStatus.target.getTransitioningName()); default: LOGGER.error("Unsupported progress state: {}", overrideStatus.progress); return Optional.empty(); } } String stateString = mesosStatus.get().getState().toString(); if (stateString.startsWith("TASK_")) { // should always be the case // Trim "TASK_" prefix ("TASK_RUNNING" => "RUNNING"): stateString = stateString.substring("TASK_".length()); } return Optional.of(stateString); }
/** * Returns whether the provided {@link TaskStatus} has reached a terminal state. */ public static boolean isTerminal(Protos.TaskStatus taskStatus) { switch (taskStatus.getState()) { case TASK_DROPPED: case TASK_ERROR: case TASK_FAILED: case TASK_FINISHED: case TASK_GONE: case TASK_KILLED: //an agent marked as gone should never come back therefore this is terminal return true; case TASK_GONE_BY_OPERATOR: // mesos.proto: "might return to RUNNING in the future" case TASK_KILLING: case TASK_LOST: case TASK_RUNNING: case TASK_STAGING: case TASK_STARTING: case TASK_UNKNOWN: // mesos.proto: "may or may not still be running" case TASK_UNREACHABLE: break; default: return false; } return false; }
@Override public void statusUpdate(SchedulerDriver driver, TaskStatus status) { TaskID taskId = status.getTaskId(); LOGGER.log(Level.INFO, "Status update: task {0} is in state {1}", new Object[]{taskId, status.getState()}); if (!results.containsKey(taskId)) { throw new IllegalStateException("Unknown taskId: " + taskId); } Result result = results.get(taskId); switch (status.getState()) { case TASK_STAGING: case TASK_STARTING: break; case TASK_RUNNING: result.result.running(result.slave); break; case TASK_FINISHED: result.result.finished(result.slave); break; case TASK_FAILED: case TASK_KILLED: case TASK_LOST: result.result.failed(result.slave); break; default: throw new IllegalStateException("Invalid State: " + status.getState()); } }
/** * Maps the provided status to the service that owns its task, then queries that service with the status. * * <p>This is an optimization which avoids querying services about task statuses that don't relate to them. * <p>In addition to reducing unnecessary queries, this also improves isolation between services. They only see * task statuses which relate to them. */ @Override public TaskStatusResponse taskStatus(Protos.TaskStatus status) { return multiServiceManager .getMatchingService(status) .map(x -> x.taskStatus(status)) .orElseGet(() -> multiServiceManager .getServiceSanitized(frameworkName) .map(x -> { LOGGER.info("Forwarding task status to default service: {}", frameworkName); return x.taskStatus(status); }) .orElseGet(() -> { // Unrecognized service. Status for old task ? LOGGER.info("Received status for unknown task {}: {}", status.getTaskId().getValue(), TextFormat.shortDebugString(status)); return TaskStatusResponse.unknownTask(); }) ); }
@Override protected void processStatusUpdate(Protos.TaskStatus status) throws Exception { // Store status, then pass status to PlanManager => Plan => Steps String taskName = StateStoreUtils.fetchTaskInfo(stateStore, status).getName(); // StateStore updates: // - TaskStatus // - Override status (if applicable) stateStore.storeStatus(taskName, status); // Notify plans of status update: planCoordinator.getPlanManagers().forEach(planManager -> planManager.update(status)); // If the TaskStatus contains an IP Address, store it as a property in the StateStore. // We expect the TaskStatus to contain an IP address in both Host or CNI networking. // Currently, we are always _missing_ the IP Address on TASK_LOST. We always expect it on TASK_RUNNINGs if (status.hasContainerStatus() && status.getContainerStatus().getNetworkInfosCount() > 0 && status.getContainerStatus().getNetworkInfosList().stream() .anyMatch(networkInfo -> networkInfo.getIpAddressesCount() > 0)) { // Map the TaskStatus to a TaskInfo. The map will throw a StateStoreException if no such TaskInfo exists. try { StateStoreUtils.storeTaskStatusAsProperty(stateStore, taskName, status); } catch (StateStoreException e) { logger.warn("Unable to store network info for status update: " + status, e); } } }
@Test public void installAndRecover() throws Exception { install(); Collection<? extends Step> incompleteSteps = getIncompleteSteps(); assertTrue(incompleteSteps.isEmpty()); final CassandraDaemonTask task = cassandraState.getDaemons().get("node-0"); scheduler.statusUpdate(driver, TestUtils.generateStatus(task.getTaskInfo().getTaskId(), Protos.TaskState.TASK_KILLED)); Set<Protos.TaskStatus> taskStatuses = cassandraState.getTaskStatuses(); final Optional<Protos.TaskStatus> first = taskStatuses.stream().filter(status -> status.getTaskId().equals(task.getTaskInfo().getTaskId())).findFirst(); assertEquals(Protos.TaskState.TASK_KILLED, first.get().getState()); final CassandraTask templateTask = cassandraState.get("node-0-task-template").get(); final Protos.Offer offer = TestUtils.generateReplacementOffer(frameworkId.getValue(), task.getTaskInfo(), templateTask.getTaskInfo()); scheduler.resourceOffers(driver, Arrays.asList(offer)); Collection<QueuedSchedulerDriver.OfferOperations> offerOps = driver.drainAccepted(); assertEquals(String.format("expected accepted offer: %s", offer), 1, offerOps.size()); Collection<Protos.Offer.Operation> ops = offerOps.iterator().next().getOperations(); launchAll(ops, scheduler, driver); taskStatuses = cassandraState.getTaskStatuses(); final Optional<Protos.TaskStatus> node0Status = taskStatuses.stream().filter(status -> { try { return org.apache.mesos.offer.TaskUtils.toTaskName(status.getTaskId()).equals(task.getTaskInfo().getName()); } catch (Exception e) { throw new RuntimeException(e); } }).findFirst(); assertEquals(Protos.TaskState.TASK_RUNNING, node0Status.get().getState()); }