/** * Tries to remove the checkpoint identified by the given checkpoint id. * * @param checkpointId identifying the checkpoint to remove * @return true if the checkpoint could be removed */ private boolean tryRemove(long checkpointId) throws Exception { return checkpointsInZooKeeper.releaseAndTryRemove(checkpointIdToPath(checkpointId)); }
private static CompletedCheckpoint retrieveCompletedCheckpoint(Tuple2<RetrievableStateHandle<CompletedCheckpoint>, String> stateHandlePath) throws FlinkException { long checkpointId = pathToCheckpointId(stateHandlePath.f1); LOG.info("Trying to retrieve checkpoint {}.", checkpointId); try { return stateHandlePath.f0.retrieveState(); } catch (ClassNotFoundException cnfe) { throw new FlinkException("Could not retrieve checkpoint " + checkpointId + " from state handle under " + stateHandlePath.f1 + ". This indicates that you are trying to recover from state written by an " + "older Flink version which is not compatible. Try cleaning the state handle store.", cnfe); } catch (IOException ioe) { throw new FlinkException("Could not retrieve checkpoint " + checkpointId + " from state handle under " + stateHandlePath.f1 + ". This indicates that the retrieved state handle is broken. Try cleaning the " + "state handle store.", ioe); } } }
/** * Synchronously writes the new checkpoints to ZooKeeper and asynchronously removes older ones. * * @param checkpoint Completed checkpoint to add. */ @Override public void addCheckpoint(final CompletedCheckpoint checkpoint) throws Exception { checkNotNull(checkpoint, "Checkpoint"); final String path = checkpointIdToPath(checkpoint.getCheckpointID()); // Now add the new one. If it fails, we don't want to loose existing data. checkpointsInZooKeeper.addAndLock(path, checkpoint); completedCheckpoints.addLast(checkpoint); // Everything worked, let's remove a previous checkpoint if necessary. while (completedCheckpoints.size() > maxNumberOfCheckpointsToRetain) { try { removeSubsumed(completedCheckpoints.removeFirst()); } catch (Exception e) { LOG.warn("Failed to subsume the old checkpoint", e); } } LOG.debug("Added {} to {}.", checkpoint, path); }
/** * Synchronously writes the new checkpoints to ZooKeeper and asynchronously removes older ones. * * @param checkpoint Completed checkpoint to add. */ @Override public void addCheckpoint(final CompletedCheckpoint checkpoint) throws Exception { checkNotNull(checkpoint, "Checkpoint"); final String path = checkpointIdToPath(checkpoint.getCheckpointID()); // Now add the new one. If it fails, we don't want to loose existing data. checkpointsInZooKeeper.addAndLock(path, checkpoint); completedCheckpoints.addLast(checkpoint); // Everything worked, let's remove a previous checkpoint if necessary. while (completedCheckpoints.size() > maxNumberOfCheckpointsToRetain) { final CompletedCheckpoint completedCheckpoint = completedCheckpoints.removeFirst(); tryRemoveCompletedCheckpoint(completedCheckpoint, CompletedCheckpoint::discardOnSubsume); } LOG.debug("Added {} to {}.", checkpoint, path); }
return new ZooKeeperCompletedCheckpointStore( maxNumberOfCheckpointsToRetain, client,
completedCheckpoint = retrieveCompletedCheckpoint(checkpointStateHandle); if (completedCheckpoint != null) { retrievedCheckpoints.add(completedCheckpoint);
private void tryRemoveCompletedCheckpoint(CompletedCheckpoint completedCheckpoint, ThrowingConsumer<CompletedCheckpoint, Exception> discardCallback) { try { if (tryRemove(completedCheckpoint.getCheckpointID())) { executor.execute(() -> { try { discardCallback.accept(completedCheckpoint); } catch (Exception e) { LOG.warn("Could not discard completed checkpoint {}.", completedCheckpoint.getCheckpointID(), e); } }); } } catch (Exception e) { LOG.warn("Failed to subsume the old checkpoint", e); } }
@Override public void shutdown(JobStatus jobStatus) throws Exception { if (jobStatus.isGloballyTerminalState()) { LOG.info("Shutting down"); for (CompletedCheckpoint checkpoint : completedCheckpoints) { tryRemoveCompletedCheckpoint( checkpoint, completedCheckpoint -> completedCheckpoint.discardOnShutdown(jobStatus)); } completedCheckpoints.clear(); String path = "/" + client.getNamespace(); LOG.info("Removing {} from ZooKeeper", path); ZKPaths.deleteChildren(client.getZookeeperClient().getZooKeeper(), path, true); } else { LOG.info("Suspending"); // Clear the local handles, but don't remove any state completedCheckpoints.clear(); // Release the state handle locks in ZooKeeper such that they can be deleted checkpointsInZooKeeper.releaseAll(); } }
@Override public void shutdown(JobStatus jobStatus) throws Exception { if (jobStatus.isGloballyTerminalState()) { LOG.info("Shutting down"); for (CompletedCheckpoint checkpoint : completedCheckpoints) { try { removeShutdown(checkpoint, jobStatus); } catch (Exception e) { LOG.error("Failed to discard checkpoint.", e); } } completedCheckpoints.clear(); String path = "/" + client.getNamespace(); LOG.info("Removing {} from ZooKeeper", path); ZKPaths.deleteChildren(client.getZookeeperClient().getZooKeeper(), path, true); } else { LOG.info("Suspending"); // Clear the local handles, but don't remove any state completedCheckpoints.clear(); // Release the state handle locks in ZooKeeper such that they can be deleted checkpointsInZooKeeper.releaseAll(); } }
/** * Synchronously writes the new checkpoints to ZooKeeper and asynchronously removes older ones. * * @param checkpoint Completed checkpoint to add. */ @Override public void addCheckpoint(final CompletedCheckpoint checkpoint) throws Exception { checkNotNull(checkpoint, "Checkpoint"); final String path = checkpointIdToPath(checkpoint.getCheckpointID()); // Now add the new one. If it fails, we don't want to loose existing data. checkpointsInZooKeeper.addAndLock(path, checkpoint); completedCheckpoints.addLast(checkpoint); // Everything worked, let's remove a previous checkpoint if necessary. while (completedCheckpoints.size() > maxNumberOfCheckpointsToRetain) { final CompletedCheckpoint completedCheckpoint = completedCheckpoints.removeFirst(); tryRemoveCompletedCheckpoint(completedCheckpoint, CompletedCheckpoint::discardOnSubsume); } LOG.debug("Added {} to {}.", checkpoint, path); }
return new ZooKeeperCompletedCheckpointStore( maxNumberOfCheckpointsToRetain, client,
completedCheckpoint = retrieveCompletedCheckpoint(checkpointStateHandle); if (completedCheckpoint != null) { retrievedCheckpoints.add(completedCheckpoint);
private void tryRemoveCompletedCheckpoint(CompletedCheckpoint completedCheckpoint, ThrowingConsumer<CompletedCheckpoint, Exception> discardCallback) { try { if (tryRemove(completedCheckpoint.getCheckpointID())) { executor.execute(() -> { try { discardCallback.accept(completedCheckpoint); } catch (Exception e) { LOG.warn("Could not discard completed checkpoint {}.", completedCheckpoint.getCheckpointID(), e); } }); } } catch (Exception e) { LOG.warn("Failed to subsume the old checkpoint", e); } }
@Override public void shutdown(JobStatus jobStatus) throws Exception { if (jobStatus.isGloballyTerminalState()) { LOG.info("Shutting down"); for (CompletedCheckpoint checkpoint : completedCheckpoints) { tryRemoveCompletedCheckpoint( checkpoint, completedCheckpoint -> completedCheckpoint.discardOnShutdown(jobStatus)); } completedCheckpoints.clear(); String path = "/" + client.getNamespace(); LOG.info("Removing {} from ZooKeeper", path); ZKPaths.deleteChildren(client.getZookeeperClient().getZooKeeper(), path, true); } else { LOG.info("Suspending"); // Clear the local handles, but don't remove any state completedCheckpoints.clear(); // Release the state handle locks in ZooKeeper such that they can be deleted checkpointsInZooKeeper.releaseAll(); } }
@Override public void shutdown(JobStatus jobStatus) throws Exception { if (jobStatus.isGloballyTerminalState()) { LOG.info("Shutting down"); for (CompletedCheckpoint checkpoint : completedCheckpoints) { try { removeShutdown(checkpoint, jobStatus); } catch (Exception e) { LOG.error("Failed to discard checkpoint.", e); } } completedCheckpoints.clear(); String path = "/" + client.getNamespace(); LOG.info("Removing {} from ZooKeeper", path); ZKPaths.deleteChildren(client.getZookeeperClient().getZooKeeper(), path, true); } else { LOG.info("Suspending"); // Clear the local handles, but don't remove any state completedCheckpoints.clear(); // Release the state handle locks in ZooKeeper such that they can be deleted checkpointsInZooKeeper.releaseAll(); } }
/** * Tries to remove the checkpoint identified by the given checkpoint id. * * @param checkpointId identifying the checkpoint to remove * @return true if the checkpoint could be removed */ private boolean tryRemove(long checkpointId) throws Exception { return checkpointsInZooKeeper.releaseAndTryRemove(checkpointIdToPath(checkpointId)); }
/** * Synchronously writes the new checkpoints to ZooKeeper and asynchronously removes older ones. * * @param checkpoint Completed checkpoint to add. */ @Override public void addCheckpoint(final CompletedCheckpoint checkpoint) throws Exception { checkNotNull(checkpoint, "Checkpoint"); final String path = checkpointIdToPath(checkpoint.getCheckpointID()); // Now add the new one. If it fails, we don't want to loose existing data. checkpointsInZooKeeper.addAndLock(path, checkpoint); completedCheckpoints.addLast(checkpoint); // Everything worked, let's remove a previous checkpoint if necessary. while (completedCheckpoints.size() > maxNumberOfCheckpointsToRetain) { try { removeSubsumed(completedCheckpoints.removeFirst()); } catch (Exception e) { LOG.warn("Failed to subsume the old checkpoint", e); } } LOG.debug("Added {} to {}.", checkpoint, path); }
private static CompletedCheckpoint retrieveCompletedCheckpoint(Tuple2<RetrievableStateHandle<CompletedCheckpoint>, String> stateHandlePath) throws FlinkException { long checkpointId = pathToCheckpointId(stateHandlePath.f1); LOG.info("Trying to retrieve checkpoint {}.", checkpointId); try { return stateHandlePath.f0.retrieveState(); } catch (ClassNotFoundException cnfe) { throw new FlinkException("Could not retrieve checkpoint " + checkpointId + " from state handle under " + stateHandlePath.f1 + ". This indicates that you are trying to recover from state written by an " + "older Flink version which is not compatible. Try cleaning the state handle store.", cnfe); } catch (IOException ioe) { throw new FlinkException("Could not retrieve checkpoint " + checkpointId + " from state handle under " + stateHandlePath.f1 + ". This indicates that the retrieved state handle is broken. Try cleaning the " + "state handle store.", ioe); } } }
return new ZooKeeperCompletedCheckpointStore( maxNumberOfCheckpointsToRetain, client,
completedCheckpoint = retrieveCompletedCheckpoint(checkpointStateHandle); if (completedCheckpoint != null) { retrievedCheckpoints.add(completedCheckpoint);