@Override public void continueOrPropagate(RetryableException ex) { ExceptionRetryBehaviour retryBehaviour = ExceptionRetryBehaviour.getRetryBehaviourForException(ex); synchronized (this) { // Only fail over if this failure was to the current server. // This means that no one on another thread has failed us over already. if (mostRecentServerIndex.get() != null && mostRecentServerIndex.get() == failoverCount.get()) { long failures = failuresSinceLastSwitch.incrementAndGet(); if (shouldSwitchNode(retryBehaviour, failures)) { failoverToNextNode(retryBehaviour); } else if (retryBehaviour.shouldRetryInfinitelyManyTimes()) { failuresSinceLastSwitch.set(0); } } } checkAndHandleFailure(ex); if (retryBehaviour.shouldBackoffAndTryOtherNodes()) { int numFailovers = failoverCount.get(); if (numFailovers > 0 && numFailovers % servers.size() == 0) { // We implement some randomness around the expected value of BACKOFF_BEFORE_ROUND_ROBIN_RETRY_MILLIS. // Even though this is not exponential backoff, should be enough to avoid a thundering herd problem. long pauseTimeWithJitter = ThreadLocalRandom.current() .nextLong(BACKOFF_BEFORE_ROUND_ROBIN_RETRY_MILLIS / 2, (BACKOFF_BEFORE_ROUND_ROBIN_RETRY_MILLIS * 3) / 2); pauseForBackoff(ex, pauseTimeWithJitter); } } else { pauseForBackoff(ex); } }
private static boolean isCausedByBlockingTimeout(RetryableException retryableException) { return retryableException.getCause() instanceof AtlasDbRemoteException && getCausingErrorName(retryableException).equals(BlockingTimeoutException.class.getName()); }
public static ExceptionRetryBehaviour getRetryBehaviourForException(RetryableException retryableException) { if (isCausedByBlockingTimeout(retryableException)) { // This is the case where we have a network request that failed because it blocked too long on a lock. // Since it is still the leader, we want to try again on the same node. return RETRY_INDEFINITELY_ON_SAME_NODE; } if (retryableException.retryAfter() != null) { // This is the case where the server has returned a 503. // This is done when we want to do fast failover because we aren't the leader or we are shutting down. return RETRY_ON_OTHER_NODE; } // This is the case where we have failed due to networking or other IOException. return RETRY_ON_SAME_NODE; }
private boolean shouldSwitchNode(ExceptionRetryBehaviour retryBehaviour, long failures) { return retryBehaviour.shouldBackoffAndTryOtherNodes() || (!retryBehaviour.shouldRetryInfinitelyManyTimes() && failures >= failuresBeforeSwitching); }
@Test public void shouldRetryIndefinitelyOnSameNodeOnBlockingTimeoutExceptionWithoutRetryAfter() { RetryableException exception = createRetryableExceptionWithGenericMessage( REMOTE_BLOCKING_TIMEOUT_EXCEPTION, null); assertThat(ExceptionRetryBehaviour.getRetryBehaviourForException(exception)) .isEqualTo(ExceptionRetryBehaviour.RETRY_INDEFINITELY_ON_SAME_NODE); }
private void failoverToNextNode(ExceptionRetryBehaviour retryBehaviour) { if (retryBehaviour.shouldBackoffAndTryOtherNodes()) { // We did talk to a node successfully. It was shutting down but nodes are available // so we shouldn't keep making the backoff higher. numSwitches.set(0); startTimeOfFastFailover.compareAndSet(0, System.currentTimeMillis()); } else { numSwitches.incrementAndGet(); startTimeOfFastFailover.set(0); } failuresSinceLastSwitch.set(0); failoverCount.incrementAndGet(); }
@Test public void retryIndefinitelyOnSameNodeShouldRetryInfinitelyManyTimes() { ExceptionRetryBehaviour behaviour = ExceptionRetryBehaviour.RETRY_INDEFINITELY_ON_SAME_NODE; assertThat(behaviour.shouldRetryInfinitelyManyTimes()).isTrue(); }
private boolean shouldSwitchNode(ExceptionRetryBehaviour retryBehaviour, long failures) { return retryBehaviour.shouldBackoffAndTryOtherNodes() || (!retryBehaviour.shouldRetryInfinitelyManyTimes() && failures >= failuresBeforeSwitching); }
@Test public void shouldRetryOnSameNodeOnServiceNotAvailableExceptionWithoutRetryAfter() { RetryableException exception = createRetryableExceptionWithGenericMessage( SERVICE_NOT_AVAILABLE_EXCEPTION, null); assertThat(ExceptionRetryBehaviour.getRetryBehaviourForException(exception)) .isEqualTo(ExceptionRetryBehaviour.RETRY_ON_SAME_NODE); }
@Test public void retryOnOtherNodesShouldRetryOnOtherNodes() { ExceptionRetryBehaviour behaviour = ExceptionRetryBehaviour.RETRY_ON_OTHER_NODE; assertThat(behaviour.shouldBackoffAndTryOtherNodes()).isTrue(); }
@Test public void retryOnOtherNodesShouldRetryInfinitelyManyTimes() { ExceptionRetryBehaviour behaviour = ExceptionRetryBehaviour.RETRY_ON_OTHER_NODE; assertThat(behaviour.shouldRetryInfinitelyManyTimes()).isTrue(); }
@Override public void continueOrPropagate(RetryableException ex) { ExceptionRetryBehaviour retryBehaviour = ExceptionRetryBehaviour.getRetryBehaviourForException(ex); synchronized (this) { // Only fail over if this failure was to the current server. // This means that no one on another thread has failed us over already. if (mostRecentServerIndex.get() != null && mostRecentServerIndex.get() == failoverCount.get()) { long failures = failuresSinceLastSwitch.incrementAndGet(); if (shouldSwitchNode(retryBehaviour, failures)) { failoverToNextNode(retryBehaviour); } else if (retryBehaviour.shouldRetryInfinitelyManyTimes()) { failuresSinceLastSwitch.set(0); } } } checkAndHandleFailure(ex); if (retryBehaviour.shouldBackoffAndTryOtherNodes()) { int numFailovers = failoverCount.get(); if (numFailovers > 0 && numFailovers % servers.size() == 0) { // We implement some randomness around the expected value of BACKOFF_BEFORE_ROUND_ROBIN_RETRY_MILLIS. // Even though this is not exponential backoff, should be enough to avoid a thundering herd problem. long pauseTimeWithJitter = ThreadLocalRandom.current() .nextLong(BACKOFF_BEFORE_ROUND_ROBIN_RETRY_MILLIS / 2, (BACKOFF_BEFORE_ROUND_ROBIN_RETRY_MILLIS * 3) / 2); pauseForBackoff(ex, pauseTimeWithJitter); } } else { pauseForBackoff(ex); } }
@Test public void shouldRetryOnOtherNodesOnServiceNotAvailableExceptionWithRetryAfter() { RetryableException exception = createRetryableExceptionWithGenericMessage( SERVICE_NOT_AVAILABLE_EXCEPTION, DATE); assertThat(ExceptionRetryBehaviour.getRetryBehaviourForException(exception)) .isEqualTo(ExceptionRetryBehaviour.RETRY_ON_OTHER_NODE); }
@Test public void retryOnSameNodeShouldRetryOnSameNode() { ExceptionRetryBehaviour behaviour = ExceptionRetryBehaviour.RETRY_ON_SAME_NODE; assertThat(behaviour.shouldBackoffAndTryOtherNodes()).isFalse(); }
@Test public void retryOnSameNodeShouldRetryFinitelyManyTimes() { ExceptionRetryBehaviour behaviour = ExceptionRetryBehaviour.RETRY_ON_SAME_NODE; assertThat(behaviour.shouldRetryInfinitelyManyTimes()).isFalse(); }
private static boolean isCausedByBlockingTimeout(RetryableException retryableException) { return retryableException.getCause() instanceof AtlasDbRemoteException && getCausingErrorName(retryableException).equals(BlockingTimeoutException.class.getName()); }
public static ExceptionRetryBehaviour getRetryBehaviourForException(RetryableException retryableException) { if (isCausedByBlockingTimeout(retryableException)) { // This is the case where we have a network request that failed because it blocked too long on a lock. // Since it is still the leader, we want to try again on the same node. return RETRY_INDEFINITELY_ON_SAME_NODE; } if (retryableException.retryAfter() != null) { // This is the case where the server has returned a 503. // This is done when we want to do fast failover because we aren't the leader or we are shutting down. return RETRY_ON_OTHER_NODE; } // This is the case where we have failed due to networking or other IOException. return RETRY_ON_SAME_NODE; }
@Test public void shouldRetryIndefinitelyOnSameNodeOnBlockingTimeoutExceptionWithRetryAfter() { RetryableException exception = createRetryableExceptionWithGenericMessage( REMOTE_BLOCKING_TIMEOUT_EXCEPTION, DATE); assertThat(ExceptionRetryBehaviour.getRetryBehaviourForException(exception)) .isEqualTo(ExceptionRetryBehaviour.RETRY_INDEFINITELY_ON_SAME_NODE); }
@Test public void retryIndefinitelyOnSameNodeShouldRetryOnSameNode() { ExceptionRetryBehaviour behaviour = ExceptionRetryBehaviour.RETRY_INDEFINITELY_ON_SAME_NODE; assertThat(behaviour.shouldBackoffAndTryOtherNodes()).isFalse(); }
@Test public void shouldRetryOnSameNodeOnUnknownCauseRetryableExceptionWithoutRetryAfter() { RetryableException exception = createRetryableExceptionWithGenericMessage( null, null); assertThat(ExceptionRetryBehaviour.getRetryBehaviourForException(exception)) .isEqualTo(ExceptionRetryBehaviour.RETRY_ON_SAME_NODE); }