Retry if hit gemini rate limit. Return friendly message if retries fail

Although we had handling in place for retrying after gemini suggested
backoff on hitting rate limits. The actual rate limit exception was
getting caught to render friendly message, so retry wasn't actually
getting triggered.

This change allows both
- Retry on hitting 429 rate limit exceptions
- Return friendly message if rate limit triggered retry eventually fails

Related:
- Changes to retry with gemini suggested backoff time in 0f953f9
This commit is contained in:
Debanjum
2025-08-14 17:57:41 -07:00
parent 4274f58dbd
commit 3eb8cce984

View File

@@ -72,10 +72,52 @@ SAFETY_SETTINGS = [
]
class GeminiRetryableClientError(Exception):
"""Wrapper for retryable Gemini client errors that should surface a friendly message if retries exhaust.
Stores the original exception plus a fallback `response_text` to return after retries are exhausted.
"""
def __init__(self, original: gerrors.ClientError, response_text: str):
super().__init__(str(original))
self.original = original
self.response_text = response_text
# Expose code attribute so existing retry predicate logic can still inspect it if needed
self.code = getattr(original, "code", None)
def _gemini_retry_error_callback(retry_state: RetryCallState):
"""Produce a graceful fallback ResponseWithThought after all retry attempts fail.
Tenacity will call this when stop condition reached and reraise=False.
Extract our custom exception to build a ResponseWithThought with the stored friendly message.
"""
exc = retry_state.outcome.exception() if retry_state.outcome else None
if isinstance(exc, GeminiRetryableClientError):
# Access original call arguments to optionally record a trace
kwargs = retry_state.kwargs or {}
messages = kwargs.get("messages")
tracer = kwargs.get("tracer", {})
model_name = kwargs.get("model_name")
temperature = kwargs.get("temperature")
if tracer is not None:
tracer["chat_model"] = model_name
tracer["temperature"] = temperature
if messages and is_promptrace_enabled():
try:
commit_conversation_trace(messages, exc.response_text, tracer or {})
except Exception:
logger.debug("Failed to commit conversation trace on retry exhaustion", exc_info=True)
return ResponseWithThought(text=exc.response_text, thought=None, raw_content=[])
else:
# Propagate other exceptions to caller. Tenacity re-raises if we re-raise here.
raise exc
def _is_retryable_error(exception: BaseException) -> bool:
"""Check if the exception is a retryable error"""
# server errors
if isinstance(exception, (gerrors.APIError, gerrors.ClientError)):
if isinstance(exception, (gerrors.APIError, gerrors.ClientError, GeminiRetryableClientError)):
return exception.code in [429, 502, 503, 504]
# client errors
if (
@@ -134,7 +176,8 @@ def _wait_with_gemini_delay(min_wait=4, max_wait=120, multiplier=1, fallback_wai
wait=_wait_with_gemini_delay(min_wait=1, max_wait=10, fallback_wait=wait_random_exponential(min=1, max=10)),
stop=stop_after_attempt(2),
before_sleep=before_sleep_log(logger, logging.DEBUG),
reraise=True,
reraise=False,
retry_error_callback=_gemini_retry_error_callback,
)
def gemini_completion_with_backoff(
messages: list[ChatMessage],
@@ -210,15 +253,17 @@ def gemini_completion_with_backoff(
)
except gerrors.ClientError as e:
response = None
# Handle 429 rate limit errors directly
# For 429 rate-limit errors, raise wrapped exception so tenacity can retry.
if e.code == 429:
# Prepare friendly message for eventual exhaustion
response_text = "My brain is exhausted. Can you please try again in a bit?"
# Log the full error details for debugging
logger.error(f"Gemini ClientError: {e.code} {e.status}. Details: {e.details}")
# Handle other errors
logger.warning(f"Retryable Gemini ClientError: {e.code} {e.status}. Details: {e.details}")
# Raise wrapped so our retry callback can produce final ResponseWithThought
raise GeminiRetryableClientError(e, response_text)
# Handle non-retryable client errors
else:
# Respond with reason for stopping
response_text, _ = handle_gemini_response(e.args)
# Respond with reason for stopping
logger.warning(
f"LLM Response Prevented for {model_name}: {response_text}.\n"
+ f"Last Message by {messages[-1].role}: {messages[-1].content}"
@@ -240,7 +285,9 @@ def gemini_completion_with_backoff(
# Validate the response. If empty, raise an error to retry.
if is_none_or_empty(response_text):
logger.warning(f"No response by {model_name}\nLast Message by {messages[-1].role}: {messages[-1].content}.")
logger.warning(
f"No response by {model_name}\nLast Message by {messages[-1].role}: {messages[-1].content}. Retry."
)
raise ValueError(f"Empty or no response by {model_name} over API. Retry if needed.")
# Save conversation trace
@@ -373,14 +420,14 @@ def handle_gemini_response(
elif isinstance(candidates[0], str):
message = candidates[0]
stopped = True
# Check if the response was blocked due to safety concerns with the generated content
elif candidates[0].finish_reason == gtypes.FinishReason.SAFETY:
message = generate_safety_response(candidates[0].safety_ratings)
stopped = True
# Check if finish reason is empty, therefore generation is in progress
elif not candidates[0].finish_reason:
message = None
stopped = False
# Check if the response was blocked due to safety concerns with the generated content
elif candidates[0].finish_reason == gtypes.FinishReason.SAFETY:
message = generate_safety_response(candidates[0].safety_ratings)
stopped = True
# Check if the response was stopped due to reaching maximum token limit or other reasons
elif candidates[0].finish_reason != gtypes.FinishReason.STOP:
message = f"\nI can't talk further about that because of **{candidates[0].finish_reason.name} issue.**"