From 3eb8cce984c547f699f4cb5f29fdc8124abe3287 Mon Sep 17 00:00:00 2001
From: Debanjum <debanjum@gmail.com>
Date: Thu, 14 Aug 2025 17:57:41 -0700
Subject: [PATCH] Retry if hit gemini rate limit. Return friendly message if
 retries fail

Although we had handling in place for retrying after gemini suggested
backoff on hitting rate limits. The actual rate limit exception was
getting caught to render friendly message, so retry wasn't actually
getting triggered.

This change allows both
- Retry on hitting 429 rate limit exceptions
- Return friendly message if rate limit triggered retry eventually fails

Related:
- Changes to retry with gemini suggested backoff time in 0f953f9
---
 .../processor/conversation/google/utils.py    | 71 +++++++++++++++----
 1 file changed, 59 insertions(+), 12 deletions(-)

diff --git a/src/khoj/processor/conversation/google/utils.py b/src/khoj/processor/conversation/google/utils.py
index 92f8cce7..d3776319 100644
--- a/src/khoj/processor/conversation/google/utils.py
+++ b/src/khoj/processor/conversation/google/utils.py
@@ -72,10 +72,52 @@ SAFETY_SETTINGS = [
 ]
 
 
+class GeminiRetryableClientError(Exception):
+    """Wrapper for retryable Gemini client errors that should surface a friendly message if retries exhaust.
+
+    Stores the original exception plus a fallback `response_text` to return after retries are exhausted.
+    """
+
+    def __init__(self, original: gerrors.ClientError, response_text: str):
+        super().__init__(str(original))
+        self.original = original
+        self.response_text = response_text
+        # Expose code attribute so existing retry predicate logic can still inspect it if needed
+        self.code = getattr(original, "code", None)
+
+
+def _gemini_retry_error_callback(retry_state: RetryCallState):
+    """Produce a graceful fallback ResponseWithThought after all retry attempts fail.
+
+    Tenacity will call this when stop condition reached and reraise=False.
+    Extract our custom exception to build a ResponseWithThought with the stored friendly message.
+    """
+    exc = retry_state.outcome.exception() if retry_state.outcome else None
+    if isinstance(exc, GeminiRetryableClientError):
+        # Access original call arguments to optionally record a trace
+        kwargs = retry_state.kwargs or {}
+        messages = kwargs.get("messages")
+        tracer = kwargs.get("tracer", {})
+        model_name = kwargs.get("model_name")
+        temperature = kwargs.get("temperature")
+        if tracer is not None:
+            tracer["chat_model"] = model_name
+            tracer["temperature"] = temperature
+        if messages and is_promptrace_enabled():
+            try:
+                commit_conversation_trace(messages, exc.response_text, tracer or {})
+            except Exception:
+                logger.debug("Failed to commit conversation trace on retry exhaustion", exc_info=True)
+        return ResponseWithThought(text=exc.response_text, thought=None, raw_content=[])
+    else:
+        # Propagate other exceptions to caller. Tenacity re-raises if we re-raise here.
+        raise exc
+
+
 def _is_retryable_error(exception: BaseException) -> bool:
     """Check if the exception is a retryable error"""
     # server errors
-    if isinstance(exception, (gerrors.APIError, gerrors.ClientError)):
+    if isinstance(exception, (gerrors.APIError, gerrors.ClientError, GeminiRetryableClientError)):
         return exception.code in [429, 502, 503, 504]
     # client errors
     if (
@@ -134,7 +176,8 @@ def _wait_with_gemini_delay(min_wait=4, max_wait=120, multiplier=1, fallback_wai
     wait=_wait_with_gemini_delay(min_wait=1, max_wait=10, fallback_wait=wait_random_exponential(min=1, max=10)),
     stop=stop_after_attempt(2),
     before_sleep=before_sleep_log(logger, logging.DEBUG),
-    reraise=True,
+    reraise=False,
+    retry_error_callback=_gemini_retry_error_callback,
 )
 def gemini_completion_with_backoff(
     messages: list[ChatMessage],
@@ -210,15 +253,17 @@ def gemini_completion_with_backoff(
         )
     except gerrors.ClientError as e:
         response = None
-        # Handle 429 rate limit errors directly
+        # For 429 rate-limit errors, raise wrapped exception so tenacity can retry.
         if e.code == 429:
+            # Prepare friendly message for eventual exhaustion
             response_text = "My brain is exhausted. Can you please try again in a bit?"
-            # Log the full error details for debugging
-            logger.error(f"Gemini ClientError: {e.code} {e.status}. Details: {e.details}")
-        # Handle other errors
+            logger.warning(f"Retryable Gemini ClientError: {e.code} {e.status}. Details: {e.details}")
+            # Raise wrapped so our retry callback can produce final ResponseWithThought
+            raise GeminiRetryableClientError(e, response_text)
+        # Handle non-retryable client errors
         else:
+            # Respond with reason for stopping
             response_text, _ = handle_gemini_response(e.args)
-        # Respond with reason for stopping
         logger.warning(
             f"LLM Response Prevented for {model_name}: {response_text}.\n"
             + f"Last Message by {messages[-1].role}: {messages[-1].content}"
@@ -240,7 +285,9 @@ def gemini_completion_with_backoff(
 
     # Validate the response. If empty, raise an error to retry.
     if is_none_or_empty(response_text):
-        logger.warning(f"No response by {model_name}\nLast Message by {messages[-1].role}: {messages[-1].content}.")
+        logger.warning(
+            f"No response by {model_name}\nLast Message by {messages[-1].role}: {messages[-1].content}. Retry."
+        )
         raise ValueError(f"Empty or no response by {model_name} over API. Retry if needed.")
 
     # Save conversation trace
@@ -373,14 +420,14 @@ def handle_gemini_response(
     elif isinstance(candidates[0], str):
         message = candidates[0]
         stopped = True
-    # Check if the response was blocked due to safety concerns with the generated content
-    elif candidates[0].finish_reason == gtypes.FinishReason.SAFETY:
-        message = generate_safety_response(candidates[0].safety_ratings)
-        stopped = True
     # Check if finish reason is empty, therefore generation is in progress
     elif not candidates[0].finish_reason:
         message = None
         stopped = False
+    # Check if the response was blocked due to safety concerns with the generated content
+    elif candidates[0].finish_reason == gtypes.FinishReason.SAFETY:
+        message = generate_safety_response(candidates[0].safety_ratings)
+        stopped = True
     # Check if the response was stopped due to reaching maximum token limit or other reasons
     elif candidates[0].finish_reason != gtypes.FinishReason.STOP:
         message = f"\nI can't talk further about that because of **{candidates[0].finish_reason.name} issue.**"