From 20f08ca564e48c71b1ab0fa4e98bb44a64ff50b5 Mon Sep 17 00:00:00 2001
From: Debanjum <debanjum@gmail.com>
Date: Sat, 17 May 2025 16:24:05 -0700
Subject: [PATCH] Reduce timeouts on calling local and online llms via openai
 api

- Use much larger read, connect timeout if llm served over local url
- Use larger timeout duration than default (5s) for online llms too
  This matches timeout duration increase calls to gemini api
---
 .../processor/conversation/openai/utils.py     | 18 ++++++++++++++++--
 1 file changed, 16 insertions(+), 2 deletions(-)

diff --git a/src/khoj/processor/conversation/openai/utils.py b/src/khoj/processor/conversation/openai/utils.py
index 3a1b8947..7b1c11db 100644
--- a/src/khoj/processor/conversation/openai/utils.py
+++ b/src/khoj/processor/conversation/openai/utils.py
@@ -5,6 +5,7 @@ from time import perf_counter
 from typing import AsyncGenerator, Dict, Generator, List, Literal, Optional, Union
 from urllib.parse import urlparse
 
+import httpx
 import openai
 from openai.lib.streaming.chat import (
     ChatCompletionStream,
@@ -102,6 +103,7 @@ def completion_with_backoff(
         if not deepthought and len(formatted_messages) > 0:
             formatted_messages[-1]["content"] = formatted_messages[-1]["content"] + " /no_think"
 
+    read_timeout = 300 if is_local_api(api_base_url) else 60
     model_kwargs["stream_options"] = {"include_usage": True}
     if os.getenv("KHOJ_LLM_SEED"):
         model_kwargs["seed"] = int(os.getenv("KHOJ_LLM_SEED"))
@@ -111,7 +113,7 @@ def completion_with_backoff(
         messages=formatted_messages,  # type: ignore
         model=model_name,
         temperature=temperature,
-        timeout=20,
+        timeout=httpx.Timeout(30, read=read_timeout),
         **model_kwargs,
     ) as chat:
         for chunk in stream_processor(chat):
@@ -217,6 +219,7 @@ async def chat_completion_with_backoff(
                 formatted_messages[-1]["content"] = formatted_messages[-1]["content"] + " /no_think"
 
         stream = True
+        read_timeout = 300 if is_local_api(api_base_url) else 60
         model_kwargs["stream_options"] = {"include_usage": True}
         if os.getenv("KHOJ_LLM_SEED"):
             model_kwargs["seed"] = int(os.getenv("KHOJ_LLM_SEED"))
@@ -229,7 +232,7 @@ async def chat_completion_with_backoff(
             model=model_name,
             stream=stream,
             temperature=temperature,
-            timeout=20,
+            timeout=httpx.Timeout(30, read=read_timeout),
             **model_kwargs,
         )
         async for chunk in stream_processor(chat_stream):
@@ -313,6 +316,17 @@ def is_qwen_reasoning_model(model_name: str, api_base_url: str = None) -> bool:
     return "qwen3" in model_name.lower() and api_base_url is not None
 
 
+def is_local_api(api_base_url: str) -> bool:
+    """
+    Check if the API base URL is a local API
+    """
+    if not api_base_url:
+        return False
+
+    host = urlparse(api_base_url).hostname
+    return host == "localhost" or host == "127.0.0.1"
+
+
 class ThoughtDeltaEvent(ContentDeltaEvent):
     """
     Chat completion chunk with thoughts, reasoning support.