Reduce timeouts on calling local and online llms via openai api

- Use much larger read, connect timeout if llm served over local url - Use larger timeout duration than default (5s) for online llms too This matches timeout duration increase calls to gemini api
2026-04-28 00:19:25 +00:00 · 2025-05-17 16:24:05 -07:00
parent e0352cd8e1
commit 20f08ca564
1 changed files with 16 additions and 2 deletions
@@ -5,6 +5,7 @@ from time import perf_counter
 from typing import AsyncGenerator, Dict, Generator, List, Literal, Optional, Union
 from urllib.parse import urlparse

+import httpx
 import openai
 from openai.lib.streaming.chat import (
    ChatCompletionStream,
@@ -102,6 +103,7 @@ def completion_with_backoff(
        if not deepthought and len(formatted_messages) > 0:
            formatted_messages[-1]["content"] = formatted_messages[-1]["content"] + " /no_think"

+    read_timeout = 300 if is_local_api(api_base_url) else 60
    model_kwargs["stream_options"] = {"include_usage": True}
    if os.getenv("KHOJ_LLM_SEED"):
        model_kwargs["seed"] = int(os.getenv("KHOJ_LLM_SEED"))
@@ -111,7 +113,7 @@ def completion_with_backoff(
        messages=formatted_messages,  # type: ignore
        model=model_name,
        temperature=temperature,
-        timeout=20,
+        timeout=httpx.Timeout(30, read=read_timeout),
        **model_kwargs,
    ) as chat:
        for chunk in stream_processor(chat):
@@ -217,6 +219,7 @@ async def chat_completion_with_backoff(
                formatted_messages[-1]["content"] = formatted_messages[-1]["content"] + " /no_think"

        stream = True
+        read_timeout = 300 if is_local_api(api_base_url) else 60
        model_kwargs["stream_options"] = {"include_usage": True}
        if os.getenv("KHOJ_LLM_SEED"):
            model_kwargs["seed"] = int(os.getenv("KHOJ_LLM_SEED"))
@@ -229,7 +232,7 @@ async def chat_completion_with_backoff(
            model=model_name,
            stream=stream,
            temperature=temperature,
-            timeout=20,
+            timeout=httpx.Timeout(30, read=read_timeout),
            **model_kwargs,
        )
        async for chunk in stream_processor(chat_stream):
@@ -313,6 +316,17 @@ def is_qwen_reasoning_model(model_name: str, api_base_url: str = None) -> bool:
    return "qwen3" in model_name.lower() and api_base_url is not None


+def is_local_api(api_base_url: str) -> bool:
+    """
+    Check if the API base URL is a local API
+    """
+    if not api_base_url:
+        return False
+
+    host = urlparse(api_base_url).hostname
+    return host == "localhost" or host == "127.0.0.1"
+
+
 class ThoughtDeltaEvent(ContentDeltaEvent):
    """
    Chat completion chunk with thoughts, reasoning support.