From 20f08ca564e48c71b1ab0fa4e98bb44a64ff50b5 Mon Sep 17 00:00:00 2001 From: Debanjum Date: Sat, 17 May 2025 16:24:05 -0700 Subject: [PATCH] Reduce timeouts on calling local and online llms via openai api - Use much larger read, connect timeout if llm served over local url - Use larger timeout duration than default (5s) for online llms too This matches timeout duration increase calls to gemini api --- .../processor/conversation/openai/utils.py | 18 ++++++++++++++++-- 1 file changed, 16 insertions(+), 2 deletions(-) diff --git a/src/khoj/processor/conversation/openai/utils.py b/src/khoj/processor/conversation/openai/utils.py index 3a1b8947..7b1c11db 100644 --- a/src/khoj/processor/conversation/openai/utils.py +++ b/src/khoj/processor/conversation/openai/utils.py @@ -5,6 +5,7 @@ from time import perf_counter from typing import AsyncGenerator, Dict, Generator, List, Literal, Optional, Union from urllib.parse import urlparse +import httpx import openai from openai.lib.streaming.chat import ( ChatCompletionStream, @@ -102,6 +103,7 @@ def completion_with_backoff( if not deepthought and len(formatted_messages) > 0: formatted_messages[-1]["content"] = formatted_messages[-1]["content"] + " /no_think" + read_timeout = 300 if is_local_api(api_base_url) else 60 model_kwargs["stream_options"] = {"include_usage": True} if os.getenv("KHOJ_LLM_SEED"): model_kwargs["seed"] = int(os.getenv("KHOJ_LLM_SEED")) @@ -111,7 +113,7 @@ def completion_with_backoff( messages=formatted_messages, # type: ignore model=model_name, temperature=temperature, - timeout=20, + timeout=httpx.Timeout(30, read=read_timeout), **model_kwargs, ) as chat: for chunk in stream_processor(chat): @@ -217,6 +219,7 @@ async def chat_completion_with_backoff( formatted_messages[-1]["content"] = formatted_messages[-1]["content"] + " /no_think" stream = True + read_timeout = 300 if is_local_api(api_base_url) else 60 model_kwargs["stream_options"] = {"include_usage": True} if os.getenv("KHOJ_LLM_SEED"): model_kwargs["seed"] = int(os.getenv("KHOJ_LLM_SEED")) @@ -229,7 +232,7 @@ async def chat_completion_with_backoff( model=model_name, stream=stream, temperature=temperature, - timeout=20, + timeout=httpx.Timeout(30, read=read_timeout), **model_kwargs, ) async for chunk in stream_processor(chat_stream): @@ -313,6 +316,17 @@ def is_qwen_reasoning_model(model_name: str, api_base_url: str = None) -> bool: return "qwen3" in model_name.lower() and api_base_url is not None +def is_local_api(api_base_url: str) -> bool: + """ + Check if the API base URL is a local API + """ + if not api_base_url: + return False + + host = urlparse(api_base_url).hostname + return host == "localhost" or host == "127.0.0.1" + + class ThoughtDeltaEvent(ContentDeltaEvent): """ Chat completion chunk with thoughts, reasoning support.