Reduce timeouts on calling local and online llms via openai api

- Use much larger read, connect timeout if llm served over local url
- Use larger timeout duration than default (5s) for online llms too
  This matches timeout duration increase calls to gemini api
This commit is contained in:
Debanjum
2025-05-17 16:24:05 -07:00
parent e0352cd8e1
commit 20f08ca564

View File

@@ -5,6 +5,7 @@ from time import perf_counter
from typing import AsyncGenerator, Dict, Generator, List, Literal, Optional, Union from typing import AsyncGenerator, Dict, Generator, List, Literal, Optional, Union
from urllib.parse import urlparse from urllib.parse import urlparse
import httpx
import openai import openai
from openai.lib.streaming.chat import ( from openai.lib.streaming.chat import (
ChatCompletionStream, ChatCompletionStream,
@@ -102,6 +103,7 @@ def completion_with_backoff(
if not deepthought and len(formatted_messages) > 0: if not deepthought and len(formatted_messages) > 0:
formatted_messages[-1]["content"] = formatted_messages[-1]["content"] + " /no_think" formatted_messages[-1]["content"] = formatted_messages[-1]["content"] + " /no_think"
read_timeout = 300 if is_local_api(api_base_url) else 60
model_kwargs["stream_options"] = {"include_usage": True} model_kwargs["stream_options"] = {"include_usage": True}
if os.getenv("KHOJ_LLM_SEED"): if os.getenv("KHOJ_LLM_SEED"):
model_kwargs["seed"] = int(os.getenv("KHOJ_LLM_SEED")) model_kwargs["seed"] = int(os.getenv("KHOJ_LLM_SEED"))
@@ -111,7 +113,7 @@ def completion_with_backoff(
messages=formatted_messages, # type: ignore messages=formatted_messages, # type: ignore
model=model_name, model=model_name,
temperature=temperature, temperature=temperature,
timeout=20, timeout=httpx.Timeout(30, read=read_timeout),
**model_kwargs, **model_kwargs,
) as chat: ) as chat:
for chunk in stream_processor(chat): for chunk in stream_processor(chat):
@@ -217,6 +219,7 @@ async def chat_completion_with_backoff(
formatted_messages[-1]["content"] = formatted_messages[-1]["content"] + " /no_think" formatted_messages[-1]["content"] = formatted_messages[-1]["content"] + " /no_think"
stream = True stream = True
read_timeout = 300 if is_local_api(api_base_url) else 60
model_kwargs["stream_options"] = {"include_usage": True} model_kwargs["stream_options"] = {"include_usage": True}
if os.getenv("KHOJ_LLM_SEED"): if os.getenv("KHOJ_LLM_SEED"):
model_kwargs["seed"] = int(os.getenv("KHOJ_LLM_SEED")) model_kwargs["seed"] = int(os.getenv("KHOJ_LLM_SEED"))
@@ -229,7 +232,7 @@ async def chat_completion_with_backoff(
model=model_name, model=model_name,
stream=stream, stream=stream,
temperature=temperature, temperature=temperature,
timeout=20, timeout=httpx.Timeout(30, read=read_timeout),
**model_kwargs, **model_kwargs,
) )
async for chunk in stream_processor(chat_stream): async for chunk in stream_processor(chat_stream):
@@ -313,6 +316,17 @@ def is_qwen_reasoning_model(model_name: str, api_base_url: str = None) -> bool:
return "qwen3" in model_name.lower() and api_base_url is not None return "qwen3" in model_name.lower() and api_base_url is not None
def is_local_api(api_base_url: str) -> bool:
"""
Check if the API base URL is a local API
"""
if not api_base_url:
return False
host = urlparse(api_base_url).hostname
return host == "localhost" or host == "127.0.0.1"
class ThoughtDeltaEvent(ContentDeltaEvent): class ThoughtDeltaEvent(ContentDeltaEvent):
""" """
Chat completion chunk with thoughts, reasoning support. Chat completion chunk with thoughts, reasoning support.