From bdf9afa726487fe34b72a99e3bb2ab17179e53e9 Mon Sep 17 00:00:00 2001 From: Debanjum Date: Thu, 4 Dec 2025 21:53:10 -0800 Subject: [PATCH] Set openai api output tokens high by default to not hit length limits Explicitly set completion tokens high to avoid early termination issues, especially when trying to generate structured responses. --- src/khoj/processor/conversation/openai/utils.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/src/khoj/processor/conversation/openai/utils.py b/src/khoj/processor/conversation/openai/utils.py index 4f4bd358..3ebb2e62 100644 --- a/src/khoj/processor/conversation/openai/utils.py +++ b/src/khoj/processor/conversation/openai/utils.py @@ -55,6 +55,10 @@ logger = logging.getLogger(__name__) openai_clients: Dict[str, openai.OpenAI] = {} openai_async_clients: Dict[str, openai.AsyncOpenAI] = {} +# Default completion tokens +# Reduce premature termination, especially when streaming structured responses +MAX_COMPLETION_TOKENS = 16000 + def _extract_text_for_instructions(content: Union[str, List, Dict, None]) -> str: """Extract plain text from a message content suitable for Responses API instructions.""" @@ -111,6 +115,7 @@ def completion_with_backoff( model_kwargs["temperature"] = temperature model_kwargs["top_p"] = model_kwargs.get("top_p", 0.95) + model_kwargs["max_completion_tokens"] = model_kwargs.get("max_completion_tokens", MAX_COMPLETION_TOKENS) formatted_messages = format_message_for_api(messages, model_name, api_base_url) @@ -303,6 +308,7 @@ async def chat_completion_with_backoff( model_kwargs.pop("stream_options", None) model_kwargs["top_p"] = model_kwargs.get("top_p", 0.95) + model_kwargs["max_completion_tokens"] = model_kwargs.get("max_completion_tokens", MAX_COMPLETION_TOKENS) formatted_messages = format_message_for_api(messages, model_name, api_base_url)