From bdf9afa726487fe34b72a99e3bb2ab17179e53e9 Mon Sep 17 00:00:00 2001
From: Debanjum <debanjum@gmail.com>
Date: Thu, 4 Dec 2025 21:53:10 -0800
Subject: [PATCH] Set openai api output tokens high by default to not hit
 length limits

Explicitly set completion tokens high to avoid early termination
issues, especially when trying to generate structured responses.
---
 src/khoj/processor/conversation/openai/utils.py | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/src/khoj/processor/conversation/openai/utils.py b/src/khoj/processor/conversation/openai/utils.py
index 4f4bd358..3ebb2e62 100644
--- a/src/khoj/processor/conversation/openai/utils.py
+++ b/src/khoj/processor/conversation/openai/utils.py
@@ -55,6 +55,10 @@ logger = logging.getLogger(__name__)
 openai_clients: Dict[str, openai.OpenAI] = {}
 openai_async_clients: Dict[str, openai.AsyncOpenAI] = {}
 
+# Default completion tokens
+# Reduce premature termination, especially when streaming structured responses
+MAX_COMPLETION_TOKENS = 16000
+
 
 def _extract_text_for_instructions(content: Union[str, List, Dict, None]) -> str:
     """Extract plain text from a message content suitable for Responses API instructions."""
@@ -111,6 +115,7 @@ def completion_with_backoff(
 
     model_kwargs["temperature"] = temperature
     model_kwargs["top_p"] = model_kwargs.get("top_p", 0.95)
+    model_kwargs["max_completion_tokens"] = model_kwargs.get("max_completion_tokens", MAX_COMPLETION_TOKENS)
 
     formatted_messages = format_message_for_api(messages, model_name, api_base_url)
 
@@ -303,6 +308,7 @@ async def chat_completion_with_backoff(
         model_kwargs.pop("stream_options", None)
 
     model_kwargs["top_p"] = model_kwargs.get("top_p", 0.95)
+    model_kwargs["max_completion_tokens"] = model_kwargs.get("max_completion_tokens", MAX_COMPLETION_TOKENS)
 
     formatted_messages = format_message_for_api(messages, model_name, api_base_url)