Set openai api output tokens high by default to not hit length limits

Explicitly set completion tokens high to avoid early termination issues, especially when trying to generate structured responses.
2026-03-02 13:18:18 +00:00 · 2025-12-04 21:53:10 -08:00
parent 5b6dab1627
commit bdf9afa726
1 changed files with 6 additions and 0 deletions
--- a/src/khoj/processor/conversation/openai/utils.py
+++ b/src/khoj/processor/conversation/openai/utils.py
@@ -55,6 +55,10 @@ logger = logging.getLogger(__name__)
 openai_clients: Dict[str, openai.OpenAI] = {}
 openai_async_clients: Dict[str, openai.AsyncOpenAI] = {}

+# Default completion tokens
+# Reduce premature termination, especially when streaming structured responses
+MAX_COMPLETION_TOKENS = 16000
+

 def _extract_text_for_instructions(content: Union[str, List, Dict, None]) -> str:
    """Extract plain text from a message content suitable for Responses API instructions."""
@@ -111,6 +115,7 @@ def completion_with_backoff(

    model_kwargs["temperature"] = temperature
    model_kwargs["top_p"] = model_kwargs.get("top_p", 0.95)
+    model_kwargs["max_completion_tokens"] = model_kwargs.get("max_completion_tokens", MAX_COMPLETION_TOKENS)

    formatted_messages = format_message_for_api(messages, model_name, api_base_url)

@@ -303,6 +308,7 @@ async def chat_completion_with_backoff(
        model_kwargs.pop("stream_options", None)

    model_kwargs["top_p"] = model_kwargs.get("top_p", 0.95)
+    model_kwargs["max_completion_tokens"] = model_kwargs.get("max_completion_tokens", MAX_COMPLETION_TOKENS)

    formatted_messages = format_message_for_api(messages, model_name, api_base_url)