diff --git a/src/khoj/processor/conversation/openai/utils.py b/src/khoj/processor/conversation/openai/utils.py index 4f4bd358..3ebb2e62 100644 --- a/src/khoj/processor/conversation/openai/utils.py +++ b/src/khoj/processor/conversation/openai/utils.py @@ -55,6 +55,10 @@ logger = logging.getLogger(__name__) openai_clients: Dict[str, openai.OpenAI] = {} openai_async_clients: Dict[str, openai.AsyncOpenAI] = {} +# Default completion tokens +# Reduce premature termination, especially when streaming structured responses +MAX_COMPLETION_TOKENS = 16000 + def _extract_text_for_instructions(content: Union[str, List, Dict, None]) -> str: """Extract plain text from a message content suitable for Responses API instructions.""" @@ -111,6 +115,7 @@ def completion_with_backoff( model_kwargs["temperature"] = temperature model_kwargs["top_p"] = model_kwargs.get("top_p", 0.95) + model_kwargs["max_completion_tokens"] = model_kwargs.get("max_completion_tokens", MAX_COMPLETION_TOKENS) formatted_messages = format_message_for_api(messages, model_name, api_base_url) @@ -303,6 +308,7 @@ async def chat_completion_with_backoff( model_kwargs.pop("stream_options", None) model_kwargs["top_p"] = model_kwargs.get("top_p", 0.95) + model_kwargs["max_completion_tokens"] = model_kwargs.get("max_completion_tokens", MAX_COMPLETION_TOKENS) formatted_messages = format_message_for_api(messages, model_name, api_base_url)