From 65644f78b0a5b8f371c47fe451da77c4838ecb35 Mon Sep 17 00:00:00 2001 From: Debanjum Date: Wed, 11 Jun 2025 10:04:14 -0700 Subject: [PATCH] Set lower max output tokens for non reasoning Gemini models While reasoning models support longer output tokens. Non reasoning models do not. Use a lower max output tokens for them --- .../processor/conversation/google/utils.py | 26 +++++++++++++++---- 1 file changed, 21 insertions(+), 5 deletions(-) diff --git a/src/khoj/processor/conversation/google/utils.py b/src/khoj/processor/conversation/google/utils.py index f44ed3d4..92760e65 100644 --- a/src/khoj/processor/conversation/google/utils.py +++ b/src/khoj/processor/conversation/google/utils.py @@ -39,7 +39,8 @@ gemini_clients: Dict[str, genai.Client] = {} # Output tokens should be more than reasoning tokens. # This avoids premature response termination. -MAX_OUTPUT_TOKENS_GEMINI = 20000 +MAX_OUTPUT_TOKENS_FOR_REASONING_GEMINI = 20000 +MAX_OUTPUT_TOKENS_FOR_STANDARD_GEMINI = 8000 MAX_REASONING_TOKENS_GEMINI = 10000 SAFETY_SETTINGS = [ @@ -111,15 +112,19 @@ def gemini_completion_with_backoff( response_schema = clean_response_schema(model_kwargs["response_schema"]) thinking_config = None - if deepthought and model_name.startswith("gemini-2.5"): + if deepthought and is_reasoning_model(model_name): thinking_config = gtypes.ThinkingConfig(thinking_budget=MAX_REASONING_TOKENS_GEMINI) + max_output_tokens = MAX_OUTPUT_TOKENS_FOR_STANDARD_GEMINI + if is_reasoning_model(model_name): + max_output_tokens = MAX_OUTPUT_TOKENS_FOR_REASONING_GEMINI + seed = int(os.getenv("KHOJ_LLM_SEED")) if os.getenv("KHOJ_LLM_SEED") else None config = gtypes.GenerateContentConfig( system_instruction=system_instruction, temperature=temperature, thinking_config=thinking_config, - max_output_tokens=MAX_OUTPUT_TOKENS_GEMINI, + max_output_tokens=max_output_tokens, safety_settings=SAFETY_SETTINGS, response_mime_type=model_kwargs.get("response_mime_type", "text/plain") if model_kwargs else "text/plain", response_schema=response_schema, @@ -188,15 +193,19 @@ async def gemini_chat_completion_with_backoff( formatted_messages, system_instruction = format_messages_for_gemini(messages, system_prompt) thinking_config = None - if deepthought and model_name.startswith("gemini-2.5"): + if deepthought and is_reasoning_model(model_name): thinking_config = gtypes.ThinkingConfig(thinking_budget=MAX_REASONING_TOKENS_GEMINI, include_thoughts=True) + max_output_tokens = MAX_OUTPUT_TOKENS_FOR_STANDARD_GEMINI + if is_reasoning_model(model_name): + max_output_tokens = MAX_OUTPUT_TOKENS_FOR_REASONING_GEMINI + seed = int(os.getenv("KHOJ_LLM_SEED")) if os.getenv("KHOJ_LLM_SEED") else None config = gtypes.GenerateContentConfig( system_instruction=system_instruction, temperature=temperature, thinking_config=thinking_config, - max_output_tokens=MAX_OUTPUT_TOKENS_GEMINI, + max_output_tokens=max_output_tokens, stop_sequences=["Notes:\n["], safety_settings=SAFETY_SETTINGS, seed=seed, @@ -385,3 +394,10 @@ def clean_response_schema(response_schema: BaseModel) -> dict: # Generate content in the order in which the schema properties were defined response_schema_dict["property_ordering"] = field_names return response_schema_dict + + +def is_reasoning_model(model_name: str) -> bool: + """ + Check if the model is a reasoning model. + """ + return model_name.startswith("gemini-2.5")