mirror of
https://github.com/khoaliber/khoj.git
synced 2026-03-09 05:39:12 +00:00
Set lower max output tokens for non reasoning Gemini models
While reasoning models support longer output tokens. Non reasoning models do not. Use a lower max output tokens for them
This commit is contained in:
@@ -39,7 +39,8 @@ gemini_clients: Dict[str, genai.Client] = {}
|
|||||||
|
|
||||||
# Output tokens should be more than reasoning tokens.
|
# Output tokens should be more than reasoning tokens.
|
||||||
# This avoids premature response termination.
|
# This avoids premature response termination.
|
||||||
MAX_OUTPUT_TOKENS_GEMINI = 20000
|
MAX_OUTPUT_TOKENS_FOR_REASONING_GEMINI = 20000
|
||||||
|
MAX_OUTPUT_TOKENS_FOR_STANDARD_GEMINI = 8000
|
||||||
MAX_REASONING_TOKENS_GEMINI = 10000
|
MAX_REASONING_TOKENS_GEMINI = 10000
|
||||||
|
|
||||||
SAFETY_SETTINGS = [
|
SAFETY_SETTINGS = [
|
||||||
@@ -111,15 +112,19 @@ def gemini_completion_with_backoff(
|
|||||||
response_schema = clean_response_schema(model_kwargs["response_schema"])
|
response_schema = clean_response_schema(model_kwargs["response_schema"])
|
||||||
|
|
||||||
thinking_config = None
|
thinking_config = None
|
||||||
if deepthought and model_name.startswith("gemini-2.5"):
|
if deepthought and is_reasoning_model(model_name):
|
||||||
thinking_config = gtypes.ThinkingConfig(thinking_budget=MAX_REASONING_TOKENS_GEMINI)
|
thinking_config = gtypes.ThinkingConfig(thinking_budget=MAX_REASONING_TOKENS_GEMINI)
|
||||||
|
|
||||||
|
max_output_tokens = MAX_OUTPUT_TOKENS_FOR_STANDARD_GEMINI
|
||||||
|
if is_reasoning_model(model_name):
|
||||||
|
max_output_tokens = MAX_OUTPUT_TOKENS_FOR_REASONING_GEMINI
|
||||||
|
|
||||||
seed = int(os.getenv("KHOJ_LLM_SEED")) if os.getenv("KHOJ_LLM_SEED") else None
|
seed = int(os.getenv("KHOJ_LLM_SEED")) if os.getenv("KHOJ_LLM_SEED") else None
|
||||||
config = gtypes.GenerateContentConfig(
|
config = gtypes.GenerateContentConfig(
|
||||||
system_instruction=system_instruction,
|
system_instruction=system_instruction,
|
||||||
temperature=temperature,
|
temperature=temperature,
|
||||||
thinking_config=thinking_config,
|
thinking_config=thinking_config,
|
||||||
max_output_tokens=MAX_OUTPUT_TOKENS_GEMINI,
|
max_output_tokens=max_output_tokens,
|
||||||
safety_settings=SAFETY_SETTINGS,
|
safety_settings=SAFETY_SETTINGS,
|
||||||
response_mime_type=model_kwargs.get("response_mime_type", "text/plain") if model_kwargs else "text/plain",
|
response_mime_type=model_kwargs.get("response_mime_type", "text/plain") if model_kwargs else "text/plain",
|
||||||
response_schema=response_schema,
|
response_schema=response_schema,
|
||||||
@@ -188,15 +193,19 @@ async def gemini_chat_completion_with_backoff(
|
|||||||
formatted_messages, system_instruction = format_messages_for_gemini(messages, system_prompt)
|
formatted_messages, system_instruction = format_messages_for_gemini(messages, system_prompt)
|
||||||
|
|
||||||
thinking_config = None
|
thinking_config = None
|
||||||
if deepthought and model_name.startswith("gemini-2.5"):
|
if deepthought and is_reasoning_model(model_name):
|
||||||
thinking_config = gtypes.ThinkingConfig(thinking_budget=MAX_REASONING_TOKENS_GEMINI, include_thoughts=True)
|
thinking_config = gtypes.ThinkingConfig(thinking_budget=MAX_REASONING_TOKENS_GEMINI, include_thoughts=True)
|
||||||
|
|
||||||
|
max_output_tokens = MAX_OUTPUT_TOKENS_FOR_STANDARD_GEMINI
|
||||||
|
if is_reasoning_model(model_name):
|
||||||
|
max_output_tokens = MAX_OUTPUT_TOKENS_FOR_REASONING_GEMINI
|
||||||
|
|
||||||
seed = int(os.getenv("KHOJ_LLM_SEED")) if os.getenv("KHOJ_LLM_SEED") else None
|
seed = int(os.getenv("KHOJ_LLM_SEED")) if os.getenv("KHOJ_LLM_SEED") else None
|
||||||
config = gtypes.GenerateContentConfig(
|
config = gtypes.GenerateContentConfig(
|
||||||
system_instruction=system_instruction,
|
system_instruction=system_instruction,
|
||||||
temperature=temperature,
|
temperature=temperature,
|
||||||
thinking_config=thinking_config,
|
thinking_config=thinking_config,
|
||||||
max_output_tokens=MAX_OUTPUT_TOKENS_GEMINI,
|
max_output_tokens=max_output_tokens,
|
||||||
stop_sequences=["Notes:\n["],
|
stop_sequences=["Notes:\n["],
|
||||||
safety_settings=SAFETY_SETTINGS,
|
safety_settings=SAFETY_SETTINGS,
|
||||||
seed=seed,
|
seed=seed,
|
||||||
@@ -385,3 +394,10 @@ def clean_response_schema(response_schema: BaseModel) -> dict:
|
|||||||
# Generate content in the order in which the schema properties were defined
|
# Generate content in the order in which the schema properties were defined
|
||||||
response_schema_dict["property_ordering"] = field_names
|
response_schema_dict["property_ordering"] = field_names
|
||||||
return response_schema_dict
|
return response_schema_dict
|
||||||
|
|
||||||
|
|
||||||
|
def is_reasoning_model(model_name: str) -> bool:
|
||||||
|
"""
|
||||||
|
Check if the model is a reasoning model.
|
||||||
|
"""
|
||||||
|
return model_name.startswith("gemini-2.5")
|
||||||
|
|||||||
Reference in New Issue
Block a user