mirror of
https://github.com/khoaliber/khoj.git
synced 2026-03-09 21:29:11 +00:00
Use groq with service tier auto to fallback to flex on rate limit
Merge gpt-oss config with openai reasoning config as similar tuning. Add pricing for gpt oss 20b model
This commit is contained in:
@@ -145,11 +145,8 @@ def completion_with_backoff(
|
|||||||
# See https://qwenlm.github.io/blog/qwen3/#advanced-usages
|
# See https://qwenlm.github.io/blog/qwen3/#advanced-usages
|
||||||
if not deepthought:
|
if not deepthought:
|
||||||
add_qwen_no_think_tag(formatted_messages)
|
add_qwen_no_think_tag(formatted_messages)
|
||||||
elif "gpt-oss" in model_name.lower():
|
elif is_groq_api(api_base_url):
|
||||||
model_kwargs["temperature"] = 1
|
model_kwargs["service_tier"] = "auto"
|
||||||
reasoning_effort = "medium" if deepthought else "low"
|
|
||||||
model_kwargs["reasoning_effort"] = reasoning_effort
|
|
||||||
model_kwargs["top_p"] = 1.0
|
|
||||||
|
|
||||||
read_timeout = 300 if is_local_api(api_base_url) else 60
|
read_timeout = 300 if is_local_api(api_base_url) else 60
|
||||||
if os.getenv("KHOJ_LLM_SEED"):
|
if os.getenv("KHOJ_LLM_SEED"):
|
||||||
@@ -355,11 +352,8 @@ async def chat_completion_with_backoff(
|
|||||||
# See https://qwenlm.github.io/blog/qwen3/#advanced-usages
|
# See https://qwenlm.github.io/blog/qwen3/#advanced-usages
|
||||||
if not deepthought:
|
if not deepthought:
|
||||||
add_qwen_no_think_tag(formatted_messages)
|
add_qwen_no_think_tag(formatted_messages)
|
||||||
elif "gpt-oss" in model_name.lower():
|
elif is_groq_api(api_base_url):
|
||||||
temperature = 1
|
model_kwargs["service_tier"] = "auto"
|
||||||
reasoning_effort = "medium" if deepthought else "low"
|
|
||||||
model_kwargs["reasoning_effort"] = reasoning_effort
|
|
||||||
model_kwargs["top_p"] = 1.0
|
|
||||||
|
|
||||||
read_timeout = 300 if is_local_api(api_base_url) else 60
|
read_timeout = 300 if is_local_api(api_base_url) else 60
|
||||||
if os.getenv("KHOJ_LLM_SEED"):
|
if os.getenv("KHOJ_LLM_SEED"):
|
||||||
@@ -854,8 +848,10 @@ def is_openai_reasoning_model(model_name: str, api_base_url: str = None) -> bool
|
|||||||
"""
|
"""
|
||||||
Check if the model is an OpenAI reasoning model
|
Check if the model is an OpenAI reasoning model
|
||||||
"""
|
"""
|
||||||
return is_openai_api(api_base_url) and (
|
return (
|
||||||
model_name.lower().startswith("o") or model_name.lower().startswith("gpt-5")
|
is_openai_api(api_base_url)
|
||||||
|
and (model_name.lower().startswith("o") or model_name.lower().startswith("gpt-5"))
|
||||||
|
or model_name.lower().startswith("gpt-oss")
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
@@ -879,6 +875,13 @@ def is_twitter_reasoning_model(model_name: str, api_base_url: str = None) -> boo
|
|||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def is_groq_api(api_base_url: str = None) -> bool:
|
||||||
|
"""
|
||||||
|
Check if the model is served over the Groq API
|
||||||
|
"""
|
||||||
|
return api_base_url is not None and api_base_url.startswith("https://api.groq.com")
|
||||||
|
|
||||||
|
|
||||||
def is_qwen_style_reasoning_model(model_name: str, api_base_url: str = None) -> bool:
|
def is_qwen_style_reasoning_model(model_name: str, api_base_url: str = None) -> bool:
|
||||||
"""
|
"""
|
||||||
Check if the model is a Qwen style reasoning model
|
Check if the model is a Qwen style reasoning model
|
||||||
|
|||||||
@@ -75,4 +75,5 @@ model_to_cost: Dict[str, Dict[str, float]] = {
|
|||||||
# Groq pricing
|
# Groq pricing
|
||||||
"moonshotai/kimi-k2-instruct": {"input": 1.00, "output": 3.00},
|
"moonshotai/kimi-k2-instruct": {"input": 1.00, "output": 3.00},
|
||||||
"openai/gpt-oss-120b": {"input": 0.15, "output": 0.75},
|
"openai/gpt-oss-120b": {"input": 0.15, "output": 0.75},
|
||||||
|
"openai/gpt-oss-20b": {"input": 0.10, "output": 0.50},
|
||||||
}
|
}
|
||||||
|
|||||||
Reference in New Issue
Block a user