mirror of
https://github.com/khoaliber/khoj.git
synced 2026-03-08 05:39:13 +00:00
Use prompt cache key to improve cache hits with openai responses api
Using prompt cache key enables sticky routing to openai servers. This increases probability of a chat actor hitting same server and reusing cached prompts. We use stable hash of first N characters to uniquely identify a chat actor prompt
This commit is contained in:
@@ -1,3 +1,4 @@
|
|||||||
|
import hashlib
|
||||||
import json
|
import json
|
||||||
import logging
|
import logging
|
||||||
import os
|
import os
|
||||||
@@ -457,6 +458,11 @@ def responses_completion_with_backoff(
|
|||||||
|
|
||||||
model_kwargs = deepcopy(model_kwargs)
|
model_kwargs = deepcopy(model_kwargs)
|
||||||
model_kwargs["top_p"] = model_kwargs.get("top_p", 0.95)
|
model_kwargs["top_p"] = model_kwargs.get("top_p", 0.95)
|
||||||
|
|
||||||
|
# Use prompt cache key to increase probability of cache hits
|
||||||
|
if instructions:
|
||||||
|
model_kwargs["prompt_cache_key"] = f"{hashlib.md5(instructions[:500].encode()).hexdigest()}"
|
||||||
|
|
||||||
# Configure thinking for openai reasoning models
|
# Configure thinking for openai reasoning models
|
||||||
if is_openai_reasoning_model(model_name, api_base_url):
|
if is_openai_reasoning_model(model_name, api_base_url):
|
||||||
temperature = 1
|
temperature = 1
|
||||||
|
|||||||
Reference in New Issue
Block a user