Use prompt cache key to improve cache hits with openai responses api

Using prompt cache key enables sticky routing to openai servers. This increases probability of a chat actor hitting same server and reusing cached prompts. We use stable hash of first N characters to uniquely identify a chat actor prompt
2026-03-08 05:39:13 +00:00 · 2025-08-31 12:37:34 -07:00
parent 3c1948e9de
commit 7533e3eecf
1 changed files with 6 additions and 0 deletions
--- a/src/khoj/processor/conversation/openai/utils.py
+++ b/src/khoj/processor/conversation/openai/utils.py
@@ -1,3 +1,4 @@
 import hashlib
 import json
 import logging
 import os
@@ -457,6 +458,11 @@ def responses_completion_with_backoff(
    model_kwargs = deepcopy(model_kwargs)
    model_kwargs["top_p"] = model_kwargs.get("top_p", 0.95)
    # Use prompt cache key to increase probability of cache hits
    if instructions:
        model_kwargs["prompt_cache_key"] = f"{hashlib.md5(instructions[:500].encode()).hexdigest()}"
    # Configure thinking for openai reasoning models
    if is_openai_reasoning_model(model_name, api_base_url):
        temperature = 1