Count cache, reasoning tokens to estimate cost for models served over openai api

Count cached tokens, reasoning tokens for better cost estimates for models served over an openai compatible api. Previously we didn't include cached token or reasoning tokens in costing.
2026-04-19 17:14:35 +00:00 · 2025-11-10 17:44:04 -08:00
parent ce6d75e5a2
commit b14e6eb069
1 changed files with 16 additions and 8 deletions
--- a/src/khoj/processor/conversation/openai/utils.py
+++ b/src/khoj/processor/conversation/openai/utils.py
@@ -237,14 +237,18 @@ def completion_with_backoff(
        chunk = chunk.chunk

    # Calculate cost of chat
-    input_tokens = chunk.usage.prompt_tokens if hasattr(chunk, "usage") and chunk.usage else 0
-    output_tokens = chunk.usage.completion_tokens if hasattr(chunk, "usage") and chunk.usage else 0
-    cost = (
-        chunk.usage.model_extra.get("estimated_cost", 0) if hasattr(chunk, "usage") and chunk.usage else 0
-    )  # Estimated costs returned by DeepInfra API
+    input_tokens, output_tokens, cache_read_tokens, cost = 0, 0, 0, 0
+    if hasattr(chunk, "usage") and chunk.usage:
+        input_tokens = chunk.usage.prompt_tokens
+        output_tokens = chunk.usage.completion_tokens
+        if hasattr(chunk.usage, "prompt_tokens_details") and chunk.usage.prompt_tokens_details:
+            cache_read_tokens = chunk.usage.prompt_tokens_details.cached_tokens
+        if hasattr(chunk.usage, "completion_tokens_details") and chunk.usage.completion_tokens_details:
+            output_tokens += chunk.usage.completion_tokens_details.reasoning_tokens
+        cost = chunk.usage.model_extra.get("estimated_cost", 0)  # Estimated costs returned by DeepInfra API

    tracer["usage"] = get_chat_usage_metrics(
-        model_name, input_tokens, output_tokens, usage=tracer.get("usage"), cost=cost
+        model_name, input_tokens, output_tokens, cache_read_tokens, usage=tracer.get("usage"), cost=cost
    )

    # Validate the response. If empty, raise an error to retry.
@@ -392,15 +396,19 @@ async def chat_completion_with_backoff(
                yield response_chunk

    # Calculate cost of chat after stream finishes
-    input_tokens, output_tokens, cost = 0, 0, 0
+    input_tokens, output_tokens, cache_read_tokens, cost = 0, 0, 0, 0
    if final_chunk and hasattr(final_chunk, "usage") and final_chunk.usage:
        input_tokens = final_chunk.usage.prompt_tokens
        output_tokens = final_chunk.usage.completion_tokens
+        if hasattr(final_chunk.usage, "prompt_tokens_details") and final_chunk.usage.prompt_tokens_details:
+            cache_read_tokens = final_chunk.usage.prompt_tokens_details.cached_tokens
+        if hasattr(final_chunk.usage, "completion_tokens_details") and final_chunk.usage.completion_tokens_details:
+            output_tokens += final_chunk.usage.completion_tokens_details.reasoning_tokens
        # Estimated costs returned by DeepInfra API
        if final_chunk.usage.model_extra and "estimated_cost" in final_chunk.usage.model_extra:
            cost = final_chunk.usage.model_extra.get("estimated_cost", 0)
    tracer["usage"] = get_chat_usage_metrics(
-        model_name, input_tokens, output_tokens, usage=tracer.get("usage"), cost=cost
+        model_name, input_tokens, output_tokens, cache_read_tokens, usage=tracer.get("usage"), cost=cost
    )

    # Validate the response. If empty, raise an error to retry.