From b14e6eb069d28f9d6791e5ac8b01a4a92bc84e39 Mon Sep 17 00:00:00 2001
From: Debanjum <debanjum@gmail.com>
Date: Mon, 10 Nov 2025 17:44:04 -0800
Subject: [PATCH] Count cache, reasoning tokens to estimate cost for models
 served over openai api

Count cached tokens, reasoning tokens for better cost estimates for
models served over an openai compatible api. Previously we didn't
include cached token or reasoning tokens in costing.
---
 .../processor/conversation/openai/utils.py    | 24 ++++++++++++-------
 1 file changed, 16 insertions(+), 8 deletions(-)

diff --git a/src/khoj/processor/conversation/openai/utils.py b/src/khoj/processor/conversation/openai/utils.py
index 46e275ff..f0048baf 100644
--- a/src/khoj/processor/conversation/openai/utils.py
+++ b/src/khoj/processor/conversation/openai/utils.py
@@ -237,14 +237,18 @@ def completion_with_backoff(
         chunk = chunk.chunk
 
     # Calculate cost of chat
-    input_tokens = chunk.usage.prompt_tokens if hasattr(chunk, "usage") and chunk.usage else 0
-    output_tokens = chunk.usage.completion_tokens if hasattr(chunk, "usage") and chunk.usage else 0
-    cost = (
-        chunk.usage.model_extra.get("estimated_cost", 0) if hasattr(chunk, "usage") and chunk.usage else 0
-    )  # Estimated costs returned by DeepInfra API
+    input_tokens, output_tokens, cache_read_tokens, cost = 0, 0, 0, 0
+    if hasattr(chunk, "usage") and chunk.usage:
+        input_tokens = chunk.usage.prompt_tokens
+        output_tokens = chunk.usage.completion_tokens
+        if hasattr(chunk.usage, "prompt_tokens_details") and chunk.usage.prompt_tokens_details:
+            cache_read_tokens = chunk.usage.prompt_tokens_details.cached_tokens
+        if hasattr(chunk.usage, "completion_tokens_details") and chunk.usage.completion_tokens_details:
+            output_tokens += chunk.usage.completion_tokens_details.reasoning_tokens
+        cost = chunk.usage.model_extra.get("estimated_cost", 0)  # Estimated costs returned by DeepInfra API
 
     tracer["usage"] = get_chat_usage_metrics(
-        model_name, input_tokens, output_tokens, usage=tracer.get("usage"), cost=cost
+        model_name, input_tokens, output_tokens, cache_read_tokens, usage=tracer.get("usage"), cost=cost
     )
 
     # Validate the response. If empty, raise an error to retry.
@@ -392,15 +396,19 @@ async def chat_completion_with_backoff(
                 yield response_chunk
 
     # Calculate cost of chat after stream finishes
-    input_tokens, output_tokens, cost = 0, 0, 0
+    input_tokens, output_tokens, cache_read_tokens, cost = 0, 0, 0, 0
     if final_chunk and hasattr(final_chunk, "usage") and final_chunk.usage:
         input_tokens = final_chunk.usage.prompt_tokens
         output_tokens = final_chunk.usage.completion_tokens
+        if hasattr(final_chunk.usage, "prompt_tokens_details") and final_chunk.usage.prompt_tokens_details:
+            cache_read_tokens = final_chunk.usage.prompt_tokens_details.cached_tokens
+        if hasattr(final_chunk.usage, "completion_tokens_details") and final_chunk.usage.completion_tokens_details:
+            output_tokens += final_chunk.usage.completion_tokens_details.reasoning_tokens
         # Estimated costs returned by DeepInfra API
         if final_chunk.usage.model_extra and "estimated_cost" in final_chunk.usage.model_extra:
             cost = final_chunk.usage.model_extra.get("estimated_cost", 0)
     tracer["usage"] = get_chat_usage_metrics(
-        model_name, input_tokens, output_tokens, usage=tracer.get("usage"), cost=cost
+        model_name, input_tokens, output_tokens, cache_read_tokens, usage=tracer.get("usage"), cost=cost
     )
 
     # Validate the response. If empty, raise an error to retry.