From b14e6eb069d28f9d6791e5ac8b01a4a92bc84e39 Mon Sep 17 00:00:00 2001 From: Debanjum Date: Mon, 10 Nov 2025 17:44:04 -0800 Subject: [PATCH] Count cache, reasoning tokens to estimate cost for models served over openai api Count cached tokens, reasoning tokens for better cost estimates for models served over an openai compatible api. Previously we didn't include cached token or reasoning tokens in costing. --- .../processor/conversation/openai/utils.py | 24 ++++++++++++------- 1 file changed, 16 insertions(+), 8 deletions(-) diff --git a/src/khoj/processor/conversation/openai/utils.py b/src/khoj/processor/conversation/openai/utils.py index 46e275ff..f0048baf 100644 --- a/src/khoj/processor/conversation/openai/utils.py +++ b/src/khoj/processor/conversation/openai/utils.py @@ -237,14 +237,18 @@ def completion_with_backoff( chunk = chunk.chunk # Calculate cost of chat - input_tokens = chunk.usage.prompt_tokens if hasattr(chunk, "usage") and chunk.usage else 0 - output_tokens = chunk.usage.completion_tokens if hasattr(chunk, "usage") and chunk.usage else 0 - cost = ( - chunk.usage.model_extra.get("estimated_cost", 0) if hasattr(chunk, "usage") and chunk.usage else 0 - ) # Estimated costs returned by DeepInfra API + input_tokens, output_tokens, cache_read_tokens, cost = 0, 0, 0, 0 + if hasattr(chunk, "usage") and chunk.usage: + input_tokens = chunk.usage.prompt_tokens + output_tokens = chunk.usage.completion_tokens + if hasattr(chunk.usage, "prompt_tokens_details") and chunk.usage.prompt_tokens_details: + cache_read_tokens = chunk.usage.prompt_tokens_details.cached_tokens + if hasattr(chunk.usage, "completion_tokens_details") and chunk.usage.completion_tokens_details: + output_tokens += chunk.usage.completion_tokens_details.reasoning_tokens + cost = chunk.usage.model_extra.get("estimated_cost", 0) # Estimated costs returned by DeepInfra API tracer["usage"] = get_chat_usage_metrics( - model_name, input_tokens, output_tokens, usage=tracer.get("usage"), cost=cost + model_name, input_tokens, output_tokens, cache_read_tokens, usage=tracer.get("usage"), cost=cost ) # Validate the response. If empty, raise an error to retry. @@ -392,15 +396,19 @@ async def chat_completion_with_backoff( yield response_chunk # Calculate cost of chat after stream finishes - input_tokens, output_tokens, cost = 0, 0, 0 + input_tokens, output_tokens, cache_read_tokens, cost = 0, 0, 0, 0 if final_chunk and hasattr(final_chunk, "usage") and final_chunk.usage: input_tokens = final_chunk.usage.prompt_tokens output_tokens = final_chunk.usage.completion_tokens + if hasattr(final_chunk.usage, "prompt_tokens_details") and final_chunk.usage.prompt_tokens_details: + cache_read_tokens = final_chunk.usage.prompt_tokens_details.cached_tokens + if hasattr(final_chunk.usage, "completion_tokens_details") and final_chunk.usage.completion_tokens_details: + output_tokens += final_chunk.usage.completion_tokens_details.reasoning_tokens # Estimated costs returned by DeepInfra API if final_chunk.usage.model_extra and "estimated_cost" in final_chunk.usage.model_extra: cost = final_chunk.usage.model_extra.get("estimated_cost", 0) tracer["usage"] = get_chat_usage_metrics( - model_name, input_tokens, output_tokens, usage=tracer.get("usage"), cost=cost + model_name, input_tokens, output_tokens, cache_read_tokens, usage=tracer.get("usage"), cost=cost ) # Validate the response. If empty, raise an error to retry.