Count cache, reasoning tokens to estimate cost for models served over openai api

Count cached tokens, reasoning tokens for better cost estimates for
models served over an openai compatible api. Previously we didn't
include cached token or reasoning tokens in costing.
This commit is contained in:
Debanjum
2025-11-10 17:44:04 -08:00
parent ce6d75e5a2
commit b14e6eb069

View File

@@ -237,14 +237,18 @@ def completion_with_backoff(
chunk = chunk.chunk chunk = chunk.chunk
# Calculate cost of chat # Calculate cost of chat
input_tokens = chunk.usage.prompt_tokens if hasattr(chunk, "usage") and chunk.usage else 0 input_tokens, output_tokens, cache_read_tokens, cost = 0, 0, 0, 0
output_tokens = chunk.usage.completion_tokens if hasattr(chunk, "usage") and chunk.usage else 0 if hasattr(chunk, "usage") and chunk.usage:
cost = ( input_tokens = chunk.usage.prompt_tokens
chunk.usage.model_extra.get("estimated_cost", 0) if hasattr(chunk, "usage") and chunk.usage else 0 output_tokens = chunk.usage.completion_tokens
) # Estimated costs returned by DeepInfra API if hasattr(chunk.usage, "prompt_tokens_details") and chunk.usage.prompt_tokens_details:
cache_read_tokens = chunk.usage.prompt_tokens_details.cached_tokens
if hasattr(chunk.usage, "completion_tokens_details") and chunk.usage.completion_tokens_details:
output_tokens += chunk.usage.completion_tokens_details.reasoning_tokens
cost = chunk.usage.model_extra.get("estimated_cost", 0) # Estimated costs returned by DeepInfra API
tracer["usage"] = get_chat_usage_metrics( tracer["usage"] = get_chat_usage_metrics(
model_name, input_tokens, output_tokens, usage=tracer.get("usage"), cost=cost model_name, input_tokens, output_tokens, cache_read_tokens, usage=tracer.get("usage"), cost=cost
) )
# Validate the response. If empty, raise an error to retry. # Validate the response. If empty, raise an error to retry.
@@ -392,15 +396,19 @@ async def chat_completion_with_backoff(
yield response_chunk yield response_chunk
# Calculate cost of chat after stream finishes # Calculate cost of chat after stream finishes
input_tokens, output_tokens, cost = 0, 0, 0 input_tokens, output_tokens, cache_read_tokens, cost = 0, 0, 0, 0
if final_chunk and hasattr(final_chunk, "usage") and final_chunk.usage: if final_chunk and hasattr(final_chunk, "usage") and final_chunk.usage:
input_tokens = final_chunk.usage.prompt_tokens input_tokens = final_chunk.usage.prompt_tokens
output_tokens = final_chunk.usage.completion_tokens output_tokens = final_chunk.usage.completion_tokens
if hasattr(final_chunk.usage, "prompt_tokens_details") and final_chunk.usage.prompt_tokens_details:
cache_read_tokens = final_chunk.usage.prompt_tokens_details.cached_tokens
if hasattr(final_chunk.usage, "completion_tokens_details") and final_chunk.usage.completion_tokens_details:
output_tokens += final_chunk.usage.completion_tokens_details.reasoning_tokens
# Estimated costs returned by DeepInfra API # Estimated costs returned by DeepInfra API
if final_chunk.usage.model_extra and "estimated_cost" in final_chunk.usage.model_extra: if final_chunk.usage.model_extra and "estimated_cost" in final_chunk.usage.model_extra:
cost = final_chunk.usage.model_extra.get("estimated_cost", 0) cost = final_chunk.usage.model_extra.get("estimated_cost", 0)
tracer["usage"] = get_chat_usage_metrics( tracer["usage"] = get_chat_usage_metrics(
model_name, input_tokens, output_tokens, usage=tracer.get("usage"), cost=cost model_name, input_tokens, output_tokens, cache_read_tokens, usage=tracer.get("usage"), cost=cost
) )
# Validate the response. If empty, raise an error to retry. # Validate the response. If empty, raise an error to retry.