mirror of
https://github.com/khoaliber/khoj.git
synced 2026-03-02 13:18:18 +00:00
Count cache, reasoning tokens to estimate cost for models served over openai api
Count cached tokens, reasoning tokens for better cost estimates for models served over an openai compatible api. Previously we didn't include cached token or reasoning tokens in costing.
This commit is contained in:
@@ -237,14 +237,18 @@ def completion_with_backoff(
|
|||||||
chunk = chunk.chunk
|
chunk = chunk.chunk
|
||||||
|
|
||||||
# Calculate cost of chat
|
# Calculate cost of chat
|
||||||
input_tokens = chunk.usage.prompt_tokens if hasattr(chunk, "usage") and chunk.usage else 0
|
input_tokens, output_tokens, cache_read_tokens, cost = 0, 0, 0, 0
|
||||||
output_tokens = chunk.usage.completion_tokens if hasattr(chunk, "usage") and chunk.usage else 0
|
if hasattr(chunk, "usage") and chunk.usage:
|
||||||
cost = (
|
input_tokens = chunk.usage.prompt_tokens
|
||||||
chunk.usage.model_extra.get("estimated_cost", 0) if hasattr(chunk, "usage") and chunk.usage else 0
|
output_tokens = chunk.usage.completion_tokens
|
||||||
) # Estimated costs returned by DeepInfra API
|
if hasattr(chunk.usage, "prompt_tokens_details") and chunk.usage.prompt_tokens_details:
|
||||||
|
cache_read_tokens = chunk.usage.prompt_tokens_details.cached_tokens
|
||||||
|
if hasattr(chunk.usage, "completion_tokens_details") and chunk.usage.completion_tokens_details:
|
||||||
|
output_tokens += chunk.usage.completion_tokens_details.reasoning_tokens
|
||||||
|
cost = chunk.usage.model_extra.get("estimated_cost", 0) # Estimated costs returned by DeepInfra API
|
||||||
|
|
||||||
tracer["usage"] = get_chat_usage_metrics(
|
tracer["usage"] = get_chat_usage_metrics(
|
||||||
model_name, input_tokens, output_tokens, usage=tracer.get("usage"), cost=cost
|
model_name, input_tokens, output_tokens, cache_read_tokens, usage=tracer.get("usage"), cost=cost
|
||||||
)
|
)
|
||||||
|
|
||||||
# Validate the response. If empty, raise an error to retry.
|
# Validate the response. If empty, raise an error to retry.
|
||||||
@@ -392,15 +396,19 @@ async def chat_completion_with_backoff(
|
|||||||
yield response_chunk
|
yield response_chunk
|
||||||
|
|
||||||
# Calculate cost of chat after stream finishes
|
# Calculate cost of chat after stream finishes
|
||||||
input_tokens, output_tokens, cost = 0, 0, 0
|
input_tokens, output_tokens, cache_read_tokens, cost = 0, 0, 0, 0
|
||||||
if final_chunk and hasattr(final_chunk, "usage") and final_chunk.usage:
|
if final_chunk and hasattr(final_chunk, "usage") and final_chunk.usage:
|
||||||
input_tokens = final_chunk.usage.prompt_tokens
|
input_tokens = final_chunk.usage.prompt_tokens
|
||||||
output_tokens = final_chunk.usage.completion_tokens
|
output_tokens = final_chunk.usage.completion_tokens
|
||||||
|
if hasattr(final_chunk.usage, "prompt_tokens_details") and final_chunk.usage.prompt_tokens_details:
|
||||||
|
cache_read_tokens = final_chunk.usage.prompt_tokens_details.cached_tokens
|
||||||
|
if hasattr(final_chunk.usage, "completion_tokens_details") and final_chunk.usage.completion_tokens_details:
|
||||||
|
output_tokens += final_chunk.usage.completion_tokens_details.reasoning_tokens
|
||||||
# Estimated costs returned by DeepInfra API
|
# Estimated costs returned by DeepInfra API
|
||||||
if final_chunk.usage.model_extra and "estimated_cost" in final_chunk.usage.model_extra:
|
if final_chunk.usage.model_extra and "estimated_cost" in final_chunk.usage.model_extra:
|
||||||
cost = final_chunk.usage.model_extra.get("estimated_cost", 0)
|
cost = final_chunk.usage.model_extra.get("estimated_cost", 0)
|
||||||
tracer["usage"] = get_chat_usage_metrics(
|
tracer["usage"] = get_chat_usage_metrics(
|
||||||
model_name, input_tokens, output_tokens, usage=tracer.get("usage"), cost=cost
|
model_name, input_tokens, output_tokens, cache_read_tokens, usage=tracer.get("usage"), cost=cost
|
||||||
)
|
)
|
||||||
|
|
||||||
# Validate the response. If empty, raise an error to retry.
|
# Validate the response. If empty, raise an error to retry.
|
||||||
|
|||||||
Reference in New Issue
Block a user