Add costs of ai prompt cache read, write. Use for calls to Anthropic

This commit is contained in:
Debanjum
2025-03-24 11:14:30 +05:30
parent d4b0ef5e93
commit b4929905b2
5 changed files with 57 additions and 19 deletions

View File

@@ -104,7 +104,11 @@ def anthropic_completion_with_backoff(
# Calculate cost of chat # Calculate cost of chat
input_tokens = final_message.usage.input_tokens input_tokens = final_message.usage.input_tokens
output_tokens = final_message.usage.output_tokens output_tokens = final_message.usage.output_tokens
tracer["usage"] = get_chat_usage_metrics(model_name, input_tokens, output_tokens, tracer.get("usage")) cache_read_tokens = final_message.usage.cache_read_input_tokens
cache_write_tokens = final_message.usage.cache_creation_input_tokens
tracer["usage"] = get_chat_usage_metrics(
model_name, input_tokens, output_tokens, cache_read_tokens, cache_write_tokens, tracer.get("usage")
)
# Save conversation trace # Save conversation trace
tracer["chat_model"] = model_name tracer["chat_model"] = model_name
@@ -207,7 +211,11 @@ def anthropic_llm_thread(
# Calculate cost of chat # Calculate cost of chat
input_tokens = final_message.usage.input_tokens input_tokens = final_message.usage.input_tokens
output_tokens = final_message.usage.output_tokens output_tokens = final_message.usage.output_tokens
tracer["usage"] = get_chat_usage_metrics(model_name, input_tokens, output_tokens, tracer.get("usage")) cache_read_tokens = final_message.usage.cache_read_input_tokens
cache_write_tokens = final_message.usage.cache_creation_input_tokens
tracer["usage"] = get_chat_usage_metrics(
model_name, input_tokens, output_tokens, cache_read_tokens, cache_write_tokens, tracer.get("usage")
)
# Save conversation trace # Save conversation trace
tracer["chat_model"] = model_name tracer["chat_model"] = model_name

View File

@@ -109,7 +109,7 @@ def gemini_completion_with_backoff(
# Aggregate cost of chat # Aggregate cost of chat
input_tokens = response.usage_metadata.prompt_token_count if response else 0 input_tokens = response.usage_metadata.prompt_token_count if response else 0
output_tokens = response.usage_metadata.candidates_token_count if response else 0 output_tokens = response.usage_metadata.candidates_token_count if response else 0
tracer["usage"] = get_chat_usage_metrics(model_name, input_tokens, output_tokens, tracer.get("usage")) tracer["usage"] = get_chat_usage_metrics(model_name, input_tokens, output_tokens, usage=tracer.get("usage"))
# Save conversation trace # Save conversation trace
tracer["chat_model"] = model_name tracer["chat_model"] = model_name
@@ -191,7 +191,7 @@ def gemini_llm_thread(
# Calculate cost of chat # Calculate cost of chat
input_tokens = chunk.usage_metadata.prompt_token_count input_tokens = chunk.usage_metadata.prompt_token_count
output_tokens = chunk.usage_metadata.candidates_token_count output_tokens = chunk.usage_metadata.candidates_token_count
tracer["usage"] = get_chat_usage_metrics(model_name, input_tokens, output_tokens, tracer.get("usage")) tracer["usage"] = get_chat_usage_metrics(model_name, input_tokens, output_tokens, usage=tracer.get("usage"))
# Save conversation trace # Save conversation trace
tracer["chat_model"] = model_name tracer["chat_model"] = model_name

View File

@@ -93,7 +93,9 @@ def completion_with_backoff(
chunk.usage.model_extra.get("estimated_cost", 0) if hasattr(chunk, "usage") and chunk.usage else 0 chunk.usage.model_extra.get("estimated_cost", 0) if hasattr(chunk, "usage") and chunk.usage else 0
) # Estimated costs returned by DeepInfra API ) # Estimated costs returned by DeepInfra API
tracer["usage"] = get_chat_usage_metrics(model_name, input_tokens, output_tokens, tracer.get("usage"), cost) tracer["usage"] = get_chat_usage_metrics(
model_name, input_tokens, output_tokens, usage=tracer.get("usage"), cost=cost
)
# Save conversation trace # Save conversation trace
tracer["chat_model"] = model_name tracer["chat_model"] = model_name
@@ -226,7 +228,9 @@ def llm_thread(
cost = ( cost = (
chunk.usage.model_extra.get("estimated_cost", 0) if hasattr(chunk, "usage") and chunk.usage else 0 chunk.usage.model_extra.get("estimated_cost", 0) if hasattr(chunk, "usage") and chunk.usage else 0
) # Estimated costs returned by DeepInfra API ) # Estimated costs returned by DeepInfra API
tracer["usage"] = get_chat_usage_metrics(model_name, input_tokens, output_tokens, tracer.get("usage"), cost) tracer["usage"] = get_chat_usage_metrics(
model_name, input_tokens, output_tokens, usage=tracer.get("usage"), cost=cost
)
# Save conversation trace # Save conversation trace
tracer["chat_model"] = model_name tracer["chat_model"] = model_name

View File

@@ -47,12 +47,12 @@ model_to_cost: Dict[str, Dict[str, float]] = {
"gemini-1.5-pro": {"input": 1.25, "output": 5.00}, "gemini-1.5-pro": {"input": 1.25, "output": 5.00},
"gemini-1.5-pro-002": {"input": 1.25, "output": 5.00}, "gemini-1.5-pro-002": {"input": 1.25, "output": 5.00},
"gemini-2.0-flash": {"input": 0.10, "output": 0.40}, "gemini-2.0-flash": {"input": 0.10, "output": 0.40},
# Anthropic Pricing: https://www.anthropic.com/pricing#anthropic-api_ # Anthropic Pricing: https://www.anthropic.com/pricing#anthropic-api
"claude-3-5-haiku-20241022": {"input": 1.0, "output": 5.0}, "claude-3-5-haiku-20241022": {"input": 1.0, "output": 5.0, "cache_read": 0.08, "cache_write": 1.0},
"claude-3-5-haiku@20241022": {"input": 1.0, "output": 5.0}, "claude-3-5-haiku@20241022": {"input": 1.0, "output": 5.0, "cache_read": 0.08, "cache_write": 1.0},
"claude-3-5-sonnet-20241022": {"input": 3.0, "output": 15.0}, "claude-3-5-sonnet-20241022": {"input": 3.0, "output": 15.0, "cache_read": 0.3, "cache_write": 3.75},
"claude-3-5-sonnet-latest": {"input": 3.0, "output": 15.0}, "claude-3-5-sonnet-latest": {"input": 3.0, "output": 15.0, "cache_read": 0.3, "cache_write": 3.75},
"claude-3-7-sonnet-20250219": {"input": 3.0, "output": 15.0}, "claude-3-7-sonnet-20250219": {"input": 3.0, "output": 15.0, "cache_read": 0.3, "cache_write": 3.75},
"claude-3-7-sonnet@20250219": {"input": 3.0, "output": 15.0}, "claude-3-7-sonnet@20250219": {"input": 3.0, "output": 15.0, "cache_read": 0.3, "cache_write": 3.75},
"claude-3-7-sonnet-latest": {"input": 3.0, "output": 15.0}, "claude-3-7-sonnet-latest": {"input": 3.0, "output": 15.0, "cache_read": 0.3, "cache_write": 3.75},
} }

View File

@@ -596,7 +596,14 @@ def get_country_name_from_timezone(tz: str) -> str:
return country_names.get(get_country_code_from_timezone(tz), "United States") return country_names.get(get_country_code_from_timezone(tz), "United States")
def get_cost_of_chat_message(model_name: str, input_tokens: int = 0, output_tokens: int = 0, prev_cost: float = 0.0): def get_cost_of_chat_message(
model_name: str,
input_tokens: int = 0,
output_tokens: int = 0,
cache_read_tokens: int = 0,
cache_write_tokens: int = 0,
prev_cost: float = 0.0,
):
""" """
Calculate cost of chat message based on input and output tokens Calculate cost of chat message based on input and output tokens
""" """
@@ -604,21 +611,40 @@ def get_cost_of_chat_message(model_name: str, input_tokens: int = 0, output_toke
# Calculate cost of input and output tokens. Costs are per million tokens # Calculate cost of input and output tokens. Costs are per million tokens
input_cost = constants.model_to_cost.get(model_name, {}).get("input", 0) * (input_tokens / 1e6) input_cost = constants.model_to_cost.get(model_name, {}).get("input", 0) * (input_tokens / 1e6)
output_cost = constants.model_to_cost.get(model_name, {}).get("output", 0) * (output_tokens / 1e6) output_cost = constants.model_to_cost.get(model_name, {}).get("output", 0) * (output_tokens / 1e6)
cache_read_cost = constants.model_to_cost.get(model_name, {}).get("cache_read", 0) * (cache_read_tokens / 1e6)
cache_write_cost = constants.model_to_cost.get(model_name, {}).get("cache_write", 0) * (cache_write_tokens / 1e6)
return input_cost + output_cost + prev_cost return input_cost + output_cost + cache_read_cost + cache_write_cost + prev_cost
def get_chat_usage_metrics( def get_chat_usage_metrics(
model_name: str, input_tokens: int = 0, output_tokens: int = 0, usage: dict = {}, cost: float = None model_name: str,
input_tokens: int = 0,
output_tokens: int = 0,
cache_read_tokens: int = 0,
cache_write_tokens: int = 0,
usage: dict = {},
cost: float = None,
): ):
""" """
Get usage metrics for chat message based on input and output tokens and cost Get usage metrics for chat message based on input and output tokens and cost
""" """
prev_usage = usage or {"input_tokens": 0, "output_tokens": 0, "cost": 0.0} prev_usage = usage or {
"input_tokens": 0,
"output_tokens": 0,
"cache_read_tokens": 0,
"cache_write_tokens": 0,
"cost": 0.0,
}
return { return {
"input_tokens": prev_usage["input_tokens"] + input_tokens, "input_tokens": prev_usage["input_tokens"] + input_tokens,
"output_tokens": prev_usage["output_tokens"] + output_tokens, "output_tokens": prev_usage["output_tokens"] + output_tokens,
"cost": cost or get_cost_of_chat_message(model_name, input_tokens, output_tokens, prev_cost=prev_usage["cost"]), "cache_read_tokens": prev_usage.get("cache_read_tokens", 0) + cache_read_tokens,
"cache_write_tokens": prev_usage.get("cache_write_tokens", 0) + cache_write_tokens,
"cost": cost
or get_cost_of_chat_message(
model_name, input_tokens, output_tokens, cache_read_tokens, cache_write_tokens, prev_cost=prev_usage["cost"]
),
} }