diff --git a/src/khoj/processor/conversation/openai/utils.py b/src/khoj/processor/conversation/openai/utils.py index 46e275ff..f0048baf 100644 --- a/src/khoj/processor/conversation/openai/utils.py +++ b/src/khoj/processor/conversation/openai/utils.py @@ -237,14 +237,18 @@ def completion_with_backoff( chunk = chunk.chunk # Calculate cost of chat - input_tokens = chunk.usage.prompt_tokens if hasattr(chunk, "usage") and chunk.usage else 0 - output_tokens = chunk.usage.completion_tokens if hasattr(chunk, "usage") and chunk.usage else 0 - cost = ( - chunk.usage.model_extra.get("estimated_cost", 0) if hasattr(chunk, "usage") and chunk.usage else 0 - ) # Estimated costs returned by DeepInfra API + input_tokens, output_tokens, cache_read_tokens, cost = 0, 0, 0, 0 + if hasattr(chunk, "usage") and chunk.usage: + input_tokens = chunk.usage.prompt_tokens + output_tokens = chunk.usage.completion_tokens + if hasattr(chunk.usage, "prompt_tokens_details") and chunk.usage.prompt_tokens_details: + cache_read_tokens = chunk.usage.prompt_tokens_details.cached_tokens + if hasattr(chunk.usage, "completion_tokens_details") and chunk.usage.completion_tokens_details: + output_tokens += chunk.usage.completion_tokens_details.reasoning_tokens + cost = chunk.usage.model_extra.get("estimated_cost", 0) # Estimated costs returned by DeepInfra API tracer["usage"] = get_chat_usage_metrics( - model_name, input_tokens, output_tokens, usage=tracer.get("usage"), cost=cost + model_name, input_tokens, output_tokens, cache_read_tokens, usage=tracer.get("usage"), cost=cost ) # Validate the response. If empty, raise an error to retry. @@ -392,15 +396,19 @@ async def chat_completion_with_backoff( yield response_chunk # Calculate cost of chat after stream finishes - input_tokens, output_tokens, cost = 0, 0, 0 + input_tokens, output_tokens, cache_read_tokens, cost = 0, 0, 0, 0 if final_chunk and hasattr(final_chunk, "usage") and final_chunk.usage: input_tokens = final_chunk.usage.prompt_tokens output_tokens = final_chunk.usage.completion_tokens + if hasattr(final_chunk.usage, "prompt_tokens_details") and final_chunk.usage.prompt_tokens_details: + cache_read_tokens = final_chunk.usage.prompt_tokens_details.cached_tokens + if hasattr(final_chunk.usage, "completion_tokens_details") and final_chunk.usage.completion_tokens_details: + output_tokens += final_chunk.usage.completion_tokens_details.reasoning_tokens # Estimated costs returned by DeepInfra API if final_chunk.usage.model_extra and "estimated_cost" in final_chunk.usage.model_extra: cost = final_chunk.usage.model_extra.get("estimated_cost", 0) tracer["usage"] = get_chat_usage_metrics( - model_name, input_tokens, output_tokens, usage=tracer.get("usage"), cost=cost + model_name, input_tokens, output_tokens, cache_read_tokens, usage=tracer.get("usage"), cost=cost ) # Validate the response. If empty, raise an error to retry.