From e125e299a783c7c0b876851f53e2cadbad356eb7 Mon Sep 17 00:00:00 2001 From: Debanjum Date: Tue, 13 May 2025 12:14:19 -0600 Subject: [PATCH] Ensure time to first token logged only once per chat response Time to first token Log lines were shown multiple times if new chunk bein streamed was empty for some reason. This change makes the logic robust to empty chunks being recieved. --- src/khoj/processor/conversation/anthropic/utils.py | 4 +++- src/khoj/processor/conversation/google/utils.py | 4 +++- src/khoj/processor/conversation/openai/utils.py | 4 +++- 3 files changed, 9 insertions(+), 3 deletions(-) diff --git a/src/khoj/processor/conversation/anthropic/utils.py b/src/khoj/processor/conversation/anthropic/utils.py index e436ecda..c2db6a72 100644 --- a/src/khoj/processor/conversation/anthropic/utils.py +++ b/src/khoj/processor/conversation/anthropic/utils.py @@ -144,6 +144,7 @@ async def anthropic_chat_completion_with_backoff( formatted_messages, system_prompt = format_messages_for_anthropic(messages, system_prompt) aggregated_response = "" + response_started = False final_message = None start_time = perf_counter() async with client.messages.stream( @@ -157,7 +158,8 @@ async def anthropic_chat_completion_with_backoff( ) as stream: async for chunk in stream: # Log the time taken to start response - if aggregated_response == "": + if not response_started: + response_started = True logger.info(f"First response took: {perf_counter() - start_time:.3f} seconds") # Skip empty chunks if chunk.type != "content_block_delta": diff --git a/src/khoj/processor/conversation/google/utils.py b/src/khoj/processor/conversation/google/utils.py index ed37a0b3..d465cbda 100644 --- a/src/khoj/processor/conversation/google/utils.py +++ b/src/khoj/processor/conversation/google/utils.py @@ -195,13 +195,15 @@ async def gemini_chat_completion_with_backoff( aggregated_response = "" final_chunk = None + response_started = False start_time = perf_counter() chat_stream: AsyncIterator[gtypes.GenerateContentResponse] = await client.aio.models.generate_content_stream( model=model_name, config=config, contents=formatted_messages ) async for chunk in chat_stream: # Log the time taken to start response - if final_chunk is None: + if not response_started: + response_started = True logger.info(f"First response took: {perf_counter() - start_time:.3f} seconds") # Keep track of the last chunk for usage data final_chunk = chunk diff --git a/src/khoj/processor/conversation/openai/utils.py b/src/khoj/processor/conversation/openai/utils.py index 7b1c11db..77dee0c4 100644 --- a/src/khoj/processor/conversation/openai/utils.py +++ b/src/khoj/processor/conversation/openai/utils.py @@ -226,6 +226,7 @@ async def chat_completion_with_backoff( aggregated_response = "" final_chunk = None + response_started = False start_time = perf_counter() chat_stream: openai.AsyncStream[ChatCompletionChunk] = await client.chat.completions.create( messages=formatted_messages, # type: ignore @@ -237,7 +238,8 @@ async def chat_completion_with_backoff( ) async for chunk in stream_processor(chat_stream): # Log the time taken to start response - if final_chunk is None: + if not response_started: + response_started = True logger.info(f"First response took: {perf_counter() - start_time:.3f} seconds") # Keep track of the last chunk for usage data final_chunk = chunk