Ensure time to first token logged only once per chat response

Time to first token Log lines were shown multiple times if new chunk bein streamed was empty for some reason. This change makes the logic robust to empty chunks being recieved.
2026-03-02 21:19:12 +00:00 · 2025-05-13 12:14:19 -06:00
parent 2694734d22
commit e125e299a7
3 changed files with 9 additions and 3 deletions
--- a/src/khoj/processor/conversation/anthropic/utils.py
+++ b/src/khoj/processor/conversation/anthropic/utils.py
@@ -144,6 +144,7 @@ async def anthropic_chat_completion_with_backoff(
        formatted_messages, system_prompt = format_messages_for_anthropic(messages, system_prompt)

        aggregated_response = ""
+        response_started = False
        final_message = None
        start_time = perf_counter()
        async with client.messages.stream(
@@ -157,7 +158,8 @@ async def anthropic_chat_completion_with_backoff(
        ) as stream:
            async for chunk in stream:
                # Log the time taken to start response
-                if aggregated_response == "":
+                if not response_started:
+                    response_started = True
                    logger.info(f"First response took: {perf_counter() - start_time:.3f} seconds")
                # Skip empty chunks
                if chunk.type != "content_block_delta":
--- a/src/khoj/processor/conversation/google/utils.py
+++ b/src/khoj/processor/conversation/google/utils.py
@@ -195,13 +195,15 @@ async def gemini_chat_completion_with_backoff(

        aggregated_response = ""
        final_chunk = None
+        response_started = False
        start_time = perf_counter()
        chat_stream: AsyncIterator[gtypes.GenerateContentResponse] = await client.aio.models.generate_content_stream(
            model=model_name, config=config, contents=formatted_messages
        )
        async for chunk in chat_stream:
            # Log the time taken to start response
-            if final_chunk is None:
+            if not response_started:
+                response_started = True
                logger.info(f"First response took: {perf_counter() - start_time:.3f} seconds")
            # Keep track of the last chunk for usage data
            final_chunk = chunk
--- a/src/khoj/processor/conversation/openai/utils.py
+++ b/src/khoj/processor/conversation/openai/utils.py
@@ -226,6 +226,7 @@ async def chat_completion_with_backoff(

        aggregated_response = ""
        final_chunk = None
+        response_started = False
        start_time = perf_counter()
        chat_stream: openai.AsyncStream[ChatCompletionChunk] = await client.chat.completions.create(
            messages=formatted_messages,  # type: ignore
@@ -237,7 +238,8 @@ async def chat_completion_with_backoff(
        )
        async for chunk in stream_processor(chat_stream):
            # Log the time taken to start response
-            if final_chunk is None:
+            if not response_started:
+                response_started = True
                logger.info(f"First response took: {perf_counter() - start_time:.3f} seconds")
            # Keep track of the last chunk for usage data
            final_chunk = chunk