Ensure time to first token logged only once per chat response

Time to first token Log lines were shown multiple times if new chunk
bein streamed was empty for some reason.

This change makes the logic robust to empty chunks being recieved.
This commit is contained in:
Debanjum
2025-05-13 12:14:19 -06:00
parent 2694734d22
commit e125e299a7
3 changed files with 9 additions and 3 deletions

View File

@@ -144,6 +144,7 @@ async def anthropic_chat_completion_with_backoff(
formatted_messages, system_prompt = format_messages_for_anthropic(messages, system_prompt)
aggregated_response = ""
response_started = False
final_message = None
start_time = perf_counter()
async with client.messages.stream(
@@ -157,7 +158,8 @@ async def anthropic_chat_completion_with_backoff(
) as stream:
async for chunk in stream:
# Log the time taken to start response
if aggregated_response == "":
if not response_started:
response_started = True
logger.info(f"First response took: {perf_counter() - start_time:.3f} seconds")
# Skip empty chunks
if chunk.type != "content_block_delta":

View File

@@ -195,13 +195,15 @@ async def gemini_chat_completion_with_backoff(
aggregated_response = ""
final_chunk = None
response_started = False
start_time = perf_counter()
chat_stream: AsyncIterator[gtypes.GenerateContentResponse] = await client.aio.models.generate_content_stream(
model=model_name, config=config, contents=formatted_messages
)
async for chunk in chat_stream:
# Log the time taken to start response
if final_chunk is None:
if not response_started:
response_started = True
logger.info(f"First response took: {perf_counter() - start_time:.3f} seconds")
# Keep track of the last chunk for usage data
final_chunk = chunk

View File

@@ -226,6 +226,7 @@ async def chat_completion_with_backoff(
aggregated_response = ""
final_chunk = None
response_started = False
start_time = perf_counter()
chat_stream: openai.AsyncStream[ChatCompletionChunk] = await client.chat.completions.create(
messages=formatted_messages, # type: ignore
@@ -237,7 +238,8 @@ async def chat_completion_with_backoff(
)
async for chunk in stream_processor(chat_stream):
# Log the time taken to start response
if final_chunk is None:
if not response_started:
response_started = True
logger.info(f"First response took: {perf_counter() - start_time:.3f} seconds")
# Keep track of the last chunk for usage data
final_chunk = chunk