mirror of
https://github.com/khoaliber/khoj.git
synced 2026-03-02 21:19:12 +00:00
Ensure time to first token logged only once per chat response
Time to first token Log lines were shown multiple times if new chunk bein streamed was empty for some reason. This change makes the logic robust to empty chunks being recieved.
This commit is contained in:
@@ -144,6 +144,7 @@ async def anthropic_chat_completion_with_backoff(
|
||||
formatted_messages, system_prompt = format_messages_for_anthropic(messages, system_prompt)
|
||||
|
||||
aggregated_response = ""
|
||||
response_started = False
|
||||
final_message = None
|
||||
start_time = perf_counter()
|
||||
async with client.messages.stream(
|
||||
@@ -157,7 +158,8 @@ async def anthropic_chat_completion_with_backoff(
|
||||
) as stream:
|
||||
async for chunk in stream:
|
||||
# Log the time taken to start response
|
||||
if aggregated_response == "":
|
||||
if not response_started:
|
||||
response_started = True
|
||||
logger.info(f"First response took: {perf_counter() - start_time:.3f} seconds")
|
||||
# Skip empty chunks
|
||||
if chunk.type != "content_block_delta":
|
||||
|
||||
@@ -195,13 +195,15 @@ async def gemini_chat_completion_with_backoff(
|
||||
|
||||
aggregated_response = ""
|
||||
final_chunk = None
|
||||
response_started = False
|
||||
start_time = perf_counter()
|
||||
chat_stream: AsyncIterator[gtypes.GenerateContentResponse] = await client.aio.models.generate_content_stream(
|
||||
model=model_name, config=config, contents=formatted_messages
|
||||
)
|
||||
async for chunk in chat_stream:
|
||||
# Log the time taken to start response
|
||||
if final_chunk is None:
|
||||
if not response_started:
|
||||
response_started = True
|
||||
logger.info(f"First response took: {perf_counter() - start_time:.3f} seconds")
|
||||
# Keep track of the last chunk for usage data
|
||||
final_chunk = chunk
|
||||
|
||||
@@ -226,6 +226,7 @@ async def chat_completion_with_backoff(
|
||||
|
||||
aggregated_response = ""
|
||||
final_chunk = None
|
||||
response_started = False
|
||||
start_time = perf_counter()
|
||||
chat_stream: openai.AsyncStream[ChatCompletionChunk] = await client.chat.completions.create(
|
||||
messages=formatted_messages, # type: ignore
|
||||
@@ -237,7 +238,8 @@ async def chat_completion_with_backoff(
|
||||
)
|
||||
async for chunk in stream_processor(chat_stream):
|
||||
# Log the time taken to start response
|
||||
if final_chunk is None:
|
||||
if not response_started:
|
||||
response_started = True
|
||||
logger.info(f"First response took: {perf_counter() - start_time:.3f} seconds")
|
||||
# Keep track of the last chunk for usage data
|
||||
final_chunk = chunk
|
||||
|
||||
Reference in New Issue
Block a user