diff --git a/tests/eval_frames.py b/tests/eval_frames.py index de5e7d83..ce0b9b23 100644 --- a/tests/eval_frames.py +++ b/tests/eval_frames.py @@ -1,3 +1,4 @@ +import concurrent.futures import json import os import time @@ -90,6 +91,43 @@ def evaluate_response(query: str, agent_response: str, ground_truth: str) -> Dic return {"decision": "FALSE", "explanation": f"Evaluation failed: {str(e)}"} +def process_batch(batch, counter, results, dataset_length): + for prompt, answer, reasoning_type in batch: + counter += 1 + print(f"Processing example: {counter}/{dataset_length}") + + # Trigger research mode if enabled + prompt = f"/{KHOJ_MODE} {prompt}" if KHOJ_MODE else prompt + + # Get agent response + agent_response = get_agent_response(prompt) + + # Evaluate response + evaluation = evaluate_response(prompt, agent_response, answer) + + # Store results + results.append( + { + "index": counter, + "prompt": prompt, + "ground_truth": answer, + "agent_response": agent_response, + "evaluation_decision": evaluation["decision"], + "evaluation_explanation": evaluation["explanation"], + "reasoning_type": reasoning_type, + } + ) + + # Color the decision based on its value + decision_color = "green" if evaluation["decision"] == True else "red" + colored_decision = color_text(evaluation["decision"], decision_color) + print( + f'Decision: {colored_decision}\nQuestion: {prompt}\nExpected Answer: {answer}\nAgent Answer: {agent_response}\nExplanation: {evaluation["explanation"]}\n' + ) + + time.sleep(SLEEP_SECONDS) # Rate limiting + + def color_text(text, color): colors = {"red": "\033[91m", "green": "\033[92m", "reset": "\033[0m"} return f"{colors[color]}{text}{colors['reset']}" @@ -109,49 +147,21 @@ def main(): # Initialize variables counter = 0 results = [] + dataset_length = len(dataset["Prompt"]) # Process examples in batches - for i in range(0, len(dataset), BATCH_SIZE): - batch = zip( - dataset["Prompt"][i : i + BATCH_SIZE], - dataset["Answer"][i : i + BATCH_SIZE], - dataset["reasoning_types"][i : i + BATCH_SIZE], - ) - - for prompt, answer, reasoning_type in batch: - counter += 1 - print(f'Processing example: {counter}/{len(dataset["Prompt"])}') - - # Trigger research mode if enabled - prompt = f"/{KHOJ_MODE} {prompt}" if KHOJ_MODE else prompt - - # Get agent response - agent_response = get_agent_response(prompt) - - # Evaluate response - evaluation = evaluate_response(agent_response, answer) - - # Store results - results.append( - { - "index": i, - "prompt": prompt, - "ground_truth": answer, - "agent_response": agent_response, - "evaluation_decision": evaluation["decision"], - "evaluation_explanation": evaluation["explanation"], - "reasoning_type": reasoning_type, - } + with concurrent.futures.ThreadPoolExecutor() as executor: + futures = [] + for i in range(0, dataset_length, BATCH_SIZE): + batch = zip( + dataset["Prompt"][i : i + BATCH_SIZE], + dataset["Answer"][i : i + BATCH_SIZE], + dataset["reasoning_types"][i : i + BATCH_SIZE], ) + futures.append(executor.submit(process_batch, batch, counter, results, dataset_length)) - # Color the decision based on its value - decision_color = "green" if evaluation["decision"] == True else "red" - colored_decision = color_text(evaluation["decision"], decision_color) - print( - f'Decision: {colored_decision}\nQuestion: {prompt}\nExpected Answer: {answer}\nAgent Answer: {agent_response}\nExplanation: {evaluation["explanation"]}\n' - ) - - time.sleep(SLEEP_SECONDS) # Rate limiting + # Wait for all futures to complete + concurrent.futures.wait(futures) # Calculate metrics df = pd.DataFrame(results)