From 52b1928023e4f7430a42087d136dde61dab61133 Mon Sep 17 00:00:00 2001 From: Debanjum Date: Sun, 31 Aug 2025 14:05:16 -0700 Subject: [PATCH] Make gpqa answer evaluator more versatile at extracting mcq answers --- tests/evals/eval.py | 21 ++++++++++++++++----- 1 file changed, 16 insertions(+), 5 deletions(-) diff --git a/tests/evals/eval.py b/tests/evals/eval.py index a012d364..2e74994f 100644 --- a/tests/evals/eval.py +++ b/tests/evals/eval.py @@ -436,14 +436,25 @@ def evaluate_response_with_mcq_match( ) -> tuple[bool | None, str, float]: """Evaluate Khoj response against benchmark ground truth using string matching""" try: - # Extract answer from agent response - answer_pattern_multichoice = r"(?i)Answer\s*:\s*([A-D])" - match = re.search(answer_pattern_multichoice, agent_response) - extracted_answer = match.group(1) if match else None + # Extract answer from agent response using multiple patterns + answer_patterns = [ + r"(?i)Answer\s*:\s*([A-D])", # Answer: D + r"(?i)(?:final\s+)?answer\s+is\s+([A-D])", # answer is D / final answer is D + r"\$\\boxed\{([A-D])\}\$", # $\boxed{D}$ + r"\\boxed\{([A-D])\}", # \boxed{D} + r"\b([A-D])\b(?=\s*$)", # Just the letter at end of response + ] + + extracted_answer = None + for pattern in answer_patterns: + match = re.search(pattern, agent_response) + if match: + extracted_answer = match.group(1).upper() + break # Check if extracted answer matches ground truth decision = extracted_answer == ground_truth - explanation = f"Agent response {'matches' if decision else 'does not match'} ground truth {ground_truth}" + explanation = f'Agent response "{extracted_answer}" {"matches" if decision else "does not match"} ground truth {ground_truth}.' # Return decision, explanation and cost in structured form return float(decision), explanation, 0.0