From 52b1928023e4f7430a42087d136dde61dab61133 Mon Sep 17 00:00:00 2001
From: Debanjum <debanjum@gmail.com>
Date: Sun, 31 Aug 2025 14:05:16 -0700
Subject: [PATCH] Make gpqa answer evaluator more versatile at extracting mcq
 answers

---
 tests/evals/eval.py | 21 ++++++++++++++++-----
 1 file changed, 16 insertions(+), 5 deletions(-)

diff --git a/tests/evals/eval.py b/tests/evals/eval.py
index a012d364..2e74994f 100644
--- a/tests/evals/eval.py
+++ b/tests/evals/eval.py
@@ -436,14 +436,25 @@ def evaluate_response_with_mcq_match(
 ) -> tuple[bool | None, str, float]:
     """Evaluate Khoj response against benchmark ground truth using string matching"""
     try:
-        # Extract answer from agent response
-        answer_pattern_multichoice = r"(?i)Answer\s*:\s*([A-D])"
-        match = re.search(answer_pattern_multichoice, agent_response)
-        extracted_answer = match.group(1) if match else None
+        # Extract answer from agent response using multiple patterns
+        answer_patterns = [
+            r"(?i)Answer\s*:\s*([A-D])",  # Answer: D
+            r"(?i)(?:final\s+)?answer\s+is\s+([A-D])",  # answer is D / final answer is D
+            r"\$\\boxed\{([A-D])\}\$",  # $\boxed{D}$
+            r"\\boxed\{([A-D])\}",  # \boxed{D}
+            r"\b([A-D])\b(?=\s*$)",  # Just the letter at end of response
+        ]
+
+        extracted_answer = None
+        for pattern in answer_patterns:
+            match = re.search(pattern, agent_response)
+            if match:
+                extracted_answer = match.group(1).upper()
+                break
 
         # Check if extracted answer matches ground truth
         decision = extracted_answer == ground_truth
-        explanation = f"Agent response {'matches' if decision else 'does not match'} ground truth {ground_truth}"
+        explanation = f'Agent response "{extracted_answer}" {"matches" if decision else "does not match"} ground truth {ground_truth}.'
 
         # Return decision, explanation and cost in structured form
         return float(decision), explanation, 0.0