Make gpqa answer evaluator more versatile at extracting mcq answers

2026-04-19 17:14:35 +00:00 · 2025-08-31 14:05:16 -07:00
parent 703e189979
commit 52b1928023
1 changed files with 16 additions and 5 deletions
--- a/tests/evals/eval.py
+++ b/tests/evals/eval.py
@@ -436,14 +436,25 @@ def evaluate_response_with_mcq_match(
 ) -> tuple[bool | None, str, float]:
    """Evaluate Khoj response against benchmark ground truth using string matching"""
    try:
-        # Extract answer from agent response
-        answer_pattern_multichoice = r"(?i)Answer\s*:\s*([A-D])"
-        match = re.search(answer_pattern_multichoice, agent_response)
-        extracted_answer = match.group(1) if match else None
+        # Extract answer from agent response using multiple patterns
+        answer_patterns = [
+            r"(?i)Answer\s*:\s*([A-D])",  # Answer: D
+            r"(?i)(?:final\s+)?answer\s+is\s+([A-D])",  # answer is D / final answer is D
+            r"\$\\boxed\{([A-D])\}\$",  # $\boxed{D}$
+            r"\\boxed\{([A-D])\}",  # \boxed{D}
+            r"\b([A-D])\b(?=\s*$)",  # Just the letter at end of response
+        ]
+
+        extracted_answer = None
+        for pattern in answer_patterns:
+            match = re.search(pattern, agent_response)
+            if match:
+                extracted_answer = match.group(1).upper()
+                break

        # Check if extracted answer matches ground truth
        decision = extracted_answer == ground_truth
-        explanation = f"Agent response {'matches' if decision else 'does not match'} ground truth {ground_truth}"
+        explanation = f'Agent response "{extracted_answer}" {"matches" if decision else "does not match"} ground truth {ground_truth}.'

        # Return decision, explanation and cost in structured form
        return float(decision), explanation, 0.0