From d9d58849581883675edb9bbe8815705d6468ea04 Mon Sep 17 00:00:00 2001
From: Debanjum <debanjum@gmail.com>
Date: Thu, 14 Nov 2024 15:55:00 -0800
Subject: [PATCH] Enable evaluating Khoj on the OpenAI SimpleQA bench using
 eval script

- Just load the raw csv from OpenAI bucket. Normalize it into FRAMES format
- Improve docstring for frames datasets as well
- Log the load dataset perf timer at info level
---
 tests/eval_frames.py | 64 +++++++++++++++++++++++++++++++++++++++++---
 1 file changed, 60 insertions(+), 4 deletions(-)

diff --git a/tests/eval_frames.py b/tests/eval_frames.py
index 7788322d..8716e4d1 100644
--- a/tests/eval_frames.py
+++ b/tests/eval_frames.py
@@ -5,6 +5,7 @@ import logging
 import os
 import time
 from datetime import datetime
+from io import StringIO
 from typing import Any, Dict
 
 import pandas as pd
@@ -36,11 +37,21 @@ SLEEP_SECONDS = 1  # Delay between API calls to avoid rate limiting
 
 
 def load_frames_dataset():
-    """Load the FRAMES benchmark dataset from HuggingFace"""
+    """
+    Load the Google FRAMES benchmark dataset from HuggingFace
+
+    FRAMES is a benchmark dataset to evaluate retrieval and answering capabilities of agents.
+    It contains ~800 requiring multi-hop retrieval and reasoning across various topics.
+
+    ### Data Fields
+    - Prompt: The question to be answered
+    - Answer: The ground truth answer
+    - reasoning_types: The type of reasoning required to answer the question
+    """
     try:
         dataset = load_dataset("google/frames-benchmark")
-        dataset = dataset.shuffle() if RANDOMIZE else dataset
         # Use test split for evaluation. Sample and shuffle dataset if configured
+        dataset = dataset.shuffle() if RANDOMIZE else dataset
         return dataset["test"][: int(SAMPLE_SIZE)] if SAMPLE_SIZE else dataset["test"]
 
     except Exception as e:
@@ -48,6 +59,49 @@ def load_frames_dataset():
         return None
 
 
+def load_simpleqa_dataset():
+    """
+    Load the OpenAI SimpleQA benchmark dataset from their public bucket.
+
+    SimpleQA is a dataset of moderately difficult q&a for 2024 models to answer across various topics.
+    It contains ~4000 human vetted questions and answers with additional metadata.
+    Its usage can be seen in openai/simple-evals github repository as well.
+
+    ### Data Fields
+    - problem: The question to be answered
+    - answer: The ground truth answer
+    - metadata: Additional metadata including topic information
+    """
+
+    try:
+        # Load SimpleQA benchmark from OpenAI public bucket
+        raw_url = "https://openaipublic.blob.core.windows.net/simple-evals/simple_qa_test_set.csv"
+        response = requests.get(raw_url)
+        response.raise_for_status()
+
+        # Parse benchmark from raw CSV response
+        csv_data = pd.read_csv(StringIO(response.text))
+        # Normalize it into FRAMES format
+        formatted_data = [
+            {
+                "Prompt": d["problem"],
+                "Answer": d["answer"],
+                "reasoning_types": json.loads(csv_data.to_dict("records")[0]["metadata"].replace("'", '"'))["topic"],
+            }
+            for d in csv_data.to_dict("records")
+        ]
+
+        # Convert benchmark to HF Dataset
+        dataset = Dataset.from_list(formatted_data)
+        dataset = dataset.shuffle() if RANDOMIZE else dataset
+        dataset = dataset.select(range(int(SAMPLE_SIZE))) if SAMPLE_SIZE else dataset
+
+        return dataset
+    except Exception as e:
+        logger.error(f"Error loading simpleqa dataset: {e}")
+        return None
+
+
 def get_agent_response(prompt: str) -> str:
     """Get response from the Khoj API"""
     try:
@@ -176,7 +230,7 @@ def parse_args():
         "--dataset",
         "-d",
         default="frames",
-        choices=["frames"],
+        choices=["frames", "simpleqa"],
         help="Dataset to use for evaluation (default: frames)",
     )
     return parser.parse_args()
@@ -188,9 +242,11 @@ def main():
     dataset = None
 
     # Load dataset
-    with timer(f"Loaded {args.dataset} dataset in", logger):
+    with timer(f"Loaded {args.dataset} dataset in", logger, log_level=logging.INFO):
         if args.dataset == "frames":
             dataset = load_frames_dataset()
+        elif args.dataset == "simpleqa":
+            dataset = load_simpleqa_dataset()
     if dataset is None:
         return