From d9d58849581883675edb9bbe8815705d6468ea04 Mon Sep 17 00:00:00 2001 From: Debanjum Date: Thu, 14 Nov 2024 15:55:00 -0800 Subject: [PATCH] Enable evaluating Khoj on the OpenAI SimpleQA bench using eval script - Just load the raw csv from OpenAI bucket. Normalize it into FRAMES format - Improve docstring for frames datasets as well - Log the load dataset perf timer at info level --- tests/eval_frames.py | 64 +++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 60 insertions(+), 4 deletions(-) diff --git a/tests/eval_frames.py b/tests/eval_frames.py index 7788322d..8716e4d1 100644 --- a/tests/eval_frames.py +++ b/tests/eval_frames.py @@ -5,6 +5,7 @@ import logging import os import time from datetime import datetime +from io import StringIO from typing import Any, Dict import pandas as pd @@ -36,11 +37,21 @@ SLEEP_SECONDS = 1 # Delay between API calls to avoid rate limiting def load_frames_dataset(): - """Load the FRAMES benchmark dataset from HuggingFace""" + """ + Load the Google FRAMES benchmark dataset from HuggingFace + + FRAMES is a benchmark dataset to evaluate retrieval and answering capabilities of agents. + It contains ~800 requiring multi-hop retrieval and reasoning across various topics. + + ### Data Fields + - Prompt: The question to be answered + - Answer: The ground truth answer + - reasoning_types: The type of reasoning required to answer the question + """ try: dataset = load_dataset("google/frames-benchmark") - dataset = dataset.shuffle() if RANDOMIZE else dataset # Use test split for evaluation. Sample and shuffle dataset if configured + dataset = dataset.shuffle() if RANDOMIZE else dataset return dataset["test"][: int(SAMPLE_SIZE)] if SAMPLE_SIZE else dataset["test"] except Exception as e: @@ -48,6 +59,49 @@ def load_frames_dataset(): return None +def load_simpleqa_dataset(): + """ + Load the OpenAI SimpleQA benchmark dataset from their public bucket. + + SimpleQA is a dataset of moderately difficult q&a for 2024 models to answer across various topics. + It contains ~4000 human vetted questions and answers with additional metadata. + Its usage can be seen in openai/simple-evals github repository as well. + + ### Data Fields + - problem: The question to be answered + - answer: The ground truth answer + - metadata: Additional metadata including topic information + """ + + try: + # Load SimpleQA benchmark from OpenAI public bucket + raw_url = "https://openaipublic.blob.core.windows.net/simple-evals/simple_qa_test_set.csv" + response = requests.get(raw_url) + response.raise_for_status() + + # Parse benchmark from raw CSV response + csv_data = pd.read_csv(StringIO(response.text)) + # Normalize it into FRAMES format + formatted_data = [ + { + "Prompt": d["problem"], + "Answer": d["answer"], + "reasoning_types": json.loads(csv_data.to_dict("records")[0]["metadata"].replace("'", '"'))["topic"], + } + for d in csv_data.to_dict("records") + ] + + # Convert benchmark to HF Dataset + dataset = Dataset.from_list(formatted_data) + dataset = dataset.shuffle() if RANDOMIZE else dataset + dataset = dataset.select(range(int(SAMPLE_SIZE))) if SAMPLE_SIZE else dataset + + return dataset + except Exception as e: + logger.error(f"Error loading simpleqa dataset: {e}") + return None + + def get_agent_response(prompt: str) -> str: """Get response from the Khoj API""" try: @@ -176,7 +230,7 @@ def parse_args(): "--dataset", "-d", default="frames", - choices=["frames"], + choices=["frames", "simpleqa"], help="Dataset to use for evaluation (default: frames)", ) return parser.parse_args() @@ -188,9 +242,11 @@ def main(): dataset = None # Load dataset - with timer(f"Loaded {args.dataset} dataset in", logger): + with timer(f"Loaded {args.dataset} dataset in", logger, log_level=logging.INFO): if args.dataset == "frames": dataset = load_frames_dataset() + elif args.dataset == "simpleqa": + dataset = load_simpleqa_dataset() if dataset is None: return