diff --git a/.github/workflows/run_evals.yml b/.github/workflows/run_evals.yml index f7578253..fde3489f 100644 --- a/.github/workflows/run_evals.yml +++ b/.github/workflows/run_evals.yml @@ -25,6 +25,7 @@ on: options: - frames - simpleqa + - gpqa sample_size: description: 'Number of samples to evaluate' required: false @@ -97,6 +98,7 @@ jobs: GEMINI_API_KEY: ${{ secrets.GEMINI_API_KEY }} SERPER_DEV_API_KEY: ${{ secrets.SERPER_DEV_API_KEY }} OLOSTEP_API_KEY: ${{ secrets.OLOSTEP_API_KEY }} + HF_TOKEN: ${{ secrets.HF_TOKEN }} KHOJ_ADMIN_EMAIL: khoj KHOJ_ADMIN_PASSWORD: khoj POSTGRES_HOST: localhost diff --git a/tests/evals/eval.py b/tests/evals/eval.py index 30c842b2..0e715d61 100644 --- a/tests/evals/eval.py +++ b/tests/evals/eval.py @@ -3,6 +3,7 @@ import concurrent.futures import json import logging import os +import re import time from datetime import datetime from io import StringIO @@ -24,13 +25,10 @@ logger = logging.getLogger(__name__) KHOJ_URL = os.getenv("KHOJ_URL", "http://localhost:42110") KHOJ_CHAT_API_URL = f"{KHOJ_URL}/api/chat" KHOJ_API_KEY = os.getenv("KHOJ_API_KEY") -KHOJ_MODE = os.getenv("KHOJ_MODE", "default") # E.g research, general, notes etc. +KHOJ_MODE = os.getenv("KHOJ_MODE", "default").lower() # E.g research, general, notes etc. GEMINI_API_KEY = os.getenv("GEMINI_API_KEY") GEMINI_EVAL_MODEL = os.getenv("GEMINI_EVAL_MODEL", "gemini-1.5-pro-002") -GEMINI_API_URL = ( - f"https://generativelanguage.googleapis.com/v1beta/models/{GEMINI_EVAL_MODEL}:generateContent?key={GEMINI_API_KEY}" -) SAMPLE_SIZE = os.getenv("SAMPLE_SIZE") # Number of examples to evaluate RANDOMIZE = os.getenv("RANDOMIZE", "false").lower() == "true" # Randomize examples @@ -128,6 +126,75 @@ def load_simpleqa_dataset(): return None +def load_gpqa_dataset(): + """ + Load the Google GPQA benchmark dataset from HuggingFace + + GPQA is a benchmark dataset to evaluate retrieval and answering capabilities of agents. + It contains ~800 requiring multi-hop retrieval and reasoning across various topics. + + ### Data Fields + - Prompt: The question to be answered + - Answer: The ground truth answer + - reasoning_types: The type of reasoning required to answer the question + """ + import random + + def format_multiple_choice_question(row: Dict) -> tuple[str, str]: + """ + Create GPQA multi-choice prompt from shuffled answer choices and question. + Refer: https://github.com/openai/simple-evals/blob/a8e85cc8a5dea497d915f870895250e07f9cc737/common.py#L12 + + Returns formatted prompt and correct answer letter. + """ + # Gather choices + choices = [ + row["Incorrect Answer 1"], + row["Incorrect Answer 2"], + row["Incorrect Answer 3"], + row["Correct Answer"], + ] + # Shuffle choices + random.shuffle(choices) + + # Get correct answer letter + correct_index = choices.index(row["Correct Answer"]) + correct_letter = "ABCD"[correct_index] + + prompt = f""" +Answer the following multiple choice question. Answer should be of the following format: 'Answer: $LETTER' (without quotes) where LETTER is one of ABCD. Think step by step before answering. + +{row["Question"]} + +A) {choices[0]} +B) {choices[1]} +C) {choices[2]} +D) {choices[3]} + """.strip() + + return prompt, correct_letter + + try: + dataset = load_dataset("Idavidrein/gpqa", "gpqa_diamond", split="train") + + # Create multi-choice q&a prompt from choices and correct answer + prompts_and_answers = [format_multiple_choice_question(row) for row in dataset] + + # Normalize dataset to FRAMES format + dataset = dataset.rename_columns({"Subdomain": "reasoning_types"}) + dataset = dataset.add_column("Prompt", [p[0] for p in prompts_and_answers]) + dataset = dataset.add_column("Answer", [p[1] for p in prompts_and_answers]) + + # Sample and shuffle dataset if configured + dataset = dataset.shuffle() if RANDOMIZE else dataset + dataset = dataset[: int(SAMPLE_SIZE)] if SAMPLE_SIZE else dataset + + return dataset + except Exception as e: + logger.error(f"Error loading dataset: {e}") + return None + + def get_agent_response(prompt: str) -> Dict[str, Any]: """Get response from the Khoj API""" # Set headers @@ -152,7 +219,30 @@ def get_agent_response(prompt: str) -> Dict[str, Any]: return {"response": "", "usage": {}} -def evaluate_response(query: str, agent_response: str, ground_truth: str) -> tuple[bool | None, str, float]: +def evaluate_response_with_mcq_match( + query: str, agent_response: str, ground_truth: str +) -> tuple[bool | None, str, float]: + """Evaluate Khoj response against benchmark ground truth using string matching""" + try: + # Extract answer from agent response + answer_pattern_multichoice = r"(?i)Answer\s*:\s*([A-D])" + match = re.search(answer_pattern_multichoice, agent_response) + extracted_answer = match.group(1) if match else None + + # Check if extracted answer matches ground truth + decision = extracted_answer == ground_truth + explanation = f"Agent response {'matches' if decision else 'does not match'} ground truth {ground_truth}" + + # Return decision, explanation and cost in structured form + return decision, explanation, 0.0 + except Exception as e: + logger.error(f"Error in evaluation: {e}") + return None, f"Evaluation failed: {str(e)}", 0.0 + + +def evaluate_response_with_gemini( + query: str, agent_response: str, ground_truth: str, eval_model=GEMINI_EVAL_MODEL +) -> tuple[bool | None, str, float]: """Evaluate Khoj response against benchmark ground truth using Gemini""" evaluation_prompt = f""" Compare the following agent response with the ground truth answer. @@ -166,10 +256,13 @@ def evaluate_response(query: str, agent_response: str, ground_truth: str) -> tup Provide your evaluation in the following json format: {"explanation:" "[How you made the decision?)", "decision:" "(TRUE if response contains key information, FALSE otherwise)"} """ + gemini_api_url = ( + f"https://generativelanguage.googleapis.com/v1beta/models/{eval_model}:generateContent?key={GEMINI_API_KEY}" + ) try: response = requests.post( - GEMINI_API_URL, + gemini_api_url, headers={"Content-Type": "application/json"}, json={ "contents": [{"parts": [{"text": evaluation_prompt}]}], @@ -182,7 +275,7 @@ def evaluate_response(query: str, agent_response: str, ground_truth: str) -> tup # Update cost of evaluation input_tokens = response_json["usageMetadata"]["promptTokenCount"] ouput_tokens = response_json["usageMetadata"]["candidatesTokenCount"] - cost = get_cost_of_chat_message(GEMINI_EVAL_MODEL, input_tokens, ouput_tokens) + cost = get_cost_of_chat_message(eval_model, input_tokens, ouput_tokens) # Parse evaluation response eval_response: dict[str, str] = json.loads( @@ -200,7 +293,7 @@ def evaluate_response(query: str, agent_response: str, ground_truth: str) -> tup return None, f"Evaluation failed: {str(e)}", 0.0 -def process_batch(batch, batch_start, results, dataset_length): +def process_batch(batch, batch_start, results, dataset_length, response_evaluator): global running_cost for idx, (prompt, answer, reasoning_type) in enumerate(batch): current_index = batch_start + idx @@ -219,7 +312,7 @@ def process_batch(batch, batch_start, results, dataset_length): decision = None explanation = "Agent response is empty. This maybe due to a service error." else: - decision, explanation, eval_cost = evaluate_response(prompt, agent_response, answer) + decision, explanation, eval_cost = response_evaluator(prompt, agent_response, answer) # Store results results.append( @@ -292,7 +385,7 @@ def parse_args(): "--dataset", "-d", default="frames", - choices=["frames", "simpleqa"], + choices=["frames", "simpleqa", "gpqa"], help="Dataset to use for evaluation (default: frames)", ) return parser.parse_args() @@ -309,12 +402,18 @@ def main(): dataset = load_frames_dataset() elif args.dataset == "simpleqa": dataset = load_simpleqa_dataset() + elif args.dataset == "gpqa": + dataset = load_gpqa_dataset() if dataset is None: return # Initialize variables results = [] dataset_length = len(dataset["Prompt"]) + if args.dataset == "gpqa": + response_evaluator = evaluate_response_with_mcq_match + else: + response_evaluator = evaluate_response_with_gemini # Process examples in batches with concurrent.futures.ThreadPoolExecutor() as executor: @@ -326,7 +425,9 @@ def main(): dataset["Answer"][i : i + BATCH_SIZE], dataset["reasoning_types"][i : i + BATCH_SIZE], ) - futures.append(executor.submit(process_batch, batch, batch_start, results, dataset_length)) + futures.append( + executor.submit(process_batch, batch, batch_start, results, dataset_length, response_evaluator) + ) # Wait for all futures to complete concurrent.futures.wait(futures)