From 911e1bf981b41ba30f7393d62f68b07e2a882274 Mon Sep 17 00:00:00 2001 From: Debanjum Date: Fri, 4 Apr 2025 00:08:48 +0530 Subject: [PATCH] Use gemini 2.0 flash as evaluator. Set seed for it to reduce eval variance. Gemini 2.0 flash model is cheaper and better than Gemini 1.5 pro --- tests/evals/eval.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/tests/evals/eval.py b/tests/evals/eval.py index e9d56f03..f968c0a3 100644 --- a/tests/evals/eval.py +++ b/tests/evals/eval.py @@ -37,8 +37,9 @@ KHOJ_API_KEY = os.getenv("KHOJ_API_KEY") KHOJ_MODE = os.getenv("KHOJ_MODE", "default").lower() # E.g research, general, notes etc. GEMINI_API_KEY = os.getenv("GEMINI_API_KEY") -GEMINI_EVAL_MODEL = os.getenv("GEMINI_EVAL_MODEL", "gemini-1.5-pro-002") +GEMINI_EVAL_MODEL = os.getenv("GEMINI_EVAL_MODEL", "gemini-2.0-flash-001") +LLM_SEED = int(os.getenv("KHOJ_LLM_SEED")) if os.getenv("KHOJ_LLM_SEED") else None SAMPLE_SIZE = os.getenv("SAMPLE_SIZE") # Number of examples to evaluate RANDOMIZE = os.getenv("RANDOMIZE", "false").lower() == "true" # Randomize examples BATCH_SIZE = int( @@ -469,7 +470,7 @@ def evaluate_response_with_gemini( headers={"Content-Type": "application/json"}, json={ "contents": [{"parts": [{"text": evaluation_prompt}]}], - "generationConfig": {"response_mime_type": "application/json"}, + "generationConfig": {"response_mime_type": "application/json", "seed": LLM_SEED}, }, ) response.raise_for_status()