Add GitHub workflow to quiz Khoj across modes and specified evals (#982)

- Evaluate khoj on random 200 questions from each of google frames and openai simpleqa benchmarks across *general*, *default* and *research* modes - Run eval with Gemini 1.5 Flash as test giver and Gemini 1.5 Pro as test evaluator models - Trigger eval workflow on release or manually - Make dataset, khoj mode and sample size configurable when triggered via manual workflow - Enable Web search, webpage read tools during evaluation
2026-03-02 13:18:18 +00:00 · 2024-11-18 02:19:30 -08:00
parent f75085dc7a
commit 7c0fd71bfd
2 changed files with 127 additions and 3 deletions
--- a/tests/evals/eval.py
+++ b/tests/evals/eval.py
@@ -22,7 +22,7 @@ logger = logging.getLogger(__name__)
 KHOJ_URL = os.getenv("KHOJ_URL", "http://localhost:42110")
 KHOJ_CHAT_API_URL = f"{KHOJ_URL}/api/chat"
 KHOJ_API_KEY = os.getenv("KHOJ_API_KEY")
-KHOJ_MODE = os.getenv("KHOJ_MODE")  # E.g research, general, notes etc.
+KHOJ_MODE = os.getenv("KHOJ_MODE", "default")  # E.g research, general, notes etc.

 GEMINI_API_KEY = os.getenv("GEMINI_API_KEY")
 GEMINI_EVAL_MODEL = os.getenv("GEMINI_EVAL_MODEL", "gemini-1.5-pro-002")
@@ -32,8 +32,10 @@ GEMINI_API_URL = (

 SAMPLE_SIZE = os.getenv("SAMPLE_SIZE")  # Number of examples to evaluate
 RANDOMIZE = os.getenv("RANDOMIZE", "false").lower() == "true"  # Randomize examples
-BATCH_SIZE = int(os.getenv("BATCH_SIZE", 10))  # Number of examples to evaluate in parallel
-SLEEP_SECONDS = 1  # Delay between API calls to avoid rate limiting
+BATCH_SIZE = int(
+    os.getenv("BATCH_SIZE", int(SAMPLE_SIZE) / 10 if SAMPLE_SIZE else 10)
+)  # Examples to evaluate in each batch
+SLEEP_SECONDS = 3 if KHOJ_MODE == "general" else 1  # Sleep between API calls to avoid rate limiting


 def load_frames_dataset():