Add GitHub workflow to quiz Khoj across modes and specified evals (#982)

- Evaluate khoj on random 200 questions from each of google frames and openai simpleqa benchmarks across *general*, *default* and *research* modes
- Run eval with Gemini 1.5 Flash as test giver and Gemini 1.5 Pro as test evaluator models
- Trigger eval workflow on release or manually
- Make dataset, khoj mode and sample size configurable when triggered via manual workflow
- Enable Web search, webpage read tools during evaluation
This commit is contained in:
Debanjum
2024-11-18 02:19:30 -08:00
committed by GitHub
parent f75085dc7a
commit 7c0fd71bfd
2 changed files with 127 additions and 3 deletions

View File

@@ -22,7 +22,7 @@ logger = logging.getLogger(__name__)
KHOJ_URL = os.getenv("KHOJ_URL", "http://localhost:42110")
KHOJ_CHAT_API_URL = f"{KHOJ_URL}/api/chat"
KHOJ_API_KEY = os.getenv("KHOJ_API_KEY")
KHOJ_MODE = os.getenv("KHOJ_MODE") # E.g research, general, notes etc.
KHOJ_MODE = os.getenv("KHOJ_MODE", "default") # E.g research, general, notes etc.
GEMINI_API_KEY = os.getenv("GEMINI_API_KEY")
GEMINI_EVAL_MODEL = os.getenv("GEMINI_EVAL_MODEL", "gemini-1.5-pro-002")
@@ -32,8 +32,10 @@ GEMINI_API_URL = (
SAMPLE_SIZE = os.getenv("SAMPLE_SIZE") # Number of examples to evaluate
RANDOMIZE = os.getenv("RANDOMIZE", "false").lower() == "true" # Randomize examples
BATCH_SIZE = int(os.getenv("BATCH_SIZE", 10)) # Number of examples to evaluate in parallel
SLEEP_SECONDS = 1 # Delay between API calls to avoid rate limiting
BATCH_SIZE = int(
os.getenv("BATCH_SIZE", int(SAMPLE_SIZE) / 10 if SAMPLE_SIZE else 10)
) # Examples to evaluate in each batch
SLEEP_SECONDS = 3 if KHOJ_MODE == "general" else 1 # Sleep between API calls to avoid rate limiting
def load_frames_dataset():