From 6cc5a10b0972ec69c319d3efdd3cb13c1c7d759a Mon Sep 17 00:00:00 2001 From: Debanjum Date: Sat, 22 Mar 2025 08:05:12 +0530 Subject: [PATCH] Disable SimpleQA eval on release as saturated & low signal for usecase Reaching >94% in research mode on SimpleQA. When answers can be researched online, it becomes too easy. And the FRAMES eval does a more thorough job of evaluating that use-case anyway. --- .github/workflows/run_evals.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/run_evals.yml b/.github/workflows/run_evals.yml index e39a2d9a..2d075215 100644 --- a/.github/workflows/run_evals.yml +++ b/.github/workflows/run_evals.yml @@ -58,7 +58,7 @@ jobs: matrix: # Use input from manual trigger if available, else run all combinations khoj_mode: ${{ github.event_name == 'workflow_dispatch' && fromJSON(format('["{0}"]', inputs.khoj_mode)) || fromJSON('["general", "default", "research"]') }} - dataset: ${{ github.event_name == 'workflow_dispatch' && fromJSON(format('["{0}"]', inputs.dataset)) || fromJSON('["frames", "simpleqa", "gpqa"]') }} + dataset: ${{ github.event_name == 'workflow_dispatch' && fromJSON(format('["{0}"]', inputs.dataset)) || fromJSON('["frames", "gpqa"]') }} services: postgres: