Deterministically shuffle dataset for consistent data in a eval run

Previously eval run across modes would use different dataset shuffles. This change enables a strict apples to apples perf comparison of the different khoj modes across the same (random) subset of questions by using a dataset seed per workflow run to sample questions
2026-03-02 21:19:12 +00:00 · 2025-08-31 13:51:43 -07:00
parent edf9ea6312
commit 703e189979
2 changed files with 17 additions and 7 deletions
--- a/.github/workflows/run_evals.yml
+++ b/.github/workflows/run_evals.yml
@@ -76,6 +76,11 @@ on:
        options:
          - 'false'
          - 'true'
+      dataset_seed:
+        description: 'Seed to deterministically shuffle questions'
+        required: false
+        default: ''
+        type: string

 jobs:
  eval:
@@ -149,8 +154,9 @@ jobs:
          SAMPLE_SIZE: ${{ github.event_name == 'workflow_dispatch' && inputs.sample_size || 200 }}
          BATCH_SIZE: "20"
          RANDOMIZE: ${{ github.event_name == 'workflow_dispatch' && inputs.randomize || 'true' }}
-          KHOJ_URL: "http://localhost:42110"
+          DATASET_SEED: ${{ github.event_name == 'workflow_dispatch' && inputs.dataset_seed || github.run_id }}
          KHOJ_LLM_SEED: "42"
+          KHOJ_URL: "http://localhost:42110"
          KHOJ_DEFAULT_CHAT_MODEL: ${{ github.event_name == 'workflow_dispatch' && inputs.chat_model || 'gemini-2.5-flash' }}
          KHOJ_RESEARCH_ITERATIONS: ${{ github.event_name == 'workflow_dispatch' && inputs.max_research_iterations || 10 }}
          KHOJ_AUTO_READ_WEBPAGE: ${{ github.event_name == 'workflow_dispatch' && inputs.auto_read_webpage || 'false' }}