mirror of
https://github.com/khoaliber/khoj.git
synced 2026-03-02 13:18:18 +00:00
Deterministically shuffle dataset for consistent data in a eval run
Previously eval run across modes would use different dataset shuffles. This change enables a strict apples to apples perf comparison of the different khoj modes across the same (random) subset of questions by using a dataset seed per workflow run to sample questions
This commit is contained in:
8
.github/workflows/run_evals.yml
vendored
8
.github/workflows/run_evals.yml
vendored
@@ -76,6 +76,11 @@ on:
|
|||||||
options:
|
options:
|
||||||
- 'false'
|
- 'false'
|
||||||
- 'true'
|
- 'true'
|
||||||
|
dataset_seed:
|
||||||
|
description: 'Seed to deterministically shuffle questions'
|
||||||
|
required: false
|
||||||
|
default: ''
|
||||||
|
type: string
|
||||||
|
|
||||||
jobs:
|
jobs:
|
||||||
eval:
|
eval:
|
||||||
@@ -149,8 +154,9 @@ jobs:
|
|||||||
SAMPLE_SIZE: ${{ github.event_name == 'workflow_dispatch' && inputs.sample_size || 200 }}
|
SAMPLE_SIZE: ${{ github.event_name == 'workflow_dispatch' && inputs.sample_size || 200 }}
|
||||||
BATCH_SIZE: "20"
|
BATCH_SIZE: "20"
|
||||||
RANDOMIZE: ${{ github.event_name == 'workflow_dispatch' && inputs.randomize || 'true' }}
|
RANDOMIZE: ${{ github.event_name == 'workflow_dispatch' && inputs.randomize || 'true' }}
|
||||||
KHOJ_URL: "http://localhost:42110"
|
DATASET_SEED: ${{ github.event_name == 'workflow_dispatch' && inputs.dataset_seed || github.run_id }}
|
||||||
KHOJ_LLM_SEED: "42"
|
KHOJ_LLM_SEED: "42"
|
||||||
|
KHOJ_URL: "http://localhost:42110"
|
||||||
KHOJ_DEFAULT_CHAT_MODEL: ${{ github.event_name == 'workflow_dispatch' && inputs.chat_model || 'gemini-2.5-flash' }}
|
KHOJ_DEFAULT_CHAT_MODEL: ${{ github.event_name == 'workflow_dispatch' && inputs.chat_model || 'gemini-2.5-flash' }}
|
||||||
KHOJ_RESEARCH_ITERATIONS: ${{ github.event_name == 'workflow_dispatch' && inputs.max_research_iterations || 10 }}
|
KHOJ_RESEARCH_ITERATIONS: ${{ github.event_name == 'workflow_dispatch' && inputs.max_research_iterations || 10 }}
|
||||||
KHOJ_AUTO_READ_WEBPAGE: ${{ github.event_name == 'workflow_dispatch' && inputs.auto_read_webpage || 'false' }}
|
KHOJ_AUTO_READ_WEBPAGE: ${{ github.event_name == 'workflow_dispatch' && inputs.auto_read_webpage || 'false' }}
|
||||||
|
|||||||
@@ -40,6 +40,7 @@ GEMINI_API_KEY = os.getenv("GEMINI_API_KEY")
|
|||||||
GEMINI_EVAL_MODEL = os.getenv("GEMINI_EVAL_MODEL", "gemini-2.5-flash")
|
GEMINI_EVAL_MODEL = os.getenv("GEMINI_EVAL_MODEL", "gemini-2.5-flash")
|
||||||
|
|
||||||
LLM_SEED = int(os.getenv("KHOJ_LLM_SEED")) if os.getenv("KHOJ_LLM_SEED") else None
|
LLM_SEED = int(os.getenv("KHOJ_LLM_SEED")) if os.getenv("KHOJ_LLM_SEED") else None
|
||||||
|
DATASET_SEED = int(os.getenv("DATASET_SEED")) if os.getenv("DATASET_SEED") else None
|
||||||
SAMPLE_SIZE = os.getenv("SAMPLE_SIZE") # Number of examples to evaluate
|
SAMPLE_SIZE = os.getenv("SAMPLE_SIZE") # Number of examples to evaluate
|
||||||
RANDOMIZE = os.getenv("RANDOMIZE", "false").lower() == "true" # Randomize examples
|
RANDOMIZE = os.getenv("RANDOMIZE", "false").lower() == "true" # Randomize examples
|
||||||
BATCH_SIZE = int(
|
BATCH_SIZE = int(
|
||||||
@@ -196,7 +197,7 @@ def load_frames_dataset():
|
|||||||
try:
|
try:
|
||||||
dataset = load_dataset("google/frames-benchmark")
|
dataset = load_dataset("google/frames-benchmark")
|
||||||
# Use test split for evaluation. Sample and shuffle dataset if configured
|
# Use test split for evaluation. Sample and shuffle dataset if configured
|
||||||
dataset = dataset.shuffle() if RANDOMIZE else dataset
|
dataset = dataset.shuffle(seed=DATASET_SEED) if RANDOMIZE else dataset
|
||||||
return dataset["test"][: int(SAMPLE_SIZE)] if SAMPLE_SIZE else dataset["test"]
|
return dataset["test"][: int(SAMPLE_SIZE)] if SAMPLE_SIZE else dataset["test"]
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
@@ -238,7 +239,7 @@ def load_simpleqa_dataset():
|
|||||||
|
|
||||||
# Convert benchmark to HF Dataset
|
# Convert benchmark to HF Dataset
|
||||||
dataset = Dataset.from_list(formatted_data)
|
dataset = Dataset.from_list(formatted_data)
|
||||||
dataset = dataset.shuffle() if RANDOMIZE else dataset
|
dataset = dataset.shuffle(seed=DATASET_SEED) if RANDOMIZE else dataset
|
||||||
dataset = dataset.select(range(int(SAMPLE_SIZE))) if SAMPLE_SIZE else dataset
|
dataset = dataset.select(range(int(SAMPLE_SIZE))) if SAMPLE_SIZE else dataset
|
||||||
|
|
||||||
return dataset
|
return dataset
|
||||||
@@ -275,8 +276,11 @@ def load_gpqa_dataset():
|
|||||||
row["Incorrect Answer 3"],
|
row["Incorrect Answer 3"],
|
||||||
row["Correct Answer"],
|
row["Correct Answer"],
|
||||||
]
|
]
|
||||||
# Shuffle choices
|
# Shuffle choices with deterministic seed if provided
|
||||||
random.shuffle(choices)
|
if DATASET_SEED is not None:
|
||||||
|
random.Random(DATASET_SEED).shuffle(choices)
|
||||||
|
else:
|
||||||
|
random.shuffle(choices)
|
||||||
|
|
||||||
# Get correct answer letter
|
# Get correct answer letter
|
||||||
correct_index = choices.index(row["Correct Answer"])
|
correct_index = choices.index(row["Correct Answer"])
|
||||||
@@ -307,7 +311,7 @@ D) {choices[3]}
|
|||||||
dataset = dataset.add_column("Answer", [p[1] for p in prompts_and_answers])
|
dataset = dataset.add_column("Answer", [p[1] for p in prompts_and_answers])
|
||||||
|
|
||||||
# Sample and shuffle dataset if configured
|
# Sample and shuffle dataset if configured
|
||||||
dataset = dataset.shuffle() if RANDOMIZE else dataset
|
dataset = dataset.shuffle(seed=DATASET_SEED) if RANDOMIZE else dataset
|
||||||
dataset = dataset[: int(SAMPLE_SIZE)] if SAMPLE_SIZE else dataset
|
dataset = dataset[: int(SAMPLE_SIZE)] if SAMPLE_SIZE else dataset
|
||||||
|
|
||||||
return dataset
|
return dataset
|
||||||
@@ -331,7 +335,7 @@ def load_math500_dataset():
|
|||||||
# Load the MATH500 dataset from HuggingFace
|
# Load the MATH500 dataset from HuggingFace
|
||||||
dataset = load_dataset("HuggingFaceH4/MATH-500", split="test")
|
dataset = load_dataset("HuggingFaceH4/MATH-500", split="test")
|
||||||
dataset = dataset.rename_columns({"problem": "Prompt", "answer": "Answer", "subject": "reasoning_types"})
|
dataset = dataset.rename_columns({"problem": "Prompt", "answer": "Answer", "subject": "reasoning_types"})
|
||||||
dataset = dataset.shuffle() if RANDOMIZE else dataset
|
dataset = dataset.shuffle(seed=DATASET_SEED) if RANDOMIZE else dataset
|
||||||
dataset = dataset.select(range(int(SAMPLE_SIZE))) if SAMPLE_SIZE else dataset
|
dataset = dataset.select(range(int(SAMPLE_SIZE))) if SAMPLE_SIZE else dataset
|
||||||
|
|
||||||
return dataset
|
return dataset
|
||||||
|
|||||||
Reference in New Issue
Block a user