Files
khoj/.github/workflows/run_evals.yml
Debanjum 22cd638add Fix handling unset openai_base_url to run eval with openai chat models
The github run_eval workflow sets OPENAI_BASE_URL to empty string.

The ai model api created during initialization for openai models gets
set to empty string rather than None or the actual openai base url

This tries to call llm at to empty string base url instead of the
default openai api base url, which obviously fails.

Fix is to map empty base url's to the actual openai api base url.
2025-05-19 16:19:43 -07:00

218 lines
7.7 KiB
YAML

name: eval
on:
# Run on every release
push:
tags:
- "*"
# Allow manual triggers from GitHub UI
workflow_dispatch:
inputs:
khoj_mode:
description: 'Khoj Mode (general/default/research)'
required: true
default: 'default'
type: choice
options:
- general
- default
- research
dataset:
description: 'Dataset to evaluate (frames/simpleqa)'
required: true
default: 'frames'
type: choice
options:
- frames
- simpleqa
- gpqa
- math500
sample_size:
description: 'Number of samples to evaluate'
required: false
default: 200
type: number
sandbox:
description: 'Code sandbox to use'
required: false
default: 'terrarium'
type: choice
options:
- terrarium
- e2b
chat_model:
description: 'Chat model to use'
required: false
default: 'gemini-2.0-flash'
type: string
max_research_iterations:
description: 'Maximum number of iterations in research mode'
required: false
default: 5
type: number
openai_api_key:
description: 'OpenAI API key'
required: false
default: ''
type: string
openai_base_url:
description: 'Base URL of OpenAI compatible API'
required: false
default: 'https://api.openai.com/v1'
type: string
auto_read_webpage:
description: 'Auto read webpage on online search'
required: false
default: 'false'
type: choice
options:
- 'false'
- 'true'
randomize:
description: 'Randomize the sample of questions'
required: false
default: 'true'
type: choice
options:
- 'false'
- 'true'
jobs:
eval:
runs-on: ubuntu-latest
strategy:
matrix:
# Use input from manual trigger if available, else run all combinations
khoj_mode: ${{ github.event_name == 'workflow_dispatch' && fromJSON(format('["{0}"]', inputs.khoj_mode)) || fromJSON('["general", "default", "research"]') }}
dataset: ${{ github.event_name == 'workflow_dispatch' && fromJSON(format('["{0}"]', inputs.dataset)) || fromJSON('["frames", "gpqa"]') }}
services:
postgres:
image: ankane/pgvector
env:
POSTGRES_PASSWORD: postgres
POSTGRES_USER: postgres
POSTGRES_DB: postgres
ports:
- 5432:5432
options: >-
--health-cmd pg_isready
--health-interval 10s
--health-timeout 5s
--health-retries 5
steps:
- uses: actions/checkout@v3
with:
fetch-depth: 0
- name: Set up Python
uses: actions/setup-python@v4
with:
python-version: '3.10'
- name: Get App Version
id: hatch
run: |
# Mask relevant workflow inputs as secret early
OPENAI_API_KEY=$(jq -r '.inputs.openai_api_key' $GITHUB_EVENT_PATH)
echo ::add-mask::$OPENAI_API_KEY
echo OPENAI_API_KEY="$OPENAI_API_KEY" >> $GITHUB_ENV
# Get app version from hatch
echo "version=$(pipx run hatch version)" >> $GITHUB_OUTPUT
- name: ⏬️ Install Dependencies
env:
DEBIAN_FRONTEND: noninteractive
run: |
# install dependencies
sudo apt update && sudo apt install -y git python3-pip libegl1 sqlite3 libsqlite3-dev libsqlite3-0 ffmpeg libsm6 libxext6
# upgrade pip
python -m ensurepip --upgrade && python -m pip install --upgrade pip
# install terrarium for code sandbox
git clone https://github.com/khoj-ai/terrarium.git && cd terrarium && npm install --legacy-peer-deps && mkdir pyodide_cache
- name: ⬇️ Install Application
run: |
sed -i 's/dynamic = \["version"\]/version = "${{ steps.hatch.outputs.version }}"/' pyproject.toml
pip install --upgrade .[dev]
- name: 📝 Run Eval
env:
KHOJ_MODE: ${{ matrix.khoj_mode }}
SAMPLE_SIZE: ${{ github.event_name == 'workflow_dispatch' && inputs.sample_size || 200 }}
BATCH_SIZE: "20"
RANDOMIZE: ${{ github.event_name == 'workflow_dispatch' && inputs.randomize || 'true' }}
KHOJ_URL: "http://localhost:42110"
KHOJ_LLM_SEED: "42"
KHOJ_DEFAULT_CHAT_MODEL: ${{ github.event_name == 'workflow_dispatch' && inputs.chat_model || 'gemini-2.0-flash' }}
KHOJ_RESEARCH_ITERATIONS: ${{ github.event_name == 'workflow_dispatch' && inputs.max_research_iterations || 10 }}
KHOJ_AUTO_READ_WEBPAGE: ${{ github.event_name == 'workflow_dispatch' && inputs.auto_read_webpage || 'false' }}
GEMINI_API_KEY: ${{ secrets.GEMINI_API_KEY }}
OPENAI_BASE_URL: ${{ github.event_name == 'workflow_dispatch' && inputs.openai_base_url || 'https://api.openai.com/v1' }}
SERPER_DEV_API_KEY: ${{ matrix.dataset != 'math500' && secrets.SERPER_DEV_API_KEY || '' }}
OLOSTEP_API_KEY: ${{ matrix.dataset != 'math500' && secrets.OLOSTEP_API_KEY || ''}}
FIRECRAWL_API_KEY: ${{ matrix.dataset != 'math500' && secrets.FIRECRAWL_API_KEY || '' }}
HF_TOKEN: ${{ secrets.HF_TOKEN }}
E2B_API_KEY: ${{ inputs.sandbox == 'e2b' && secrets.E2B_API_KEY || '' }}
E2B_TEMPLATE: ${{ vars.E2B_TEMPLATE }}
KHOJ_ADMIN_EMAIL: khoj
KHOJ_ADMIN_PASSWORD: khoj
POSTGRES_HOST: localhost
POSTGRES_PORT: 5432
POSTGRES_USER: postgres
POSTGRES_PASSWORD: postgres
POSTGRES_DB: postgres
USE_EMBEDDED_DB: "true"
KHOJ_TELEMETRY_DISABLE: "True" # To disable telemetry for tests
run: |
# Start Khoj server in background
khoj --anonymous-mode --non-interactive &
# Start code sandbox
npm install -g pm2
NODE_ENV=production npm run ci --prefix terrarium
# Wait for server to be ready
timeout=120
while ! curl -s http://localhost:42110/api/health > /dev/null; do
if [ $timeout -le 0 ]; then
echo "Timed out waiting for Khoj server"
exit 1
fi
echo "Waiting for Khoj server..."
sleep 2
timeout=$((timeout-2))
done
# Run evals
python tests/evals/eval.py -d ${{ matrix.dataset }}
- name: Upload Results
if: always() # Upload results even if tests fail
uses: actions/upload-artifact@v4
with:
name: eval-results-${{ steps.hatch.outputs.version }}-${{ matrix.khoj_mode }}-${{ matrix.dataset }}
path: |
*_evaluation_results_*.csv
*_evaluation_summary_*.txt
- name: Display Results
if: always()
run: |
# Read and display summary
echo "## Evaluation Summary of Khoj on ${{ matrix.dataset }} in ${{ matrix.khoj_mode }} mode" >> $GITHUB_STEP_SUMMARY
echo "**$(head -n 1 *_evaluation_summary_*.txt)**" >> $GITHUB_STEP_SUMMARY
echo "- Khoj Version: ${{ steps.hatch.outputs.version }}" >> $GITHUB_STEP_SUMMARY
echo "- Chat Model: ${{ inputs.chat_model || 'gemini-2.0-flash' }}" >> $GITHUB_STEP_SUMMARY
echo "- Code Sandbox: ${{ inputs.sandbox || 'terrarium' }}" >> $GITHUB_STEP_SUMMARY
echo "\`\`\`" >> $GITHUB_STEP_SUMMARY
tail -n +2 *_evaluation_summary_*.txt >> $GITHUB_STEP_SUMMARY
echo "" >> $GITHUB_STEP_SUMMARY
echo "\`\`\`" >> $GITHUB_STEP_SUMMARY
# Display in logs too
echo "===== EVALUATION RESULTS ====="
cat *_evaluation_summary_*.txt