khoj/.github/workflows/run_evals.yml

name: eval

on:
  # Run on every release
  push:
    tags:
      - "*"
  # Allow manual triggers from GitHub UI
  workflow_dispatch:
    inputs:
      khoj_mode:
        description: 'Khoj Mode (general/default/research)'
        required: true
        default: 'default'
        type: choice
        options:
          - general
          - default
          - research
      dataset:
        description: 'Dataset to evaluate (frames/simpleqa)'
        required: true
        default: 'frames'
        type: choice
        options:
          - frames
          - simpleqa
          - gpqa
          - math500
      sample_size:
        description: 'Number of samples to evaluate'
        required: false
        default: 200
        type: number
      sandbox:
        description: 'Code sandbox to use'
        required: false
        default: 'terrarium'
        type: choice
        options:
          - terrarium
          - e2b
      chat_model:
        description: 'Chat model to use'
        required: false
        default: 'gemini-2.0-flash'
        type: string
      max_research_iterations:
        description: 'Maximum number of iterations in research mode'
        required: false
        default: 5
        type: number
      openai_api_key:
        description: 'OpenAI API key'
        required: false
        default: ''
        type: string
      openai_base_url:
        description: 'Base URL of OpenAI compatible API'
        required: false
        default: 'https://api.openai.com/v1'
        type: string
      auto_read_webpage:
        description: 'Auto read webpage on online search'
        required: false
        default: 'false'
        type: choice
        options:
          - 'false'
          - 'true'
      randomize:
        description: 'Randomize the sample of questions'
        required: false
        default: 'true'
        type: choice
        options:
          - 'false'
          - 'true'

jobs:
  eval:
    runs-on: ubuntu-latest
    strategy:
      matrix:
        # Use input from manual trigger if available, else run all combinations
        khoj_mode: ${{ github.event_name == 'workflow_dispatch' && fromJSON(format('["{0}"]', inputs.khoj_mode)) || fromJSON('["general", "default", "research"]') }}
        dataset: ${{ github.event_name == 'workflow_dispatch' && fromJSON(format('["{0}"]', inputs.dataset)) || fromJSON('["frames", "gpqa"]') }}

    services:
      postgres:
        image: ankane/pgvector
        env:
          POSTGRES_PASSWORD: postgres
          POSTGRES_USER: postgres
          POSTGRES_DB: postgres
        ports:
          - 5432:5432
        options: >-
          --health-cmd pg_isready
          --health-interval 10s
          --health-timeout 5s
          --health-retries 5

    steps:
      - uses: actions/checkout@v3
        with:
          fetch-depth: 0

      - name: Set up Python
        uses: actions/setup-python@v4
        with:
          python-version: '3.10'

      - name: Get App Version
        id: hatch
        run: |
          # Mask relevant workflow inputs as secret early
          OPENAI_API_KEY=$(jq -r '.inputs.openai_api_key' $GITHUB_EVENT_PATH)
          echo ::add-mask::$OPENAI_API_KEY
          echo OPENAI_API_KEY="$OPENAI_API_KEY" >> $GITHUB_ENV

          # Get app version from hatch
          echo "version=$(pipx run hatch version)" >> $GITHUB_OUTPUT

      - name: ⏬️ Install Dependencies
        env:
          DEBIAN_FRONTEND: noninteractive
        run: |
          # install dependencies
          sudo apt update && sudo apt install -y git python3-pip libegl1 sqlite3 libsqlite3-dev libsqlite3-0 ffmpeg libsm6 libxext6
          # upgrade pip
          python -m ensurepip --upgrade && python -m pip install --upgrade pip
          # install terrarium for code sandbox
          git clone https://github.com/khoj-ai/terrarium.git && cd terrarium && npm install --legacy-peer-deps && mkdir pyodide_cache

      - name: ⬇️ Install Application
        run: |
          sed -i 's/dynamic = \["version"\]/version = "${{ steps.hatch.outputs.version }}"/' pyproject.toml
          pip install --upgrade .[dev]

      - name: 📝 Run Eval
        env:
          KHOJ_MODE: ${{ matrix.khoj_mode }}
          SAMPLE_SIZE: ${{ github.event_name == 'workflow_dispatch' && inputs.sample_size || 200 }}
          BATCH_SIZE: "20"
          RANDOMIZE: ${{ github.event_name == 'workflow_dispatch' && inputs.randomize || 'true' }}
          KHOJ_URL: "http://localhost:42110"
          KHOJ_LLM_SEED: "42"
          KHOJ_DEFAULT_CHAT_MODEL: ${{ github.event_name == 'workflow_dispatch' && inputs.chat_model || 'gemini-2.0-flash' }}
          KHOJ_RESEARCH_ITERATIONS: ${{ github.event_name == 'workflow_dispatch' && inputs.max_research_iterations || 10 }}
          KHOJ_AUTO_READ_WEBPAGE: ${{ github.event_name == 'workflow_dispatch' && inputs.auto_read_webpage || 'false' }}
          GEMINI_API_KEY: ${{ secrets.GEMINI_API_KEY }}
          OPENAI_BASE_URL: ${{ github.event_name == 'workflow_dispatch' && inputs.openai_base_url || 'https://api.openai.com/v1' }}
          SERPER_DEV_API_KEY: ${{ matrix.dataset != 'math500' && secrets.SERPER_DEV_API_KEY || '' }}
          OLOSTEP_API_KEY: ${{ matrix.dataset != 'math500' && secrets.OLOSTEP_API_KEY || ''}}
          FIRECRAWL_API_KEY: ${{ matrix.dataset != 'math500' && secrets.FIRECRAWL_API_KEY || '' }}
          HF_TOKEN: ${{ secrets.HF_TOKEN }}
          E2B_API_KEY: ${{ inputs.sandbox == 'e2b' && secrets.E2B_API_KEY || '' }}
          E2B_TEMPLATE: ${{ vars.E2B_TEMPLATE }}
          KHOJ_ADMIN_EMAIL: khoj
          KHOJ_ADMIN_PASSWORD: khoj
          POSTGRES_HOST: localhost
          POSTGRES_PORT: 5432
          POSTGRES_USER: postgres
          POSTGRES_PASSWORD: postgres
          POSTGRES_DB: postgres
          USE_EMBEDDED_DB: "true"
          KHOJ_TELEMETRY_DISABLE: "True"  # To disable telemetry for tests
        run: |
          # Start Khoj server in background
          khoj --anonymous-mode --non-interactive &

          # Start code sandbox
          npm install -g pm2
          NODE_ENV=production npm run ci --prefix terrarium

          # Wait for server to be ready
          timeout=120
          while ! curl -s http://localhost:42110/api/health > /dev/null; do
            if [ $timeout -le 0 ]; then
              echo "Timed out waiting for Khoj server"
              exit 1
            fi
            echo "Waiting for Khoj server..."
            sleep 2
            timeout=$((timeout-2))
          done

          # Run evals
          python tests/evals/eval.py -d ${{ matrix.dataset }}

      - name: Upload Results
        if: always()  # Upload results even if tests fail
        uses: actions/upload-artifact@v4
        with:
          name: eval-results-${{ steps.hatch.outputs.version }}-${{ matrix.khoj_mode }}-${{ matrix.dataset }}
          path: |
            *_evaluation_results_*.csv
            *_evaluation_summary_*.txt

      - name: Display Results
        if: always()
        run: |
          # Read and display summary
          echo "## Evaluation Summary of Khoj on ${{ matrix.dataset }} in ${{ matrix.khoj_mode }} mode" >> $GITHUB_STEP_SUMMARY
          echo "**$(head -n 1 *_evaluation_summary_*.txt)**" >> $GITHUB_STEP_SUMMARY
          echo "- Khoj Version: ${{ steps.hatch.outputs.version }}" >> $GITHUB_STEP_SUMMARY
          echo "- Chat Model: ${{ inputs.chat_model || 'gemini-2.0-flash' }}" >> $GITHUB_STEP_SUMMARY
          echo "- Code Sandbox: ${{ inputs.sandbox || 'terrarium' }}" >> $GITHUB_STEP_SUMMARY
          echo "\`\`\`" >> $GITHUB_STEP_SUMMARY
          tail -n +2 *_evaluation_summary_*.txt >> $GITHUB_STEP_SUMMARY
          echo "" >> $GITHUB_STEP_SUMMARY
          echo "\`\`\`" >> $GITHUB_STEP_SUMMARY

          # Display in logs too
          echo "===== EVALUATION RESULTS ====="
          cat *_evaluation_summary_*.txt