From f13bdc5135d9b5990f2a11a92b7c649ddaa3aa15 Mon Sep 17 00:00:00 2001
From: Debanjum <debanjum@gmail.com>
Date: Thu, 13 Feb 2025 17:15:30 +0530
Subject: [PATCH 01/10] Log eval run progress percentage for orientation

---
 tests/evals/eval.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/tests/evals/eval.py b/tests/evals/eval.py
index 20a6051e..a1749435 100644
--- a/tests/evals/eval.py
+++ b/tests/evals/eval.py
@@ -1,13 +1,11 @@
 import argparse
 import base64
 import concurrent.futures
-import hashlib
 import json
 import logging
 import os
 import re
 import time
-import uuid
 from datetime import datetime
 from functools import partial
 from io import StringIO
@@ -553,6 +551,7 @@ def process_batch(batch, batch_start, results, dataset_length, response_evaluato
 ---------
 Decision: {colored_decision}
 Accuracy: {running_accuracy:.2%}
+Progress: {running_total_count.get()/dataset_length:.2%}
 Question: {prompt}
 Expected Answer: {answer}
 Agent Answer: {agent_response}

From 4a28714a08c19ef50e40116bff5220ca747d86a0 Mon Sep 17 00:00:00 2001
From: sabaimran <narmiabas@gmail.com>
Date: Tue, 28 Jan 2025 18:46:04 -0800
Subject: [PATCH 02/10] Add retry logic to code execution and add health check
 to sandbox container

---
 docker-compose.yml                   |  5 +++++
 src/khoj/processor/tools/run_code.py | 26 +++++++++++++++++++++++++-
 2 files changed, 30 insertions(+), 1 deletion(-)

diff --git a/docker-compose.yml b/docker-compose.yml
index 22371182..ea0b603b 100644
--- a/docker-compose.yml
+++ b/docker-compose.yml
@@ -14,6 +14,11 @@ services:
       retries: 5
   sandbox:
     image: ghcr.io/khoj-ai/terrarium:latest
+    healthcheck:
+      test: ["CMD-SHELL", "curl -f http://localhost:8080/health"]
+      interval: 30s
+      timeout: 10s
+      retries: 2
   search:
     image: docker.io/searxng/searxng:latest
     volumes:
diff --git a/src/khoj/processor/tools/run_code.py b/src/khoj/processor/tools/run_code.py
index 6c1eb125..5a38e0f4 100644
--- a/src/khoj/processor/tools/run_code.py
+++ b/src/khoj/processor/tools/run_code.py
@@ -1,8 +1,10 @@
+import asyncio
 import base64
 import datetime
 import logging
 import mimetypes
 import os
+from functools import wraps
 from pathlib import Path
 from typing import Any, Callable, List, NamedTuple, Optional
 
@@ -144,6 +146,28 @@ async def generate_python_code(
     return GeneratedCode(code, input_files, input_links)
 
 
+def async_retry_with_backoff(retries=3, backoff_in_seconds=1):
+    def decorator(func):
+        @wraps(func)
+        async def wrapper(*args, **kwargs):
+            retry_count = 0
+            while retry_count < retries:
+                try:
+                    return await func(*args, **kwargs)
+                except (aiohttp.ClientError, asyncio.TimeoutError) as e:
+                    retry_count += 1
+                    if retry_count == retries:
+                        raise e
+                    wait_time = backoff_in_seconds * (2 ** (retry_count - 1))  # exponential backoff
+                    await asyncio.sleep(wait_time)
+            return await func(*args, **kwargs)
+
+        return wrapper
+
+    return decorator
+
+
+@async_retry_with_backoff(retries=3, backoff_in_seconds=1)
 async def execute_sandboxed_python(code: str, input_data: list[dict], sandbox_url: str = SANDBOX_URL) -> dict[str, Any]:
     """
     Takes code to run as a string and calls the terrarium API to execute it.
@@ -157,7 +181,7 @@ async def execute_sandboxed_python(code: str, input_data: list[dict], sandbox_ur
     data = {"code": cleaned_code, "files": input_data}
 
     async with aiohttp.ClientSession() as session:
-        async with session.post(sandbox_url, json=data, headers=headers) as response:
+        async with session.post(sandbox_url, json=data, headers=headers, timeout=30) as response:
             if response.status == 200:
                 result: dict[str, Any] = await response.json()
                 result["code"] = cleaned_code

From ecc2f795717e39fe02b44cf4aa82b72488fc3fe0 Mon Sep 17 00:00:00 2001
From: Debanjum <debanjum@gmail.com>
Date: Wed, 12 Feb 2025 23:09:11 +0530
Subject: [PATCH 03/10] Use tenacity retry decorator to retry executing code in
 sandbox

---
 src/khoj/processor/tools/run_code.py | 46 ++++++++++++++--------------
 1 file changed, 23 insertions(+), 23 deletions(-)

diff --git a/src/khoj/processor/tools/run_code.py b/src/khoj/processor/tools/run_code.py
index 5a38e0f4..ac1463ab 100644
--- a/src/khoj/processor/tools/run_code.py
+++ b/src/khoj/processor/tools/run_code.py
@@ -4,11 +4,18 @@ import datetime
 import logging
 import mimetypes
 import os
-from functools import wraps
 from pathlib import Path
 from typing import Any, Callable, List, NamedTuple, Optional
 
 import aiohttp
+from httpx import RemoteProtocolError
+from tenacity import (
+    before_sleep_log,
+    retry,
+    retry_if_exception_type,
+    stop_after_attempt,
+    wait_random_exponential,
+)
 
 from khoj.database.adapters import FileObjectAdapters
 from khoj.database.models import Agent, FileObject, KhojUser
@@ -92,6 +99,8 @@ async def run_code(
             cleaned_result = truncate_code_context({"cleaned": {"results": result}})["cleaned"]["results"]
             logger.info(f"Executed Code\n----\n{code}\n----\nResult\n----\n{cleaned_result}\n----")
             yield {query: {"code": code, "results": result}}
+    except asyncio.TimeoutError as e:
+        raise ValueError(f"Failed to run code for {query} with Timeout error: {e}")
     except Exception as e:
         raise ValueError(f"Failed to run code for {query} with error: {e}")
 
@@ -146,28 +155,19 @@ async def generate_python_code(
     return GeneratedCode(code, input_files, input_links)
 
 
-def async_retry_with_backoff(retries=3, backoff_in_seconds=1):
-    def decorator(func):
-        @wraps(func)
-        async def wrapper(*args, **kwargs):
-            retry_count = 0
-            while retry_count < retries:
-                try:
-                    return await func(*args, **kwargs)
-                except (aiohttp.ClientError, asyncio.TimeoutError) as e:
-                    retry_count += 1
-                    if retry_count == retries:
-                        raise e
-                    wait_time = backoff_in_seconds * (2 ** (retry_count - 1))  # exponential backoff
-                    await asyncio.sleep(wait_time)
-            return await func(*args, **kwargs)
-
-        return wrapper
-
-    return decorator
-
-
-@async_retry_with_backoff(retries=3, backoff_in_seconds=1)
+@retry(
+    retry=(
+        retry_if_exception_type(aiohttp.ClientError)
+        | retry_if_exception_type(aiohttp.ClientTimeout)
+        | retry_if_exception_type(asyncio.TimeoutError)
+        | retry_if_exception_type(ConnectionError)
+        | retry_if_exception_type(RemoteProtocolError)
+    ),
+    wait=wait_random_exponential(min=1, max=5),
+    stop=stop_after_attempt(3),
+    before_sleep=before_sleep_log(logger, logging.DEBUG),
+    reraise=True,
+)
 async def execute_sandboxed_python(code: str, input_data: list[dict], sandbox_url: str = SANDBOX_URL) -> dict[str, Any]:
     """
     Takes code to run as a string and calls the terrarium API to execute it.

From 701a7be2919ee1457b810d18eb62420df1de68fc Mon Sep 17 00:00:00 2001
From: Debanjum <debanjum@gmail.com>
Date: Fri, 14 Feb 2025 13:35:37 +0530
Subject: [PATCH 04/10] Stop code sandbox on request timeout to allow sandbox
 process restarts

---
 .github/workflows/run_evals.yml      | 2 +-
 src/khoj/processor/tools/run_code.py | 8 +++++++-
 2 files changed, 8 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/run_evals.yml b/.github/workflows/run_evals.yml
index 914a0835..6e6f7143 100644
--- a/.github/workflows/run_evals.yml
+++ b/.github/workflows/run_evals.yml
@@ -114,7 +114,7 @@ jobs:
 
           # Start code sandbox
           npm install -g pm2
-          npm run ci --prefix terrarium
+          NODE_ENV=production npm run ci --prefix terrarium
 
           # Wait for server to be ready
           timeout=120
diff --git a/src/khoj/processor/tools/run_code.py b/src/khoj/processor/tools/run_code.py
index ac1463ab..62ffde74 100644
--- a/src/khoj/processor/tools/run_code.py
+++ b/src/khoj/processor/tools/run_code.py
@@ -100,7 +100,13 @@ async def run_code(
             logger.info(f"Executed Code\n----\n{code}\n----\nResult\n----\n{cleaned_result}\n----")
             yield {query: {"code": code, "results": result}}
     except asyncio.TimeoutError as e:
-        raise ValueError(f"Failed to run code for {query} with Timeout error: {e}")
+        # Call the sandbox_url/stop GET API endpoint to stop the code sandbox
+        error = f"Failed to run code for {query} with Timeout error: {e}"
+        try:
+            await aiohttp.ClientSession().get(f"{sandbox_url}/stop", timeout=5)
+        except Exception as e:
+            error += f"\n\nFailed to stop code sandbox with error: {e}"
+        raise ValueError(error)
     except Exception as e:
         raise ValueError(f"Failed to run code for {query} with error: {e}")
 

From b4183c73337a84b1bf3f6b16f41c1ffb63cf4b51 Mon Sep 17 00:00:00 2001
From: Debanjum <debanjum@gmail.com>
Date: Wed, 12 Feb 2025 21:42:23 +0530
Subject: [PATCH 05/10] Default to gemini 2.0 flash instead of 1.5 flash on
 Gemini setup

Add price of gemini 2.0 flash for cost calculations
---
 .github/workflows/run_evals.yml                       |  2 +-
 documentation/docs/get-started/setup.mdx              |  2 +-
 src/khoj/processor/conversation/google/gemini_chat.py | 10 +++++-----
 src/khoj/utils/constants.py                           |  3 ++-
 tests/conftest.py                                     |  2 +-
 tests/evals/eval.py                                   |  2 +-
 6 files changed, 11 insertions(+), 10 deletions(-)

diff --git a/.github/workflows/run_evals.yml b/.github/workflows/run_evals.yml
index 6e6f7143..9544b7f3 100644
--- a/.github/workflows/run_evals.yml
+++ b/.github/workflows/run_evals.yml
@@ -147,7 +147,7 @@ jobs:
           echo "## Evaluation Summary of Khoj on ${{ matrix.dataset }} in ${{ matrix.khoj_mode }} mode" >> $GITHUB_STEP_SUMMARY
           echo "**$(head -n 1 *_evaluation_summary_*.txt)**" >> $GITHUB_STEP_SUMMARY
           echo "- Khoj Version: ${{ steps.hatch.outputs.version }}" >> $GITHUB_STEP_SUMMARY
-          echo "- Chat Model: Gemini 1.5 Flash 002" >> $GITHUB_STEP_SUMMARY
+          echo "- Chat Model: Gemini 2.0 Flash" >> $GITHUB_STEP_SUMMARY
           echo "\`\`\`" >> $GITHUB_STEP_SUMMARY
           tail -n +2 *_evaluation_summary_*.txt >> $GITHUB_STEP_SUMMARY
           echo "" >> $GITHUB_STEP_SUMMARY
diff --git a/documentation/docs/get-started/setup.mdx b/documentation/docs/get-started/setup.mdx
index c6cdec42..fb8e9f4c 100644
--- a/documentation/docs/get-started/setup.mdx
+++ b/documentation/docs/get-started/setup.mdx
@@ -333,7 +333,7 @@ Using Ollama? See the [Ollama Integration](/advanced/ollama) section for more cu
    - Add your [Gemini API key](https://aistudio.google.com/app/apikey)
    - Give the configuration a friendly name like `Gemini`. Do not configure the API base url.
 2. Create a new [chat model](http://localhost:42110/server/admin/database/chatmodel/add)
-    - Set the `chat-model` field to a [Google Gemini chat model](https://cloud.google.com/vertex-ai/generative-ai/docs/learn/models#gemini-models). Example: `gemini-1.5-flash`.
+    - Set the `chat-model` field to a [Google Gemini chat model](https://cloud.google.com/vertex-ai/generative-ai/docs/learn/models#gemini-models). Example: `gemini-2.0-flash`.
     - Set the `model-type` field to `Google`.
     - Set the `ai model api` field to the Gemini AI Model API you created in step 1.
 
diff --git a/src/khoj/processor/conversation/google/gemini_chat.py b/src/khoj/processor/conversation/google/gemini_chat.py
index 6fd95ccd..cb25258c 100644
--- a/src/khoj/processor/conversation/google/gemini_chat.py
+++ b/src/khoj/processor/conversation/google/gemini_chat.py
@@ -31,7 +31,7 @@ logger = logging.getLogger(__name__)
 
 def extract_questions_gemini(
     text,
-    model: Optional[str] = "gemini-1.5-flash",
+    model: Optional[str] = "gemini-2.0-flash",
     conversation_log={},
     api_key=None,
     temperature=0,
@@ -132,9 +132,9 @@ def gemini_send_message_to_model(
 
     model_kwargs = {}
 
-    # Sometimes, this causes unwanted behavior and terminates response early. Disable for now while it's flaky.
-    # if response_type == "json_object":
-    #     model_kwargs["response_mime_type"] = "application/json"
+    # This caused unwanted behavior and terminates response early for gemini 1.5 series. Monitor for flakiness with 2.0 series.
+    if response_type == "json_object" and model in ["gemini-2.0-flash"]:
+        model_kwargs["response_mime_type"] = "application/json"
 
     # Get Response from Gemini
     return gemini_completion_with_backoff(
@@ -154,7 +154,7 @@ def converse_gemini(
     online_results: Optional[Dict[str, Dict]] = None,
     code_results: Optional[Dict[str, Dict]] = None,
     conversation_log={},
-    model: Optional[str] = "gemini-1.5-flash",
+    model: Optional[str] = "gemini-2.0-flash",
     api_key: Optional[str] = None,
     temperature: float = 0.2,
     completion_func=None,
diff --git a/src/khoj/utils/constants.py b/src/khoj/utils/constants.py
index b3ff1f97..74c06172 100644
--- a/src/khoj/utils/constants.py
+++ b/src/khoj/utils/constants.py
@@ -18,7 +18,7 @@ default_offline_chat_models = [
     "bartowski/Qwen2.5-14B-Instruct-GGUF",
 ]
 default_openai_chat_models = ["gpt-4o-mini", "gpt-4o"]
-default_gemini_chat_models = ["gemini-1.5-flash", "gemini-1.5-pro"]
+default_gemini_chat_models = ["gemini-2.0-flash", "gemini-1.5-pro"]
 default_anthropic_chat_models = ["claude-3-5-sonnet-20241022", "claude-3-5-haiku-20241022"]
 
 empty_config = {
@@ -46,6 +46,7 @@ model_to_cost: Dict[str, Dict[str, float]] = {
     "gemini-1.5-flash-002": {"input": 0.075, "output": 0.30},
     "gemini-1.5-pro": {"input": 1.25, "output": 5.00},
     "gemini-1.5-pro-002": {"input": 1.25, "output": 5.00},
+    "gemini-2.0-flash": {"input": 0.10, "output": 0.40},
     # Anthropic Pricing: https://www.anthropic.com/pricing#anthropic-api_
     "claude-3-5-sonnet-20241022": {"input": 3.0, "output": 15.0},
     "claude-3-5-haiku-20241022": {"input": 1.0, "output": 5.0},
diff --git a/tests/conftest.py b/tests/conftest.py
index 1795b340..e5ab3a8e 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -315,7 +315,7 @@ def chat_client_builder(search_config, user, index_content=True, require_auth=Fa
     if chat_provider == ChatModel.ModelType.OPENAI:
         online_chat_model = ChatModelFactory(name="gpt-4o-mini", model_type="openai")
     elif chat_provider == ChatModel.ModelType.GOOGLE:
-        online_chat_model = ChatModelFactory(name="gemini-1.5-flash", model_type="google")
+        online_chat_model = ChatModelFactory(name="gemini-2.0-flash", model_type="google")
     elif chat_provider == ChatModel.ModelType.ANTHROPIC:
         online_chat_model = ChatModelFactory(name="claude-3-5-haiku-20241022", model_type="anthropic")
     if online_chat_model:
diff --git a/tests/evals/eval.py b/tests/evals/eval.py
index a1749435..0c95996f 100644
--- a/tests/evals/eval.py
+++ b/tests/evals/eval.py
@@ -629,7 +629,7 @@ def main():
         response_evaluator = evaluate_response_with_mcq_match
     elif args.dataset == "math500":
         response_evaluator = partial(
-            evaluate_response_with_gemini, eval_model=os.getenv("GEMINI_EVAL_MODEL", "gemini-1.5-flash-002")
+            evaluate_response_with_gemini, eval_model=os.getenv("GEMINI_EVAL_MODEL", "gemini-2.0-flash-001")
         )
     elif args.dataset == "frames_ir":
         response_evaluator = evaluate_response_for_ir

From 45fb85f1dfcbbde136f27963c017ac9e86598a84 Mon Sep 17 00:00:00 2001
From: Debanjum <debanjum@gmail.com>
Date: Sat, 15 Feb 2025 03:11:15 +0530
Subject: [PATCH 06/10] Add E2B as an optional code sandbox provider

- Specify E2B api key and template to use via env variables
- Try load, use e2b library when E2B api key set
- Fallback to try use terrarium sandbox otherwise
- Enable more python packages in e2b sandbox like rdkit via custom e2b template

- Use Async E2B Sandbox
- Parallelize file IO with sandbox
- Add documentation on how to enable E2B as code sandbox instead of Terrarium
---
 .github/workflows/run_evals.yml               |  11 ++
 docker-compose.yml                            |   4 +-
 documentation/docs/features/code_execution.md |  24 ++--
 pyproject.toml                                |   3 +-
 src/khoj/processor/conversation/prompts.py    |  10 +-
 src/khoj/processor/tools/run_code.py          | 110 +++++++++++++++++-
 src/khoj/utils/helpers.py                     |  13 ++-
 7 files changed, 157 insertions(+), 18 deletions(-)

diff --git a/.github/workflows/run_evals.yml b/.github/workflows/run_evals.yml
index 9544b7f3..71123acf 100644
--- a/.github/workflows/run_evals.yml
+++ b/.github/workflows/run_evals.yml
@@ -32,6 +32,14 @@ on:
         required: false
         default: 200
         type: number
+      sandbox:
+        description: 'Code sandbox to use'
+        required: false
+        default: 'terrarium'
+        type: choice
+        options:
+          - terrarium
+          - e2b
 
 jobs:
   eval:
@@ -100,6 +108,8 @@ jobs:
           SERPER_DEV_API_KEY: ${{ matrix.dataset != 'math500' && secrets.SERPER_DEV_API_KEY }}
           OLOSTEP_API_KEY: ${{ matrix.dataset != 'math500' && secrets.OLOSTEP_API_KEY }}
           HF_TOKEN: ${{ secrets.HF_TOKEN }}
+          E2B_API_KEY: ${{ inputs.sandbox == 'e2b' && secrets.E2B_API_KEY }}
+          E2B_TEMPLATE: ${{ vars.E2B_TEMPLATE }}
           KHOJ_ADMIN_EMAIL: khoj
           KHOJ_ADMIN_PASSWORD: khoj
           POSTGRES_HOST: localhost
@@ -148,6 +158,7 @@ jobs:
           echo "**$(head -n 1 *_evaluation_summary_*.txt)**" >> $GITHUB_STEP_SUMMARY
           echo "- Khoj Version: ${{ steps.hatch.outputs.version }}" >> $GITHUB_STEP_SUMMARY
           echo "- Chat Model: Gemini 2.0 Flash" >> $GITHUB_STEP_SUMMARY
+          echo "- Code Sandbox: ${{ inputs.sandbox}}" >> $GITHUB_STEP_SUMMARY
           echo "\`\`\`" >> $GITHUB_STEP_SUMMARY
           tail -n +2 *_evaluation_summary_*.txt >> $GITHUB_STEP_SUMMARY
           echo "" >> $GITHUB_STEP_SUMMARY
diff --git a/docker-compose.yml b/docker-compose.yml
index ea0b603b..3ff21b11 100644
--- a/docker-compose.yml
+++ b/docker-compose.yml
@@ -58,8 +58,10 @@ services:
       - KHOJ_DEBUG=False
       - KHOJ_ADMIN_EMAIL=username@example.com
       - KHOJ_ADMIN_PASSWORD=password
-      # Default URL of Terrarium, the Python sandbox used by Khoj to run code. Its container is specified above
+      # Default URL of Terrarium, the default Python sandbox used by Khoj to run code. Its container is specified above
       - KHOJ_TERRARIUM_URL=http://sandbox:8080
+      # Uncomment line below to have Khoj run code in remote E2B code sandbox instead of the self-hosted Terrarium sandbox above. Get your E2B API key from https://e2b.dev/.
+      # - E2B_API_KEY=your_e2b_api_key
       # Default URL of SearxNG, the default web search engine used by Khoj. Its container is specified above
       - KHOJ_SEARXNG_URL=http://search:8080
       # Uncomment line below to use with Ollama running on your local machine at localhost:11434.
diff --git a/documentation/docs/features/code_execution.md b/documentation/docs/features/code_execution.md
index 8403d466..05c994d7 100644
--- a/documentation/docs/features/code_execution.md
+++ b/documentation/docs/features/code_execution.md
@@ -3,22 +3,23 @@
 
 # Code Execution
 
-Khoj can generate and run very simple Python code snippets as well. This is useful if you want to generate a plot, run a simple calculation, or do some basic data manipulation. LLMs by default aren't skilled at complex quantitative tasks. Code generation & execution can come in handy for such tasks.
+Khoj can generate and run simple Python code as well. This is useful if you want to have Khoj do some data analysis, generate plots and reports. LLMs by default aren't skilled at complex quantitative tasks. Code generation & execution can come in handy for such tasks.
 
-Just use `/code` in your chat command.
+Khoj automatically infers when to use the code tool. You can also tell it explicitly to use the code tool or use the `/code` [slash command](https://docs.khoj.dev/features/chat/#commands) in your chat.
 
-### Setup (Self-Hosting)
-Run [Cohere's Terrarium](https://github.com/cohere-ai/cohere-terrarium) on your machine to enable code generation and execution.
+## Setup (Self-Hosting)
+### Terrarium Sandbox
+Use [Cohere's Terrarium](https://github.com/cohere-ai/cohere-terrarium) to host the code sandbox locally on your machine for free.
 
-Check the [instructions](https://github.com/cohere-ai/cohere-terrarium?tab=readme-ov-file#development) for running from source.
-
-For running with Docker, you can use our [docker-compose.yml](https://github.com/khoj-ai/khoj/blob/master/docker-compose.yml), or start it manually like this:
+To run with Docker, use our [docker-compose.yml](https://github.com/khoj-ai/khoj/blob/master/docker-compose.yml) to automatically setup the Terrarium code sandbox, or start it manually like this:
 
 ```bash
 docker pull ghcr.io/khoj-ai/terrarium:latest
 docker run -d -p 8080:8080 ghcr.io/khoj-ai/terrarium:latest
 ```
 
+To run from source, check [these instructions](https://github.com/khoj-ai/cohere-terrarium?tab=readme-ov-file#development).
+
 #### Verify
 Verify that it's running, by evaluating a simple Python expression:
 
@@ -28,3 +29,12 @@ curl -X POST -H "Content-Type: application/json" \
 --data-raw '{"code": "1 + 1"}' \
 --no-buffer
 ```
+
+### E2B Sandbox
+[E2B](https://e2b.dev/) allows Khoj to run code on a remote but versatile sandbox with support for more python libraries. This is [not free](https://e2b.dev/pricing).
+
+To have Khoj use E2B as the code sandbox:
+1. Generate an API key on [their dashboard](https://e2b.dev/dashboard).
+2. Set the `E2B_API_KEY` environment variable to it on the machine running your Khoj server.
+   - When using our [docker-compose.yml](https://github.com/khoj-ai/khoj/blob/master/docker-compose.yml), uncomment and set the `E2B_API_KEY` env var in the `docker-compose.yml` file.
+3. Now restart your Khoj server to switch to using the E2B code sandbox.
diff --git a/pyproject.toml b/pyproject.toml
index 1ed9426e..5093fee7 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -68,7 +68,7 @@ dependencies = [
     "authlib == 1.2.1",
     "llama-cpp-python == 0.2.88",
     "itsdangerous == 2.1.2",
-    "httpx == 0.25.0",
+    "httpx == 0.27.2",
     "pgvector == 0.2.4",
     "psycopg2-binary == 2.9.9",
     "lxml == 4.9.3",
@@ -92,6 +92,7 @@ dependencies = [
     "pyjson5 == 1.6.7",
     "resend == 1.0.1",
     "email-validator == 2.2.0",
+    "e2b-code-interpreter ~= 1.0.0",
 ]
 dynamic = ["version"]
 
diff --git a/src/khoj/processor/conversation/prompts.py b/src/khoj/processor/conversation/prompts.py
index 61060572..63cf028f 100644
--- a/src/khoj/processor/conversation/prompts.py
+++ b/src/khoj/processor/conversation/prompts.py
@@ -974,9 +974,8 @@ Khoj:
 python_code_generation_prompt = PromptTemplate.from_template(
     """
 You are Khoj, an advanced python programmer. You are tasked with constructing a python program to best answer the user query.
-- The python program will run in a pyodide python sandbox with no network access.
+- The python program will run in a sandbox with no network access.
 - You can write programs to run complex calculations, analyze data, create charts, generate documents to meticulously answer the query.
-- The sandbox has access to the standard library, matplotlib, panda, numpy, scipy, bs4 and sympy packages. The requests, torch, catboost, tensorflow and tkinter packages are not available.
 - List known file paths to required user documents in "input_files" and known links to required documents from the web in the "input_links" field.
 - The python program should be self-contained. It can only read data generated by the program itself and from provided input_files, input_links by their basename (i.e filename excluding file path).
 - Do not try display images or plots in the code directly. The code should save the image or plot to a file instead.
@@ -1030,6 +1029,13 @@ Code Execution Results:
 """.strip()
 )
 
+e2b_sandbox_context = """
+- The sandbox has access to only the standard library, matplotlib, pandas, numpy, scipy, bs4, sympy, einops, biopython, shapely, plotly and rdkit packages. The requests, torch, catboost, tensorflow and tkinter packages are not available.
+""".strip()
+
+terrarium_sandbox_context = """
+The sandbox has access to the standard library, matplotlib, pandas, numpy, scipy, bs4 and sympy packages. The requests, torch, catboost, tensorflow, rdkit and tkinter packages are not available.
+""".strip()
 
 # Automations
 # --
diff --git a/src/khoj/processor/tools/run_code.py b/src/khoj/processor/tools/run_code.py
index 62ffde74..af1f0ffd 100644
--- a/src/khoj/processor/tools/run_code.py
+++ b/src/khoj/processor/tools/run_code.py
@@ -27,7 +27,12 @@ from khoj.processor.conversation.utils import (
     load_complex_json,
 )
 from khoj.routers.helpers import send_message_to_model_wrapper
-from khoj.utils.helpers import is_none_or_empty, timer, truncate_code_context
+from khoj.utils.helpers import (
+    is_e2b_code_sandbox_enabled,
+    is_none_or_empty,
+    timer,
+    truncate_code_context,
+)
 from khoj.utils.rawconfig import LocationData
 
 logger = logging.getLogger(__name__)
@@ -131,6 +136,12 @@ async def generate_python_code(
         prompts.personality_context.format(personality=agent.personality) if agent and agent.personality else ""
     )
 
+    # add sandbox specific context like available packages
+    sandbox_context = (
+        prompts.e2b_sandbox_context if is_e2b_code_sandbox_enabled() else prompts.terrarium_sandbox_context
+    )
+    personality_context = f"{sandbox_context}\n{personality_context}"
+
     code_generation_prompt = prompts.python_code_generation_prompt.format(
         current_date=utc_date,
         query=q,
@@ -182,15 +193,104 @@ async def execute_sandboxed_python(code: str, input_data: list[dict], sandbox_ur
     Reference data i/o format based on Terrarium example client code at:
     https://github.com/cohere-ai/cohere-terrarium/blob/main/example-clients/python/terrarium_client.py
     """
-    headers = {"Content-Type": "application/json"}
     cleaned_code = clean_code_python(code)
-    data = {"code": cleaned_code, "files": input_data}
+    if is_e2b_code_sandbox_enabled():
+        try:
+            return await execute_e2b(cleaned_code, input_data)
+        except ImportError:
+            pass
+    return await execute_terrarium(cleaned_code, input_data, sandbox_url)
 
+
+async def execute_e2b(code: str, input_files: list[dict]) -> dict[str, Any]:
+    """Execute code and handle file I/O in e2b sandbox"""
+    from e2b_code_interpreter import AsyncSandbox
+
+    sandbox = await AsyncSandbox.create(
+        api_key=os.getenv("E2B_API_KEY"),
+        template=os.getenv("E2B_TEMPLATE", "pmt2o0ghpang8gbiys57"),
+        timeout=120,
+        request_timeout=30,
+    )
+
+    try:
+        # Upload input files in parallel
+        upload_tasks = [
+            sandbox.files.write(path=file["filename"], data=base64.b64decode(file["b64_data"]), request_timeout=30)
+            for file in input_files
+        ]
+        await asyncio.gather(*upload_tasks)
+
+        # Note stored files before execution
+        E2bFile = NamedTuple("E2bFile", [("name", str), ("path", str)])
+        original_files = {E2bFile(f.name, f.path) for f in await sandbox.files.list("~")}
+
+        # Execute code from main.py file
+        execution = await sandbox.run_code(code=code, timeout=60)
+
+        # Collect output files
+        output_files = []
+
+        # Identify new files created during execution
+        new_files = set(E2bFile(f.name, f.path) for f in await sandbox.files.list("~")) - original_files
+        # Read newly created files in parallel
+        download_tasks = [sandbox.files.read(f.path, request_timeout=30) for f in new_files]
+        downloaded_files = await asyncio.gather(*download_tasks)
+        for f, content in zip(new_files, downloaded_files):
+            if isinstance(content, bytes):
+                # Binary files like PNG - encode as base64
+                b64_data = base64.b64encode(content).decode("utf-8")
+            elif Path(f.name).suffix in [".png", ".jpeg", ".jpg", ".svg"]:
+                # Ignore image files as they are extracted from execution results below for inline display
+                continue
+            else:
+                # Text files - encode utf-8 string as base64
+                b64_data = base64.b64encode(content.encode("utf-8")).decode("utf-8")
+            output_files.append({"filename": f.name, "b64_data": b64_data})
+
+        # Collect output files from execution results
+        for idx, result in enumerate(execution.results):
+            for result_type in ["png", "jpeg", "svg", "text", "markdown", "json"]:
+                if b64_data := getattr(result, result_type, None):
+                    output_files.append({"filename": f"{idx}.{result_type}", "b64_data": b64_data})
+                    break
+
+        # collect logs
+        success = not execution.error and not execution.logs.stderr
+        stdout = "\n".join(execution.logs.stdout)
+        errors = "\n".join(execution.logs.stderr)
+        if execution.error:
+            errors = f"{execution.error}\n{errors}"
+
+        return {
+            "code": code,
+            "success": success,
+            "std_out": stdout,
+            "std_err": errors,
+            "output_files": output_files,
+        }
+    except Exception as e:
+        return {
+            "code": code,
+            "success": False,
+            "std_err": f"Sandbox failed to execute code: {str(e)}",
+            "output_files": [],
+        }
+
+
+async def execute_terrarium(
+    code: str,
+    input_data: list[dict],
+    sandbox_url: str,
+) -> dict[str, Any]:
+    """Execute code using Terrarium sandbox"""
+    headers = {"Content-Type": "application/json"}
+    data = {"code": code, "files": input_data}
     async with aiohttp.ClientSession() as session:
         async with session.post(sandbox_url, json=data, headers=headers, timeout=30) as response:
             if response.status == 200:
                 result: dict[str, Any] = await response.json()
-                result["code"] = cleaned_code
+                result["code"] = code
                 # Store decoded output files
                 result["output_files"] = result.get("output_files", [])
                 for output_file in result["output_files"]:
@@ -202,7 +302,7 @@ async def execute_sandboxed_python(code: str, input_data: list[dict], sandbox_ur
                 return result
             else:
                 return {
-                    "code": cleaned_code,
+                    "code": code,
                     "success": False,
                     "std_err": f"Failed to execute code with {response.status}",
                     "output_files": [],
diff --git a/src/khoj/utils/helpers.py b/src/khoj/utils/helpers.py
index b48436c6..4723403e 100644
--- a/src/khoj/utils/helpers.py
+++ b/src/khoj/utils/helpers.py
@@ -321,6 +321,12 @@ def get_device() -> torch.device:
         return torch.device("cpu")
 
 
+def is_e2b_code_sandbox_enabled():
+    """Check if E2B code sandbox is enabled.
+    Set E2B_API_KEY environment variable to use it."""
+    return not is_none_or_empty(os.getenv("E2B_API_KEY"))
+
+
 class ConversationCommand(str, Enum):
     Default = "default"
     General = "general"
@@ -362,20 +368,23 @@ command_descriptions_for_agent = {
     ConversationCommand.Code: "Agent can run Python code to parse information, run complex calculations, create documents and charts.",
 }
 
+e2b_tool_description = "To run Python code in a E2B sandbox with no network access. Helpful to parse complex information, run calculations, create text documents and create charts with quantitative data. Only matplotlib, pandas, numpy, scipy, bs4, sympy, einops, biopython, shapely and rdkit external packages are available."
+terrarium_tool_description = "To run Python code in a Terrarium, Pyodide sandbox with no network access. Helpful to parse complex information, run complex calculations, create plaintext documents and create charts with quantitative data. Only matplotlib, panda, numpy, scipy, bs4 and sympy external packages are available."
+
 tool_descriptions_for_llm = {
     ConversationCommand.Default: "To use a mix of your internal knowledge and the user's personal knowledge, or if you don't entirely understand the query.",
     ConversationCommand.General: "To use when you can answer the question without any outside information or personal knowledge",
     ConversationCommand.Notes: "To search the user's personal knowledge base. Especially helpful if the question expects context from the user's notes or documents.",
     ConversationCommand.Online: "To search for the latest, up-to-date information from the internet. Note: **Questions about Khoj should always use this data source**",
     ConversationCommand.Webpage: "To use if the user has directly provided the webpage urls or you are certain of the webpage urls to read.",
-    ConversationCommand.Code: "To run Python code in a Pyodide sandbox with no network access. Helpful when need to parse complex information, run complex calculations, create plaintext documents, and create charts with quantitative data. Only matplotlib, panda, numpy, scipy, bs4 and sympy external packages are available.",
+    ConversationCommand.Code: e2b_tool_description if is_e2b_code_sandbox_enabled() else terrarium_tool_description,
 }
 
 function_calling_description_for_llm = {
     ConversationCommand.Notes: "To search the user's personal knowledge base. Especially helpful if the question expects context from the user's notes or documents.",
     ConversationCommand.Online: "To search the internet for information. Useful to get a quick, broad overview from the internet. Provide all relevant context to ensure new searches, not in previous iterations, are performed.",
     ConversationCommand.Webpage: "To extract information from webpages. Useful for more detailed research from the internet. Usually used when you know the webpage links to refer to. Share the webpage links and information to extract in your query.",
-    ConversationCommand.Code: "To run Python code in a Pyodide sandbox with no network access. Helpful when need to parse complex information, run complex calculations, create plaintext documents, and create charts with quantitative data. Only matplotlib, panda, numpy, scipy, bs4 and sympy external packages are available.",
+    ConversationCommand.Code: e2b_tool_description if is_e2b_code_sandbox_enabled() else terrarium_tool_description,
 }
 
 mode_descriptions_for_llm = {

From 8305fddb1492b1f1d03dc30034dd67edaa67a311 Mon Sep 17 00:00:00 2001
From: Debanjum <debanjum@gmail.com>
Date: Sat, 15 Feb 2025 13:55:08 +0530
Subject: [PATCH 07/10] Default to non-zero temperature for all queries to
 Gemini models.

It may mitigate the intermittent invalid json output issues. Model
maybe going into repetition loops, non-zero temp may avoid that.
---
 src/khoj/processor/conversation/google/gemini_chat.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/khoj/processor/conversation/google/gemini_chat.py b/src/khoj/processor/conversation/google/gemini_chat.py
index cb25258c..f4e52914 100644
--- a/src/khoj/processor/conversation/google/gemini_chat.py
+++ b/src/khoj/processor/conversation/google/gemini_chat.py
@@ -34,7 +34,7 @@ def extract_questions_gemini(
     model: Optional[str] = "gemini-2.0-flash",
     conversation_log={},
     api_key=None,
-    temperature=0,
+    temperature=0.2,
     max_tokens=None,
     location_data: LocationData = None,
     user: KhojUser = None,
@@ -121,7 +121,7 @@ def gemini_send_message_to_model(
     api_key,
     model,
     response_type="text",
-    temperature=0,
+    temperature=0.2,
     model_kwargs=None,
     tracer={},
 ):

From 7b2d0fdddcbd5e9e2c9f8f0377899ec4f56a5c4d Mon Sep 17 00:00:00 2001
From: Debanjum <debanjum@gmail.com>
Date: Sat, 15 Feb 2025 18:29:35 +0530
Subject: [PATCH 08/10] Improve code gen chat actor to output code in inline md
 code blocks

Simplify code gen chat actor to improve correct code gen success,
especially for smaller models & models with limited json mode support

Allow specify code blocks inline with reasoning to try improve
code quality

Infer input files based on user file paths referenced in code.
---
 src/khoj/processor/conversation/prompts.py | 111 +++++++++++++++++----
 src/khoj/processor/tools/run_code.py       |  41 +++++---
 2 files changed, 119 insertions(+), 33 deletions(-)

diff --git a/src/khoj/processor/conversation/prompts.py b/src/khoj/processor/conversation/prompts.py
index 63cf028f..890b791c 100644
--- a/src/khoj/processor/conversation/prompts.py
+++ b/src/khoj/processor/conversation/prompts.py
@@ -976,8 +976,7 @@ python_code_generation_prompt = PromptTemplate.from_template(
 You are Khoj, an advanced python programmer. You are tasked with constructing a python program to best answer the user query.
 - The python program will run in a sandbox with no network access.
 - You can write programs to run complex calculations, analyze data, create charts, generate documents to meticulously answer the query.
-- List known file paths to required user documents in "input_files" and known links to required documents from the web in the "input_links" field.
-- The python program should be self-contained. It can only read data generated by the program itself and from provided input_files, input_links by their basename (i.e filename excluding file path).
+- The python program should be self-contained. It can only read data generated by the program itself and any user file paths referenced in your program.
 - Do not try display images or plots in the code directly. The code should save the image or plot to a file instead.
 - Write any document, charts etc. to be shared with the user to file. These files can be seen by the user.
 - Use as much context from the previous questions and answers as required to generate your code.
@@ -988,24 +987,99 @@ Current Date: {current_date}
 User's Location: {location}
 {username}
 
-The response JSON schema is of the form {{"code": "<python_code>", "input_files": ["file_path_1", "file_path_2"], "input_links": ["link_1", "link_2"]}}
-Examples:
+Your response should contain python code wrapped in markdown code blocks (i.e starting with```python and ending with ```)
+Example 1:
 ---
-{{
-"code": "# Input values\\nprincipal = 43235\\nrate = 5.24\\nyears = 5\\n\\n# Convert rate to decimal\\nrate_decimal = rate / 100\\n\\n# Calculate final amount\\nfinal_amount = principal * (1 + rate_decimal) ** years\\n\\n# Calculate interest earned\\ninterest_earned = final_amount - principal\\n\\n# Print results with formatting\\nprint(f"Interest Earned: ${{interest_earned:,.2f}}")\\nprint(f"Final Amount: ${{final_amount:,.2f}}")"
-}}
+Q: Calculate the interest earned and final amount for a principal of $43,235 invested at a rate of 5.24 percent for 5 years.
+A: Ok, to calculate the interest earned and final amount, we can use the formula for compound interest: $T = P(1 + r/n)^{{nt}}$,
+where T: total amount, P: principal, r: interest rate, n: number of times interest is compounded per year, and t: time in years.
 
-{{
-"code": "import re\\n\\n# Read org file\\nfile_path = 'tasks.org'\\nwith open(file_path, 'r') as f:\\n    content = f.read()\\n\\n# Get today's date in YYYY-MM-DD format\\ntoday = datetime.now().strftime('%Y-%m-%d')\\npattern = r'\*+\s+.*\\n.*SCHEDULED:\s+<' + today + r'.*>'\\n\\n# Find all matches using multiline mode\\nmatches = re.findall(pattern, content, re.MULTILINE)\\ncount = len(matches)\\n\\n# Display count\\nprint(f'Count of scheduled tasks for today: {{count}}')",
-"input_files": ["/home/linux/tasks.org"]
-}}
+Let's write the Python program to calculate this.
 
-{{
-"code": "import pandas as pd\\nimport matplotlib.pyplot as plt\\n\\n# Load the CSV file\\ndf = pd.read_csv('world_population_by_year.csv')\\n\\n# Plot the data\\nplt.figure(figsize=(10, 6))\\nplt.plot(df['Year'], df['Population'], marker='o')\\n\\n# Add titles and labels\\nplt.title('Population by Year')\\nplt.xlabel('Year')\\nplt.ylabel('Population')\\n\\n# Save the plot to a file\\nplt.savefig('population_by_year_plot.png')",
-"input_links": ["https://population.un.org/world_population_by_year.csv"]
-}}
+```python
+# Input values
+principal = 43235
+rate = 5.24
+years = 5
+
+# Convert rate to decimal
+rate_decimal = rate / 100
+
+# Calculate final amount
+final_amount = principal * (1 + rate_decimal) ** years
+
+# Calculate interest earned
+interest_earned = final_amount - principal
+
+# Print results with formatting
+print(f"Interest Earned: ${{interest_earned:,.2f}}")
+print(f"Final Amount: ${{final_amount:,.2f}}")
+```
+
+Example 2:
+---
+Q: Simplify first, then evaluate: $-7x+2(x^{{2}}-1)-(2x^{{2}}-x+3)$, where $x=1$.
+A: Certainly! Let's break down the problem step-by-step and utilize Python with SymPy to simplify and evaluate the expression.
+
+1. **Expression Simplification:**
+ We start with the expression \\(-7x + 2(x^2 - 1) - (2x^2 - x + 3)\\).
+
+2. **Substitute \\(x=1\\) into the simplified expression:**
+ Once simplified, we will substitute \\(x=1\\) into the expression to find its value.
+
+Let's implement this in Python using SymPy (as the package is available in the sandbox):
+
+```python
+import sympy as sp
+
+# Define the variable
+x = sp.symbols('x')
+
+# Define the expression
+expression = -7*x + 2*(x**2 - 1) - (2*x**2 - x + 3)
+
+# Simplify the expression
+simplified_expression = sp.simplify(expression)
+
+# Substitute x = 1 into the simplified expression
+evaluated_expression = simplified_expression.subs(x, 1)
+
+# Print the simplified expression and its evaluated value
+print(\"Simplified Expression:\", simplified_expression)
+print(\"Evaluated Expression at x=1:\", evaluated_expression)
+```
+
+Example 3:
+---
+Q: Plot the world ppulation growth over the years, given this year, world population world tuples: [(2000, 6), (2001, 7), (2002, 8), (2003, 9), (2004, 10)].
+A: Absolutely! We can utilize the Pandas and Matplotlib libraries (as both are available in the sandbox) to create the world population growth plot.
+```python
+import pandas as pd
+import matplotlib.pyplot as plt
+
+# Create a DataFrame of world population from the provided data
+data = {{
+    'Year': [2000, 2001, 2002, 2003, 2004],
+    'Population': [6, 7, 8, 9, 10]
+}}
+df = pd.DataFrame(data)
+
+# Plot the data
+plt.figure(figsize=(10, 6))
+plt.plot(df['Year'], df['Population'], marker='o')
+
+# Add titles and labels
+plt.title('Population by Year')
+plt.xlabel('Year')
+plt.ylabel('Population')
+
+# Save the plot to a file
+plt.savefig('population_by_year_plot.png')
+```
+
+Now it's your turn to construct a python program to answer the user's query using the provided context and coversation provided below.
+Ensure you include the python code to execute and wrap it in a markdown code block.
 
-Now it's your turn to construct a python program to answer the user's question. Provide the code, required input files and input links in a JSON object. Do not say anything else.
 Context:
 ---
 {context}
@@ -1014,8 +1088,9 @@ Chat History:
 ---
 {chat_history}
 
-User: {query}
-Khoj:
+User Query:
+---
+{query}
 """.strip()
 )
 
diff --git a/src/khoj/processor/tools/run_code.py b/src/khoj/processor/tools/run_code.py
index af1f0ffd..5c6cb48d 100644
--- a/src/khoj/processor/tools/run_code.py
+++ b/src/khoj/processor/tools/run_code.py
@@ -4,10 +4,12 @@ import datetime
 import logging
 import mimetypes
 import os
+import re
 from pathlib import Path
 from typing import Any, Callable, List, NamedTuple, Optional
 
 import aiohttp
+from asgiref.sync import sync_to_async
 from httpx import RemoteProtocolError
 from tenacity import (
     before_sleep_log,
@@ -24,7 +26,6 @@ from khoj.processor.conversation.utils import (
     ChatEvent,
     clean_code_python,
     construct_chat_history,
-    load_complex_json,
 )
 from khoj.routers.helpers import send_message_to_model_wrapper
 from khoj.utils.helpers import (
@@ -43,8 +44,7 @@ SANDBOX_URL = os.getenv("KHOJ_TERRARIUM_URL", "http://localhost:8080")
 
 class GeneratedCode(NamedTuple):
     code: str
-    input_files: List[str]
-    input_links: List[str]
+    input_files: List[FileObject]
 
 
 async def run_code(
@@ -82,13 +82,10 @@ async def run_code(
 
     # Prepare Input Data
     input_data = []
-    user_input_files: List[FileObject] = []
-    for input_file in generated_code.input_files:
-        user_input_files += await FileObjectAdapters.aget_file_objects_by_name(user, input_file)
-    for f in user_input_files:
+    for f in generated_code.input_files:
         input_data.append(
             {
-                "filename": os.path.basename(f.file_name),
+                "filename": f.file_name,
                 "b64_data": base64.b64encode(f.raw_text.encode("utf-8")).decode("utf-8"),
             }
         )
@@ -155,21 +152,35 @@ async def generate_python_code(
     response = await send_message_to_model_wrapper(
         code_generation_prompt,
         query_images=query_images,
-        response_type="json_object",
         user=user,
         tracer=tracer,
         query_files=query_files,
     )
 
-    # Validate that the response is a non-empty, JSON-serializable list
-    response = load_complex_json(response)
-    code = response.get("code", "").strip()
-    input_files = response.get("input_files", [])
-    input_links = response.get("input_links", [])
+    # Extract python code wrapped in markdown code blocks from the response
+    code_blocks = re.findall(r"```(?:python)?\n(.*?)\n```", response, re.DOTALL)
+
+    if not code_blocks:
+        raise ValueError("No Python code blocks found in response")
+
+    # Join multiple code blocks with newlines and strip any leading/trailing whitespace
+    code = "\n".join(code_blocks).strip()
 
     if not isinstance(code, str) or is_none_or_empty(code):
         raise ValueError
-    return GeneratedCode(code, input_files, input_links)
+
+    # Infer user files required in sandbox based on user file paths mentioned in code
+    input_files: List[FileObject] = []
+    user_files = await sync_to_async(set)(FileObjectAdapters.get_all_file_objects(user))
+    for user_file in user_files:
+        if user_file.file_name in code:
+            # Replace references to full file path used in code with just the file basename to ease reference in sandbox
+            file_basename = os.path.basename(user_file.file_name)
+            code = code.replace(user_file.file_name, file_basename)
+            user_file.file_name = file_basename
+            input_files.append(user_file)
+
+    return GeneratedCode(code, input_files)
 
 
 @retry(

From 94ca458639910b29c7aaab5abf16e499be597b26 Mon Sep 17 00:00:00 2001
From: Debanjum <debanjum@gmail.com>
Date: Sun, 16 Feb 2025 02:58:27 +0530
Subject: [PATCH 09/10] Set default chat model to KHOJ_CHAT_MODEL env var if
 set

Simplify code log to set default_use_model during init for readability
---
 .github/workflows/run_evals.yml        | 10 +++++++--
 src/khoj/database/adapters/__init__.py | 15 +++++++++++++
 src/khoj/utils/initialization.py       | 31 ++++++++++++++++++++------
 tests/evals/eval.py                    |  2 +-
 4 files changed, 48 insertions(+), 10 deletions(-)

diff --git a/.github/workflows/run_evals.yml b/.github/workflows/run_evals.yml
index 71123acf..21870c04 100644
--- a/.github/workflows/run_evals.yml
+++ b/.github/workflows/run_evals.yml
@@ -40,6 +40,11 @@ on:
         options:
           - terrarium
           - e2b
+      chat_model:
+        description: 'Chat model to use'
+        required: false
+        default: 'gemini-2.0-flash'
+        type: string
 
 jobs:
   eval:
@@ -48,7 +53,7 @@ jobs:
       matrix:
         # Use input from manual trigger if available, else run all combinations
         khoj_mode: ${{ github.event_name == 'workflow_dispatch' && fromJSON(format('["{0}"]', inputs.khoj_mode)) || fromJSON('["general", "default", "research"]') }}
-        dataset: ${{ github.event_name == 'workflow_dispatch' && fromJSON(format('["{0}"]', inputs.dataset)) || fromJSON('["frames", "simpleqa"]') }}
+        dataset: ${{ github.event_name == 'workflow_dispatch' && fromJSON(format('["{0}"]', inputs.dataset)) || fromJSON('["frames", "simpleqa", "gpqa"]') }}
 
     services:
       postgres:
@@ -103,6 +108,7 @@ jobs:
           BATCH_SIZE: "20"
           RANDOMIZE: "True"
           KHOJ_URL: "http://localhost:42110"
+          KHOJ_CHAT_MODEL: ${{ github.event_name == 'workflow_dispatch' && inputs.chat_model || 'gemini-2.0-flash' }}
           KHOJ_LLM_SEED: "42"
           GEMINI_API_KEY: ${{ secrets.GEMINI_API_KEY }}
           SERPER_DEV_API_KEY: ${{ matrix.dataset != 'math500' && secrets.SERPER_DEV_API_KEY }}
@@ -157,7 +163,7 @@ jobs:
           echo "## Evaluation Summary of Khoj on ${{ matrix.dataset }} in ${{ matrix.khoj_mode }} mode" >> $GITHUB_STEP_SUMMARY
           echo "**$(head -n 1 *_evaluation_summary_*.txt)**" >> $GITHUB_STEP_SUMMARY
           echo "- Khoj Version: ${{ steps.hatch.outputs.version }}" >> $GITHUB_STEP_SUMMARY
-          echo "- Chat Model: Gemini 2.0 Flash" >> $GITHUB_STEP_SUMMARY
+          echo "- Chat Model: ${{ inputs.chat_model }}" >> $GITHUB_STEP_SUMMARY
           echo "- Code Sandbox: ${{ inputs.sandbox}}" >> $GITHUB_STEP_SUMMARY
           echo "\`\`\`" >> $GITHUB_STEP_SUMMARY
           tail -n +2 *_evaluation_summary_*.txt >> $GITHUB_STEP_SUMMARY
diff --git a/src/khoj/database/adapters/__init__.py b/src/khoj/database/adapters/__init__.py
index 33e879aa..058017d2 100644
--- a/src/khoj/database/adapters/__init__.py
+++ b/src/khoj/database/adapters/__init__.py
@@ -1107,6 +1107,12 @@ class ConversationAdapters:
             return config.setting
         return ConversationAdapters.aget_advanced_chat_model(user)
 
+    @staticmethod
+    def get_chat_model_by_name(chat_model_name: str, ai_model_api_name: str = None):
+        if ai_model_api_name:
+            return ChatModel.objects.filter(name=chat_model_name, ai_model_api__name=ai_model_api_name).first()
+        return ChatModel.objects.filter(name=chat_model_name).first()
+
     @staticmethod
     async def aget_voice_model_config(user: KhojUser) -> Optional[VoiceModelOption]:
         voice_model_config = await UserVoiceModelConfig.objects.filter(user=user).prefetch_related("setting").afirst()
@@ -1205,6 +1211,15 @@ class ConversationAdapters:
             return server_chat_settings.chat_advanced
         return await ConversationAdapters.aget_default_chat_model(user)
 
+    @staticmethod
+    def set_default_chat_model(chat_model: ChatModel):
+        server_chat_settings = ServerChatSettings.objects.first()
+        if server_chat_settings:
+            server_chat_settings.chat_default = chat_model
+            server_chat_settings.save()
+        else:
+            ServerChatSettings.objects.create(chat_default=chat_model)
+
     @staticmethod
     async def aget_server_webscraper():
         server_chat_settings = await ServerChatSettings.objects.filter().prefetch_related("web_scraper").afirst()
diff --git a/src/khoj/utils/initialization.py b/src/khoj/utils/initialization.py
index 5f4254b5..3ea73891 100644
--- a/src/khoj/utils/initialization.py
+++ b/src/khoj/utils/initialization.py
@@ -185,16 +185,18 @@ def initialization(interactive: bool = True):
         )
         provider_name = provider_name or model_type.name.capitalize()
 
-        default_use_model = {True: "y", False: "n"}[default_api_key is not None]
-
-        # If not in interactive mode & in the offline setting, it's most likely that we're running in a containerized environment. This usually means there's not enough RAM to load offline models directly within the application. In such cases, we default to not using the model -- it's recommended to use another service like Ollama to host the model locally in that case.
-        default_use_model = {True: "n", False: default_use_model}[is_offline]
+        default_use_model = default_api_key is not None
+        # If not in interactive mode & in the offline setting, it's most likely that we're running in a containerized environment.
+        # This usually means there's not enough RAM to load offline models directly within the application.
+        # In such cases, we default to not using the model -- it's recommended to use another service like Ollama to host the model locally in that case.
+        if is_offline:
+            default_use_model = False
 
         use_model_provider = (
-            default_use_model if not interactive else input(f"Add {provider_name} chat models? (y/n): ")
+            default_use_model if not interactive else input(f"Add {provider_name} chat models? (y/n): ") == "y"
         )
 
-        if use_model_provider != "y":
+        if not use_model_provider:
             return False, None
 
         logger.info(f"️💬 Setting up your {provider_name} chat configuration")
@@ -303,4 +305,19 @@ def initialization(interactive: bool = True):
                 logger.error(f"🚨 Failed to create chat configuration: {e}", exc_info=True)
     else:
         _update_chat_model_options()
-        logger.info("🗣️ Chat model configuration updated")
+        logger.info("🗣️ Chat model options updated")
+
+    # Update the default chat model if it doesn't match
+    chat_config = ConversationAdapters.get_default_chat_model()
+    env_default_chat_model = os.getenv("KHOJ_CHAT_MODEL")
+    if not chat_config or not env_default_chat_model:
+        return
+    if chat_config.name != env_default_chat_model:
+        chat_model = ConversationAdapters.get_chat_model_by_name(env_default_chat_model)
+        if not chat_model:
+            logger.error(
+                f"🚨 Not setting default chat model. Chat model {env_default_chat_model} not found in existing chat model options."
+            )
+            return
+        ConversationAdapters.set_default_chat_model(chat_model)
+        logger.info(f"🗣️ Default chat model set to {chat_model.name}")
diff --git a/tests/evals/eval.py b/tests/evals/eval.py
index 0c95996f..e9d56f03 100644
--- a/tests/evals/eval.py
+++ b/tests/evals/eval.py
@@ -666,7 +666,7 @@ def main():
     colored_accuracy_str = f"Overall Accuracy: {colored_accuracy} on {args.dataset.title()} dataset."
     accuracy_str = f"Overall Accuracy: {accuracy:.2%} on {args.dataset}."
     accuracy_by_reasoning = f"Accuracy by Reasoning Type:\n{reasoning_type_accuracy}"
-    cost = f"Total Cost: ${running_cost.get():.5f}."
+    cost = f"Total Cost: ${running_cost.get():.5f} to evaluate {running_total_count.get()} results."
     sample_type = f"Sampling Type: {SAMPLE_SIZE} samples." if SAMPLE_SIZE else "Whole dataset."
     sample_type += " Randomized." if RANDOMIZE else ""
     logger.info(f"\n{colored_accuracy_str}\n\n{accuracy_by_reasoning}\n\n{cost}\n\n{sample_type}\n")

From c133d11556683cc67227e3330d29304e86518c52 Mon Sep 17 00:00:00 2001
From: Debanjum <debanjum@gmail.com>
Date: Fri, 28 Feb 2025 14:37:54 +0530
Subject: [PATCH 10/10] Improvements based on code feedback

---
 .github/workflows/run_evals.yml            | 2 +-
 src/khoj/processor/conversation/prompts.py | 2 +-
 src/khoj/processor/tools/run_code.py       | 7 ++++---
 src/khoj/utils/initialization.py           | 2 +-
 4 files changed, 7 insertions(+), 6 deletions(-)

diff --git a/.github/workflows/run_evals.yml b/.github/workflows/run_evals.yml
index 21870c04..2c8e9688 100644
--- a/.github/workflows/run_evals.yml
+++ b/.github/workflows/run_evals.yml
@@ -108,7 +108,7 @@ jobs:
           BATCH_SIZE: "20"
           RANDOMIZE: "True"
           KHOJ_URL: "http://localhost:42110"
-          KHOJ_CHAT_MODEL: ${{ github.event_name == 'workflow_dispatch' && inputs.chat_model || 'gemini-2.0-flash' }}
+          KHOJ_DEFAULT_CHAT_MODEL: ${{ github.event_name == 'workflow_dispatch' && inputs.chat_model || 'gemini-2.0-flash' }}
           KHOJ_LLM_SEED: "42"
           GEMINI_API_KEY: ${{ secrets.GEMINI_API_KEY }}
           SERPER_DEV_API_KEY: ${{ matrix.dataset != 'math500' && secrets.SERPER_DEV_API_KEY }}
diff --git a/src/khoj/processor/conversation/prompts.py b/src/khoj/processor/conversation/prompts.py
index 890b791c..0c2b3bbe 100644
--- a/src/khoj/processor/conversation/prompts.py
+++ b/src/khoj/processor/conversation/prompts.py
@@ -1051,7 +1051,7 @@ print(\"Evaluated Expression at x=1:\", evaluated_expression)
 
 Example 3:
 ---
-Q: Plot the world ppulation growth over the years, given this year, world population world tuples: [(2000, 6), (2001, 7), (2002, 8), (2003, 9), (2004, 10)].
+Q: Plot the world population growth over the years, given this year, world population world tuples: [(2000, 6), (2001, 7), (2002, 8), (2003, 9), (2004, 10)].
 A: Absolutely! We can utilize the Pandas and Matplotlib libraries (as both are available in the sandbox) to create the world population growth plot.
 ```python
 import pandas as pd
diff --git a/src/khoj/processor/tools/run_code.py b/src/khoj/processor/tools/run_code.py
index 5c6cb48d..12e65670 100644
--- a/src/khoj/processor/tools/run_code.py
+++ b/src/khoj/processor/tools/run_code.py
@@ -40,6 +40,7 @@ logger = logging.getLogger(__name__)
 
 
 SANDBOX_URL = os.getenv("KHOJ_TERRARIUM_URL", "http://localhost:8080")
+DEFAULT_E2B_TEMPLATE = "pmt2o0ghpang8gbiys57"
 
 
 class GeneratedCode(NamedTuple):
@@ -219,7 +220,7 @@ async def execute_e2b(code: str, input_files: list[dict]) -> dict[str, Any]:
 
     sandbox = await AsyncSandbox.create(
         api_key=os.getenv("E2B_API_KEY"),
-        template=os.getenv("E2B_TEMPLATE", "pmt2o0ghpang8gbiys57"),
+        template=os.getenv("E2B_TEMPLATE", DEFAULT_E2B_TEMPLATE),
         timeout=120,
         request_timeout=30,
     )
@@ -232,7 +233,7 @@ async def execute_e2b(code: str, input_files: list[dict]) -> dict[str, Any]:
         ]
         await asyncio.gather(*upload_tasks)
 
-        # Note stored files before execution
+        # Note stored files before execution to identify new files created during execution
         E2bFile = NamedTuple("E2bFile", [("name", str), ("path", str)])
         original_files = {E2bFile(f.name, f.path) for f in await sandbox.files.list("~")}
 
@@ -261,7 +262,7 @@ async def execute_e2b(code: str, input_files: list[dict]) -> dict[str, Any]:
 
         # Collect output files from execution results
         for idx, result in enumerate(execution.results):
-            for result_type in ["png", "jpeg", "svg", "text", "markdown", "json"]:
+            for result_type in {"png", "jpeg", "svg", "text", "markdown", "json"}:
                 if b64_data := getattr(result, result_type, None):
                     output_files.append({"filename": f"{idx}.{result_type}", "b64_data": b64_data})
                     break
diff --git a/src/khoj/utils/initialization.py b/src/khoj/utils/initialization.py
index 3ea73891..b5c661c4 100644
--- a/src/khoj/utils/initialization.py
+++ b/src/khoj/utils/initialization.py
@@ -309,7 +309,7 @@ def initialization(interactive: bool = True):
 
     # Update the default chat model if it doesn't match
     chat_config = ConversationAdapters.get_default_chat_model()
-    env_default_chat_model = os.getenv("KHOJ_CHAT_MODEL")
+    env_default_chat_model = os.getenv("KHOJ_DEFAULT_CHAT_MODEL")
     if not chat_config or not env_default_chat_model:
         return
     if chat_config.name != env_default_chat_model: