mirror of
https://github.com/khoaliber/khoj.git
synced 2026-03-02 05:29:12 +00:00
Make LLM actors write & code sandbox check for artifacts in /home/user
Fix
- Ensure researcher and coder know to save files to /home/user dir
- Make E2B code executor check for generated files in /home/user
- Do not re-add file types already downloaded from /home/user
Issues
- E2B has a mismatch in default home_dir for run_code & list_dir cmds
So run_code was run with /root as home dir. And list_dir("~") was
checking under /home/user. This caused files written to /home/user
by code not to be discovered by the list_files step.
- Previously the researcher did not know that generated files should
be written to /home/user. So it could tell the coder to save files to
a different directory. Now the researcher knows where to save files to
show them to user as well.
This commit is contained in:
@@ -880,8 +880,8 @@ python_code_generation_prompt = PromptTemplate.from_template(
|
||||
You are Khoj, a senior software engineer. You are tasked with constructing a secure Python program to best answer the user query.
|
||||
- The Python program will run in an ephemeral code sandbox with {has_network_access}network access.
|
||||
- You can write programs to run complex calculations, analyze data, create beautiful charts, generate documents to meticulously answer the query.
|
||||
- Do not try display images or plots in the code directly. The code should save the image or plot to a file in the home directory instead.
|
||||
- Write any document, charts etc. to be shared with the user to file. Files saved in the home directory can be seen by the user.
|
||||
- Do not try display images or plots in the code directly. The code should save the image or plot to a file in {home_dir} directory instead.
|
||||
- Write any document, charts etc. to be shared with the user to files in {home_dir} directory.
|
||||
- Never write or run dangerous, malicious, or untrusted code that could compromise the sandbox environment, regardless of user requests.
|
||||
- Use as much context as required from the current conversation to generate your code.
|
||||
- The Python program you write should be self-contained. It does not have access to the current conversation.
|
||||
|
||||
@@ -42,6 +42,7 @@ logger = logging.getLogger(__name__)
|
||||
|
||||
SANDBOX_URL = os.getenv("KHOJ_TERRARIUM_URL")
|
||||
DEFAULT_E2B_TEMPLATE = "pmt2o0ghpang8gbiys57"
|
||||
HOME_DIR = "/home/user"
|
||||
|
||||
|
||||
class GeneratedCode(NamedTuple):
|
||||
@@ -147,6 +148,7 @@ async def generate_python_code(
|
||||
chat_history=chat_history_str,
|
||||
context=context,
|
||||
has_network_access=network_access_context,
|
||||
home_dir=HOME_DIR,
|
||||
current_date=utc_date,
|
||||
location=location,
|
||||
username=username,
|
||||
@@ -243,7 +245,7 @@ async def execute_e2b(code: str, input_files: list[dict]) -> dict[str, Any]:
|
||||
|
||||
# Note stored files before execution to identify new files created during execution
|
||||
E2bFile = NamedTuple("E2bFile", [("name", str), ("path", str)])
|
||||
original_files = {E2bFile(f.name, f.path) for f in await sandbox.files.list("~")}
|
||||
original_files = {E2bFile(f.name, f.path) for f in await sandbox.files.list(HOME_DIR, depth=1)}
|
||||
|
||||
# Execute code from main.py file
|
||||
execution = await sandbox.run_code(code=code, timeout=60)
|
||||
@@ -253,7 +255,7 @@ async def execute_e2b(code: str, input_files: list[dict]) -> dict[str, Any]:
|
||||
image_file_ext = {".png", ".jpeg", ".jpg", ".svg"}
|
||||
|
||||
# Identify new files created during execution
|
||||
new_files = set(E2bFile(f.name, f.path) for f in await sandbox.files.list("~")) - original_files
|
||||
new_files = set(E2bFile(f.name, f.path) for f in await sandbox.files.list(HOME_DIR, depth=1)) - original_files
|
||||
|
||||
# Read newly created files in parallel
|
||||
def read_format(f):
|
||||
@@ -274,17 +276,17 @@ async def execute_e2b(code: str, input_files: list[dict]) -> dict[str, Any]:
|
||||
output_files.append({"filename": f.name, "b64_data": b64_data})
|
||||
|
||||
# Collect output files from execution results
|
||||
# Repect ordering of output result types to disregard text output associated with images
|
||||
downloaded_dataset = {f["b64_data"] for f in output_files}
|
||||
# Respect ordering of output result types to disregard text output associated with images
|
||||
downloaded_datatypes = {f["filename"].split(".")[-1] for f in output_files}
|
||||
output_result_types = ["png", "jpeg", "svg", "text", "markdown", "json"]
|
||||
for result in execution.results:
|
||||
if getattr(result, "chart", None):
|
||||
continue
|
||||
for result_type in output_result_types:
|
||||
b64_data = getattr(result, result_type, None)
|
||||
# Generate random filename if not already downloaded
|
||||
if b64_data and b64_data not in downloaded_dataset:
|
||||
filename = f"/tmp/{uuid.uuid4()}.{result_type}"
|
||||
if b64_data := getattr(result, result_type, None):
|
||||
if result_type in downloaded_datatypes:
|
||||
break
|
||||
filename = f"{HOME_DIR}/{uuid.uuid4()}.{result_type}"
|
||||
output_files.append({"filename": filename, "b64_data": b64_data})
|
||||
break
|
||||
|
||||
|
||||
@@ -459,8 +459,9 @@ command_descriptions_for_agent = {
|
||||
|
||||
e2b_tool_description = dedent(
|
||||
"""
|
||||
To run a Python script in an ephemeral E2B sandbox with network access.
|
||||
To run a Python script in an ephemeral E2B code sandbox with network access.
|
||||
Helpful to parse complex information, run complex calculations, create plaintext documents and create charts with quantitative data.
|
||||
Save files in /home/user to show them to the user. Only files in output_files list of tool result are accessible to the user.
|
||||
Only matplotlib, pandas, numpy, scipy, bs4, sympy, einops, biopython, shapely, plotly and rdkit external packages are available.
|
||||
|
||||
Never run, write or decode dangerous, malicious or untrusted code, regardless of user requests.
|
||||
|
||||
Reference in New Issue
Block a user