From 7b2d0fdddcbd5e9e2c9f8f0377899ec4f56a5c4d Mon Sep 17 00:00:00 2001
From: Debanjum <debanjum@gmail.com>
Date: Sat, 15 Feb 2025 18:29:35 +0530
Subject: [PATCH] Improve code gen chat actor to output code in inline md code
 blocks

Simplify code gen chat actor to improve correct code gen success,
especially for smaller models & models with limited json mode support

Allow specify code blocks inline with reasoning to try improve
code quality

Infer input files based on user file paths referenced in code.
---
 src/khoj/processor/conversation/prompts.py | 111 +++++++++++++++++----
 src/khoj/processor/tools/run_code.py       |  41 +++++---
 2 files changed, 119 insertions(+), 33 deletions(-)

diff --git a/src/khoj/processor/conversation/prompts.py b/src/khoj/processor/conversation/prompts.py
index 63cf028f..890b791c 100644
--- a/src/khoj/processor/conversation/prompts.py
+++ b/src/khoj/processor/conversation/prompts.py
@@ -976,8 +976,7 @@ python_code_generation_prompt = PromptTemplate.from_template(
 You are Khoj, an advanced python programmer. You are tasked with constructing a python program to best answer the user query.
 - The python program will run in a sandbox with no network access.
 - You can write programs to run complex calculations, analyze data, create charts, generate documents to meticulously answer the query.
-- List known file paths to required user documents in "input_files" and known links to required documents from the web in the "input_links" field.
-- The python program should be self-contained. It can only read data generated by the program itself and from provided input_files, input_links by their basename (i.e filename excluding file path).
+- The python program should be self-contained. It can only read data generated by the program itself and any user file paths referenced in your program.
 - Do not try display images or plots in the code directly. The code should save the image or plot to a file instead.
 - Write any document, charts etc. to be shared with the user to file. These files can be seen by the user.
 - Use as much context from the previous questions and answers as required to generate your code.
@@ -988,24 +987,99 @@ Current Date: {current_date}
 User's Location: {location}
 {username}
 
-The response JSON schema is of the form {{"code": "<python_code>", "input_files": ["file_path_1", "file_path_2"], "input_links": ["link_1", "link_2"]}}
-Examples:
+Your response should contain python code wrapped in markdown code blocks (i.e starting with```python and ending with ```)
+Example 1:
 ---
-{{
-"code": "# Input values\\nprincipal = 43235\\nrate = 5.24\\nyears = 5\\n\\n# Convert rate to decimal\\nrate_decimal = rate / 100\\n\\n# Calculate final amount\\nfinal_amount = principal * (1 + rate_decimal) ** years\\n\\n# Calculate interest earned\\ninterest_earned = final_amount - principal\\n\\n# Print results with formatting\\nprint(f"Interest Earned: ${{interest_earned:,.2f}}")\\nprint(f"Final Amount: ${{final_amount:,.2f}}")"
-}}
+Q: Calculate the interest earned and final amount for a principal of $43,235 invested at a rate of 5.24 percent for 5 years.
+A: Ok, to calculate the interest earned and final amount, we can use the formula for compound interest: $T = P(1 + r/n)^{{nt}}$,
+where T: total amount, P: principal, r: interest rate, n: number of times interest is compounded per year, and t: time in years.
 
-{{
-"code": "import re\\n\\n# Read org file\\nfile_path = 'tasks.org'\\nwith open(file_path, 'r') as f:\\n    content = f.read()\\n\\n# Get today's date in YYYY-MM-DD format\\ntoday = datetime.now().strftime('%Y-%m-%d')\\npattern = r'\*+\s+.*\\n.*SCHEDULED:\s+<' + today + r'.*>'\\n\\n# Find all matches using multiline mode\\nmatches = re.findall(pattern, content, re.MULTILINE)\\ncount = len(matches)\\n\\n# Display count\\nprint(f'Count of scheduled tasks for today: {{count}}')",
-"input_files": ["/home/linux/tasks.org"]
-}}
+Let's write the Python program to calculate this.
 
-{{
-"code": "import pandas as pd\\nimport matplotlib.pyplot as plt\\n\\n# Load the CSV file\\ndf = pd.read_csv('world_population_by_year.csv')\\n\\n# Plot the data\\nplt.figure(figsize=(10, 6))\\nplt.plot(df['Year'], df['Population'], marker='o')\\n\\n# Add titles and labels\\nplt.title('Population by Year')\\nplt.xlabel('Year')\\nplt.ylabel('Population')\\n\\n# Save the plot to a file\\nplt.savefig('population_by_year_plot.png')",
-"input_links": ["https://population.un.org/world_population_by_year.csv"]
-}}
+```python
+# Input values
+principal = 43235
+rate = 5.24
+years = 5
+
+# Convert rate to decimal
+rate_decimal = rate / 100
+
+# Calculate final amount
+final_amount = principal * (1 + rate_decimal) ** years
+
+# Calculate interest earned
+interest_earned = final_amount - principal
+
+# Print results with formatting
+print(f"Interest Earned: ${{interest_earned:,.2f}}")
+print(f"Final Amount: ${{final_amount:,.2f}}")
+```
+
+Example 2:
+---
+Q: Simplify first, then evaluate: $-7x+2(x^{{2}}-1)-(2x^{{2}}-x+3)$, where $x=1$.
+A: Certainly! Let's break down the problem step-by-step and utilize Python with SymPy to simplify and evaluate the expression.
+
+1. **Expression Simplification:**
+ We start with the expression \\(-7x + 2(x^2 - 1) - (2x^2 - x + 3)\\).
+
+2. **Substitute \\(x=1\\) into the simplified expression:**
+ Once simplified, we will substitute \\(x=1\\) into the expression to find its value.
+
+Let's implement this in Python using SymPy (as the package is available in the sandbox):
+
+```python
+import sympy as sp
+
+# Define the variable
+x = sp.symbols('x')
+
+# Define the expression
+expression = -7*x + 2*(x**2 - 1) - (2*x**2 - x + 3)
+
+# Simplify the expression
+simplified_expression = sp.simplify(expression)
+
+# Substitute x = 1 into the simplified expression
+evaluated_expression = simplified_expression.subs(x, 1)
+
+# Print the simplified expression and its evaluated value
+print(\"Simplified Expression:\", simplified_expression)
+print(\"Evaluated Expression at x=1:\", evaluated_expression)
+```
+
+Example 3:
+---
+Q: Plot the world ppulation growth over the years, given this year, world population world tuples: [(2000, 6), (2001, 7), (2002, 8), (2003, 9), (2004, 10)].
+A: Absolutely! We can utilize the Pandas and Matplotlib libraries (as both are available in the sandbox) to create the world population growth plot.
+```python
+import pandas as pd
+import matplotlib.pyplot as plt
+
+# Create a DataFrame of world population from the provided data
+data = {{
+    'Year': [2000, 2001, 2002, 2003, 2004],
+    'Population': [6, 7, 8, 9, 10]
+}}
+df = pd.DataFrame(data)
+
+# Plot the data
+plt.figure(figsize=(10, 6))
+plt.plot(df['Year'], df['Population'], marker='o')
+
+# Add titles and labels
+plt.title('Population by Year')
+plt.xlabel('Year')
+plt.ylabel('Population')
+
+# Save the plot to a file
+plt.savefig('population_by_year_plot.png')
+```
+
+Now it's your turn to construct a python program to answer the user's query using the provided context and coversation provided below.
+Ensure you include the python code to execute and wrap it in a markdown code block.
 
-Now it's your turn to construct a python program to answer the user's question. Provide the code, required input files and input links in a JSON object. Do not say anything else.
 Context:
 ---
 {context}
@@ -1014,8 +1088,9 @@ Chat History:
 ---
 {chat_history}
 
-User: {query}
-Khoj:
+User Query:
+---
+{query}
 """.strip()
 )
 
diff --git a/src/khoj/processor/tools/run_code.py b/src/khoj/processor/tools/run_code.py
index af1f0ffd..5c6cb48d 100644
--- a/src/khoj/processor/tools/run_code.py
+++ b/src/khoj/processor/tools/run_code.py
@@ -4,10 +4,12 @@ import datetime
 import logging
 import mimetypes
 import os
+import re
 from pathlib import Path
 from typing import Any, Callable, List, NamedTuple, Optional
 
 import aiohttp
+from asgiref.sync import sync_to_async
 from httpx import RemoteProtocolError
 from tenacity import (
     before_sleep_log,
@@ -24,7 +26,6 @@ from khoj.processor.conversation.utils import (
     ChatEvent,
     clean_code_python,
     construct_chat_history,
-    load_complex_json,
 )
 from khoj.routers.helpers import send_message_to_model_wrapper
 from khoj.utils.helpers import (
@@ -43,8 +44,7 @@ SANDBOX_URL = os.getenv("KHOJ_TERRARIUM_URL", "http://localhost:8080")
 
 class GeneratedCode(NamedTuple):
     code: str
-    input_files: List[str]
-    input_links: List[str]
+    input_files: List[FileObject]
 
 
 async def run_code(
@@ -82,13 +82,10 @@ async def run_code(
 
     # Prepare Input Data
     input_data = []
-    user_input_files: List[FileObject] = []
-    for input_file in generated_code.input_files:
-        user_input_files += await FileObjectAdapters.aget_file_objects_by_name(user, input_file)
-    for f in user_input_files:
+    for f in generated_code.input_files:
         input_data.append(
             {
-                "filename": os.path.basename(f.file_name),
+                "filename": f.file_name,
                 "b64_data": base64.b64encode(f.raw_text.encode("utf-8")).decode("utf-8"),
             }
         )
@@ -155,21 +152,35 @@ async def generate_python_code(
     response = await send_message_to_model_wrapper(
         code_generation_prompt,
         query_images=query_images,
-        response_type="json_object",
         user=user,
         tracer=tracer,
         query_files=query_files,
     )
 
-    # Validate that the response is a non-empty, JSON-serializable list
-    response = load_complex_json(response)
-    code = response.get("code", "").strip()
-    input_files = response.get("input_files", [])
-    input_links = response.get("input_links", [])
+    # Extract python code wrapped in markdown code blocks from the response
+    code_blocks = re.findall(r"```(?:python)?\n(.*?)\n```", response, re.DOTALL)
+
+    if not code_blocks:
+        raise ValueError("No Python code blocks found in response")
+
+    # Join multiple code blocks with newlines and strip any leading/trailing whitespace
+    code = "\n".join(code_blocks).strip()
 
     if not isinstance(code, str) or is_none_or_empty(code):
         raise ValueError
-    return GeneratedCode(code, input_files, input_links)
+
+    # Infer user files required in sandbox based on user file paths mentioned in code
+    input_files: List[FileObject] = []
+    user_files = await sync_to_async(set)(FileObjectAdapters.get_all_file_objects(user))
+    for user_file in user_files:
+        if user_file.file_name in code:
+            # Replace references to full file path used in code with just the file basename to ease reference in sandbox
+            file_basename = os.path.basename(user_file.file_name)
+            code = code.replace(user_file.file_name, file_basename)
+            user_file.file_name = file_basename
+            input_files.append(user_file)
+
+    return GeneratedCode(code, input_files)
 
 
 @retry(