From 7b2d0fdddcbd5e9e2c9f8f0377899ec4f56a5c4d Mon Sep 17 00:00:00 2001 From: Debanjum Date: Sat, 15 Feb 2025 18:29:35 +0530 Subject: [PATCH] Improve code gen chat actor to output code in inline md code blocks Simplify code gen chat actor to improve correct code gen success, especially for smaller models & models with limited json mode support Allow specify code blocks inline with reasoning to try improve code quality Infer input files based on user file paths referenced in code. --- src/khoj/processor/conversation/prompts.py | 111 +++++++++++++++++---- src/khoj/processor/tools/run_code.py | 41 +++++--- 2 files changed, 119 insertions(+), 33 deletions(-) diff --git a/src/khoj/processor/conversation/prompts.py b/src/khoj/processor/conversation/prompts.py index 63cf028f..890b791c 100644 --- a/src/khoj/processor/conversation/prompts.py +++ b/src/khoj/processor/conversation/prompts.py @@ -976,8 +976,7 @@ python_code_generation_prompt = PromptTemplate.from_template( You are Khoj, an advanced python programmer. You are tasked with constructing a python program to best answer the user query. - The python program will run in a sandbox with no network access. - You can write programs to run complex calculations, analyze data, create charts, generate documents to meticulously answer the query. -- List known file paths to required user documents in "input_files" and known links to required documents from the web in the "input_links" field. -- The python program should be self-contained. It can only read data generated by the program itself and from provided input_files, input_links by their basename (i.e filename excluding file path). +- The python program should be self-contained. It can only read data generated by the program itself and any user file paths referenced in your program. - Do not try display images or plots in the code directly. The code should save the image or plot to a file instead. - Write any document, charts etc. to be shared with the user to file. These files can be seen by the user. - Use as much context from the previous questions and answers as required to generate your code. @@ -988,24 +987,99 @@ Current Date: {current_date} User's Location: {location} {username} -The response JSON schema is of the form {{"code": "", "input_files": ["file_path_1", "file_path_2"], "input_links": ["link_1", "link_2"]}} -Examples: +Your response should contain python code wrapped in markdown code blocks (i.e starting with```python and ending with ```) +Example 1: --- -{{ -"code": "# Input values\\nprincipal = 43235\\nrate = 5.24\\nyears = 5\\n\\n# Convert rate to decimal\\nrate_decimal = rate / 100\\n\\n# Calculate final amount\\nfinal_amount = principal * (1 + rate_decimal) ** years\\n\\n# Calculate interest earned\\ninterest_earned = final_amount - principal\\n\\n# Print results with formatting\\nprint(f"Interest Earned: ${{interest_earned:,.2f}}")\\nprint(f"Final Amount: ${{final_amount:,.2f}}")" -}} +Q: Calculate the interest earned and final amount for a principal of $43,235 invested at a rate of 5.24 percent for 5 years. +A: Ok, to calculate the interest earned and final amount, we can use the formula for compound interest: $T = P(1 + r/n)^{{nt}}$, +where T: total amount, P: principal, r: interest rate, n: number of times interest is compounded per year, and t: time in years. -{{ -"code": "import re\\n\\n# Read org file\\nfile_path = 'tasks.org'\\nwith open(file_path, 'r') as f:\\n content = f.read()\\n\\n# Get today's date in YYYY-MM-DD format\\ntoday = datetime.now().strftime('%Y-%m-%d')\\npattern = r'\*+\s+.*\\n.*SCHEDULED:\s+<' + today + r'.*>'\\n\\n# Find all matches using multiline mode\\nmatches = re.findall(pattern, content, re.MULTILINE)\\ncount = len(matches)\\n\\n# Display count\\nprint(f'Count of scheduled tasks for today: {{count}}')", -"input_files": ["/home/linux/tasks.org"] -}} +Let's write the Python program to calculate this. -{{ -"code": "import pandas as pd\\nimport matplotlib.pyplot as plt\\n\\n# Load the CSV file\\ndf = pd.read_csv('world_population_by_year.csv')\\n\\n# Plot the data\\nplt.figure(figsize=(10, 6))\\nplt.plot(df['Year'], df['Population'], marker='o')\\n\\n# Add titles and labels\\nplt.title('Population by Year')\\nplt.xlabel('Year')\\nplt.ylabel('Population')\\n\\n# Save the plot to a file\\nplt.savefig('population_by_year_plot.png')", -"input_links": ["https://population.un.org/world_population_by_year.csv"] -}} +```python +# Input values +principal = 43235 +rate = 5.24 +years = 5 + +# Convert rate to decimal +rate_decimal = rate / 100 + +# Calculate final amount +final_amount = principal * (1 + rate_decimal) ** years + +# Calculate interest earned +interest_earned = final_amount - principal + +# Print results with formatting +print(f"Interest Earned: ${{interest_earned:,.2f}}") +print(f"Final Amount: ${{final_amount:,.2f}}") +``` + +Example 2: +--- +Q: Simplify first, then evaluate: $-7x+2(x^{{2}}-1)-(2x^{{2}}-x+3)$, where $x=1$. +A: Certainly! Let's break down the problem step-by-step and utilize Python with SymPy to simplify and evaluate the expression. + +1. **Expression Simplification:** + We start with the expression \\(-7x + 2(x^2 - 1) - (2x^2 - x + 3)\\). + +2. **Substitute \\(x=1\\) into the simplified expression:** + Once simplified, we will substitute \\(x=1\\) into the expression to find its value. + +Let's implement this in Python using SymPy (as the package is available in the sandbox): + +```python +import sympy as sp + +# Define the variable +x = sp.symbols('x') + +# Define the expression +expression = -7*x + 2*(x**2 - 1) - (2*x**2 - x + 3) + +# Simplify the expression +simplified_expression = sp.simplify(expression) + +# Substitute x = 1 into the simplified expression +evaluated_expression = simplified_expression.subs(x, 1) + +# Print the simplified expression and its evaluated value +print(\"Simplified Expression:\", simplified_expression) +print(\"Evaluated Expression at x=1:\", evaluated_expression) +``` + +Example 3: +--- +Q: Plot the world ppulation growth over the years, given this year, world population world tuples: [(2000, 6), (2001, 7), (2002, 8), (2003, 9), (2004, 10)]. +A: Absolutely! We can utilize the Pandas and Matplotlib libraries (as both are available in the sandbox) to create the world population growth plot. +```python +import pandas as pd +import matplotlib.pyplot as plt + +# Create a DataFrame of world population from the provided data +data = {{ + 'Year': [2000, 2001, 2002, 2003, 2004], + 'Population': [6, 7, 8, 9, 10] +}} +df = pd.DataFrame(data) + +# Plot the data +plt.figure(figsize=(10, 6)) +plt.plot(df['Year'], df['Population'], marker='o') + +# Add titles and labels +plt.title('Population by Year') +plt.xlabel('Year') +plt.ylabel('Population') + +# Save the plot to a file +plt.savefig('population_by_year_plot.png') +``` + +Now it's your turn to construct a python program to answer the user's query using the provided context and coversation provided below. +Ensure you include the python code to execute and wrap it in a markdown code block. -Now it's your turn to construct a python program to answer the user's question. Provide the code, required input files and input links in a JSON object. Do not say anything else. Context: --- {context} @@ -1014,8 +1088,9 @@ Chat History: --- {chat_history} -User: {query} -Khoj: +User Query: +--- +{query} """.strip() ) diff --git a/src/khoj/processor/tools/run_code.py b/src/khoj/processor/tools/run_code.py index af1f0ffd..5c6cb48d 100644 --- a/src/khoj/processor/tools/run_code.py +++ b/src/khoj/processor/tools/run_code.py @@ -4,10 +4,12 @@ import datetime import logging import mimetypes import os +import re from pathlib import Path from typing import Any, Callable, List, NamedTuple, Optional import aiohttp +from asgiref.sync import sync_to_async from httpx import RemoteProtocolError from tenacity import ( before_sleep_log, @@ -24,7 +26,6 @@ from khoj.processor.conversation.utils import ( ChatEvent, clean_code_python, construct_chat_history, - load_complex_json, ) from khoj.routers.helpers import send_message_to_model_wrapper from khoj.utils.helpers import ( @@ -43,8 +44,7 @@ SANDBOX_URL = os.getenv("KHOJ_TERRARIUM_URL", "http://localhost:8080") class GeneratedCode(NamedTuple): code: str - input_files: List[str] - input_links: List[str] + input_files: List[FileObject] async def run_code( @@ -82,13 +82,10 @@ async def run_code( # Prepare Input Data input_data = [] - user_input_files: List[FileObject] = [] - for input_file in generated_code.input_files: - user_input_files += await FileObjectAdapters.aget_file_objects_by_name(user, input_file) - for f in user_input_files: + for f in generated_code.input_files: input_data.append( { - "filename": os.path.basename(f.file_name), + "filename": f.file_name, "b64_data": base64.b64encode(f.raw_text.encode("utf-8")).decode("utf-8"), } ) @@ -155,21 +152,35 @@ async def generate_python_code( response = await send_message_to_model_wrapper( code_generation_prompt, query_images=query_images, - response_type="json_object", user=user, tracer=tracer, query_files=query_files, ) - # Validate that the response is a non-empty, JSON-serializable list - response = load_complex_json(response) - code = response.get("code", "").strip() - input_files = response.get("input_files", []) - input_links = response.get("input_links", []) + # Extract python code wrapped in markdown code blocks from the response + code_blocks = re.findall(r"```(?:python)?\n(.*?)\n```", response, re.DOTALL) + + if not code_blocks: + raise ValueError("No Python code blocks found in response") + + # Join multiple code blocks with newlines and strip any leading/trailing whitespace + code = "\n".join(code_blocks).strip() if not isinstance(code, str) or is_none_or_empty(code): raise ValueError - return GeneratedCode(code, input_files, input_links) + + # Infer user files required in sandbox based on user file paths mentioned in code + input_files: List[FileObject] = [] + user_files = await sync_to_async(set)(FileObjectAdapters.get_all_file_objects(user)) + for user_file in user_files: + if user_file.file_name in code: + # Replace references to full file path used in code with just the file basename to ease reference in sandbox + file_basename = os.path.basename(user_file.file_name) + code = code.replace(user_file.file_name, file_basename) + user_file.file_name = file_basename + input_files.append(user_file) + + return GeneratedCode(code, input_files) @retry(