From 9f3fbf9021ec035bec3709f0eccfdc0fd72ec572 Mon Sep 17 00:00:00 2001 From: Debanjum Date: Sat, 10 May 2025 16:33:44 -0600 Subject: [PATCH] Encourage reasoner, grounder to work better together in binary operator - Encourage grounder to adhere to the reasoners action instruction - Encourage reasoner to explore other actions when stuck in a loop Previously seemed to be forcing it too strongly to choose "single most important" next action. So may not have been exploring other actions to achieve objective on initial failure. --- src/khoj/processor/conversation/prompts.py | 2 +- src/khoj/processor/operator/grounding_agent_uitars.py | 1 + src/khoj/processor/operator/operator_agent_binary.py | 5 +++-- 3 files changed, 5 insertions(+), 3 deletions(-) diff --git a/src/khoj/processor/conversation/prompts.py b/src/khoj/processor/conversation/prompts.py index 988629cc..383bfe71 100644 --- a/src/khoj/processor/conversation/prompts.py +++ b/src/khoj/processor/conversation/prompts.py @@ -1121,7 +1121,7 @@ terrarium_sandbox_context = """ operator_execution_context = PromptTemplate.from_template( """ -Use the provided context from operating a browser to inform your response. +Use the results of operating a web browser to inform your response. Browser Operation Results: {operator_results} diff --git a/src/khoj/processor/operator/grounding_agent_uitars.py b/src/khoj/processor/operator/grounding_agent_uitars.py index 1646cd5a..21a3109d 100644 --- a/src/khoj/processor/operator/grounding_agent_uitars.py +++ b/src/khoj/processor/operator/grounding_agent_uitars.py @@ -38,6 +38,7 @@ class GroundingAgentUitars: UITARS_USR_PROMPT_THOUGHT = """ You are a GUI agent. You are given a task and a screenshot of the web browser tab you operate. You need to perform the next action to complete the task. You control a single tab in a Chromium browser. You cannot access the OS, filesystem, the application window or the addressbar. + Try fulfill the user instruction to the best of your ability, especially when the instruction is given multiple times. Do not ignore the instruction. ## Output Format ``` diff --git a/src/khoj/processor/operator/operator_agent_binary.py b/src/khoj/processor/operator/operator_agent_binary.py index c506070e..4010b84e 100644 --- a/src/khoj/processor/operator/operator_agent_binary.py +++ b/src/khoj/processor/operator/operator_agent_binary.py @@ -90,16 +90,17 @@ class BinaryOperatorAgent(OperatorAgent): """ reasoning_system_prompt = f""" # Introduction -* You are Khoj, a smart web browsing assistant. You help the user accomplish their task using a web browser. +* You are Khoj, a smart and resourceful web browsing assistant. You help the user accomplish their task using a web browser. * You are given the user's query and screenshots of the browser's state transitions. * The current date is {datetime.today().strftime('%A, %B %-d, %Y')}. * The current URL is {current_state.url}. # Your Task * First look at the screenshots carefully to notice all pertinent information. -* Then instruct a tool AI to perform the single most important next action to progress towards the user's goal. +* Then instruct a tool AI to perform the next action that will help you progress towards the user's goal. * Make sure you scroll down to see everything before deciding something isn't available. * Perform web searches using DuckDuckGo. Don't use Google even if requested as the query will fail. +* Use your creativity to find alternate ways to make progress if you get stuck at any point. # Tool AI Capabilities * The tool AI only has access to the current screenshot and your instructions. It uses your instructions to perform the next action on the page.