From 59e0e092b09db18f216071e4fbf253950fa5cc83 Mon Sep 17 00:00:00 2001 From: Debanjum Date: Sat, 10 May 2025 16:27:53 -0600 Subject: [PATCH] Remove deprecated prompt for grounding model to choose goto, back func Goto and back functions are chosen by the visual reasoning model for increased reliability in selecting those tools. The ui-tars grounding models seems too tuned to use a specific set of tools. --- src/khoj/processor/operator/grounding_agent_uitars.py | 3 --- src/khoj/processor/operator/operate_browser.py | 3 ++- 2 files changed, 2 insertions(+), 4 deletions(-) diff --git a/src/khoj/processor/operator/grounding_agent_uitars.py b/src/khoj/processor/operator/grounding_agent_uitars.py index 4ba2578b..1646cd5a 100644 --- a/src/khoj/processor/operator/grounding_agent_uitars.py +++ b/src/khoj/processor/operator/grounding_agent_uitars.py @@ -38,7 +38,6 @@ class GroundingAgentUitars: UITARS_USR_PROMPT_THOUGHT = """ You are a GUI agent. You are given a task and a screenshot of the web browser tab you operate. You need to perform the next action to complete the task. You control a single tab in a Chromium browser. You cannot access the OS, filesystem, the application window or the addressbar. - ALWAYS use the `goto()` function to navigate to a specific URL. Ctrl+t, Ctrl+w, Ctrl+q, Ctrl+Shift+T, Ctrl+Shift+W are not allowed. ## Output Format ``` @@ -66,8 +65,6 @@ class GroundingAgentUitars: type(content='') #If you want to submit your input, use "\\n" at the end of `content`. scroll(start_box='<|box_start|>(x1,y1)<|box_end|>', direction='down or up or right or left') wait() #Sleep for 5s and take a screenshot to check for any changes. - goto(url='') # ALWAYS use the goto function to navigate to a specific URL. - back() # Use this to go back to the previous page. finished(content='xxx') # Use escape characters \\', \\", and \\n in content part to ensure we can parse the content in normal python string format. """.lstrip() diff --git a/src/khoj/processor/operator/operate_browser.py b/src/khoj/processor/operator/operate_browser.py index cfd677e7..ce69a6fd 100644 --- a/src/khoj/processor/operator/operate_browser.py +++ b/src/khoj/processor/operator/operate_browser.py @@ -79,7 +79,7 @@ async def operate_browser( with timer(f"Operating browser with {reasoning_model.model_type} {reasoning_model.name}", logger): while iterations < max_iterations and not task_completed: if cancellation_event and cancellation_event.is_set(): - logger.info(f"Browser operator cancelled by client disconnect") + logger.debug(f"Browser operator cancelled by client disconnect") break iterations += 1 @@ -94,6 +94,7 @@ async def operate_browser( env_steps: List[EnvStepResult] = [] for action in agent_result.actions: if cancellation_event and cancellation_event.is_set(): + logger.debug(f"Browser operator cancelled by client disconnect") break # Handle request for user action and break the loop if isinstance(action, RequestUserAction):