Parse goto, back actions directly from instruction for uitars grounder

UI tars grounder doesn't like calling non-standard functions like goto, back. Directly parse visual reasoner instruction to bypass uitars grounder model. At least for goto and back functions grounding isn't necessary, so this works well.
2026-03-02 21:19:12 +00:00 · 2025-05-08 21:31:01 -06:00
parent 7395af3c3a
commit ffe58d2ec1
2 changed files with 41 additions and 9 deletions
--- a/src/khoj/processor/operator/grounding_agent_uitars.py
+++ b/src/khoj/processor/operator/grounding_agent_uitars.py
@@ -133,13 +133,19 @@ class GroundingAgentUitars:
        Suggest the next action(s) based on the instruction and current environment.
        """
        messages = self._format_messages_for_api(instruction, env_state)
+
        recent_screenshot = Image.open(BytesIO(self.history_images[-1]))
        origin_resized_height = recent_screenshot.height
        origin_resized_width = recent_screenshot.width
+
+        prediction, parsed_responses = self.parse_instruction_to_action(
+            instruction, origin_resized_height, origin_resized_width
+        )
+
        temperature = self.temperature
        top_k = self.top_k
        try_times = 3
-        while True:
+        while not parsed_responses:
            if try_times <= 0:
                print(f"Reach max retry times to fetch response from client, as error flag.")
                return "client error\nFAIL", []
@@ -194,7 +200,7 @@ class GroundingAgentUitars:

        return self._parse_action(parsed_responses, prediction)

-    def _parse_action(self, parsed_responses: dict, prediction: str) -> tuple[str, list[OperatorAction]]:
+    def _parse_action(self, parsed_responses: list[dict], prediction: str) -> tuple[str, list[OperatorAction]]:
        """
        Parse the model's prediction into actions and return the result.
        """
@@ -505,7 +511,7 @@ class GroundingAgentUitars:
            all_action.append(action_str)

        parsed_actions = [self.parse_action_string(action.replace("\n", "\\n").lstrip()) for action in all_action]
-        actions = []
+        actions: list[dict] = []
        for action_instance, raw_str in zip(parsed_actions, all_action):
            if action_instance == None:
                print(f"Action can't parse: {raw_str}")
@@ -913,6 +919,29 @@ class GroundingAgentUitars:

        return pyautogui_code

+    def parse_instruction_to_action(
+        self, instruction: str, origin_resized_height: int, origin_resized_width: int
+    ) -> tuple[str, list[dict]]:
+        """
+        Parse instruction into action with simple string match for GOTO and BACK actions.
+
+        Useful for actions that do not need to invoke the visual grounding model.
+        """
+        prediction, parsed_responses = None, []
+        # handle GOTO <URL>, BACK actions at the end of the response.
+        if instruction.strip().splitlines()[-1].strip().startswith("GOTO"):
+            url = instruction.split("GOTO")[-1].strip()
+            prediction = f"Thought: Let me go to {url}\nAction: goto(url='{url}')"
+            parsed_responses = self.parse_action_to_structure_output(
+                prediction, origin_resized_height, origin_resized_width, self.max_pixels, self.min_pixels
+            )
+        elif instruction.strip().endswith("BACK"):
+            prediction = "Thought: Let me go back to the previous page.\nAction: back()"
+            parsed_responses = self.parse_action_to_structure_output(
+                prediction, origin_resized_height, origin_resized_width, self.max_pixels, self.min_pixels
+            )
+        return prediction, parsed_responses
+
    def add_box_token(self, input_string):
        # Step 1: Split the string into individual actions
        if "Action: " in input_string and "start_box=" in input_string:
--- a/src/khoj/processor/operator/operator_agent_binary.py
+++ b/src/khoj/processor/operator/operator_agent_binary.py
@@ -106,18 +106,21 @@ class BinaryOperatorAgent(OperatorAgent):

 # IMPORTANT
 * You are allowed upto {self.max_iterations} iterations to complete the task.
-* Explicitly tell the tool AI to use the `goto` function to navigate to a specific URL.
-* Once you've verified that the main objective has been achieved, just say "DONE" (without the quotes). Do not say anything else.
+* To navigate to a specific URL, put "GOTO <URL>" (without quotes) on the last line of your response.
+* To navigate back to the previous page, end your response with "BACK" (without quotes).
+* Once you've verified that the main objective has been achieved, end your response with "DONE" (without quotes).

 # Examples
 ## Example 1
- use the 'goto' function to navigate to https://example.com
+GOTO https://example.com
 ## Example 2
- 'click the blue login button located at the top right corner'
+click the blue login button located at the top right corner
 ## Example 3
- 'scroll down the page to find the contact section'
+scroll down the page
 ## Example 4
- 'type the username example@email.com into the input field labeled Username')
+type the username example@email.com into the input field labeled Username
+## Example 5
+DONE

 # Instructions
 Now describe a single high-level action to take next to progress towards the user's goal in detail.