Parse goto, back actions directly from instruction for uitars grounder

UI tars grounder doesn't like calling non-standard functions like
goto, back.

Directly parse visual reasoner instruction to bypass uitars grounder
model.

At least for goto and back functions grounding isn't necessary, so
this works well.
This commit is contained in:
Debanjum
2025-05-08 21:31:01 -06:00
parent 7395af3c3a
commit ffe58d2ec1
2 changed files with 41 additions and 9 deletions

View File

@@ -133,13 +133,19 @@ class GroundingAgentUitars:
Suggest the next action(s) based on the instruction and current environment. Suggest the next action(s) based on the instruction and current environment.
""" """
messages = self._format_messages_for_api(instruction, env_state) messages = self._format_messages_for_api(instruction, env_state)
recent_screenshot = Image.open(BytesIO(self.history_images[-1])) recent_screenshot = Image.open(BytesIO(self.history_images[-1]))
origin_resized_height = recent_screenshot.height origin_resized_height = recent_screenshot.height
origin_resized_width = recent_screenshot.width origin_resized_width = recent_screenshot.width
prediction, parsed_responses = self.parse_instruction_to_action(
instruction, origin_resized_height, origin_resized_width
)
temperature = self.temperature temperature = self.temperature
top_k = self.top_k top_k = self.top_k
try_times = 3 try_times = 3
while True: while not parsed_responses:
if try_times <= 0: if try_times <= 0:
print(f"Reach max retry times to fetch response from client, as error flag.") print(f"Reach max retry times to fetch response from client, as error flag.")
return "client error\nFAIL", [] return "client error\nFAIL", []
@@ -194,7 +200,7 @@ class GroundingAgentUitars:
return self._parse_action(parsed_responses, prediction) return self._parse_action(parsed_responses, prediction)
def _parse_action(self, parsed_responses: dict, prediction: str) -> tuple[str, list[OperatorAction]]: def _parse_action(self, parsed_responses: list[dict], prediction: str) -> tuple[str, list[OperatorAction]]:
""" """
Parse the model's prediction into actions and return the result. Parse the model's prediction into actions and return the result.
""" """
@@ -505,7 +511,7 @@ class GroundingAgentUitars:
all_action.append(action_str) all_action.append(action_str)
parsed_actions = [self.parse_action_string(action.replace("\n", "\\n").lstrip()) for action in all_action] parsed_actions = [self.parse_action_string(action.replace("\n", "\\n").lstrip()) for action in all_action]
actions = [] actions: list[dict] = []
for action_instance, raw_str in zip(parsed_actions, all_action): for action_instance, raw_str in zip(parsed_actions, all_action):
if action_instance == None: if action_instance == None:
print(f"Action can't parse: {raw_str}") print(f"Action can't parse: {raw_str}")
@@ -913,6 +919,29 @@ class GroundingAgentUitars:
return pyautogui_code return pyautogui_code
def parse_instruction_to_action(
self, instruction: str, origin_resized_height: int, origin_resized_width: int
) -> tuple[str, list[dict]]:
"""
Parse instruction into action with simple string match for GOTO and BACK actions.
Useful for actions that do not need to invoke the visual grounding model.
"""
prediction, parsed_responses = None, []
# handle GOTO <URL>, BACK actions at the end of the response.
if instruction.strip().splitlines()[-1].strip().startswith("GOTO"):
url = instruction.split("GOTO")[-1].strip()
prediction = f"Thought: Let me go to {url}\nAction: goto(url='{url}')"
parsed_responses = self.parse_action_to_structure_output(
prediction, origin_resized_height, origin_resized_width, self.max_pixels, self.min_pixels
)
elif instruction.strip().endswith("BACK"):
prediction = "Thought: Let me go back to the previous page.\nAction: back()"
parsed_responses = self.parse_action_to_structure_output(
prediction, origin_resized_height, origin_resized_width, self.max_pixels, self.min_pixels
)
return prediction, parsed_responses
def add_box_token(self, input_string): def add_box_token(self, input_string):
# Step 1: Split the string into individual actions # Step 1: Split the string into individual actions
if "Action: " in input_string and "start_box=" in input_string: if "Action: " in input_string and "start_box=" in input_string:

View File

@@ -106,18 +106,21 @@ class BinaryOperatorAgent(OperatorAgent):
# IMPORTANT # IMPORTANT
* You are allowed upto {self.max_iterations} iterations to complete the task. * You are allowed upto {self.max_iterations} iterations to complete the task.
* Explicitly tell the tool AI to use the `goto` function to navigate to a specific URL. * To navigate to a specific URL, put "GOTO <URL>" (without quotes) on the last line of your response.
* Once you've verified that the main objective has been achieved, just say "DONE" (without the quotes). Do not say anything else. * To navigate back to the previous page, end your response with "BACK" (without quotes).
* Once you've verified that the main objective has been achieved, end your response with "DONE" (without quotes).
# Examples # Examples
## Example 1 ## Example 1
- use the 'goto' function to navigate to https://example.com GOTO https://example.com
## Example 2 ## Example 2
- 'click the blue login button located at the top right corner' click the blue login button located at the top right corner
## Example 3 ## Example 3
- 'scroll down the page to find the contact section' scroll down the page
## Example 4 ## Example 4
- 'type the username example@email.com into the input field labeled Username') type the username example@email.com into the input field labeled Username
## Example 5
DONE
# Instructions # Instructions
Now describe a single high-level action to take next to progress towards the user's goal in detail. Now describe a single high-level action to take next to progress towards the user's goal in detail.