Parse goto, back actions directly from instruction for uitars grounder

UI tars grounder doesn't like calling non-standard functions like
goto, back.

Directly parse visual reasoner instruction to bypass uitars grounder
model.

At least for goto and back functions grounding isn't necessary, so
this works well.
This commit is contained in:
Debanjum
2025-05-08 21:31:01 -06:00
parent 7395af3c3a
commit ffe58d2ec1
2 changed files with 41 additions and 9 deletions

View File

@@ -133,13 +133,19 @@ class GroundingAgentUitars:
Suggest the next action(s) based on the instruction and current environment.
"""
messages = self._format_messages_for_api(instruction, env_state)
recent_screenshot = Image.open(BytesIO(self.history_images[-1]))
origin_resized_height = recent_screenshot.height
origin_resized_width = recent_screenshot.width
prediction, parsed_responses = self.parse_instruction_to_action(
instruction, origin_resized_height, origin_resized_width
)
temperature = self.temperature
top_k = self.top_k
try_times = 3
while True:
while not parsed_responses:
if try_times <= 0:
print(f"Reach max retry times to fetch response from client, as error flag.")
return "client error\nFAIL", []
@@ -194,7 +200,7 @@ class GroundingAgentUitars:
return self._parse_action(parsed_responses, prediction)
def _parse_action(self, parsed_responses: dict, prediction: str) -> tuple[str, list[OperatorAction]]:
def _parse_action(self, parsed_responses: list[dict], prediction: str) -> tuple[str, list[OperatorAction]]:
"""
Parse the model's prediction into actions and return the result.
"""
@@ -505,7 +511,7 @@ class GroundingAgentUitars:
all_action.append(action_str)
parsed_actions = [self.parse_action_string(action.replace("\n", "\\n").lstrip()) for action in all_action]
actions = []
actions: list[dict] = []
for action_instance, raw_str in zip(parsed_actions, all_action):
if action_instance == None:
print(f"Action can't parse: {raw_str}")
@@ -913,6 +919,29 @@ class GroundingAgentUitars:
return pyautogui_code
def parse_instruction_to_action(
self, instruction: str, origin_resized_height: int, origin_resized_width: int
) -> tuple[str, list[dict]]:
"""
Parse instruction into action with simple string match for GOTO and BACK actions.
Useful for actions that do not need to invoke the visual grounding model.
"""
prediction, parsed_responses = None, []
# handle GOTO <URL>, BACK actions at the end of the response.
if instruction.strip().splitlines()[-1].strip().startswith("GOTO"):
url = instruction.split("GOTO")[-1].strip()
prediction = f"Thought: Let me go to {url}\nAction: goto(url='{url}')"
parsed_responses = self.parse_action_to_structure_output(
prediction, origin_resized_height, origin_resized_width, self.max_pixels, self.min_pixels
)
elif instruction.strip().endswith("BACK"):
prediction = "Thought: Let me go back to the previous page.\nAction: back()"
parsed_responses = self.parse_action_to_structure_output(
prediction, origin_resized_height, origin_resized_width, self.max_pixels, self.min_pixels
)
return prediction, parsed_responses
def add_box_token(self, input_string):
# Step 1: Split the string into individual actions
if "Action: " in input_string and "start_box=" in input_string:

View File

@@ -106,18 +106,21 @@ class BinaryOperatorAgent(OperatorAgent):
# IMPORTANT
* You are allowed upto {self.max_iterations} iterations to complete the task.
* Explicitly tell the tool AI to use the `goto` function to navigate to a specific URL.
* Once you've verified that the main objective has been achieved, just say "DONE" (without the quotes). Do not say anything else.
* To navigate to a specific URL, put "GOTO <URL>" (without quotes) on the last line of your response.
* To navigate back to the previous page, end your response with "BACK" (without quotes).
* Once you've verified that the main objective has been achieved, end your response with "DONE" (without quotes).
# Examples
## Example 1
- use the 'goto' function to navigate to https://example.com
GOTO https://example.com
## Example 2
- 'click the blue login button located at the top right corner'
click the blue login button located at the top right corner
## Example 3
- 'scroll down the page to find the contact section'
scroll down the page
## Example 4
- 'type the username example@email.com into the input field labeled Username')
type the username example@email.com into the input field labeled Username
## Example 5
DONE
# Instructions
Now describe a single high-level action to take next to progress towards the user's goal in detail.