mirror of
https://github.com/khoaliber/khoj.git
synced 2026-03-02 21:19:12 +00:00
Parse goto, back actions directly from instruction for uitars grounder
UI tars grounder doesn't like calling non-standard functions like goto, back. Directly parse visual reasoner instruction to bypass uitars grounder model. At least for goto and back functions grounding isn't necessary, so this works well.
This commit is contained in:
@@ -133,13 +133,19 @@ class GroundingAgentUitars:
|
||||
Suggest the next action(s) based on the instruction and current environment.
|
||||
"""
|
||||
messages = self._format_messages_for_api(instruction, env_state)
|
||||
|
||||
recent_screenshot = Image.open(BytesIO(self.history_images[-1]))
|
||||
origin_resized_height = recent_screenshot.height
|
||||
origin_resized_width = recent_screenshot.width
|
||||
|
||||
prediction, parsed_responses = self.parse_instruction_to_action(
|
||||
instruction, origin_resized_height, origin_resized_width
|
||||
)
|
||||
|
||||
temperature = self.temperature
|
||||
top_k = self.top_k
|
||||
try_times = 3
|
||||
while True:
|
||||
while not parsed_responses:
|
||||
if try_times <= 0:
|
||||
print(f"Reach max retry times to fetch response from client, as error flag.")
|
||||
return "client error\nFAIL", []
|
||||
@@ -194,7 +200,7 @@ class GroundingAgentUitars:
|
||||
|
||||
return self._parse_action(parsed_responses, prediction)
|
||||
|
||||
def _parse_action(self, parsed_responses: dict, prediction: str) -> tuple[str, list[OperatorAction]]:
|
||||
def _parse_action(self, parsed_responses: list[dict], prediction: str) -> tuple[str, list[OperatorAction]]:
|
||||
"""
|
||||
Parse the model's prediction into actions and return the result.
|
||||
"""
|
||||
@@ -505,7 +511,7 @@ class GroundingAgentUitars:
|
||||
all_action.append(action_str)
|
||||
|
||||
parsed_actions = [self.parse_action_string(action.replace("\n", "\\n").lstrip()) for action in all_action]
|
||||
actions = []
|
||||
actions: list[dict] = []
|
||||
for action_instance, raw_str in zip(parsed_actions, all_action):
|
||||
if action_instance == None:
|
||||
print(f"Action can't parse: {raw_str}")
|
||||
@@ -913,6 +919,29 @@ class GroundingAgentUitars:
|
||||
|
||||
return pyautogui_code
|
||||
|
||||
def parse_instruction_to_action(
|
||||
self, instruction: str, origin_resized_height: int, origin_resized_width: int
|
||||
) -> tuple[str, list[dict]]:
|
||||
"""
|
||||
Parse instruction into action with simple string match for GOTO and BACK actions.
|
||||
|
||||
Useful for actions that do not need to invoke the visual grounding model.
|
||||
"""
|
||||
prediction, parsed_responses = None, []
|
||||
# handle GOTO <URL>, BACK actions at the end of the response.
|
||||
if instruction.strip().splitlines()[-1].strip().startswith("GOTO"):
|
||||
url = instruction.split("GOTO")[-1].strip()
|
||||
prediction = f"Thought: Let me go to {url}\nAction: goto(url='{url}')"
|
||||
parsed_responses = self.parse_action_to_structure_output(
|
||||
prediction, origin_resized_height, origin_resized_width, self.max_pixels, self.min_pixels
|
||||
)
|
||||
elif instruction.strip().endswith("BACK"):
|
||||
prediction = "Thought: Let me go back to the previous page.\nAction: back()"
|
||||
parsed_responses = self.parse_action_to_structure_output(
|
||||
prediction, origin_resized_height, origin_resized_width, self.max_pixels, self.min_pixels
|
||||
)
|
||||
return prediction, parsed_responses
|
||||
|
||||
def add_box_token(self, input_string):
|
||||
# Step 1: Split the string into individual actions
|
||||
if "Action: " in input_string and "start_box=" in input_string:
|
||||
|
||||
@@ -106,18 +106,21 @@ class BinaryOperatorAgent(OperatorAgent):
|
||||
|
||||
# IMPORTANT
|
||||
* You are allowed upto {self.max_iterations} iterations to complete the task.
|
||||
* Explicitly tell the tool AI to use the `goto` function to navigate to a specific URL.
|
||||
* Once you've verified that the main objective has been achieved, just say "DONE" (without the quotes). Do not say anything else.
|
||||
* To navigate to a specific URL, put "GOTO <URL>" (without quotes) on the last line of your response.
|
||||
* To navigate back to the previous page, end your response with "BACK" (without quotes).
|
||||
* Once you've verified that the main objective has been achieved, end your response with "DONE" (without quotes).
|
||||
|
||||
# Examples
|
||||
## Example 1
|
||||
- use the 'goto' function to navigate to https://example.com
|
||||
GOTO https://example.com
|
||||
## Example 2
|
||||
- 'click the blue login button located at the top right corner'
|
||||
click the blue login button located at the top right corner
|
||||
## Example 3
|
||||
- 'scroll down the page to find the contact section'
|
||||
scroll down the page
|
||||
## Example 4
|
||||
- 'type the username example@email.com into the input field labeled Username')
|
||||
type the username example@email.com into the input field labeled Username
|
||||
## Example 5
|
||||
DONE
|
||||
|
||||
# Instructions
|
||||
Now describe a single high-level action to take next to progress towards the user's goal in detail.
|
||||
|
||||
Reference in New Issue
Block a user