diff --git a/src/khoj/processor/operator/grounding_agent_uitars.py b/src/khoj/processor/operator/grounding_agent_uitars.py index 33103e44..e8a4978d 100644 --- a/src/khoj/processor/operator/grounding_agent_uitars.py +++ b/src/khoj/processor/operator/grounding_agent_uitars.py @@ -133,13 +133,19 @@ class GroundingAgentUitars: Suggest the next action(s) based on the instruction and current environment. """ messages = self._format_messages_for_api(instruction, env_state) + recent_screenshot = Image.open(BytesIO(self.history_images[-1])) origin_resized_height = recent_screenshot.height origin_resized_width = recent_screenshot.width + + prediction, parsed_responses = self.parse_instruction_to_action( + instruction, origin_resized_height, origin_resized_width + ) + temperature = self.temperature top_k = self.top_k try_times = 3 - while True: + while not parsed_responses: if try_times <= 0: print(f"Reach max retry times to fetch response from client, as error flag.") return "client error\nFAIL", [] @@ -194,7 +200,7 @@ class GroundingAgentUitars: return self._parse_action(parsed_responses, prediction) - def _parse_action(self, parsed_responses: dict, prediction: str) -> tuple[str, list[OperatorAction]]: + def _parse_action(self, parsed_responses: list[dict], prediction: str) -> tuple[str, list[OperatorAction]]: """ Parse the model's prediction into actions and return the result. """ @@ -505,7 +511,7 @@ class GroundingAgentUitars: all_action.append(action_str) parsed_actions = [self.parse_action_string(action.replace("\n", "\\n").lstrip()) for action in all_action] - actions = [] + actions: list[dict] = [] for action_instance, raw_str in zip(parsed_actions, all_action): if action_instance == None: print(f"Action can't parse: {raw_str}") @@ -913,6 +919,29 @@ class GroundingAgentUitars: return pyautogui_code + def parse_instruction_to_action( + self, instruction: str, origin_resized_height: int, origin_resized_width: int + ) -> tuple[str, list[dict]]: + """ + Parse instruction into action with simple string match for GOTO and BACK actions. + + Useful for actions that do not need to invoke the visual grounding model. + """ + prediction, parsed_responses = None, [] + # handle GOTO , BACK actions at the end of the response. + if instruction.strip().splitlines()[-1].strip().startswith("GOTO"): + url = instruction.split("GOTO")[-1].strip() + prediction = f"Thought: Let me go to {url}\nAction: goto(url='{url}')" + parsed_responses = self.parse_action_to_structure_output( + prediction, origin_resized_height, origin_resized_width, self.max_pixels, self.min_pixels + ) + elif instruction.strip().endswith("BACK"): + prediction = "Thought: Let me go back to the previous page.\nAction: back()" + parsed_responses = self.parse_action_to_structure_output( + prediction, origin_resized_height, origin_resized_width, self.max_pixels, self.min_pixels + ) + return prediction, parsed_responses + def add_box_token(self, input_string): # Step 1: Split the string into individual actions if "Action: " in input_string and "start_box=" in input_string: diff --git a/src/khoj/processor/operator/operator_agent_binary.py b/src/khoj/processor/operator/operator_agent_binary.py index 6573f906..4045f3be 100644 --- a/src/khoj/processor/operator/operator_agent_binary.py +++ b/src/khoj/processor/operator/operator_agent_binary.py @@ -106,18 +106,21 @@ class BinaryOperatorAgent(OperatorAgent): # IMPORTANT * You are allowed upto {self.max_iterations} iterations to complete the task. -* Explicitly tell the tool AI to use the `goto` function to navigate to a specific URL. -* Once you've verified that the main objective has been achieved, just say "DONE" (without the quotes). Do not say anything else. +* To navigate to a specific URL, put "GOTO " (without quotes) on the last line of your response. +* To navigate back to the previous page, end your response with "BACK" (without quotes). +* Once you've verified that the main objective has been achieved, end your response with "DONE" (without quotes). # Examples ## Example 1 -- use the 'goto' function to navigate to https://example.com +GOTO https://example.com ## Example 2 -- 'click the blue login button located at the top right corner' +click the blue login button located at the top right corner ## Example 3 -- 'scroll down the page to find the contact section' +scroll down the page ## Example 4 -- 'type the username example@email.com into the input field labeled Username') +type the username example@email.com into the input field labeled Username +## Example 5 +DONE # Instructions Now describe a single high-level action to take next to progress towards the user's goal in detail.