Improve termination on task completion for binary operator agent

Only let the visual reasoner handle terminating the operator run. Previously the grounder was also able to trigger termination. Make catching the termination by the reasoner more robust
2026-03-02 21:19:12 +00:00 · 2025-05-08 08:24:44 -06:00
parent 680c226137
commit fd139d4708
1 changed files with 17 additions and 24 deletions
--- a/src/khoj/processor/operator/operator_agent_binary.py
+++ b/src/khoj/processor/operator/operator_agent_binary.py
@@ -59,7 +59,13 @@ class BinaryOperatorAgent(OperatorAgent):
        reasoner_response = await self.act_reason(query, current_state)
        natural_language_action = reasoner_response["message"]
        if reasoner_response["type"] == "error":
-            logger.error(f"Error in reasoning LLM: {natural_language_action}")
+            logger.error(natural_language_action)
+            return AgentActResult(
+                actions=[],
+                action_results=[],
+                rendered_response=natural_language_action,
+            )
+        elif reasoner_response["type"] == "done":
            return AgentActResult(
                actions=[],
                action_results=[],
@@ -73,7 +79,7 @@ class BinaryOperatorAgent(OperatorAgent):
        """
        Uses the reasoning LLM to determine the next high-level action based on the operation trajectory.
        """
-        vision_system_prompt = f"""
+        reasoning_system_prompt = f"""
 # Introduction
 * You are Khoj, a smart web browsing assistant. You help the user accomplish their task using a web browser.
 * You are given the user's query and screenshots of the browser's state transitions.
@@ -94,7 +100,7 @@ class BinaryOperatorAgent(OperatorAgent):
 # IMPORTANT
 * You are allowed upto {self.max_iterations} iterations to complete the task.
 * Explicitly tell the tool AI to use the `goto` function to navigate to a specific URL.
-* Once you've verified that the task has been completed, just say "DONE" (without the quotes). Do not say anything else.
+* Once you've verified that the main objective has been achieved, just say "DONE" (without the quotes). Do not say anything else.

 # Examples
 ## Example 1
@@ -112,10 +118,10 @@ Focus on the visual action and provide all necessary context.
 """.strip()

        if is_none_or_empty(self.messages):
-            query_text = query
+            query_text = f"**Main Objective**: {query}"
            query_screenshot = [f"data:image/png;base64,{convert_image_to_png(current_state.screenshot)}"]
            first_message_content = construct_structured_message(
-                message=query,
+                message=query_text,
                images=query_screenshot,
                model_type=self.vision_chat_model.model_type,
                vision_enabled=True,
@@ -132,7 +138,7 @@ Focus on the visual action and provide all necessary context.
            natural_language_action = await send_message_to_model_wrapper(
                query=query_text,
                query_images=query_screenshot,
-                system_message=vision_system_prompt,
+                system_message=reasoning_system_prompt,
                conversation_log=visual_reasoner_history,
                agent_chat_model=self.vision_chat_model,
                tracer=self.tracer,
@@ -140,13 +146,13 @@ Focus on the visual action and provide all necessary context.
            self.messages.append(current_message)
            self.messages.append(AgentMessage(role="assistant", content=natural_language_action))

-            if natural_language_action == "DONE":
+            if natural_language_action.strip().endswith("DONE"):
                return {"type": "done", "message": "Completed task."}

-            logger.info(f"Vision LLM suggested action: {natural_language_action}")
+            logger.info(f"Reasoning LLM suggested action: {natural_language_action}")

        except Exception as e:
-            return {"type": "error", "message": f"Error calling Vision LLM: {e}"}
+            return {"type": "error", "message": f"Error calling Reasoning LLM: {e}"}

        return {"type": "action", "message": natural_language_action}

@@ -158,7 +164,7 @@ Focus on the visual action and provide all necessary context.
        grounding_user_prompt = f"""
 You are a GUI agent. You are given a task and a screenshot of the web browser tab you operate. You need to decide the next action to complete the task.
 You control a single tab in a Chromium browser. You cannot access the OS, filesystem or the application window.
-Always use the `goto` function to navigate to a specific URL.
+Always use the `goto` function to navigate to a specific URL. Ctrl+t, Ctrl+w, Ctrl+q, Ctrl+Shift+T, Ctrl+Shift+W are not allowed.

 ## Output Format
 ```
@@ -178,7 +184,6 @@ scroll(start_box='<|box_start|>(x1,y1)<|box_end|>', direction='down or up or rig
 wait(duration='time') # Sleep for specified time. Default is 1s and take a screenshot to check for any changes.
 goto(url='xxx') # Always use this to navigate to a specific URL. Use escape characters \\', \\", and \\n in url part to ensure we can parse the url in normal python string format.
 back() # Use this to go back to the previous page.
-finished(content='xxx') # Use escape characters \\', \\", and \\n in content part to ensure we can parse the content in normal python string format.

 ## Note
 - Use English in `Thought` part.
@@ -344,18 +349,6 @@ finished(content='xxx') # Use escape characters \\', \\", and \\n in content par
                    "parameters": {"type": "object", "properties": {}},
                },
            },
-            {
-                "type": "function",
-                "function": {
-                    "name": "finished",
-                    "description": "If no further actions to take.",
-                    "parameters": {
-                        "type": "object",
-                        "properties": {"content": {"type": "string", "description": "Text to type"}},
-                        "required": ["content"],
-                    },
-                },
-            },
        ]

        # Construct grounding LLM input (using only the latest user prompt + image)
@@ -381,7 +374,7 @@ finished(content='xxx') # Use escape characters \\', \\", and \\n in content par
                model=self.grounding_chat_model.name,
                messages=grounding_messages_for_api,
                tools=grounding_tools,
-                tool_choice="auto",
+                tool_choice="required",
                temperature=0.0,  # Grounding should be precise
                max_tokens=1000,  # Allow for thoughts + actions
            )