Allow visual grounder of binary operator agent to see past actions

Previously the grounding agent would be reset on every call. So it only saw the most recent instruction and screenshot to make its next action suggestion. This change allows the visual grounders to see past instructions and actions to prevent looping and encourage more exploratory action suggestions by it when stuck or see errors.
2026-03-02 21:19:12 +00:00 · 2025-05-08 20:27:09 -06:00
parent d8bc6239f8
commit 7395af3c3a
2 changed files with 3 additions and 1 deletions
--- a/src/khoj/processor/operator/operate_browser.py
+++ b/src/khoj/processor/operator/operate_browser.py
@@ -144,6 +144,8 @@ async def operate_browser(
    finally:
        if environment and not user_input_message:  # Don't close browser if user input required
            await environment.close()
+        if operator_agent:
+            operator_agent.reset()

    yield {
        "text": user_input_message or response,
--- a/src/khoj/processor/operator/operator_agent_binary.py
+++ b/src/khoj/processor/operator/operator_agent_binary.py
@@ -168,7 +168,6 @@ Focus on the visual action and provide all necessary context.
        actions: List[OperatorAction] = []
        action_results: List[dict] = []
        rendered_parts = [f"**Thought (Vision)**: {action_instruction}"]
-        self.grounding_agent.reset()  # Reset grounding agent state

        try:
            grounding_response, actions = await self.grounding_agent.act(action_instruction, current_state)
@@ -318,3 +317,4 @@ Focus on the visual action and provide all necessary context.
    def reset(self):
        """Reset the agent state."""
        super().reset()
+        self.grounding_agent.reset()  # Reset grounding agent state