Handle reasoning messages returned by openai cua model

Documentation about this is currently limited, confusing. But it seems like reasoning item should be kept if computer_call after, else drop. Add noop placeholder for reasoning item to prevent termination of operator run on response with just reasoning.
2026-03-08 05:39:13 +00:00 · 2025-05-10 02:17:58 -06:00
parent 95f211d03c
commit 1442a4f6fb
1 changed files with 33 additions and 2 deletions
--- a/src/khoj/processor/operator/operator_agent_openai.py
+++ b/src/khoj/processor/operator/operator_agent_openai.py
@@ -164,7 +164,16 @@ class OpenAIOperatorAgent(OperatorAgent):
                rendered_response["text"] = response.output_text
            elif block.type == "reasoning":
                actions.append(NoopAction())
-                action_results.append(block)
+                # Add placeholder action result for reasoning
                # This is to prevent run termination.
                # It will be removed later by add_action_results func
                action_results.append(
                    {
                        "type": block.type,
                        "id": block.id,
                        "summary": [],
                    }
                )
            if action_to_run or content:
                actions.append(action_to_run)
            if action_to_run or content:
@@ -190,6 +199,7 @@ class OpenAIOperatorAgent(OperatorAgent):
            return
        # Update action results with results of applying suggested actions on the environment
        items_to_pop = []
        for idx, env_step in enumerate(env_steps):
            action_result = agent_action.action_results[idx]
            result_content = env_step.error or env_step.output or "[Action completed]"
@@ -207,10 +217,16 @@ class OpenAIOperatorAgent(OperatorAgent):
                    "image_url": f"data:image/webp;base64,{env_step.screenshot_base64}",
                    "current_url": env_step.current_url,
                }
            elif action_result["type"] == "reasoning":
                items_to_pop.append(idx)  # Mark placeholder reasoning action result for removal
                continue
            else:
                # Add text data
                action_result["output"] = result_content
        for idx in reversed(items_to_pop):
            agent_action.action_results.pop(idx)
        self.messages += [AgentMessage(role="environment", content=agent_action.action_results)]
    def _format_message_for_api(self, messages: list[AgentMessage]) -> list:
@@ -219,6 +235,21 @@ class OpenAIOperatorAgent(OperatorAgent):
        for message in messages:
            if message.role == "environment":
                if isinstance(message.content, list):
                    # Remove reasoning message if not followed by computer call
                    if (
                        len(message.content) > 1
                        and all(hasattr(item, "type") for item in message.content)
                        and message.content[0].type == "reasoning"
                        and message.content[1].type != "computer_call"
                    ) or (
                        len(message.content) == 1
                        and all(hasattr(item, "type") for item in message.content)
                        and message.content[0].type == "reasoning"
                    ):
                        logger.warning(
                            f"Removing reasoning message not followed by a computer call action: {message.content}"
                        )
                        message.content.pop(0)
                    formatted_messages.extend(message.content)
                else:
                    logger.warning(f"Expected message content list from environment, got {type(message.content)}")
@@ -242,7 +273,7 @@ class OpenAIOperatorAgent(OperatorAgent):
        # Handle case where response_content is a dictionary and not ResponseOutputItem
        # This is the case when response_content contains action results
        if not hasattr(response_content[0], "type"):
-            return "**Action**: " + json.dumps(response_content[0]["output"])
+            return "**Action**: " + json.dumps(response_content[0].get("output", "Noop"))
        compiled_response = [""]
        for block in deepcopy(response_content):