Handle reasoning messages returned by openai cua model

Documentation about this is currently limited, confusing. But it seems like reasoning item should be kept if computer_call after, else drop. Add noop placeholder for reasoning item to prevent termination of operator run on response with just reasoning.
2026-04-28 00:19:25 +00:00 · 2025-05-10 02:17:58 -06:00
parent 95f211d03c
commit 1442a4f6fb
1 changed files with 33 additions and 2 deletions
@@ -164,7 +164,16 @@ class OpenAIOperatorAgent(OperatorAgent):
                rendered_response["text"] = response.output_text
            elif block.type == "reasoning":
                actions.append(NoopAction())
-                action_results.append(block)
+                # Add placeholder action result for reasoning
+                # This is to prevent run termination.
+                # It will be removed later by add_action_results func
+                action_results.append(
+                    {
+                        "type": block.type,
+                        "id": block.id,
+                        "summary": [],
+                    }
+                )
            if action_to_run or content:
                actions.append(action_to_run)
            if action_to_run or content:
@@ -190,6 +199,7 @@ class OpenAIOperatorAgent(OperatorAgent):
            return

        # Update action results with results of applying suggested actions on the environment
+        items_to_pop = []
        for idx, env_step in enumerate(env_steps):
            action_result = agent_action.action_results[idx]
            result_content = env_step.error or env_step.output or "[Action completed]"
@@ -207,10 +217,16 @@ class OpenAIOperatorAgent(OperatorAgent):
                    "image_url": f"data:image/webp;base64,{env_step.screenshot_base64}",
                    "current_url": env_step.current_url,
                }
+            elif action_result["type"] == "reasoning":
+                items_to_pop.append(idx)  # Mark placeholder reasoning action result for removal
+                continue
            else:
                # Add text data
                action_result["output"] = result_content

+        for idx in reversed(items_to_pop):
+            agent_action.action_results.pop(idx)
+
        self.messages += [AgentMessage(role="environment", content=agent_action.action_results)]

    def _format_message_for_api(self, messages: list[AgentMessage]) -> list:
@@ -219,6 +235,21 @@ class OpenAIOperatorAgent(OperatorAgent):
        for message in messages:
            if message.role == "environment":
                if isinstance(message.content, list):
+                    # Remove reasoning message if not followed by computer call
+                    if (
+                        len(message.content) > 1
+                        and all(hasattr(item, "type") for item in message.content)
+                        and message.content[0].type == "reasoning"
+                        and message.content[1].type != "computer_call"
+                    ) or (
+                        len(message.content) == 1
+                        and all(hasattr(item, "type") for item in message.content)
+                        and message.content[0].type == "reasoning"
+                    ):
+                        logger.warning(
+                            f"Removing reasoning message not followed by a computer call action: {message.content}"
+                        )
+                        message.content.pop(0)
                    formatted_messages.extend(message.content)
                else:
                    logger.warning(f"Expected message content list from environment, got {type(message.content)}")
@@ -242,7 +273,7 @@ class OpenAIOperatorAgent(OperatorAgent):
        # Handle case where response_content is a dictionary and not ResponseOutputItem
        # This is the case when response_content contains action results
        if not hasattr(response_content[0], "type"):
-            return "**Action**: " + json.dumps(response_content[0]["output"])
+            return "**Action**: " + json.dumps(response_content[0].get("output", "Noop"))

        compiled_response = [""]
        for block in deepcopy(response_content):