Add action results for multiple actions similar to other operator agents

Adds the results of each action in a separate item in message content.
Previously we were adding this as a single larger text blob. This
changes adds structure to simplify post processing (e.g truncation).

The updated add_action_results should also require less work to
generalize if we pass tool call history to grounding model as
action results in valid openai format.
This commit is contained in:
Debanjum
2025-05-08 11:02:19 -06:00
parent e17c06b798
commit 0d8fb667ec

View File

@@ -431,12 +431,11 @@ back() # Use this to go back to the previous page.
if action_to_run: if action_to_run:
actions.append(action_to_run) actions.append(action_to_run)
# Prepare action result structure (similar to OpenAIOperatorAgent)
action_results.append( action_results.append(
{ {
"type": "tool_result", "type": "tool_result",
"tool_call_id": tool_call.id, "tool_call_id": tool_call.id,
"content": None, # Updated by environment step "content": None, # Updated after environment step
} }
) )
rendered_parts.append(action_render_str) rendered_parts.append(action_render_str)
@@ -472,28 +471,30 @@ back() # Use this to go back to the previous page.
if not agent_action.action_results: if not agent_action.action_results:
return return
tool_outputs = []
for idx, env_step in enumerate(env_steps): for idx, env_step in enumerate(env_steps):
if idx < len(agent_action.action_results): # Ensure we don't go out of bounds result_content = env_step.error or env_step.output or "[Action completed]"
result_content = env_step.error or env_step.output or "[Action completed]" action_result = agent_action.action_results[idx]
tool_outputs.append(["Took screenshot" if env_step.type == "image" else json.dumps(result_content)]) if env_step.type == "image":
message = "**Action Result**: Took screenshot"
images = [f"data:image/png;base64,{convert_image_to_png(env_step.screenshot_base64)}"]
elif idx == len(env_steps) - 1:
message = f"**Action Result**: {json.dumps(result_content)}"
images = [f"data:image/png;base64,{convert_image_to_png(env_step.screenshot_base64)}"]
else: else:
logger.warning( message = f"**Action Result**: {json.dumps(result_content)}"
f"Mismatch between env_steps ({len(env_steps)}) and action_results ({len(agent_action.action_results)})" images = []
) action_result["content"] = construct_structured_message(
message=message,
# Append tool results message to history images=images,
if tool_outputs:
tool_outputs_list = "\n".join([f"- {idx}: {str(item)}" for idx, item in enumerate(tool_outputs)])
tool_outputs_str = "**Action Results**:\n" + tool_outputs_list
formatted_screenshot = f"data:image/png;base64,{convert_image_to_png(env_step.screenshot_base64)}"
tool_output_content = construct_structured_message(
message=tool_outputs_str,
images=[formatted_screenshot],
model_type=self.reasoning_model.model_type, model_type=self.reasoning_model.model_type,
vision_enabled=True, vision_enabled=True,
) )
self.messages.append(AgentMessage(role="environment", content=tool_output_content))
# Append action results to history
action_results_content = []
for action_result in agent_action.action_results:
action_results_content.extend(action_result["content"])
self.messages.append(AgentMessage(role="environment", content=action_results_content))
async def summarize(self, summarize_prompt: str, env_state: EnvState) -> str: async def summarize(self, summarize_prompt: str, env_state: EnvState) -> str:
conversation_history = self._format_message_for_api(self.messages) conversation_history = self._format_message_for_api(self.messages)