Render screenshot in train of thought on openai agent screenshot action

This commit is contained in:
Debanjum
2025-05-04 00:24:25 -06:00
parent 78e052bfcb
commit e71575ad1a
2 changed files with 38 additions and 2 deletions

View File

@@ -352,7 +352,7 @@ export function TrainOfThought(props: TrainOfThoughtProps) {
// Render screenshot image in screenshot action message
let screenshotData = null;
try {
const jsonMatch = message.match(/\{"action": "screenshot".*\}/);
const jsonMatch = message.match(/\{.*("action": "screenshot"|"type": "screenshot").*\}/);
if (jsonMatch) {
screenshotData = JSON.parse(jsonMatch[0]);
const screenshotHtmlString = `<img src="${screenshotData.image}" alt="State of browser" class="max-w-full" />`;

View File

@@ -720,6 +720,39 @@ class OpenAIOperatorAgent(OperatorAgent):
compiled_response.append(f"**Thought**: {block.summary}")
return "\n- ".join(filter(None, compiled_response)) # Filter out empty strings
@staticmethod
async def render_response(response_content: list[ResponseOutputItem], screenshot: Optional[str] = None) -> str:
"""Render OpenAI response for display, potentially including screenshots."""
compiled_response = [""]
for block in deepcopy(response_content): # Use deepcopy to avoid modifying original
if block.type == "message":
text_content = block.text if hasattr(block, "text") else block.model_dump_json()
compiled_response.append(text_content)
elif block.type == "function_call":
block_input = {"action": block.name}
if block.name == "goto":
try:
args = json.loads(block.arguments)
block_input["url"] = args.get("url", "[Missing URL]")
except json.JSONDecodeError:
block_input["arguments"] = block.arguments
compiled_response.append(f"**Action**: {json.dumps(block_input)}")
elif block.type == "computer_call":
block_input = block.action
# If it's a screenshot action and we have a screenshot, render it
if block_input.type == "screenshot":
block_input_render = block_input.model_dump()
if screenshot:
block_input_render["image"] = f"data:image/webp;base64,{screenshot}"
else:
block_input_render["image"] = "[Failed to get screenshot]"
compiled_response.append(f"**Action**: {json.dumps(block_input_render)}")
else:
compiled_response.append(f"**Action**: {block_input.model_dump_json()}")
elif block.type == "reasoning" and block.summary:
compiled_response.append(f"**Thought**: {block.summary}")
return "\n- ".join(filter(None, compiled_response))
class AnthropicOperatorAgent(OperatorAgent):
async def act(self, messages: List[dict], current_state: EnvState) -> AgentActResult:
@@ -1068,7 +1101,10 @@ async def operate_browser(
rendered_response = await operator_agent.render_response(
agent_result.raw_agent_response.content, browser_state.screenshot
)
elif chat_model.model_type == ChatModel.ModelType.OPENAI:
rendered_response = await operator_agent.render_response(
agent_result.raw_agent_response.output, browser_state.screenshot
)
if send_status_func:
async for event in send_status_func(f"**Operating Browser**:\n{rendered_response}"):
yield {ChatEvent.STATUS: event}