Render screenshot in train of thought on browser screenshot action

Update web app to render screenshot image when screenshot action taken
by browser operator
This commit is contained in:
Debanjum
2025-04-28 16:49:22 -06:00
parent 188b3c85ae
commit 08e93c64ab
2 changed files with 58 additions and 4 deletions

View File

@@ -31,6 +31,7 @@ import {
Shapes, Shapes,
Trash, Trash,
Toolbox, Toolbox,
Browser,
} from "@phosphor-icons/react"; } from "@phosphor-icons/react";
import DOMPurify from "dompurify"; import DOMPurify from "dompurify";
@@ -333,6 +334,10 @@ function chooseIconFromHeader(header: string, iconColor: string) {
return <Code className={`${classNames}`} />; return <Code className={`${classNames}`} />;
} }
if (compareHeader.includes("operating")) {
return <Browser className={`${classNames}`} />;
}
return <Brain className={`${classNames}`} />; return <Brain className={`${classNames}`} />;
} }
@@ -342,10 +347,27 @@ export function TrainOfThought(props: TrainOfThoughtProps) {
let header = extractedHeader ? extractedHeader[1] : ""; let header = extractedHeader ? extractedHeader[1] : "";
const iconColor = props.primary ? convertColorToTextClass(props.agentColor) : "text-gray-500"; const iconColor = props.primary ? convertColorToTextClass(props.agentColor) : "text-gray-500";
const icon = chooseIconFromHeader(header, iconColor); const icon = chooseIconFromHeader(header, iconColor);
let markdownRendered = DOMPurify.sanitize(md.render(props.message)); let message = props.message;
// Remove any header tags from markdownRendered // Render screenshot image in screenshot action message
let screenshotData = null;
try {
const jsonMatch = message.match(/\{"action": "screenshot".*\}/);
if (jsonMatch) {
screenshotData = JSON.parse(jsonMatch[0]);
const screenshotHtmlString = `<img src="${screenshotData.image}" alt="State of browser" class="max-w-full" />`;
message = message.replace(jsonMatch[0], `Screenshot\n\n${screenshotHtmlString}`);
}
} catch (e) {
console.error("Failed to parse screenshot data", e);
}
// Render the sanitized train of thought as markdown
let markdownRendered = DOMPurify.sanitize(md.render(message));
// Remove any header tags from the rendered markdown
markdownRendered = markdownRendered.replace(/<h[1-6].*?<\/h[1-6]>/g, ""); markdownRendered = markdownRendered.replace(/<h[1-6].*?<\/h[1-6]>/g, "");
return ( return (
<div <div
className={`${styles.trainOfThoughtElement} break-words items-center ${props.primary ? "text-gray-400" : "text-gray-300"} ${styles.trainOfThought} ${props.primary ? styles.primary : ""}`} className={`${styles.trainOfThoughtElement} break-words items-center ${props.primary ? "text-gray-400" : "text-gray-300"} ${styles.trainOfThought} ${props.primary ? styles.primary : ""}`}

View File

@@ -3,6 +3,7 @@ import base64
import json import json
import logging import logging
import os import os
from copy import deepcopy
from datetime import datetime from datetime import datetime
from typing import Callable, List, Literal, Optional from typing import Callable, List, Literal, Optional
@@ -18,6 +19,7 @@ from khoj.database.models import Agent, ChatModel, KhojUser
from khoj.processor.conversation.utils import commit_conversation_trace from khoj.processor.conversation.utils import commit_conversation_trace
from khoj.routers.helpers import ChatEvent from khoj.routers.helpers import ChatEvent
from khoj.utils.helpers import ( from khoj.utils.helpers import (
convert_image_to_webp,
get_anthropic_async_client, get_anthropic_async_client,
get_chat_usage_metrics, get_chat_usage_metrics,
get_openai_async_client, get_openai_async_client,
@@ -477,7 +479,8 @@ async def browser_use_anthropic(
compiled_operator_messages.append(ChatMessage(role="assistant", content=compiled_response)) compiled_operator_messages.append(ChatMessage(role="assistant", content=compiled_response))
logger.debug(f"Claude response: {response.model_dump_json()}") logger.debug(f"Claude response: {response.model_dump_json()}")
if send_status_func: if send_status_func:
async for event in send_status_func(f"**Operating Browser**:\n{compiled_response}"): rendered_response = await render_claude_response(response_content, page)
async for event in send_status_func(f"**Operating Browser**:\n{rendered_response}"):
yield {ChatEvent.STATUS: event} yield {ChatEvent.STATUS: event}
# Check if Claude used any tools # Check if Claude used any tools
@@ -798,7 +801,10 @@ async def handle_browser_action_anthropic(page: Page, action_input: dict):
return {"error": "Missing action type in input"} return {"error": "Missing action type in input"}
try: try:
logger.debug(f"Anthropic Action: {action_type} with input: {action_input}") render_action_input = action_input.copy()
if render_action_input.get("image"):
render_action_input["image"] = "[placeholder for screenshot data]"
logger.debug(f"Anthropic Action: {action_type} with input: {render_action_input}")
match action_type: match action_type:
case "mouse_move": case "mouse_move":
@@ -1027,6 +1033,32 @@ def compile_claude_response(response_content: list[BetaContentBlock]) -> str:
return "\n- ".join(compiled_response) return "\n- ".join(compiled_response)
async def render_claude_response(response_content: list[BetaContentBlock], page: Page) -> str:
"""
Share the response from Anthropic AI model to be rendered by the client.
"""
compiled_response = [""]
for block in deepcopy(response_content):
if block.type == "text":
compiled_response.append(block.text)
elif block.type == "tool_use":
if hasattr(block, "name") and block.name == "goto":
block_input = {"action": block.name, "url": block.input.get("url")}
elif hasattr(block, "name") and block.name == "back":
block_input = {"action": block.name}
else:
block_input = block.input
if block_input.get("action") == "screenshot":
screenshot_base64 = await get_screenshot(page)
block_input["image"] = f"data:image/webp;base64,{screenshot_base64}"
compiled_response.append(f"**Action**: {json.dumps(block_input)}")
elif block.type == "thinking":
compiled_response.append(f"**Thought**: {block.thinking}")
return "\n- ".join(compiled_response)
async def get_screenshot(page: Page): async def get_screenshot(page: Page):
""" """
Take a viewport screenshot using Playwright and return as base64 encoded webp image. Take a viewport screenshot using Playwright and return as base64 encoded webp image.