mirror of
https://github.com/khoaliber/khoj.git
synced 2026-03-08 05:39:13 +00:00
Render screenshot in train of thought on browser screenshot action
Update web app to render screenshot image when screenshot action taken by browser operator
This commit is contained in:
@@ -31,6 +31,7 @@ import {
|
|||||||
Shapes,
|
Shapes,
|
||||||
Trash,
|
Trash,
|
||||||
Toolbox,
|
Toolbox,
|
||||||
|
Browser,
|
||||||
} from "@phosphor-icons/react";
|
} from "@phosphor-icons/react";
|
||||||
|
|
||||||
import DOMPurify from "dompurify";
|
import DOMPurify from "dompurify";
|
||||||
@@ -333,6 +334,10 @@ function chooseIconFromHeader(header: string, iconColor: string) {
|
|||||||
return <Code className={`${classNames}`} />;
|
return <Code className={`${classNames}`} />;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (compareHeader.includes("operating")) {
|
||||||
|
return <Browser className={`${classNames}`} />;
|
||||||
|
}
|
||||||
|
|
||||||
return <Brain className={`${classNames}`} />;
|
return <Brain className={`${classNames}`} />;
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -342,10 +347,27 @@ export function TrainOfThought(props: TrainOfThoughtProps) {
|
|||||||
let header = extractedHeader ? extractedHeader[1] : "";
|
let header = extractedHeader ? extractedHeader[1] : "";
|
||||||
const iconColor = props.primary ? convertColorToTextClass(props.agentColor) : "text-gray-500";
|
const iconColor = props.primary ? convertColorToTextClass(props.agentColor) : "text-gray-500";
|
||||||
const icon = chooseIconFromHeader(header, iconColor);
|
const icon = chooseIconFromHeader(header, iconColor);
|
||||||
let markdownRendered = DOMPurify.sanitize(md.render(props.message));
|
let message = props.message;
|
||||||
|
|
||||||
// Remove any header tags from markdownRendered
|
// Render screenshot image in screenshot action message
|
||||||
|
let screenshotData = null;
|
||||||
|
try {
|
||||||
|
const jsonMatch = message.match(/\{"action": "screenshot".*\}/);
|
||||||
|
if (jsonMatch) {
|
||||||
|
screenshotData = JSON.parse(jsonMatch[0]);
|
||||||
|
const screenshotHtmlString = `<img src="${screenshotData.image}" alt="State of browser" class="max-w-full" />`;
|
||||||
|
message = message.replace(jsonMatch[0], `Screenshot\n\n${screenshotHtmlString}`);
|
||||||
|
}
|
||||||
|
} catch (e) {
|
||||||
|
console.error("Failed to parse screenshot data", e);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Render the sanitized train of thought as markdown
|
||||||
|
let markdownRendered = DOMPurify.sanitize(md.render(message));
|
||||||
|
|
||||||
|
// Remove any header tags from the rendered markdown
|
||||||
markdownRendered = markdownRendered.replace(/<h[1-6].*?<\/h[1-6]>/g, "");
|
markdownRendered = markdownRendered.replace(/<h[1-6].*?<\/h[1-6]>/g, "");
|
||||||
|
|
||||||
return (
|
return (
|
||||||
<div
|
<div
|
||||||
className={`${styles.trainOfThoughtElement} break-words items-center ${props.primary ? "text-gray-400" : "text-gray-300"} ${styles.trainOfThought} ${props.primary ? styles.primary : ""}`}
|
className={`${styles.trainOfThoughtElement} break-words items-center ${props.primary ? "text-gray-400" : "text-gray-300"} ${styles.trainOfThought} ${props.primary ? styles.primary : ""}`}
|
||||||
|
|||||||
@@ -3,6 +3,7 @@ import base64
|
|||||||
import json
|
import json
|
||||||
import logging
|
import logging
|
||||||
import os
|
import os
|
||||||
|
from copy import deepcopy
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
from typing import Callable, List, Literal, Optional
|
from typing import Callable, List, Literal, Optional
|
||||||
|
|
||||||
@@ -18,6 +19,7 @@ from khoj.database.models import Agent, ChatModel, KhojUser
|
|||||||
from khoj.processor.conversation.utils import commit_conversation_trace
|
from khoj.processor.conversation.utils import commit_conversation_trace
|
||||||
from khoj.routers.helpers import ChatEvent
|
from khoj.routers.helpers import ChatEvent
|
||||||
from khoj.utils.helpers import (
|
from khoj.utils.helpers import (
|
||||||
|
convert_image_to_webp,
|
||||||
get_anthropic_async_client,
|
get_anthropic_async_client,
|
||||||
get_chat_usage_metrics,
|
get_chat_usage_metrics,
|
||||||
get_openai_async_client,
|
get_openai_async_client,
|
||||||
@@ -477,7 +479,8 @@ async def browser_use_anthropic(
|
|||||||
compiled_operator_messages.append(ChatMessage(role="assistant", content=compiled_response))
|
compiled_operator_messages.append(ChatMessage(role="assistant", content=compiled_response))
|
||||||
logger.debug(f"Claude response: {response.model_dump_json()}")
|
logger.debug(f"Claude response: {response.model_dump_json()}")
|
||||||
if send_status_func:
|
if send_status_func:
|
||||||
async for event in send_status_func(f"**Operating Browser**:\n{compiled_response}"):
|
rendered_response = await render_claude_response(response_content, page)
|
||||||
|
async for event in send_status_func(f"**Operating Browser**:\n{rendered_response}"):
|
||||||
yield {ChatEvent.STATUS: event}
|
yield {ChatEvent.STATUS: event}
|
||||||
|
|
||||||
# Check if Claude used any tools
|
# Check if Claude used any tools
|
||||||
@@ -798,7 +801,10 @@ async def handle_browser_action_anthropic(page: Page, action_input: dict):
|
|||||||
return {"error": "Missing action type in input"}
|
return {"error": "Missing action type in input"}
|
||||||
|
|
||||||
try:
|
try:
|
||||||
logger.debug(f"Anthropic Action: {action_type} with input: {action_input}")
|
render_action_input = action_input.copy()
|
||||||
|
if render_action_input.get("image"):
|
||||||
|
render_action_input["image"] = "[placeholder for screenshot data]"
|
||||||
|
logger.debug(f"Anthropic Action: {action_type} with input: {render_action_input}")
|
||||||
|
|
||||||
match action_type:
|
match action_type:
|
||||||
case "mouse_move":
|
case "mouse_move":
|
||||||
@@ -1027,6 +1033,32 @@ def compile_claude_response(response_content: list[BetaContentBlock]) -> str:
|
|||||||
return "\n- ".join(compiled_response)
|
return "\n- ".join(compiled_response)
|
||||||
|
|
||||||
|
|
||||||
|
async def render_claude_response(response_content: list[BetaContentBlock], page: Page) -> str:
|
||||||
|
"""
|
||||||
|
Share the response from Anthropic AI model to be rendered by the client.
|
||||||
|
"""
|
||||||
|
compiled_response = [""]
|
||||||
|
for block in deepcopy(response_content):
|
||||||
|
if block.type == "text":
|
||||||
|
compiled_response.append(block.text)
|
||||||
|
elif block.type == "tool_use":
|
||||||
|
if hasattr(block, "name") and block.name == "goto":
|
||||||
|
block_input = {"action": block.name, "url": block.input.get("url")}
|
||||||
|
elif hasattr(block, "name") and block.name == "back":
|
||||||
|
block_input = {"action": block.name}
|
||||||
|
else:
|
||||||
|
block_input = block.input
|
||||||
|
|
||||||
|
if block_input.get("action") == "screenshot":
|
||||||
|
screenshot_base64 = await get_screenshot(page)
|
||||||
|
block_input["image"] = f"data:image/webp;base64,{screenshot_base64}"
|
||||||
|
|
||||||
|
compiled_response.append(f"**Action**: {json.dumps(block_input)}")
|
||||||
|
elif block.type == "thinking":
|
||||||
|
compiled_response.append(f"**Thought**: {block.thinking}")
|
||||||
|
return "\n- ".join(compiled_response)
|
||||||
|
|
||||||
|
|
||||||
async def get_screenshot(page: Page):
|
async def get_screenshot(page: Page):
|
||||||
"""
|
"""
|
||||||
Take a viewport screenshot using Playwright and return as base64 encoded webp image.
|
Take a viewport screenshot using Playwright and return as base64 encoded webp image.
|
||||||
|
|||||||
Reference in New Issue
Block a user