mirror of
https://github.com/khoaliber/khoj.git
synced 2026-03-07 21:29:13 +00:00
Add current cursor position to browser screenshots for ai, human view
This commit is contained in:
@@ -1,10 +1,11 @@
|
|||||||
import asyncio
|
import asyncio
|
||||||
import base64
|
import base64
|
||||||
|
import io
|
||||||
import logging
|
import logging
|
||||||
import os
|
import os
|
||||||
from typing import Optional, Set
|
from typing import Optional, Set
|
||||||
|
|
||||||
from khoj.processor.operator.operator_actions import OperatorAction
|
from khoj.processor.operator.operator_actions import OperatorAction, Point
|
||||||
from khoj.processor.operator.operator_environment_base import (
|
from khoj.processor.operator.operator_environment_base import (
|
||||||
Environment,
|
Environment,
|
||||||
EnvState,
|
EnvState,
|
||||||
@@ -33,6 +34,7 @@ class BrowserEnvironment(Environment):
|
|||||||
self.visited_urls: Set[str] = set()
|
self.visited_urls: Set[str] = set()
|
||||||
self.excluded_urls = {"about:blank", "https://duckduckgo.com", "https://www.bing.com", "https://www.google.com"}
|
self.excluded_urls = {"about:blank", "https://duckduckgo.com", "https://www.bing.com", "https://www.google.com"}
|
||||||
self.navigation_history: list[str] = []
|
self.navigation_history: list[str] = []
|
||||||
|
self.mouse_pos = Point(x=self.width / 2, y=self.height / 2)
|
||||||
|
|
||||||
async def start(self, width: int = 1024, height: int = 768) -> None:
|
async def start(self, width: int = 1024, height: int = 768) -> None:
|
||||||
self.width = width
|
self.width = width
|
||||||
@@ -93,12 +95,33 @@ class BrowserEnvironment(Environment):
|
|||||||
return None
|
return None
|
||||||
try:
|
try:
|
||||||
screenshot_bytes = await self.page.screenshot(caret="initial", full_page=False, type="png")
|
screenshot_bytes = await self.page.screenshot(caret="initial", full_page=False, type="png")
|
||||||
|
# Draw mouse position on the screenshot image
|
||||||
|
if self.mouse_pos:
|
||||||
|
screenshot_bytes = await self._draw_mouse_position(screenshot_bytes, self.mouse_pos)
|
||||||
screenshot_webp_bytes = convert_image_to_webp(screenshot_bytes)
|
screenshot_webp_bytes = convert_image_to_webp(screenshot_bytes)
|
||||||
return base64.b64encode(screenshot_webp_bytes).decode("utf-8")
|
return base64.b64encode(screenshot_webp_bytes).decode("utf-8")
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.error(f"Failed to get screenshot: {e}")
|
logger.error(f"Failed to get screenshot: {e}")
|
||||||
return None
|
return None
|
||||||
|
|
||||||
|
async def _draw_mouse_position(self, screenshot_bytes: bytes, mouse_pos: Point) -> bytes:
|
||||||
|
from PIL import Image, ImageDraw
|
||||||
|
|
||||||
|
# Load the screenshot into a PIL image
|
||||||
|
image = Image.open(io.BytesIO(screenshot_bytes))
|
||||||
|
|
||||||
|
# Draw a red circle at the mouse position
|
||||||
|
draw = ImageDraw.Draw(image)
|
||||||
|
radius = 5
|
||||||
|
draw.ellipse(
|
||||||
|
(mouse_pos.x - radius, mouse_pos.y - radius, mouse_pos.x + radius, mouse_pos.y + radius), fill="red"
|
||||||
|
)
|
||||||
|
|
||||||
|
# Save the modified image to a bytes buffer
|
||||||
|
output_buffer = io.BytesIO()
|
||||||
|
image.save(output_buffer, format="PNG")
|
||||||
|
return output_buffer.getvalue()
|
||||||
|
|
||||||
async def get_state(self) -> EnvState:
|
async def get_state(self) -> EnvState:
|
||||||
if not self.page or self.page.is_closed():
|
if not self.page or self.page.is_closed():
|
||||||
return "about:blank", None
|
return "about:blank", None
|
||||||
@@ -127,17 +150,20 @@ class BrowserEnvironment(Environment):
|
|||||||
for modifier in reversed(modifiers):
|
for modifier in reversed(modifiers):
|
||||||
await self.page.keyboard.up(modifier)
|
await self.page.keyboard.up(modifier)
|
||||||
output = f"{button.capitalize()} clicked at ({x}, {y})"
|
output = f"{button.capitalize()} clicked at ({x}, {y})"
|
||||||
|
self.mouse_pos = Point(x=x, y=y)
|
||||||
logger.debug(f"Action: {action.type} {button} at ({x},{y})")
|
logger.debug(f"Action: {action.type} {button} at ({x},{y})")
|
||||||
|
|
||||||
case "double_click":
|
case "double_click":
|
||||||
x, y = action.x, action.y
|
x, y = action.x, action.y
|
||||||
await self.page.mouse.dblclick(x, y)
|
await self.page.mouse.dblclick(x, y)
|
||||||
|
self.mouse_pos = Point(x=x, y=y)
|
||||||
output = f"Double clicked at ({x}, {y})"
|
output = f"Double clicked at ({x}, {y})"
|
||||||
logger.debug(f"Action: {action.type} at ({x},{y})")
|
logger.debug(f"Action: {action.type} at ({x},{y})")
|
||||||
|
|
||||||
case "triple_click":
|
case "triple_click":
|
||||||
x, y = action.x, action.y
|
x, y = action.x, action.y
|
||||||
await self.page.mouse.click(x, y, click_count=3)
|
await self.page.mouse.click(x, y, click_count=3)
|
||||||
|
self.mouse_pos = Point(x=x, y=y)
|
||||||
output = f"Triple clicked at ({x}, {y})"
|
output = f"Triple clicked at ({x}, {y})"
|
||||||
logger.debug(f"Action: {action.type} at ({x},{y})")
|
logger.debug(f"Action: {action.type} at ({x},{y})")
|
||||||
|
|
||||||
@@ -148,6 +174,7 @@ class BrowserEnvironment(Environment):
|
|||||||
scroll_y = action.scroll_y or 0
|
scroll_y = action.scroll_y or 0
|
||||||
if action.x is not None and action.y is not None:
|
if action.x is not None and action.y is not None:
|
||||||
await self.page.mouse.move(action.x, action.y)
|
await self.page.mouse.move(action.x, action.y)
|
||||||
|
self.mouse_pos = Point(x=action.x, y=action.y)
|
||||||
await self.page.evaluate(f"window.scrollBy({scroll_x}, {scroll_y})")
|
await self.page.evaluate(f"window.scrollBy({scroll_x}, {scroll_y})")
|
||||||
output = f"Scrolled by ({scroll_x}, {scroll_y})"
|
output = f"Scrolled by ({scroll_x}, {scroll_y})"
|
||||||
logger.debug(f"Action: {action.type} by ({scroll_x},{scroll_y}) at ({action.x},{action.y})")
|
logger.debug(f"Action: {action.type} by ({scroll_x},{scroll_y}) at ({action.x},{action.y})")
|
||||||
@@ -166,6 +193,7 @@ class BrowserEnvironment(Environment):
|
|||||||
|
|
||||||
if action.x is not None and action.y is not None:
|
if action.x is not None and action.y is not None:
|
||||||
await self.page.mouse.move(action.x, action.y)
|
await self.page.mouse.move(action.x, action.y)
|
||||||
|
self.mouse_pos = Point(x=action.x, y=action.y)
|
||||||
await self.page.mouse.wheel(dx, dy)
|
await self.page.mouse.wheel(dx, dy)
|
||||||
output = f"Scrolled {action.scroll_direction} by {amount}"
|
output = f"Scrolled {action.scroll_direction} by {amount}"
|
||||||
logger.debug(
|
logger.debug(
|
||||||
@@ -210,6 +238,7 @@ class BrowserEnvironment(Environment):
|
|||||||
case "move":
|
case "move":
|
||||||
x, y = action.x, action.y
|
x, y = action.x, action.y
|
||||||
await self.page.mouse.move(x, y)
|
await self.page.mouse.move(x, y)
|
||||||
|
self.mouse_pos = Point(x=x, y=y)
|
||||||
output = f"Moved mouse to ({x}, {y})"
|
output = f"Moved mouse to ({x}, {y})"
|
||||||
logger.debug(f"Action: {action.type} to ({x},{y})")
|
logger.debug(f"Action: {action.type} to ({x},{y})")
|
||||||
|
|
||||||
@@ -223,6 +252,7 @@ class BrowserEnvironment(Environment):
|
|||||||
for point in path[1:]:
|
for point in path[1:]:
|
||||||
await self.page.mouse.move(point.x, point.y)
|
await self.page.mouse.move(point.x, point.y)
|
||||||
await self.page.mouse.up()
|
await self.page.mouse.up()
|
||||||
|
self.mouse_pos = Point(x=path[-1].x, y=path[-1].y)
|
||||||
output = f"Drag along path starting at ({path[0].x},{path[0].y})"
|
output = f"Drag along path starting at ({path[0].x},{path[0].y})"
|
||||||
logger.debug(f"Action: {action.type} with {len(path)} points")
|
logger.debug(f"Action: {action.type} with {len(path)} points")
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user