mirror of
https://github.com/khoaliber/khoj.git
synced 2026-03-08 05:39:13 +00:00
Add pages visited via browser operator to references returned to clients
This commit is contained in:
@@ -6,7 +6,7 @@ import os
|
|||||||
from abc import ABC, abstractmethod
|
from abc import ABC, abstractmethod
|
||||||
from copy import deepcopy
|
from copy import deepcopy
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
from typing import Callable, List, Literal, Optional, Union
|
from typing import Callable, List, Literal, Optional, Set, Union
|
||||||
|
|
||||||
import requests
|
import requests
|
||||||
from anthropic.types.beta import BetaContentBlock, BetaMessage
|
from anthropic.types.beta import BetaContentBlock, BetaMessage
|
||||||
@@ -224,6 +224,8 @@ class BrowserEnvironment(Environment):
|
|||||||
self.page: Optional[Page] = None
|
self.page: Optional[Page] = None
|
||||||
self.width: int = 1024
|
self.width: int = 1024
|
||||||
self.height: int = 768
|
self.height: int = 768
|
||||||
|
self.visited_urls: Set[str] = set()
|
||||||
|
self.excluded_urls = {"about:blank", "https://duckduckgo.com", "https://www.bing.com", "https://www.google.com"}
|
||||||
|
|
||||||
async def start(self, width: int = 1024, height: int = 768) -> None:
|
async def start(self, width: int = 1024, height: int = 768) -> None:
|
||||||
self.width = width
|
self.width = width
|
||||||
@@ -242,6 +244,16 @@ class BrowserEnvironment(Environment):
|
|||||||
default_context = self.browser.contexts[0] if self.browser.contexts else await self.browser.new_context()
|
default_context = self.browser.contexts[0] if self.browser.contexts else await self.browser.new_context()
|
||||||
self.page = default_context.pages[0] if default_context.pages else await default_context.new_page()
|
self.page = default_context.pages[0] if default_context.pages else await default_context.new_page()
|
||||||
|
|
||||||
|
# Define a handler for page load events to capture URLs
|
||||||
|
async def handle_load(loaded_page: Page):
|
||||||
|
url = loaded_page.url
|
||||||
|
if url and url not in self.excluded_urls and url not in self.visited_urls:
|
||||||
|
logger.debug(f"Page loaded: {url}")
|
||||||
|
self.visited_urls.add(url)
|
||||||
|
|
||||||
|
# Listen for load events on the main page
|
||||||
|
self.page.on("load", handle_load)
|
||||||
|
|
||||||
# Define a handler for new pages
|
# Define a handler for new pages
|
||||||
async def handle_new_page(new_page: Page):
|
async def handle_new_page(new_page: Page):
|
||||||
# Get the target URL of the new page
|
# Get the target URL of the new page
|
||||||
@@ -1211,4 +1223,7 @@ async def operate_browser(
|
|||||||
if environment and not safety_check_message: # Don't close browser if safety check pending
|
if environment and not safety_check_message: # Don't close browser if safety check pending
|
||||||
await environment.close()
|
await environment.close()
|
||||||
|
|
||||||
yield safety_check_message or response
|
yield {
|
||||||
|
"text": safety_check_message or response,
|
||||||
|
"webpages": [{"link": url, "snippet": ""} for url in environment.visited_urls],
|
||||||
|
}
|
||||||
|
|||||||
@@ -1233,8 +1233,16 @@ async def chat(
|
|||||||
):
|
):
|
||||||
if isinstance(result, dict) and ChatEvent.STATUS in result:
|
if isinstance(result, dict) and ChatEvent.STATUS in result:
|
||||||
yield result[ChatEvent.STATUS]
|
yield result[ChatEvent.STATUS]
|
||||||
elif isinstance(result, str):
|
else:
|
||||||
operator_results.append(result)
|
operator_results.append(result["text"])
|
||||||
|
# Add webpages visited while operating browser to references
|
||||||
|
if result.get("webpages"):
|
||||||
|
if not online_results.get(defiltered_query):
|
||||||
|
online_results[defiltered_query] = {"webpages": result["webpages"]}
|
||||||
|
elif not online_results[defiltered_query].get("webpages"):
|
||||||
|
online_results[defiltered_query]["webpages"] = result["webpages"]
|
||||||
|
else:
|
||||||
|
online_results[defiltered_query]["webpages"] += result["webpages"]
|
||||||
except ValueError as e:
|
except ValueError as e:
|
||||||
program_execution_context.append(f"Browser operation error: {e}")
|
program_execution_context.append(f"Browser operation error: {e}")
|
||||||
logger.warning(f"Failed to operate browser with {e}", exc_info=True)
|
logger.warning(f"Failed to operate browser with {e}", exc_info=True)
|
||||||
|
|||||||
@@ -416,9 +416,18 @@ async def execute_information_collection(
|
|||||||
):
|
):
|
||||||
if isinstance(result, dict) and ChatEvent.STATUS in result:
|
if isinstance(result, dict) and ChatEvent.STATUS in result:
|
||||||
yield result[ChatEvent.STATUS]
|
yield result[ChatEvent.STATUS]
|
||||||
elif isinstance(result, str):
|
else:
|
||||||
operator_results = result # type: ignore
|
operator_results = result["text"] # type: ignore
|
||||||
this_iteration.operatorContext = operator_results
|
this_iteration.operatorContext = operator_results
|
||||||
|
# Add webpages visited while operating browser to references
|
||||||
|
if result.get("webpages"):
|
||||||
|
if not online_results.get(this_iteration.query):
|
||||||
|
online_results[this_iteration.query] = {"webpages": result["webpages"]}
|
||||||
|
elif not online_results[this_iteration.query].get("webpages"):
|
||||||
|
online_results[this_iteration.query]["webpages"] = result["webpages"]
|
||||||
|
else:
|
||||||
|
online_results[this_iteration.query]["webpages"] += result["webpages"]
|
||||||
|
this_iteration.onlineContext = online_results
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
this_iteration.warning = f"Error operating browser: {e}"
|
this_iteration.warning = f"Error operating browser: {e}"
|
||||||
logger.error(this_iteration.warning, exc_info=True)
|
logger.error(this_iteration.warning, exc_info=True)
|
||||||
|
|||||||
Reference in New Issue
Block a user