From 98f99fa6f86c613f2ec6974e5684b1f8f5f3ddc1 Mon Sep 17 00:00:00 2001 From: Debanjum Singh Solanky Date: Tue, 15 Oct 2024 12:54:18 -0700 Subject: [PATCH] Allow using Firecrawl to extract web page content Set the FIRECRAWL_TO_EXTRACT environment variable to true to have Firecrawl scrape and extract content from webpage using their LLM This could be faster, not sure about quality as LLM used is obfuscated --- src/khoj/processor/tools/online_search.py | 51 ++++++++++++++++++++--- 1 file changed, 46 insertions(+), 5 deletions(-) diff --git a/src/khoj/processor/tools/online_search.py b/src/khoj/processor/tools/online_search.py index a9dd2476..2b4cac65 100644 --- a/src/khoj/processor/tools/online_search.py +++ b/src/khoj/processor/tools/online_search.py @@ -11,6 +11,7 @@ from bs4 import BeautifulSoup from markdownify import markdownify from khoj.database.models import Agent, KhojUser +from khoj.processor.conversation import prompts from khoj.routers.helpers import ( ChatEvent, extract_relevant_info, @@ -31,6 +32,7 @@ JINA_API_KEY = os.getenv("JINA_API_KEY") FIRECRAWL_API_KEY = os.getenv("FIRECRAWL_API_KEY") FIRECRAWL_API_URL = os.getenv("FIRECRAWL_API_URL", "https://api.firecrawl.dev") +FIRECRAWL_TO_EXTRACT = os.getenv("FIRECRAWL_TO_EXTRACT", "False").lower() == "true" OLOSTEP_API_KEY = os.getenv("OLOSTEP_API_KEY") OLOSTEP_API_URL = "https://agent.olostep.com/olostep-p2p-incomingAPI" @@ -172,21 +174,26 @@ async def read_webpages( async def read_webpage_and_extract_content( subquery: str, url: str, content: str = None, user: KhojUser = None, agent: Agent = None ) -> Tuple[str, Union[None, str], str]: + extracted_info = None try: if is_none_or_empty(content): with timer(f"Reading web page at '{url}' took", logger): if FIRECRAWL_API_KEY: - content = await read_webpage_with_firecrawl(url) + if FIRECRAWL_TO_EXTRACT: + extracted_info = await read_webpage_and_extract_content_with_firecrawl(url, subquery, agent) + else: + content = await read_webpage_with_firecrawl(url) elif OLOSTEP_API_KEY: content = await read_webpage_with_olostep(url) else: content = await read_webpage_with_jina(url) - with timer(f"Extracting relevant information from web page at '{url}' took", logger): - extracted_info = await extract_relevant_info(subquery, content, user=user, agent=agent) - return subquery, extracted_info, url + if is_none_or_empty(extracted_info): + with timer(f"Extracting relevant information from web page at '{url}' took", logger): + extracted_info = await extract_relevant_info(subquery, content, user=user, agent=agent) except Exception as e: logger.error(f"Failed to read web page at '{url}' with {e}") - return subquery, None, url + + return subquery, extracted_info, url async def read_webpage_at_url(web_url: str) -> str: @@ -240,6 +247,40 @@ async def read_webpage_with_firecrawl(web_url: str) -> str: return response_json["data"]["markdown"] +async def read_webpage_and_extract_content_with_firecrawl(web_url: str, query: str, agent: Agent = None) -> str: + firecrawl_api_url = f"{FIRECRAWL_API_URL}/v1/scrape" + headers = {"Content-Type": "application/json", "Authorization": f"Bearer {FIRECRAWL_API_KEY}"} + schema = { + "type": "object", + "properties": { + "relevant_extract": {"type": "string"}, + }, + "required": [ + "relevant_extract", + ], + } + + personality_context = ( + prompts.personality_context.format(personality=agent.personality) if agent and agent.personality else "" + ) + system_prompt = f""" +{prompts.system_prompt_extract_relevant_information} + +{personality_context} +User Query: {query} + +Collate only relevant information from the website to answer the target query and in the provided JSON schema. +""".strip() + + params = {"url": web_url, "formats": ["extract"], "extract": {"systemPrompt": system_prompt, "schema": schema}} + + async with aiohttp.ClientSession() as session: + async with session.post(firecrawl_api_url, json=params, headers=headers) as response: + response.raise_for_status() + response_json = await response.json() + return response_json["data"]["extract"]["relevant_extract"] + + async def search_with_jina(query: str, location: LocationData) -> Tuple[str, Dict[str, List[Dict]]]: encoded_query = urllib.parse.quote(query) jina_search_api_url = f"{JINA_SEARCH_API_URL}/{encoded_query}"