Allow using Firecrawl to extract web page content

Set the FIRECRAWL_TO_EXTRACT environment variable to true to have Firecrawl scrape and extract content from webpage using their LLM This could be faster, not sure about quality as LLM used is obfuscated
2026-04-28 00:19:25 +00:00 · 2024-10-15 12:54:18 -07:00
parent 993fd7cd2b
commit 98f99fa6f8
1 changed files with 46 additions and 5 deletions
@@ -11,6 +11,7 @@ from bs4 import BeautifulSoup
 from markdownify import markdownify

 from khoj.database.models import Agent, KhojUser
+from khoj.processor.conversation import prompts
 from khoj.routers.helpers import (
    ChatEvent,
    extract_relevant_info,
@@ -31,6 +32,7 @@ JINA_API_KEY = os.getenv("JINA_API_KEY")

 FIRECRAWL_API_KEY = os.getenv("FIRECRAWL_API_KEY")
 FIRECRAWL_API_URL = os.getenv("FIRECRAWL_API_URL", "https://api.firecrawl.dev")
+FIRECRAWL_TO_EXTRACT = os.getenv("FIRECRAWL_TO_EXTRACT", "False").lower() == "true"

 OLOSTEP_API_KEY = os.getenv("OLOSTEP_API_KEY")
 OLOSTEP_API_URL = "https://agent.olostep.com/olostep-p2p-incomingAPI"
@@ -172,21 +174,26 @@ async def read_webpages(
 async def read_webpage_and_extract_content(
    subquery: str, url: str, content: str = None, user: KhojUser = None, agent: Agent = None
 ) -> Tuple[str, Union[None, str], str]:
+    extracted_info = None
    try:
        if is_none_or_empty(content):
            with timer(f"Reading web page at '{url}' took", logger):
                if FIRECRAWL_API_KEY:
-                    content = await read_webpage_with_firecrawl(url)
+                    if FIRECRAWL_TO_EXTRACT:
+                        extracted_info = await read_webpage_and_extract_content_with_firecrawl(url, subquery, agent)
+                    else:
+                        content = await read_webpage_with_firecrawl(url)
                elif OLOSTEP_API_KEY:
                    content = await read_webpage_with_olostep(url)
                else:
                    content = await read_webpage_with_jina(url)
-        with timer(f"Extracting relevant information from web page at '{url}' took", logger):
-            extracted_info = await extract_relevant_info(subquery, content, user=user, agent=agent)
-        return subquery, extracted_info, url
+        if is_none_or_empty(extracted_info):
+            with timer(f"Extracting relevant information from web page at '{url}' took", logger):
+                extracted_info = await extract_relevant_info(subquery, content, user=user, agent=agent)
    except Exception as e:
        logger.error(f"Failed to read web page at '{url}' with {e}")
-        return subquery, None, url
+
+    return subquery, extracted_info, url


 async def read_webpage_at_url(web_url: str) -> str:
@@ -240,6 +247,40 @@ async def read_webpage_with_firecrawl(web_url: str) -> str:
            return response_json["data"]["markdown"]


+async def read_webpage_and_extract_content_with_firecrawl(web_url: str, query: str, agent: Agent = None) -> str:
+    firecrawl_api_url = f"{FIRECRAWL_API_URL}/v1/scrape"
+    headers = {"Content-Type": "application/json", "Authorization": f"Bearer {FIRECRAWL_API_KEY}"}
+    schema = {
+        "type": "object",
+        "properties": {
+            "relevant_extract": {"type": "string"},
+        },
+        "required": [
+            "relevant_extract",
+        ],
+    }
+
+    personality_context = (
+        prompts.personality_context.format(personality=agent.personality) if agent and agent.personality else ""
+    )
+    system_prompt = f"""
+{prompts.system_prompt_extract_relevant_information}
+
+{personality_context}
+User Query: {query}
+
+Collate only relevant information from the website to answer the target query and in the provided JSON schema.
+""".strip()
+
+    params = {"url": web_url, "formats": ["extract"], "extract": {"systemPrompt": system_prompt, "schema": schema}}
+
+    async with aiohttp.ClientSession() as session:
+        async with session.post(firecrawl_api_url, json=params, headers=headers) as response:
+            response.raise_for_status()
+            response_json = await response.json()
+            return response_json["data"]["extract"]["relevant_extract"]
+
+
 async def search_with_jina(query: str, location: LocationData) -> Tuple[str, Dict[str, List[Dict]]]:
    encoded_query = urllib.parse.quote(query)
    jina_search_api_url = f"{JINA_SEARCH_API_URL}/{encoded_query}"