From 98f99fa6f86c613f2ec6974e5684b1f8f5f3ddc1 Mon Sep 17 00:00:00 2001
From: Debanjum Singh Solanky <debanjum@gmail.com>
Date: Tue, 15 Oct 2024 12:54:18 -0700
Subject: [PATCH] Allow using Firecrawl to extract web page content

Set the FIRECRAWL_TO_EXTRACT environment variable to true to have
Firecrawl scrape and extract content from webpage using their LLM

This could be faster, not sure about quality as LLM used is obfuscated
---
 src/khoj/processor/tools/online_search.py | 51 ++++++++++++++++++++---
 1 file changed, 46 insertions(+), 5 deletions(-)

diff --git a/src/khoj/processor/tools/online_search.py b/src/khoj/processor/tools/online_search.py
index a9dd2476..2b4cac65 100644
--- a/src/khoj/processor/tools/online_search.py
+++ b/src/khoj/processor/tools/online_search.py
@@ -11,6 +11,7 @@ from bs4 import BeautifulSoup
 from markdownify import markdownify
 
 from khoj.database.models import Agent, KhojUser
+from khoj.processor.conversation import prompts
 from khoj.routers.helpers import (
     ChatEvent,
     extract_relevant_info,
@@ -31,6 +32,7 @@ JINA_API_KEY = os.getenv("JINA_API_KEY")
 
 FIRECRAWL_API_KEY = os.getenv("FIRECRAWL_API_KEY")
 FIRECRAWL_API_URL = os.getenv("FIRECRAWL_API_URL", "https://api.firecrawl.dev")
+FIRECRAWL_TO_EXTRACT = os.getenv("FIRECRAWL_TO_EXTRACT", "False").lower() == "true"
 
 OLOSTEP_API_KEY = os.getenv("OLOSTEP_API_KEY")
 OLOSTEP_API_URL = "https://agent.olostep.com/olostep-p2p-incomingAPI"
@@ -172,21 +174,26 @@ async def read_webpages(
 async def read_webpage_and_extract_content(
     subquery: str, url: str, content: str = None, user: KhojUser = None, agent: Agent = None
 ) -> Tuple[str, Union[None, str], str]:
+    extracted_info = None
     try:
         if is_none_or_empty(content):
             with timer(f"Reading web page at '{url}' took", logger):
                 if FIRECRAWL_API_KEY:
-                    content = await read_webpage_with_firecrawl(url)
+                    if FIRECRAWL_TO_EXTRACT:
+                        extracted_info = await read_webpage_and_extract_content_with_firecrawl(url, subquery, agent)
+                    else:
+                        content = await read_webpage_with_firecrawl(url)
                 elif OLOSTEP_API_KEY:
                     content = await read_webpage_with_olostep(url)
                 else:
                     content = await read_webpage_with_jina(url)
-        with timer(f"Extracting relevant information from web page at '{url}' took", logger):
-            extracted_info = await extract_relevant_info(subquery, content, user=user, agent=agent)
-        return subquery, extracted_info, url
+        if is_none_or_empty(extracted_info):
+            with timer(f"Extracting relevant information from web page at '{url}' took", logger):
+                extracted_info = await extract_relevant_info(subquery, content, user=user, agent=agent)
     except Exception as e:
         logger.error(f"Failed to read web page at '{url}' with {e}")
-        return subquery, None, url
+
+    return subquery, extracted_info, url
 
 
 async def read_webpage_at_url(web_url: str) -> str:
@@ -240,6 +247,40 @@ async def read_webpage_with_firecrawl(web_url: str) -> str:
             return response_json["data"]["markdown"]
 
 
+async def read_webpage_and_extract_content_with_firecrawl(web_url: str, query: str, agent: Agent = None) -> str:
+    firecrawl_api_url = f"{FIRECRAWL_API_URL}/v1/scrape"
+    headers = {"Content-Type": "application/json", "Authorization": f"Bearer {FIRECRAWL_API_KEY}"}
+    schema = {
+        "type": "object",
+        "properties": {
+            "relevant_extract": {"type": "string"},
+        },
+        "required": [
+            "relevant_extract",
+        ],
+    }
+
+    personality_context = (
+        prompts.personality_context.format(personality=agent.personality) if agent and agent.personality else ""
+    )
+    system_prompt = f"""
+{prompts.system_prompt_extract_relevant_information}
+
+{personality_context}
+User Query: {query}
+
+Collate only relevant information from the website to answer the target query and in the provided JSON schema.
+""".strip()
+
+    params = {"url": web_url, "formats": ["extract"], "extract": {"systemPrompt": system_prompt, "schema": schema}}
+
+    async with aiohttp.ClientSession() as session:
+        async with session.post(firecrawl_api_url, json=params, headers=headers) as response:
+            response.raise_for_status()
+            response_json = await response.json()
+            return response_json["data"]["extract"]["relevant_extract"]
+
+
 async def search_with_jina(query: str, location: LocationData) -> Tuple[str, Dict[str, List[Dict]]]:
     encoded_query = urllib.parse.quote(query)
     jina_search_api_url = f"{JINA_SEARCH_API_URL}/{encoded_query}"