Allow using Firecrawl to extract web page content

Set the FIRECRAWL_TO_EXTRACT environment variable to true to have
Firecrawl scrape and extract content from webpage using their LLM

This could be faster, not sure about quality as LLM used is obfuscated
This commit is contained in:
Debanjum Singh Solanky
2024-10-15 12:54:18 -07:00
parent 993fd7cd2b
commit 98f99fa6f8

View File

@@ -11,6 +11,7 @@ from bs4 import BeautifulSoup
from markdownify import markdownify
from khoj.database.models import Agent, KhojUser
from khoj.processor.conversation import prompts
from khoj.routers.helpers import (
ChatEvent,
extract_relevant_info,
@@ -31,6 +32,7 @@ JINA_API_KEY = os.getenv("JINA_API_KEY")
FIRECRAWL_API_KEY = os.getenv("FIRECRAWL_API_KEY")
FIRECRAWL_API_URL = os.getenv("FIRECRAWL_API_URL", "https://api.firecrawl.dev")
FIRECRAWL_TO_EXTRACT = os.getenv("FIRECRAWL_TO_EXTRACT", "False").lower() == "true"
OLOSTEP_API_KEY = os.getenv("OLOSTEP_API_KEY")
OLOSTEP_API_URL = "https://agent.olostep.com/olostep-p2p-incomingAPI"
@@ -172,21 +174,26 @@ async def read_webpages(
async def read_webpage_and_extract_content(
subquery: str, url: str, content: str = None, user: KhojUser = None, agent: Agent = None
) -> Tuple[str, Union[None, str], str]:
extracted_info = None
try:
if is_none_or_empty(content):
with timer(f"Reading web page at '{url}' took", logger):
if FIRECRAWL_API_KEY:
content = await read_webpage_with_firecrawl(url)
if FIRECRAWL_TO_EXTRACT:
extracted_info = await read_webpage_and_extract_content_with_firecrawl(url, subquery, agent)
else:
content = await read_webpage_with_firecrawl(url)
elif OLOSTEP_API_KEY:
content = await read_webpage_with_olostep(url)
else:
content = await read_webpage_with_jina(url)
with timer(f"Extracting relevant information from web page at '{url}' took", logger):
extracted_info = await extract_relevant_info(subquery, content, user=user, agent=agent)
return subquery, extracted_info, url
if is_none_or_empty(extracted_info):
with timer(f"Extracting relevant information from web page at '{url}' took", logger):
extracted_info = await extract_relevant_info(subquery, content, user=user, agent=agent)
except Exception as e:
logger.error(f"Failed to read web page at '{url}' with {e}")
return subquery, None, url
return subquery, extracted_info, url
async def read_webpage_at_url(web_url: str) -> str:
@@ -240,6 +247,40 @@ async def read_webpage_with_firecrawl(web_url: str) -> str:
return response_json["data"]["markdown"]
async def read_webpage_and_extract_content_with_firecrawl(web_url: str, query: str, agent: Agent = None) -> str:
firecrawl_api_url = f"{FIRECRAWL_API_URL}/v1/scrape"
headers = {"Content-Type": "application/json", "Authorization": f"Bearer {FIRECRAWL_API_KEY}"}
schema = {
"type": "object",
"properties": {
"relevant_extract": {"type": "string"},
},
"required": [
"relevant_extract",
],
}
personality_context = (
prompts.personality_context.format(personality=agent.personality) if agent and agent.personality else ""
)
system_prompt = f"""
{prompts.system_prompt_extract_relevant_information}
{personality_context}
User Query: {query}
Collate only relevant information from the website to answer the target query and in the provided JSON schema.
""".strip()
params = {"url": web_url, "formats": ["extract"], "extract": {"systemPrompt": system_prompt, "schema": schema}}
async with aiohttp.ClientSession() as session:
async with session.post(firecrawl_api_url, json=params, headers=headers) as response:
response.raise_for_status()
response_json = await response.json()
return response_json["data"]["extract"]["relevant_extract"]
async def search_with_jina(query: str, location: LocationData) -> Tuple[str, Dict[str, List[Dict]]]:
encoded_query = urllib.parse.quote(query)
jina_search_api_url = f"{JINA_SEARCH_API_URL}/{encoded_query}"