mirror of
https://github.com/khoaliber/khoj.git
synced 2026-03-06 05:39:12 +00:00
Allow using Firecrawl to extract web page content
Set the FIRECRAWL_TO_EXTRACT environment variable to true to have Firecrawl scrape and extract content from webpage using their LLM This could be faster, not sure about quality as LLM used is obfuscated
This commit is contained in:
@@ -11,6 +11,7 @@ from bs4 import BeautifulSoup
|
||||
from markdownify import markdownify
|
||||
|
||||
from khoj.database.models import Agent, KhojUser
|
||||
from khoj.processor.conversation import prompts
|
||||
from khoj.routers.helpers import (
|
||||
ChatEvent,
|
||||
extract_relevant_info,
|
||||
@@ -31,6 +32,7 @@ JINA_API_KEY = os.getenv("JINA_API_KEY")
|
||||
|
||||
FIRECRAWL_API_KEY = os.getenv("FIRECRAWL_API_KEY")
|
||||
FIRECRAWL_API_URL = os.getenv("FIRECRAWL_API_URL", "https://api.firecrawl.dev")
|
||||
FIRECRAWL_TO_EXTRACT = os.getenv("FIRECRAWL_TO_EXTRACT", "False").lower() == "true"
|
||||
|
||||
OLOSTEP_API_KEY = os.getenv("OLOSTEP_API_KEY")
|
||||
OLOSTEP_API_URL = "https://agent.olostep.com/olostep-p2p-incomingAPI"
|
||||
@@ -172,21 +174,26 @@ async def read_webpages(
|
||||
async def read_webpage_and_extract_content(
|
||||
subquery: str, url: str, content: str = None, user: KhojUser = None, agent: Agent = None
|
||||
) -> Tuple[str, Union[None, str], str]:
|
||||
extracted_info = None
|
||||
try:
|
||||
if is_none_or_empty(content):
|
||||
with timer(f"Reading web page at '{url}' took", logger):
|
||||
if FIRECRAWL_API_KEY:
|
||||
content = await read_webpage_with_firecrawl(url)
|
||||
if FIRECRAWL_TO_EXTRACT:
|
||||
extracted_info = await read_webpage_and_extract_content_with_firecrawl(url, subquery, agent)
|
||||
else:
|
||||
content = await read_webpage_with_firecrawl(url)
|
||||
elif OLOSTEP_API_KEY:
|
||||
content = await read_webpage_with_olostep(url)
|
||||
else:
|
||||
content = await read_webpage_with_jina(url)
|
||||
with timer(f"Extracting relevant information from web page at '{url}' took", logger):
|
||||
extracted_info = await extract_relevant_info(subquery, content, user=user, agent=agent)
|
||||
return subquery, extracted_info, url
|
||||
if is_none_or_empty(extracted_info):
|
||||
with timer(f"Extracting relevant information from web page at '{url}' took", logger):
|
||||
extracted_info = await extract_relevant_info(subquery, content, user=user, agent=agent)
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to read web page at '{url}' with {e}")
|
||||
return subquery, None, url
|
||||
|
||||
return subquery, extracted_info, url
|
||||
|
||||
|
||||
async def read_webpage_at_url(web_url: str) -> str:
|
||||
@@ -240,6 +247,40 @@ async def read_webpage_with_firecrawl(web_url: str) -> str:
|
||||
return response_json["data"]["markdown"]
|
||||
|
||||
|
||||
async def read_webpage_and_extract_content_with_firecrawl(web_url: str, query: str, agent: Agent = None) -> str:
|
||||
firecrawl_api_url = f"{FIRECRAWL_API_URL}/v1/scrape"
|
||||
headers = {"Content-Type": "application/json", "Authorization": f"Bearer {FIRECRAWL_API_KEY}"}
|
||||
schema = {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"relevant_extract": {"type": "string"},
|
||||
},
|
||||
"required": [
|
||||
"relevant_extract",
|
||||
],
|
||||
}
|
||||
|
||||
personality_context = (
|
||||
prompts.personality_context.format(personality=agent.personality) if agent and agent.personality else ""
|
||||
)
|
||||
system_prompt = f"""
|
||||
{prompts.system_prompt_extract_relevant_information}
|
||||
|
||||
{personality_context}
|
||||
User Query: {query}
|
||||
|
||||
Collate only relevant information from the website to answer the target query and in the provided JSON schema.
|
||||
""".strip()
|
||||
|
||||
params = {"url": web_url, "formats": ["extract"], "extract": {"systemPrompt": system_prompt, "schema": schema}}
|
||||
|
||||
async with aiohttp.ClientSession() as session:
|
||||
async with session.post(firecrawl_api_url, json=params, headers=headers) as response:
|
||||
response.raise_for_status()
|
||||
response_json = await response.json()
|
||||
return response_json["data"]["extract"]["relevant_extract"]
|
||||
|
||||
|
||||
async def search_with_jina(query: str, location: LocationData) -> Tuple[str, Dict[str, List[Dict]]]:
|
||||
encoded_query = urllib.parse.quote(query)
|
||||
jina_search_api_url = f"{JINA_SEARCH_API_URL}/{encoded_query}"
|
||||
|
||||
Reference in New Issue
Block a user