mirror of
https://github.com/khoaliber/khoj.git
synced 2026-03-03 21:29:08 +00:00
Support using Firecrawl to read webpages
Firecrawl is open-source, self-hostable with a default hosted service provided, similar to Jina.ai. So it can be 1. Self-hosted as part of a private Khoj cloud deployment 2. Used directly by getting an API key from the Firecrawl.dev service This is as an alternative to Olostep and Jina.ai for reading webpages.
This commit is contained in:
@@ -29,6 +29,9 @@ JINA_READER_API_URL = "https://r.jina.ai/"
|
||||
JINA_SEARCH_API_URL = "https://s.jina.ai/"
|
||||
JINA_API_KEY = os.getenv("JINA_API_KEY")
|
||||
|
||||
FIRECRAWL_API_KEY = os.getenv("FIRECRAWL_API_KEY")
|
||||
FIRECRAWL_API_URL = os.getenv("FIRECRAWL_API_URL", "https://api.firecrawl.dev")
|
||||
|
||||
OLOSTEP_API_KEY = os.getenv("OLOSTEP_API_KEY")
|
||||
OLOSTEP_API_URL = "https://agent.olostep.com/olostep-p2p-incomingAPI"
|
||||
OLOSTEP_QUERY_PARAMS = {
|
||||
@@ -172,7 +175,12 @@ async def read_webpage_and_extract_content(
|
||||
try:
|
||||
if is_none_or_empty(content):
|
||||
with timer(f"Reading web page at '{url}' took", logger):
|
||||
content = await read_webpage_with_olostep(url) if OLOSTEP_API_KEY else await read_webpage_with_jina(url)
|
||||
if FIRECRAWL_API_KEY:
|
||||
content = await read_webpage_with_firecrawl(url)
|
||||
elif OLOSTEP_API_KEY:
|
||||
content = await read_webpage_with_olostep(url)
|
||||
else:
|
||||
content = await read_webpage_with_jina(url)
|
||||
with timer(f"Extracting relevant information from web page at '{url}' took", logger):
|
||||
extracted_info = await extract_relevant_info(subquery, content, user=user, agent=agent)
|
||||
return subquery, extracted_info, url
|
||||
@@ -220,6 +228,18 @@ async def read_webpage_with_jina(web_url: str) -> str:
|
||||
return response_json["data"]["content"]
|
||||
|
||||
|
||||
async def read_webpage_with_firecrawl(web_url: str) -> str:
|
||||
firecrawl_api_url = f"{FIRECRAWL_API_URL}/v1/scrape"
|
||||
headers = {"Content-Type": "application/json", "Authorization": f"Bearer {FIRECRAWL_API_KEY}"}
|
||||
params = {"url": web_url, "formats": ["markdown"], "excludeTags": ["script", ".ad"]}
|
||||
|
||||
async with aiohttp.ClientSession() as session:
|
||||
async with session.post(firecrawl_api_url, json=params, headers=headers) as response:
|
||||
response.raise_for_status()
|
||||
response_json = await response.json()
|
||||
return response_json["data"]["markdown"]
|
||||
|
||||
|
||||
async def search_with_jina(query: str, location: LocationData) -> Tuple[str, Dict[str, List[Dict]]]:
|
||||
encoded_query = urllib.parse.quote(query)
|
||||
jina_search_api_url = f"{JINA_SEARCH_API_URL}/{encoded_query}"
|
||||
|
||||
Reference in New Issue
Block a user