mirror of
https://github.com/khoaliber/khoj.git
synced 2026-03-10 05:39:11 +00:00
Upgrade Firecrawl web provider to use their v2 api
This commit is contained in:
@@ -267,7 +267,8 @@ async def search_with_firecrawl(query: str, location: LocationData) -> Tuple[str
|
|||||||
Tuple containing the original query and a dictionary of search results
|
Tuple containing the original query and a dictionary of search results
|
||||||
"""
|
"""
|
||||||
# Set up API endpoint and headers
|
# Set up API endpoint and headers
|
||||||
firecrawl_api_url = "https://api.firecrawl.dev/v1/search"
|
firecrawl_api_base = os.getenv("FIRECRAWL_API_URL", "https://api.firecrawl.dev")
|
||||||
|
firecrawl_api_url = f"{firecrawl_api_base}/v2/search"
|
||||||
headers = {"Content-Type": "application/json", "Authorization": f"Bearer {FIRECRAWL_API_KEY}"}
|
headers = {"Content-Type": "application/json", "Authorization": f"Bearer {FIRECRAWL_API_KEY}"}
|
||||||
|
|
||||||
# Prepare request payload
|
# Prepare request payload
|
||||||
@@ -303,7 +304,7 @@ async def search_with_firecrawl(query: str, location: LocationData) -> Tuple[str
|
|||||||
|
|
||||||
# Transform Firecrawl response to match the expected format
|
# Transform Firecrawl response to match the expected format
|
||||||
organic_results = []
|
organic_results = []
|
||||||
for item in response_json.get("data", []):
|
for item in response_json.get("data", {}).get("web", []):
|
||||||
organic_results.append(
|
organic_results.append(
|
||||||
{
|
{
|
||||||
"title": item["title"],
|
"title": item["title"],
|
||||||
@@ -604,15 +605,14 @@ async def read_webpage_with_exa(web_url: str, api_key: str, api_url: str) -> str
|
|||||||
|
|
||||||
|
|
||||||
async def read_webpage_with_firecrawl(web_url: str, api_key: str, api_url: str) -> str:
|
async def read_webpage_with_firecrawl(web_url: str, api_key: str, api_url: str) -> str:
|
||||||
firecrawl_api_url = f"{api_url}/v1/scrape"
|
firecrawl_api_url = f"{api_url}/v2/scrape"
|
||||||
headers = {"Content-Type": "application/json", "Authorization": f"Bearer {api_key}"}
|
headers = {"Content-Type": "application/json", "Authorization": f"Bearer {api_key}"}
|
||||||
params = {
|
params = {
|
||||||
"url": web_url,
|
"url": web_url,
|
||||||
"formats": ["markdown"],
|
"formats": ["markdown"],
|
||||||
"excludeTags": ["script", ".ad"],
|
"excludeTags": ["script", ".ad"],
|
||||||
"removeBase64Images": True,
|
"removeBase64Images": True,
|
||||||
"proxy": "auto",
|
"maxAge": 86400000, # accept upto 1 day old cached content for speed
|
||||||
"maxAge": 3600000, # accept upto 1 hour old cached content for speed
|
|
||||||
}
|
}
|
||||||
|
|
||||||
async with aiohttp.ClientSession() as session:
|
async with aiohttp.ClientSession() as session:
|
||||||
|
|||||||
Reference in New Issue
Block a user