mirror of
https://github.com/khoaliber/khoj.git
synced 2026-03-10 05:39:11 +00:00
Enable cache, proxy to improve firecrawl webpage scrape speed, success
This commit is contained in:
@@ -507,7 +507,14 @@ async def read_webpage_with_jina(web_url: str, api_key: str, api_url: str) -> st
|
|||||||
async def read_webpage_with_firecrawl(web_url: str, api_key: str, api_url: str) -> str:
|
async def read_webpage_with_firecrawl(web_url: str, api_key: str, api_url: str) -> str:
|
||||||
firecrawl_api_url = f"{api_url}/v1/scrape"
|
firecrawl_api_url = f"{api_url}/v1/scrape"
|
||||||
headers = {"Content-Type": "application/json", "Authorization": f"Bearer {api_key}"}
|
headers = {"Content-Type": "application/json", "Authorization": f"Bearer {api_key}"}
|
||||||
params = {"url": web_url, "formats": ["markdown"], "excludeTags": ["script", ".ad"], "removeBase64Images": True}
|
params = {
|
||||||
|
"url": web_url,
|
||||||
|
"formats": ["markdown"],
|
||||||
|
"excludeTags": ["script", ".ad"],
|
||||||
|
"removeBase64Images": True,
|
||||||
|
"proxy": "auto",
|
||||||
|
"maxAge": 3600000, # accept upto 1 hour old cached content for speed
|
||||||
|
}
|
||||||
|
|
||||||
async with aiohttp.ClientSession() as session:
|
async with aiohttp.ClientSession() as session:
|
||||||
async with session.post(firecrawl_api_url, json=params, headers=headers) as response:
|
async with session.post(firecrawl_api_url, json=params, headers=headers) as response:
|
||||||
|
|||||||
Reference in New Issue
Block a user