Enable cache, proxy to improve firecrawl webpage scrape speed, success

2026-03-02 13:18:18 +00:00 · 2025-06-27 16:35:25 -07:00
parent 1566e3c74d
commit a33580d560
1 changed files with 8 additions and 1 deletions
--- a/src/khoj/processor/tools/online_search.py
+++ b/src/khoj/processor/tools/online_search.py
@@ -507,7 +507,14 @@ async def read_webpage_with_jina(web_url: str, api_key: str, api_url: str) -> st
 async def read_webpage_with_firecrawl(web_url: str, api_key: str, api_url: str) -> str:
    firecrawl_api_url = f"{api_url}/v1/scrape"
    headers = {"Content-Type": "application/json", "Authorization": f"Bearer {api_key}"}
-    params = {"url": web_url, "formats": ["markdown"], "excludeTags": ["script", ".ad"], "removeBase64Images": True}
+    params = {
+        "url": web_url,
+        "formats": ["markdown"],
+        "excludeTags": ["script", ".ad"],
+        "removeBase64Images": True,
+        "proxy": "auto",
+        "maxAge": 3600000,  # accept upto 1 hour old cached content for speed
+    }

    async with aiohttp.ClientSession() as session:
        async with session.post(firecrawl_api_url, json=params, headers=headers) as response: