From a33580d560d67b581548c598f626453cac5b5873 Mon Sep 17 00:00:00 2001 From: Debanjum Date: Fri, 27 Jun 2025 16:35:25 -0700 Subject: [PATCH] Enable cache, proxy to improve firecrawl webpage scrape speed, success --- src/khoj/processor/tools/online_search.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/src/khoj/processor/tools/online_search.py b/src/khoj/processor/tools/online_search.py index 8951bc46..78a4117f 100644 --- a/src/khoj/processor/tools/online_search.py +++ b/src/khoj/processor/tools/online_search.py @@ -507,7 +507,14 @@ async def read_webpage_with_jina(web_url: str, api_key: str, api_url: str) -> st async def read_webpage_with_firecrawl(web_url: str, api_key: str, api_url: str) -> str: firecrawl_api_url = f"{api_url}/v1/scrape" headers = {"Content-Type": "application/json", "Authorization": f"Bearer {api_key}"} - params = {"url": web_url, "formats": ["markdown"], "excludeTags": ["script", ".ad"], "removeBase64Images": True} + params = { + "url": web_url, + "formats": ["markdown"], + "excludeTags": ["script", ".ad"], + "removeBase64Images": True, + "proxy": "auto", + "maxAge": 3600000, # accept upto 1 hour old cached content for speed + } async with aiohttp.ClientSession() as session: async with session.post(firecrawl_api_url, json=params, headers=headers) as response: