From 5b5efe463d73de9cef5c5832152b86c65a358013 Mon Sep 17 00:00:00 2001 From: Debanjum Date: Fri, 2 May 2025 14:06:24 -0600 Subject: [PATCH] Remove inline base64 images from webpages read with Firecrawl --- src/khoj/processor/tools/online_search.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/khoj/processor/tools/online_search.py b/src/khoj/processor/tools/online_search.py index 0564b65c..a99ac811 100644 --- a/src/khoj/processor/tools/online_search.py +++ b/src/khoj/processor/tools/online_search.py @@ -499,7 +499,7 @@ async def read_webpage_with_jina(web_url: str, api_key: str, api_url: str) -> st async def read_webpage_with_firecrawl(web_url: str, api_key: str, api_url: str) -> str: firecrawl_api_url = f"{api_url}/v1/scrape" headers = {"Content-Type": "application/json", "Authorization": f"Bearer {api_key}"} - params = {"url": web_url, "formats": ["markdown"], "excludeTags": ["script", ".ad"]} + params = {"url": web_url, "formats": ["markdown"], "excludeTags": ["script", ".ad"], "removeBase64Images": True} async with aiohttp.ClientSession() as session: async with session.post(firecrawl_api_url, json=params, headers=headers) as response: