Timeout web search and webpage read requests to providers

2026-04-19 17:14:35 +00:00 · 2025-07-09 15:51:05 -07:00
parent 1988a8d023
commit b763dbfb2b
1 changed files with 25 additions and 10 deletions
--- a/src/khoj/processor/tools/online_search.py
+++ b/src/khoj/processor/tools/online_search.py
@@ -47,6 +47,9 @@ JINA_API_KEY = os.getenv("JINA_API_KEY")
 FIRECRAWL_API_KEY = os.getenv("FIRECRAWL_API_KEY")
 FIRECRAWL_USE_LLM_EXTRACT = is_env_var_true("FIRECRAWL_USE_LLM_EXTRACT")

+# Timeout for web search and webpage read HTTP requests
+WEBPAGE_REQUEST_TIMEOUT = 60  # seconds
+
 OLOSTEP_QUERY_PARAMS = {
    "timeout": 35,  # seconds
    "waitBeforeScraping": 0,  # seconds
@@ -215,7 +218,9 @@ async def search_with_firecrawl(query: str, location: LocationData) -> Tuple[str

    async with aiohttp.ClientSession() as session:
        try:
-            async with session.post(firecrawl_api_url, headers=headers, json=payload) as response:
+            async with session.post(
+                firecrawl_api_url, headers=headers, json=payload, timeout=WEBPAGE_REQUEST_TIMEOUT
+            ) as response:
                if response.status != 200:
                    error_text = await response.text()
                    logger.error(f"Firecrawl search failed: {error_text}")
@@ -257,7 +262,7 @@ async def search_with_searxng(query: str, location: LocationData) -> Tuple[str,

    async with aiohttp.ClientSession() as session:
        try:
-            async with session.get(search_url, params=params) as response:
+            async with session.get(search_url, params=params, timeout=WEBPAGE_REQUEST_TIMEOUT) as response:
                if response.status != 200:
                    logger.error(f"SearXNG search failed to call {searxng_url}: {await response.text()}")
                    return query, {}
@@ -299,7 +304,7 @@ async def search_with_google(query: str, location: LocationData) -> Tuple[str, D
    }

    async with aiohttp.ClientSession() as session:
-        async with session.get(base_url, params=params) as response:
+        async with session.get(base_url, params=params, timeout=WEBPAGE_REQUEST_TIMEOUT) as response:
            if response.status != 200:
                logger.error(await response.text())
                return query, {}
@@ -350,7 +355,9 @@ async def search_with_serper(query: str, location: LocationData) -> Tuple[str, D
    payload = json.dumps({"q": query, "gl": country_code})

    async with aiohttp.ClientSession() as session:
-        async with session.post(SERPER_DEV_URL, headers=headers, data=payload) as response:
+        async with session.post(
+            SERPER_DEV_URL, headers=headers, data=payload, timeout=WEBPAGE_REQUEST_TIMEOUT
+        ) as response:
            if response.status != 200:
                logger.error(await response.text())
                return query, {}
@@ -489,7 +496,7 @@ async def read_webpage_at_url(web_url: str) -> str:
    }

    async with aiohttp.ClientSession() as session:
-        async with session.get(web_url, headers=headers, timeout=30) as response:
+        async with session.get(web_url, headers=headers, timeout=WEBPAGE_REQUEST_TIMEOUT) as response:
            response.raise_for_status()
            html = await response.text()
            parsed_html = BeautifulSoup(html, "html.parser")
@@ -503,7 +510,9 @@ async def read_webpage_with_olostep(web_url: str, api_key: str, api_url: str) ->
    web_scraping_params["url"] = web_url

    async with aiohttp.ClientSession() as session:
-        async with session.get(api_url, params=web_scraping_params, headers=headers) as response:
+        async with session.get(
+            api_url, params=web_scraping_params, headers=headers, timeout=WEBPAGE_REQUEST_TIMEOUT
+        ) as response:
            response.raise_for_status()
            response_json = await response.json()
            return response_json["markdown_content"]
@@ -516,7 +525,7 @@ async def read_webpage_with_jina(web_url: str, api_key: str, api_url: str) -> st
        headers["Authorization"] = f"Bearer {api_key}"

    async with aiohttp.ClientSession() as session:
-        async with session.post(api_url, json=data, headers=headers) as response:
+        async with session.post(api_url, json=data, headers=headers, timeout=WEBPAGE_REQUEST_TIMEOUT) as response:
            response.raise_for_status()
            content = await response.text()
            return content
@@ -535,7 +544,9 @@ async def read_webpage_with_firecrawl(web_url: str, api_key: str, api_url: str)
    }

    async with aiohttp.ClientSession() as session:
-        async with session.post(firecrawl_api_url, json=params, headers=headers) as response:
+        async with session.post(
+            firecrawl_api_url, json=params, headers=headers, timeout=WEBPAGE_REQUEST_TIMEOUT
+        ) as response:
            response.raise_for_status()
            response_json = await response.json()
            return response_json["data"]["markdown"]
@@ -571,7 +582,9 @@ Collate only relevant information from the website to answer the target query an
    params = {"url": web_url, "formats": ["extract"], "extract": {"systemPrompt": system_prompt, "schema": schema}}

    async with aiohttp.ClientSession() as session:
-        async with session.post(firecrawl_api_url, json=params, headers=headers) as response:
+        async with session.post(
+            firecrawl_api_url, json=params, headers=headers, timeout=WEBPAGE_REQUEST_TIMEOUT
+        ) as response:
            response.raise_for_status()
            response_json = await response.json()
            return response_json["data"]["extract"]["relevant_extract"]
@@ -600,7 +613,9 @@ async def search_with_jina(query: str, location: LocationData) -> Tuple[str, Dic
        headers["Authorization"] = f"Bearer {api_key}"

    async with aiohttp.ClientSession() as session:
-        async with session.post(JINA_SEARCH_API_URL, json=data, headers=headers) as response:
+        async with session.post(
+            JINA_SEARCH_API_URL, json=data, headers=headers, timeout=WEBPAGE_REQUEST_TIMEOUT
+        ) as response:
            if response.status != 200:
                error_text = await response.text()
                logger.error(f"Jina search failed: {error_text}")