diff --git a/src/khoj/processor/tools/online_search.py b/src/khoj/processor/tools/online_search.py index 0c89c5ce..ef66d489 100644 --- a/src/khoj/processor/tools/online_search.py +++ b/src/khoj/processor/tools/online_search.py @@ -47,6 +47,9 @@ JINA_API_KEY = os.getenv("JINA_API_KEY") FIRECRAWL_API_KEY = os.getenv("FIRECRAWL_API_KEY") FIRECRAWL_USE_LLM_EXTRACT = is_env_var_true("FIRECRAWL_USE_LLM_EXTRACT") +# Timeout for web search and webpage read HTTP requests +WEBPAGE_REQUEST_TIMEOUT = 60 # seconds + OLOSTEP_QUERY_PARAMS = { "timeout": 35, # seconds "waitBeforeScraping": 0, # seconds @@ -215,7 +218,9 @@ async def search_with_firecrawl(query: str, location: LocationData) -> Tuple[str async with aiohttp.ClientSession() as session: try: - async with session.post(firecrawl_api_url, headers=headers, json=payload) as response: + async with session.post( + firecrawl_api_url, headers=headers, json=payload, timeout=WEBPAGE_REQUEST_TIMEOUT + ) as response: if response.status != 200: error_text = await response.text() logger.error(f"Firecrawl search failed: {error_text}") @@ -257,7 +262,7 @@ async def search_with_searxng(query: str, location: LocationData) -> Tuple[str, async with aiohttp.ClientSession() as session: try: - async with session.get(search_url, params=params) as response: + async with session.get(search_url, params=params, timeout=WEBPAGE_REQUEST_TIMEOUT) as response: if response.status != 200: logger.error(f"SearXNG search failed to call {searxng_url}: {await response.text()}") return query, {} @@ -299,7 +304,7 @@ async def search_with_google(query: str, location: LocationData) -> Tuple[str, D } async with aiohttp.ClientSession() as session: - async with session.get(base_url, params=params) as response: + async with session.get(base_url, params=params, timeout=WEBPAGE_REQUEST_TIMEOUT) as response: if response.status != 200: logger.error(await response.text()) return query, {} @@ -350,7 +355,9 @@ async def search_with_serper(query: str, location: LocationData) -> Tuple[str, D payload = json.dumps({"q": query, "gl": country_code}) async with aiohttp.ClientSession() as session: - async with session.post(SERPER_DEV_URL, headers=headers, data=payload) as response: + async with session.post( + SERPER_DEV_URL, headers=headers, data=payload, timeout=WEBPAGE_REQUEST_TIMEOUT + ) as response: if response.status != 200: logger.error(await response.text()) return query, {} @@ -489,7 +496,7 @@ async def read_webpage_at_url(web_url: str) -> str: } async with aiohttp.ClientSession() as session: - async with session.get(web_url, headers=headers, timeout=30) as response: + async with session.get(web_url, headers=headers, timeout=WEBPAGE_REQUEST_TIMEOUT) as response: response.raise_for_status() html = await response.text() parsed_html = BeautifulSoup(html, "html.parser") @@ -503,7 +510,9 @@ async def read_webpage_with_olostep(web_url: str, api_key: str, api_url: str) -> web_scraping_params["url"] = web_url async with aiohttp.ClientSession() as session: - async with session.get(api_url, params=web_scraping_params, headers=headers) as response: + async with session.get( + api_url, params=web_scraping_params, headers=headers, timeout=WEBPAGE_REQUEST_TIMEOUT + ) as response: response.raise_for_status() response_json = await response.json() return response_json["markdown_content"] @@ -516,7 +525,7 @@ async def read_webpage_with_jina(web_url: str, api_key: str, api_url: str) -> st headers["Authorization"] = f"Bearer {api_key}" async with aiohttp.ClientSession() as session: - async with session.post(api_url, json=data, headers=headers) as response: + async with session.post(api_url, json=data, headers=headers, timeout=WEBPAGE_REQUEST_TIMEOUT) as response: response.raise_for_status() content = await response.text() return content @@ -535,7 +544,9 @@ async def read_webpage_with_firecrawl(web_url: str, api_key: str, api_url: str) } async with aiohttp.ClientSession() as session: - async with session.post(firecrawl_api_url, json=params, headers=headers) as response: + async with session.post( + firecrawl_api_url, json=params, headers=headers, timeout=WEBPAGE_REQUEST_TIMEOUT + ) as response: response.raise_for_status() response_json = await response.json() return response_json["data"]["markdown"] @@ -571,7 +582,9 @@ Collate only relevant information from the website to answer the target query an params = {"url": web_url, "formats": ["extract"], "extract": {"systemPrompt": system_prompt, "schema": schema}} async with aiohttp.ClientSession() as session: - async with session.post(firecrawl_api_url, json=params, headers=headers) as response: + async with session.post( + firecrawl_api_url, json=params, headers=headers, timeout=WEBPAGE_REQUEST_TIMEOUT + ) as response: response.raise_for_status() response_json = await response.json() return response_json["data"]["extract"]["relevant_extract"] @@ -600,7 +613,9 @@ async def search_with_jina(query: str, location: LocationData) -> Tuple[str, Dic headers["Authorization"] = f"Bearer {api_key}" async with aiohttp.ClientSession() as session: - async with session.post(JINA_SEARCH_API_URL, json=data, headers=headers) as response: + async with session.post( + JINA_SEARCH_API_URL, json=data, headers=headers, timeout=WEBPAGE_REQUEST_TIMEOUT + ) as response: if response.status != 200: error_text = await response.text() logger.error(f"Jina search failed: {error_text}")