Timeout web search and webpage read requests to providers

This commit is contained in:
Debanjum
2025-07-09 15:51:05 -07:00
parent 1988a8d023
commit b763dbfb2b

View File

@@ -47,6 +47,9 @@ JINA_API_KEY = os.getenv("JINA_API_KEY")
FIRECRAWL_API_KEY = os.getenv("FIRECRAWL_API_KEY")
FIRECRAWL_USE_LLM_EXTRACT = is_env_var_true("FIRECRAWL_USE_LLM_EXTRACT")
# Timeout for web search and webpage read HTTP requests
WEBPAGE_REQUEST_TIMEOUT = 60 # seconds
OLOSTEP_QUERY_PARAMS = {
"timeout": 35, # seconds
"waitBeforeScraping": 0, # seconds
@@ -215,7 +218,9 @@ async def search_with_firecrawl(query: str, location: LocationData) -> Tuple[str
async with aiohttp.ClientSession() as session:
try:
async with session.post(firecrawl_api_url, headers=headers, json=payload) as response:
async with session.post(
firecrawl_api_url, headers=headers, json=payload, timeout=WEBPAGE_REQUEST_TIMEOUT
) as response:
if response.status != 200:
error_text = await response.text()
logger.error(f"Firecrawl search failed: {error_text}")
@@ -257,7 +262,7 @@ async def search_with_searxng(query: str, location: LocationData) -> Tuple[str,
async with aiohttp.ClientSession() as session:
try:
async with session.get(search_url, params=params) as response:
async with session.get(search_url, params=params, timeout=WEBPAGE_REQUEST_TIMEOUT) as response:
if response.status != 200:
logger.error(f"SearXNG search failed to call {searxng_url}: {await response.text()}")
return query, {}
@@ -299,7 +304,7 @@ async def search_with_google(query: str, location: LocationData) -> Tuple[str, D
}
async with aiohttp.ClientSession() as session:
async with session.get(base_url, params=params) as response:
async with session.get(base_url, params=params, timeout=WEBPAGE_REQUEST_TIMEOUT) as response:
if response.status != 200:
logger.error(await response.text())
return query, {}
@@ -350,7 +355,9 @@ async def search_with_serper(query: str, location: LocationData) -> Tuple[str, D
payload = json.dumps({"q": query, "gl": country_code})
async with aiohttp.ClientSession() as session:
async with session.post(SERPER_DEV_URL, headers=headers, data=payload) as response:
async with session.post(
SERPER_DEV_URL, headers=headers, data=payload, timeout=WEBPAGE_REQUEST_TIMEOUT
) as response:
if response.status != 200:
logger.error(await response.text())
return query, {}
@@ -489,7 +496,7 @@ async def read_webpage_at_url(web_url: str) -> str:
}
async with aiohttp.ClientSession() as session:
async with session.get(web_url, headers=headers, timeout=30) as response:
async with session.get(web_url, headers=headers, timeout=WEBPAGE_REQUEST_TIMEOUT) as response:
response.raise_for_status()
html = await response.text()
parsed_html = BeautifulSoup(html, "html.parser")
@@ -503,7 +510,9 @@ async def read_webpage_with_olostep(web_url: str, api_key: str, api_url: str) ->
web_scraping_params["url"] = web_url
async with aiohttp.ClientSession() as session:
async with session.get(api_url, params=web_scraping_params, headers=headers) as response:
async with session.get(
api_url, params=web_scraping_params, headers=headers, timeout=WEBPAGE_REQUEST_TIMEOUT
) as response:
response.raise_for_status()
response_json = await response.json()
return response_json["markdown_content"]
@@ -516,7 +525,7 @@ async def read_webpage_with_jina(web_url: str, api_key: str, api_url: str) -> st
headers["Authorization"] = f"Bearer {api_key}"
async with aiohttp.ClientSession() as session:
async with session.post(api_url, json=data, headers=headers) as response:
async with session.post(api_url, json=data, headers=headers, timeout=WEBPAGE_REQUEST_TIMEOUT) as response:
response.raise_for_status()
content = await response.text()
return content
@@ -535,7 +544,9 @@ async def read_webpage_with_firecrawl(web_url: str, api_key: str, api_url: str)
}
async with aiohttp.ClientSession() as session:
async with session.post(firecrawl_api_url, json=params, headers=headers) as response:
async with session.post(
firecrawl_api_url, json=params, headers=headers, timeout=WEBPAGE_REQUEST_TIMEOUT
) as response:
response.raise_for_status()
response_json = await response.json()
return response_json["data"]["markdown"]
@@ -571,7 +582,9 @@ Collate only relevant information from the website to answer the target query an
params = {"url": web_url, "formats": ["extract"], "extract": {"systemPrompt": system_prompt, "schema": schema}}
async with aiohttp.ClientSession() as session:
async with session.post(firecrawl_api_url, json=params, headers=headers) as response:
async with session.post(
firecrawl_api_url, json=params, headers=headers, timeout=WEBPAGE_REQUEST_TIMEOUT
) as response:
response.raise_for_status()
response_json = await response.json()
return response_json["data"]["extract"]["relevant_extract"]
@@ -600,7 +613,9 @@ async def search_with_jina(query: str, location: LocationData) -> Tuple[str, Dic
headers["Authorization"] = f"Bearer {api_key}"
async with aiohttp.ClientSession() as session:
async with session.post(JINA_SEARCH_API_URL, json=data, headers=headers) as response:
async with session.post(
JINA_SEARCH_API_URL, json=data, headers=headers, timeout=WEBPAGE_REQUEST_TIMEOUT
) as response:
if response.status != 200:
error_text = await response.text()
logger.error(f"Jina search failed: {error_text}")