mirror of
https://github.com/khoaliber/khoj.git
synced 2026-03-04 05:39:06 +00:00
Timeout web search and webpage read requests to providers
This commit is contained in:
@@ -47,6 +47,9 @@ JINA_API_KEY = os.getenv("JINA_API_KEY")
|
||||
FIRECRAWL_API_KEY = os.getenv("FIRECRAWL_API_KEY")
|
||||
FIRECRAWL_USE_LLM_EXTRACT = is_env_var_true("FIRECRAWL_USE_LLM_EXTRACT")
|
||||
|
||||
# Timeout for web search and webpage read HTTP requests
|
||||
WEBPAGE_REQUEST_TIMEOUT = 60 # seconds
|
||||
|
||||
OLOSTEP_QUERY_PARAMS = {
|
||||
"timeout": 35, # seconds
|
||||
"waitBeforeScraping": 0, # seconds
|
||||
@@ -215,7 +218,9 @@ async def search_with_firecrawl(query: str, location: LocationData) -> Tuple[str
|
||||
|
||||
async with aiohttp.ClientSession() as session:
|
||||
try:
|
||||
async with session.post(firecrawl_api_url, headers=headers, json=payload) as response:
|
||||
async with session.post(
|
||||
firecrawl_api_url, headers=headers, json=payload, timeout=WEBPAGE_REQUEST_TIMEOUT
|
||||
) as response:
|
||||
if response.status != 200:
|
||||
error_text = await response.text()
|
||||
logger.error(f"Firecrawl search failed: {error_text}")
|
||||
@@ -257,7 +262,7 @@ async def search_with_searxng(query: str, location: LocationData) -> Tuple[str,
|
||||
|
||||
async with aiohttp.ClientSession() as session:
|
||||
try:
|
||||
async with session.get(search_url, params=params) as response:
|
||||
async with session.get(search_url, params=params, timeout=WEBPAGE_REQUEST_TIMEOUT) as response:
|
||||
if response.status != 200:
|
||||
logger.error(f"SearXNG search failed to call {searxng_url}: {await response.text()}")
|
||||
return query, {}
|
||||
@@ -299,7 +304,7 @@ async def search_with_google(query: str, location: LocationData) -> Tuple[str, D
|
||||
}
|
||||
|
||||
async with aiohttp.ClientSession() as session:
|
||||
async with session.get(base_url, params=params) as response:
|
||||
async with session.get(base_url, params=params, timeout=WEBPAGE_REQUEST_TIMEOUT) as response:
|
||||
if response.status != 200:
|
||||
logger.error(await response.text())
|
||||
return query, {}
|
||||
@@ -350,7 +355,9 @@ async def search_with_serper(query: str, location: LocationData) -> Tuple[str, D
|
||||
payload = json.dumps({"q": query, "gl": country_code})
|
||||
|
||||
async with aiohttp.ClientSession() as session:
|
||||
async with session.post(SERPER_DEV_URL, headers=headers, data=payload) as response:
|
||||
async with session.post(
|
||||
SERPER_DEV_URL, headers=headers, data=payload, timeout=WEBPAGE_REQUEST_TIMEOUT
|
||||
) as response:
|
||||
if response.status != 200:
|
||||
logger.error(await response.text())
|
||||
return query, {}
|
||||
@@ -489,7 +496,7 @@ async def read_webpage_at_url(web_url: str) -> str:
|
||||
}
|
||||
|
||||
async with aiohttp.ClientSession() as session:
|
||||
async with session.get(web_url, headers=headers, timeout=30) as response:
|
||||
async with session.get(web_url, headers=headers, timeout=WEBPAGE_REQUEST_TIMEOUT) as response:
|
||||
response.raise_for_status()
|
||||
html = await response.text()
|
||||
parsed_html = BeautifulSoup(html, "html.parser")
|
||||
@@ -503,7 +510,9 @@ async def read_webpage_with_olostep(web_url: str, api_key: str, api_url: str) ->
|
||||
web_scraping_params["url"] = web_url
|
||||
|
||||
async with aiohttp.ClientSession() as session:
|
||||
async with session.get(api_url, params=web_scraping_params, headers=headers) as response:
|
||||
async with session.get(
|
||||
api_url, params=web_scraping_params, headers=headers, timeout=WEBPAGE_REQUEST_TIMEOUT
|
||||
) as response:
|
||||
response.raise_for_status()
|
||||
response_json = await response.json()
|
||||
return response_json["markdown_content"]
|
||||
@@ -516,7 +525,7 @@ async def read_webpage_with_jina(web_url: str, api_key: str, api_url: str) -> st
|
||||
headers["Authorization"] = f"Bearer {api_key}"
|
||||
|
||||
async with aiohttp.ClientSession() as session:
|
||||
async with session.post(api_url, json=data, headers=headers) as response:
|
||||
async with session.post(api_url, json=data, headers=headers, timeout=WEBPAGE_REQUEST_TIMEOUT) as response:
|
||||
response.raise_for_status()
|
||||
content = await response.text()
|
||||
return content
|
||||
@@ -535,7 +544,9 @@ async def read_webpage_with_firecrawl(web_url: str, api_key: str, api_url: str)
|
||||
}
|
||||
|
||||
async with aiohttp.ClientSession() as session:
|
||||
async with session.post(firecrawl_api_url, json=params, headers=headers) as response:
|
||||
async with session.post(
|
||||
firecrawl_api_url, json=params, headers=headers, timeout=WEBPAGE_REQUEST_TIMEOUT
|
||||
) as response:
|
||||
response.raise_for_status()
|
||||
response_json = await response.json()
|
||||
return response_json["data"]["markdown"]
|
||||
@@ -571,7 +582,9 @@ Collate only relevant information from the website to answer the target query an
|
||||
params = {"url": web_url, "formats": ["extract"], "extract": {"systemPrompt": system_prompt, "schema": schema}}
|
||||
|
||||
async with aiohttp.ClientSession() as session:
|
||||
async with session.post(firecrawl_api_url, json=params, headers=headers) as response:
|
||||
async with session.post(
|
||||
firecrawl_api_url, json=params, headers=headers, timeout=WEBPAGE_REQUEST_TIMEOUT
|
||||
) as response:
|
||||
response.raise_for_status()
|
||||
response_json = await response.json()
|
||||
return response_json["data"]["extract"]["relevant_extract"]
|
||||
@@ -600,7 +613,9 @@ async def search_with_jina(query: str, location: LocationData) -> Tuple[str, Dic
|
||||
headers["Authorization"] = f"Bearer {api_key}"
|
||||
|
||||
async with aiohttp.ClientSession() as session:
|
||||
async with session.post(JINA_SEARCH_API_URL, json=data, headers=headers) as response:
|
||||
async with session.post(
|
||||
JINA_SEARCH_API_URL, json=data, headers=headers, timeout=WEBPAGE_REQUEST_TIMEOUT
|
||||
) as response:
|
||||
if response.status != 200:
|
||||
error_text = await response.text()
|
||||
logger.error(f"Jina search failed: {error_text}")
|
||||
|
||||
Reference in New Issue
Block a user