Parallelize simple webpage read and extractor

Similar to what is being done with search_online with olostep
2026-03-06 13:22:12 +00:00 · 2024-03-14 16:34:04 +05:30
parent 1167f6ddf9
commit 71b6905008
1 changed files with 9 additions and 5 deletions
--- a/src/khoj/processor/tools/online_search.py
+++ b/src/khoj/processor/tools/online_search.py
@@ -100,12 +100,16 @@ def search_with_google(subquery: str):
 async def read_webpages(query: str, conversation_history: dict, location: LocationData):
    "Infer web pages to read from the query and extract relevant information from them"
    logger.info(f"Inferring web pages to read")
    urls = await infer_webpage_urls(query, conversation_history, location)
-    results: Dict[str, Dict[str, str]] = defaultdict(dict)
+
-    for url in urls:
+    logger.info(f"Reading web pages at: {urls}")
-        _, result = await read_webpage_and_extract_content(query, url)
+    tasks = [read_webpage_and_extract_content(query, url) for url in urls]
-        results[url]["webpages"] = result
+    results = await asyncio.gather(*tasks)
-    return results
+
    response: Dict[str, Dict[str, str]] = defaultdict(dict)
    response[query]["webpages"] = [web_extract for _, web_extract in results if web_extract is not None]
    return response
 async def read_webpage_and_extract_content(subquery: str, url: str) -> Tuple[str, Union[None, str]]: