From 71b6905008514cd2cf376f2322de9fc41fa4d04d Mon Sep 17 00:00:00 2001
From: Debanjum Singh Solanky <debanjum@gmail.com>
Date: Thu, 14 Mar 2024 16:34:04 +0530
Subject: [PATCH] Parallelize simple webpage read and extractor

Similar to what is being done with search_online with olostep
---
 src/khoj/processor/tools/online_search.py | 14 +++++++++-----
 1 file changed, 9 insertions(+), 5 deletions(-)

diff --git a/src/khoj/processor/tools/online_search.py b/src/khoj/processor/tools/online_search.py
index 45ccf111..b250fe1a 100644
--- a/src/khoj/processor/tools/online_search.py
+++ b/src/khoj/processor/tools/online_search.py
@@ -100,12 +100,16 @@ def search_with_google(subquery: str):
 
 async def read_webpages(query: str, conversation_history: dict, location: LocationData):
     "Infer web pages to read from the query and extract relevant information from them"
+    logger.info(f"Inferring web pages to read")
     urls = await infer_webpage_urls(query, conversation_history, location)
-    results: Dict[str, Dict[str, str]] = defaultdict(dict)
-    for url in urls:
-        _, result = await read_webpage_and_extract_content(query, url)
-        results[url]["webpages"] = result
-    return results
+
+    logger.info(f"Reading web pages at: {urls}")
+    tasks = [read_webpage_and_extract_content(query, url) for url in urls]
+    results = await asyncio.gather(*tasks)
+
+    response: Dict[str, Dict[str, str]] = defaultdict(dict)
+    response[query]["webpages"] = [web_extract for _, web_extract in results if web_extract is not None]
+    return response
 
 
 async def read_webpage_and_extract_content(subquery: str, url: str) -> Tuple[str, Union[None, str]]: