From 71b6905008514cd2cf376f2322de9fc41fa4d04d Mon Sep 17 00:00:00 2001 From: Debanjum Singh Solanky Date: Thu, 14 Mar 2024 16:34:04 +0530 Subject: [PATCH] Parallelize simple webpage read and extractor Similar to what is being done with search_online with olostep --- src/khoj/processor/tools/online_search.py | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/src/khoj/processor/tools/online_search.py b/src/khoj/processor/tools/online_search.py index 45ccf111..b250fe1a 100644 --- a/src/khoj/processor/tools/online_search.py +++ b/src/khoj/processor/tools/online_search.py @@ -100,12 +100,16 @@ def search_with_google(subquery: str): async def read_webpages(query: str, conversation_history: dict, location: LocationData): "Infer web pages to read from the query and extract relevant information from them" + logger.info(f"Inferring web pages to read") urls = await infer_webpage_urls(query, conversation_history, location) - results: Dict[str, Dict[str, str]] = defaultdict(dict) - for url in urls: - _, result = await read_webpage_and_extract_content(query, url) - results[url]["webpages"] = result - return results + + logger.info(f"Reading web pages at: {urls}") + tasks = [read_webpage_and_extract_content(query, url) for url in urls] + results = await asyncio.gather(*tasks) + + response: Dict[str, Dict[str, str]] = defaultdict(dict) + response[query]["webpages"] = [web_extract for _, web_extract in results if web_extract is not None] + return response async def read_webpage_and_extract_content(subquery: str, url: str) -> Tuple[str, Union[None, str]]: