From a2e79c94bea740abff10be8335e3a9c2773b943a Mon Sep 17 00:00:00 2001 From: Debanjum Singh Solanky Date: Thu, 14 Mar 2024 16:50:31 +0530 Subject: [PATCH] Pass multiple webpages with their urls in online results context Previously even if MAX_WEBPAGES_TO_READ was > 1, only 1 extracted content would ever be passed. URL of the extracted webpage content wasn't passed to clients in online results context. This limited them from being rendered --- src/khoj/processor/tools/online_search.py | 27 +++++++++++------------ 1 file changed, 13 insertions(+), 14 deletions(-) diff --git a/src/khoj/processor/tools/online_search.py b/src/khoj/processor/tools/online_search.py index b250fe1a..c9745dc9 100644 --- a/src/khoj/processor/tools/online_search.py +++ b/src/khoj/processor/tools/online_search.py @@ -57,24 +57,21 @@ async def search_online(query: str, conversation_history: dict, location: Locati # Gather distinct web pages from organic search results of each subquery without an instant answer webpage_links = { - result["link"] + organic["link"]: subquery for subquery in response_dict - for result in response_dict[subquery].get("organic", [])[:MAX_WEBPAGES_TO_READ] + for organic in response_dict[subquery].get("organic", [])[:MAX_WEBPAGES_TO_READ] if "answerBox" not in response_dict[subquery] } # Read, extract relevant info from the retrieved web pages - tasks = [] - for webpage_link in webpage_links: - logger.info(f"Reading web page at '{webpage_link}'") - task = read_webpage_and_extract_content(subquery, webpage_link) - tasks.append(task) + logger.info(f"Reading web pages at: {webpage_links.keys()}") + tasks = [read_webpage_and_extract_content(subquery, link) for link, subquery in webpage_links.items()] results = await asyncio.gather(*tasks) # Collect extracted info from the retrieved web pages - for subquery, webpage_extract in results: + for subquery, webpage_extract, url in results: if webpage_extract is not None: - response_dict[subquery]["webpages"] = webpage_extract + response_dict[subquery]["webpages"] = {"link": url, "snippet": webpage_extract} return response_dict @@ -107,21 +104,23 @@ async def read_webpages(query: str, conversation_history: dict, location: Locati tasks = [read_webpage_and_extract_content(query, url) for url in urls] results = await asyncio.gather(*tasks) - response: Dict[str, Dict[str, str]] = defaultdict(dict) - response[query]["webpages"] = [web_extract for _, web_extract in results if web_extract is not None] + response: Dict[str, Dict] = defaultdict(dict) + response[query]["webpages"] = [ + {"query": q, "link": url, "snippet": web_extract} for q, web_extract, url in results if web_extract is not None + ] return response -async def read_webpage_and_extract_content(subquery: str, url: str) -> Tuple[str, Union[None, str]]: +async def read_webpage_and_extract_content(subquery: str, url: str) -> Tuple[str, Union[None, str], str]: try: with timer(f"Reading web page at '{url}' took", logger): content = await read_webpage_with_olostep(url) if OLOSTEP_API_KEY else await read_webpage_at_url(url) with timer(f"Extracting relevant information from web page at '{url}' took", logger): extracted_info = await extract_relevant_info(subquery, content) - return subquery, extracted_info + return subquery, extracted_info, url except Exception as e: logger.error(f"Failed to read web page at '{url}' with {e}") - return subquery, None + return subquery, None, url async def read_webpage_at_url(web_url: str) -> str: