Pass multiple webpages with their urls in online results context

Previously even if MAX_WEBPAGES_TO_READ was > 1, only 1 extracted
content would ever be passed.

URL of the extracted webpage content wasn't passed to clients in
online results context. This limited them from being rendered
This commit is contained in:
Debanjum Singh Solanky
2024-03-14 16:50:31 +05:30
parent 71b6905008
commit a2e79c94be

View File

@@ -57,24 +57,21 @@ async def search_online(query: str, conversation_history: dict, location: Locati
# Gather distinct web pages from organic search results of each subquery without an instant answer # Gather distinct web pages from organic search results of each subquery without an instant answer
webpage_links = { webpage_links = {
result["link"] organic["link"]: subquery
for subquery in response_dict for subquery in response_dict
for result in response_dict[subquery].get("organic", [])[:MAX_WEBPAGES_TO_READ] for organic in response_dict[subquery].get("organic", [])[:MAX_WEBPAGES_TO_READ]
if "answerBox" not in response_dict[subquery] if "answerBox" not in response_dict[subquery]
} }
# Read, extract relevant info from the retrieved web pages # Read, extract relevant info from the retrieved web pages
tasks = [] logger.info(f"Reading web pages at: {webpage_links.keys()}")
for webpage_link in webpage_links: tasks = [read_webpage_and_extract_content(subquery, link) for link, subquery in webpage_links.items()]
logger.info(f"Reading web page at '{webpage_link}'")
task = read_webpage_and_extract_content(subquery, webpage_link)
tasks.append(task)
results = await asyncio.gather(*tasks) results = await asyncio.gather(*tasks)
# Collect extracted info from the retrieved web pages # Collect extracted info from the retrieved web pages
for subquery, webpage_extract in results: for subquery, webpage_extract, url in results:
if webpage_extract is not None: if webpage_extract is not None:
response_dict[subquery]["webpages"] = webpage_extract response_dict[subquery]["webpages"] = {"link": url, "snippet": webpage_extract}
return response_dict return response_dict
@@ -107,21 +104,23 @@ async def read_webpages(query: str, conversation_history: dict, location: Locati
tasks = [read_webpage_and_extract_content(query, url) for url in urls] tasks = [read_webpage_and_extract_content(query, url) for url in urls]
results = await asyncio.gather(*tasks) results = await asyncio.gather(*tasks)
response: Dict[str, Dict[str, str]] = defaultdict(dict) response: Dict[str, Dict] = defaultdict(dict)
response[query]["webpages"] = [web_extract for _, web_extract in results if web_extract is not None] response[query]["webpages"] = [
{"query": q, "link": url, "snippet": web_extract} for q, web_extract, url in results if web_extract is not None
]
return response return response
async def read_webpage_and_extract_content(subquery: str, url: str) -> Tuple[str, Union[None, str]]: async def read_webpage_and_extract_content(subquery: str, url: str) -> Tuple[str, Union[None, str], str]:
try: try:
with timer(f"Reading web page at '{url}' took", logger): with timer(f"Reading web page at '{url}' took", logger):
content = await read_webpage_with_olostep(url) if OLOSTEP_API_KEY else await read_webpage_at_url(url) content = await read_webpage_with_olostep(url) if OLOSTEP_API_KEY else await read_webpage_at_url(url)
with timer(f"Extracting relevant information from web page at '{url}' took", logger): with timer(f"Extracting relevant information from web page at '{url}' took", logger):
extracted_info = await extract_relevant_info(subquery, content) extracted_info = await extract_relevant_info(subquery, content)
return subquery, extracted_info return subquery, extracted_info, url
except Exception as e: except Exception as e:
logger.error(f"Failed to read web page at '{url}' with {e}") logger.error(f"Failed to read web page at '{url}' with {e}")
return subquery, None return subquery, None, url
async def read_webpage_at_url(web_url: str) -> str: async def read_webpage_at_url(web_url: str) -> str: