From 4085c9b991df9a243b1adaf690f15376366ce793 Mon Sep 17 00:00:00 2001 From: Debanjum Date: Wed, 26 Mar 2025 10:54:41 +0530 Subject: [PATCH] Fix infer webpage url step actor to request upto specified max urls Previously we'd always request up to 3 webpage url via the prompt but read only one of the requested webpage url. This would degrade quality of research and default mode. As model may request reading upto 3 webpage links but get only one of the requested webpages read. This change passes the number of webpages to read down to the AI model dynamically via the updated prompt. So number of webpages requested to be read should mostly be same as number of webpages actually read. Note: For now, the max webpages to read is kept same as before at 1. --- src/khoj/processor/conversation/prompts.py | 2 +- src/khoj/processor/tools/online_search.py | 11 +++-------- src/khoj/routers/api_chat.py | 1 + src/khoj/routers/helpers.py | 6 ++++-- src/khoj/routers/research.py | 1 + 5 files changed, 10 insertions(+), 11 deletions(-) diff --git a/src/khoj/processor/conversation/prompts.py b/src/khoj/processor/conversation/prompts.py index e6975a0a..20c3ce0d 100644 --- a/src/khoj/processor/conversation/prompts.py +++ b/src/khoj/processor/conversation/prompts.py @@ -855,7 +855,7 @@ Khoj: infer_webpages_to_read = PromptTemplate.from_template( """ -You are Khoj, an advanced web page reading assistant. You are to construct **up to three, valid** webpage urls to read before answering the user's question. +You are Khoj, an advanced web page reading assistant. You are to construct **up to {max_webpages}, valid** webpage urls to read before answering the user's question. - You will receive the conversation history as context. - Add as much context from the previous questions and answers as required to construct the webpage urls. - Use multiple web page urls if required to retrieve the relevant information. diff --git a/src/khoj/processor/tools/online_search.py b/src/khoj/processor/tools/online_search.py index 33cc3056..4234b32c 100644 --- a/src/khoj/processor/tools/online_search.py +++ b/src/khoj/processor/tools/online_search.py @@ -55,9 +55,6 @@ OLOSTEP_QUERY_PARAMS = { "expandHtml": "False", } -DEFAULT_MAX_WEBPAGES_TO_READ = 1 -MAX_WEBPAGES_TO_INFER = 10 - async def search_online( query: str, @@ -66,7 +63,7 @@ async def search_online( user: KhojUser, send_status_func: Optional[Callable] = None, custom_filters: List[str] = [], - max_webpages_to_read: int = DEFAULT_MAX_WEBPAGES_TO_READ, + max_webpages_to_read: int = 1, query_images: List[str] = None, previous_subqueries: Set = set(), agent: Agent = None, @@ -282,7 +279,7 @@ async def read_webpages( send_status_func: Optional[Callable] = None, query_images: List[str] = None, agent: Agent = None, - max_webpages_to_read: int = DEFAULT_MAX_WEBPAGES_TO_READ, + max_webpages_to_read: int = 1, query_files: str = None, tracer: dict = {}, ): @@ -290,6 +287,7 @@ async def read_webpages( logger.info(f"Inferring web pages to read") urls = await infer_webpage_urls( query, + max_webpages_to_read, conversation_history, location, user, @@ -299,9 +297,6 @@ async def read_webpages( tracer=tracer, ) - # Get the top 10 web pages to read - urls = urls[:max_webpages_to_read] - logger.info(f"Reading web pages at: {urls}") if send_status_func: webpage_links_str = "\n- " + "\n- ".join(list(urls)) diff --git a/src/khoj/routers/api_chat.py b/src/khoj/routers/api_chat.py index e1961b96..2f709ffb 100644 --- a/src/khoj/routers/api_chat.py +++ b/src/khoj/routers/api_chat.py @@ -1106,6 +1106,7 @@ async def chat( location, user, partial(send_event, ChatEvent.STATUS), + max_webpages_to_read=1, query_images=uploaded_images, agent=agent, query_files=attached_file_context, diff --git a/src/khoj/routers/helpers.py b/src/khoj/routers/helpers.py index c7e71b09..acca60b0 100644 --- a/src/khoj/routers/helpers.py +++ b/src/khoj/routers/helpers.py @@ -450,6 +450,7 @@ async def aget_data_sources_and_output_format( async def infer_webpage_urls( q: str, + max_webpages: int, conversation_history: dict, location_data: LocationData, user: KhojUser, @@ -471,9 +472,10 @@ async def infer_webpage_urls( ) online_queries_prompt = prompts.infer_webpages_to_read.format( - current_date=utc_date, query=q, + max_webpages=max_webpages, chat_history=chat_history, + current_date=utc_date, location=location, username=username, personality_context=personality_context, @@ -502,7 +504,7 @@ async def infer_webpage_urls( if len(valid_unique_urls) == 0: logger.error(f"No valid URLs found in response: {response}") return [] - return list(valid_unique_urls) + return list(valid_unique_urls)[:max_webpages] except Exception: raise ValueError(f"Invalid list of urls: {response}") diff --git a/src/khoj/routers/research.py b/src/khoj/routers/research.py index d534e812..b662dca9 100644 --- a/src/khoj/routers/research.py +++ b/src/khoj/routers/research.py @@ -321,6 +321,7 @@ async def execute_information_collection( location, user, send_status_func, + max_webpages_to_read=1, query_images=query_images, agent=agent, tracer=tracer,