mirror of
https://github.com/khoaliber/khoj.git
synced 2026-03-02 13:18:18 +00:00
Fix infer webpage url step actor to request upto specified max urls
Previously we'd always request up to 3 webpage url via the prompt but read only one of the requested webpage url. This would degrade quality of research and default mode. As model may request reading upto 3 webpage links but get only one of the requested webpages read. This change passes the number of webpages to read down to the AI model dynamically via the updated prompt. So number of webpages requested to be read should mostly be same as number of webpages actually read. Note: For now, the max webpages to read is kept same as before at 1.
This commit is contained in:
@@ -855,7 +855,7 @@ Khoj:
|
||||
|
||||
infer_webpages_to_read = PromptTemplate.from_template(
|
||||
"""
|
||||
You are Khoj, an advanced web page reading assistant. You are to construct **up to three, valid** webpage urls to read before answering the user's question.
|
||||
You are Khoj, an advanced web page reading assistant. You are to construct **up to {max_webpages}, valid** webpage urls to read before answering the user's question.
|
||||
- You will receive the conversation history as context.
|
||||
- Add as much context from the previous questions and answers as required to construct the webpage urls.
|
||||
- Use multiple web page urls if required to retrieve the relevant information.
|
||||
|
||||
@@ -55,9 +55,6 @@ OLOSTEP_QUERY_PARAMS = {
|
||||
"expandHtml": "False",
|
||||
}
|
||||
|
||||
DEFAULT_MAX_WEBPAGES_TO_READ = 1
|
||||
MAX_WEBPAGES_TO_INFER = 10
|
||||
|
||||
|
||||
async def search_online(
|
||||
query: str,
|
||||
@@ -66,7 +63,7 @@ async def search_online(
|
||||
user: KhojUser,
|
||||
send_status_func: Optional[Callable] = None,
|
||||
custom_filters: List[str] = [],
|
||||
max_webpages_to_read: int = DEFAULT_MAX_WEBPAGES_TO_READ,
|
||||
max_webpages_to_read: int = 1,
|
||||
query_images: List[str] = None,
|
||||
previous_subqueries: Set = set(),
|
||||
agent: Agent = None,
|
||||
@@ -282,7 +279,7 @@ async def read_webpages(
|
||||
send_status_func: Optional[Callable] = None,
|
||||
query_images: List[str] = None,
|
||||
agent: Agent = None,
|
||||
max_webpages_to_read: int = DEFAULT_MAX_WEBPAGES_TO_READ,
|
||||
max_webpages_to_read: int = 1,
|
||||
query_files: str = None,
|
||||
tracer: dict = {},
|
||||
):
|
||||
@@ -290,6 +287,7 @@ async def read_webpages(
|
||||
logger.info(f"Inferring web pages to read")
|
||||
urls = await infer_webpage_urls(
|
||||
query,
|
||||
max_webpages_to_read,
|
||||
conversation_history,
|
||||
location,
|
||||
user,
|
||||
@@ -299,9 +297,6 @@ async def read_webpages(
|
||||
tracer=tracer,
|
||||
)
|
||||
|
||||
# Get the top 10 web pages to read
|
||||
urls = urls[:max_webpages_to_read]
|
||||
|
||||
logger.info(f"Reading web pages at: {urls}")
|
||||
if send_status_func:
|
||||
webpage_links_str = "\n- " + "\n- ".join(list(urls))
|
||||
|
||||
@@ -1106,6 +1106,7 @@ async def chat(
|
||||
location,
|
||||
user,
|
||||
partial(send_event, ChatEvent.STATUS),
|
||||
max_webpages_to_read=1,
|
||||
query_images=uploaded_images,
|
||||
agent=agent,
|
||||
query_files=attached_file_context,
|
||||
|
||||
@@ -450,6 +450,7 @@ async def aget_data_sources_and_output_format(
|
||||
|
||||
async def infer_webpage_urls(
|
||||
q: str,
|
||||
max_webpages: int,
|
||||
conversation_history: dict,
|
||||
location_data: LocationData,
|
||||
user: KhojUser,
|
||||
@@ -471,9 +472,10 @@ async def infer_webpage_urls(
|
||||
)
|
||||
|
||||
online_queries_prompt = prompts.infer_webpages_to_read.format(
|
||||
current_date=utc_date,
|
||||
query=q,
|
||||
max_webpages=max_webpages,
|
||||
chat_history=chat_history,
|
||||
current_date=utc_date,
|
||||
location=location,
|
||||
username=username,
|
||||
personality_context=personality_context,
|
||||
@@ -502,7 +504,7 @@ async def infer_webpage_urls(
|
||||
if len(valid_unique_urls) == 0:
|
||||
logger.error(f"No valid URLs found in response: {response}")
|
||||
return []
|
||||
return list(valid_unique_urls)
|
||||
return list(valid_unique_urls)[:max_webpages]
|
||||
except Exception:
|
||||
raise ValueError(f"Invalid list of urls: {response}")
|
||||
|
||||
|
||||
@@ -321,6 +321,7 @@ async def execute_information_collection(
|
||||
location,
|
||||
user,
|
||||
send_status_func,
|
||||
max_webpages_to_read=1,
|
||||
query_images=query_images,
|
||||
agent=agent,
|
||||
tracer=tracer,
|
||||
|
||||
Reference in New Issue
Block a user