From 6118d1ff579745e24da762c222a249eb3aaf0f2e Mon Sep 17 00:00:00 2001 From: Debanjum Singh Solanky Date: Wed, 13 Mar 2024 15:22:57 +0530 Subject: [PATCH 01/10] Create chat actor for directly reading webpages based on user message - Add prompt for the read webpages chat actor to extract, infer webpage links - Make chat actor infer or extract webpage to read directly from user message - Rename previous read_webpage function to more narrow read_webpage_at_url function --- src/khoj/processor/conversation/prompts.py | 44 ++++++++++++++++++++++ src/khoj/processor/tools/online_search.py | 27 +++++++++++-- src/khoj/routers/helpers.py | 30 +++++++++++++++ src/khoj/utils/helpers.py | 10 +++++ tests/test_helpers.py | 7 +++- 5 files changed, 112 insertions(+), 6 deletions(-) diff --git a/src/khoj/processor/conversation/prompts.py b/src/khoj/processor/conversation/prompts.py index a4256525..5fa75bca 100644 --- a/src/khoj/processor/conversation/prompts.py +++ b/src/khoj/processor/conversation/prompts.py @@ -380,6 +380,50 @@ Khoj: """.strip() ) +infer_webpages_to_read = PromptTemplate.from_template( + """ +You are Khoj, an advanced web page reading assistant. You are to construct **up to three, valid** webpage urls to read before answering the user's question. +- You will receive the conversation history as context. +- Add as much context from the previous questions and answers as required to construct the webpage urls. +- Use multiple web page urls if required to retrieve the relevant information. +- You have access to the the whole internet to retrieve information. + +Which webpages will you need to read to answer the user's question? +Provide web page links as a list of strings in a JSON object. +Current Date: {current_date} +User's Location: {location} + +Here are some examples: +History: +User: I like to use Hacker News to get my tech news. +AI: Hacker News is an online forum for sharing and discussing the latest tech news. It is a great place to learn about new technologies and startups. + +Q: Summarize this post about vector database on Hacker News, https://news.ycombinator.com/item?id=12345 +Khoj: {{"links": ["https://news.ycombinator.com/item?id=12345"]}} + +History: +User: I'm currently living in New York but I'm thinking about moving to San Francisco. +AI: New York is a great city to live in. It has a lot of great restaurants and museums. San Francisco is also a great city to live in. It has good access to nature and a great tech scene. + +Q: What is the climate like in those cities? +Khoj: {{"links": ["https://en.wikipedia.org/wiki/New_York_City", "https://en.wikipedia.org/wiki/San_Francisco"]}} + +History: +User: Hey, how is it going? +AI: Not too bad. How can I help you today? + +Q: What's the latest news on r/worldnews? +Khoj: {{"links": ["https://www.reddit.com/r/worldnews/"]}} + +Now it's your turn to share actual webpage urls you'd like to read to answer the user's question. +History: +{chat_history} + +Q: {query} +Khoj: +""".strip() +) + online_search_conversation_subqueries = PromptTemplate.from_template( """ You are Khoj, an advanced google search assistant. You are tasked with constructing **up to three** google search queries to answer the user's question. diff --git a/src/khoj/processor/tools/online_search.py b/src/khoj/processor/tools/online_search.py index 597f394e..84ca7bac 100644 --- a/src/khoj/processor/tools/online_search.py +++ b/src/khoj/processor/tools/online_search.py @@ -2,6 +2,7 @@ import asyncio import json import logging import os +from collections import defaultdict from typing import Dict, Tuple, Union import aiohttp @@ -9,7 +10,11 @@ import requests from bs4 import BeautifulSoup from markdownify import markdownify -from khoj.routers.helpers import extract_relevant_info, generate_online_subqueries +from khoj.routers.helpers import ( + extract_relevant_info, + generate_online_subqueries, + infer_webpage_urls, +) from khoj.utils.helpers import is_none_or_empty, timer from khoj.utils.rawconfig import LocationData @@ -38,7 +43,7 @@ MAX_WEBPAGES_TO_READ = 1 async def search_online(query: str, conversation_history: dict, location: LocationData): - if SERPER_DEV_API_KEY is None: + if not online_search_enabled(): logger.warn("SERPER_DEV_API_KEY is not set") return {} @@ -93,10 +98,20 @@ def search_with_google(subquery: str): return extracted_search_result +async def read_webpages(query: str, conversation_history: dict, location: LocationData): + "Infer web pages to read from the query and extract relevant information from them" + urls = await infer_webpage_urls(query, conversation_history, location) + results: Dict[str, Dict[str, str]] = defaultdict(dict) + for url in urls: + _, result = await read_webpage_and_extract_content(query, url) + results[url]["extracted_content"] = result + return results + + async def read_webpage_and_extract_content(subquery: str, url: str) -> Tuple[str, Union[None, str]]: try: with timer(f"Reading web page at '{url}' took", logger): - content = await read_webpage_with_olostep(url) if OLOSTEP_API_KEY else await read_webpage(url) + content = await read_webpage_with_olostep(url) if OLOSTEP_API_KEY else await read_webpage_at_url(url) with timer(f"Extracting relevant information from web page at '{url}' took", logger): extracted_info = await extract_relevant_info(subquery, content) return subquery, extracted_info @@ -105,7 +120,7 @@ async def read_webpage_and_extract_content(subquery: str, url: str) -> Tuple[str return subquery, None -async def read_webpage(web_url: str) -> str: +async def read_webpage_at_url(web_url: str) -> str: headers = { "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.97 Safari/537.36", } @@ -129,3 +144,7 @@ async def read_webpage_with_olostep(web_url: str) -> str: response.raise_for_status() response_json = await response.json() return response_json["markdown_content"] + + +def online_search_enabled(): + return SERPER_DEV_API_KEY is not None diff --git a/src/khoj/routers/helpers.py b/src/khoj/routers/helpers.py index 724d640a..a9dd5fb3 100644 --- a/src/khoj/routers/helpers.py +++ b/src/khoj/routers/helpers.py @@ -36,6 +36,7 @@ from khoj.utils.config import GPT4AllProcessorModel from khoj.utils.helpers import ( ConversationCommand, is_none_or_empty, + is_valid_url, log_telemetry, mode_descriptions_for_llm, timer, @@ -229,6 +230,35 @@ async def aget_relevant_output_modes(query: str, conversation_history: dict): return ConversationCommand.Default +async def infer_webpage_urls(q: str, conversation_history: dict, location_data: LocationData) -> List[str]: + """ + Infer webpage links from the given query + """ + location = f"{location_data.city}, {location_data.region}, {location_data.country}" if location_data else "Unknown" + chat_history = construct_chat_history(conversation_history) + + utc_date = datetime.utcnow().strftime("%Y-%m-%d") + online_queries_prompt = prompts.infer_webpages_to_read.format( + current_date=utc_date, + query=q, + chat_history=chat_history, + location=location, + ) + + response = await send_message_to_model_wrapper(online_queries_prompt, response_type="json_object") + + # Validate that the response is a non-empty, JSON-serializable list of URLs + try: + response = response.strip() + urls = json.loads(response) + valid_unique_urls = {str(url).strip() for url in urls["links"] if is_valid_url(url)} + if is_none_or_empty(valid_unique_urls): + raise ValueError(f"Invalid list of urls: {response}") + return list(valid_unique_urls) + except Exception: + raise ValueError(f"Invalid list of urls: {response}") + + async def generate_online_subqueries(q: str, conversation_history: dict, location_data: LocationData) -> List[str]: """ Generate subqueries from the given query diff --git a/src/khoj/utils/helpers.py b/src/khoj/utils/helpers.py index 150398ee..4023722e 100644 --- a/src/khoj/utils/helpers.py +++ b/src/khoj/utils/helpers.py @@ -15,6 +15,7 @@ from os import path from pathlib import Path from time import perf_counter from typing import TYPE_CHECKING, Optional, Union +from urllib.parse import urlparse import torch from asgiref.sync import sync_to_async @@ -340,3 +341,12 @@ def in_debug_mode(): """Check if Khoj is running in debug mode. Set KHOJ_DEBUG environment variable to true to enable debug mode.""" return is_env_var_true("KHOJ_DEBUG") + + +def is_valid_url(url: str) -> bool: + """Check if a string is a valid URL""" + try: + result = urlparse(url.strip()) + return all([result.scheme, result.netloc]) + except: + return False diff --git a/tests/test_helpers.py b/tests/test_helpers.py index 086e4895..131c3553 100644 --- a/tests/test_helpers.py +++ b/tests/test_helpers.py @@ -7,7 +7,10 @@ import pytest from scipy.stats import linregress from khoj.processor.embeddings import EmbeddingsModel -from khoj.processor.tools.online_search import read_webpage, read_webpage_with_olostep +from khoj.processor.tools.online_search import ( + read_webpage_at_url, + read_webpage_with_olostep, +) from khoj.utils import helpers @@ -90,7 +93,7 @@ async def test_reading_webpage(): website = "https://en.wikipedia.org/wiki/Great_Chicago_Fire" # Act - response = await read_webpage(website) + response = await read_webpage_at_url(website) # Assert assert ( From a6b74328378d1be39188df43d0b282344ede8d9a Mon Sep 17 00:00:00 2001 From: Debanjum Singh Solanky Date: Wed, 13 Mar 2024 15:27:56 +0530 Subject: [PATCH 02/10] Add webpage chat command for read web pages requested by user Update auto chat command inference prompt to show example of when to use webpage chat command (i.e when url is directly provided in link) --- src/khoj/processor/conversation/prompts.py | 8 ++++++++ src/khoj/utils/helpers.py | 5 ++++- 2 files changed, 12 insertions(+), 1 deletion(-) diff --git a/src/khoj/processor/conversation/prompts.py b/src/khoj/processor/conversation/prompts.py index 5fa75bca..1ce0cd49 100644 --- a/src/khoj/processor/conversation/prompts.py +++ b/src/khoj/processor/conversation/prompts.py @@ -362,6 +362,14 @@ AI: Good morning! How can I help you today? Q: How can I share my files with Khoj? Khoj: {{"source": ["default", "online"]}} +Example: +Chat History: +User: What is the first element in the periodic table? +AI: The first element in the periodic table is Hydrogen. + +Q: Summarize this article https://en.wikipedia.org/wiki/Hydrogen +Khoj: {{"source": ["webpage"]}} + Example: Chat History: User: I want to start a new hobby. I'm thinking of learning to play the guitar. diff --git a/src/khoj/utils/helpers.py b/src/khoj/utils/helpers.py index 4023722e..d713c335 100644 --- a/src/khoj/utils/helpers.py +++ b/src/khoj/utils/helpers.py @@ -271,6 +271,7 @@ class ConversationCommand(str, Enum): Notes = "notes" Help = "help" Online = "online" + Webpage = "webpage" Image = "image" @@ -279,15 +280,17 @@ command_descriptions = { ConversationCommand.Notes: "Only talk about information that is available in your knowledge base.", ConversationCommand.Default: "The default command when no command specified. It intelligently auto-switches between general and notes mode.", ConversationCommand.Online: "Search for information on the internet.", + ConversationCommand.Webpage: "Get information from webpage links provided by you.", ConversationCommand.Image: "Generate images by describing your imagination in words.", ConversationCommand.Help: "Display a help message with all available commands and other metadata.", } tool_descriptions_for_llm = { ConversationCommand.Default: "To use a mix of your internal knowledge and the user's personal knowledge, or if you don't entirely understand the query.", - ConversationCommand.General: "Use this when you can answer the question without any outside information or personal knowledge", + ConversationCommand.General: "To use when you can answer the question without any outside information or personal knowledge", ConversationCommand.Notes: "To search the user's personal knowledge base. Especially helpful if the question expects context from the user's notes or documents.", ConversationCommand.Online: "To search for the latest, up-to-date information from the internet. Note: **Questions about Khoj should always use this data source**", + ConversationCommand.Webpage: "To use if the user has directly provided the webpage urls or you are certain of the webpage urls to read.", } mode_descriptions_for_llm = { From ad6f6bb0ed206c0f115761d0ee21020a867395cc Mon Sep 17 00:00:00 2001 From: Debanjum Singh Solanky Date: Wed, 13 Mar 2024 15:30:24 +0530 Subject: [PATCH 03/10] Support webpage command in chat API - Fallback to use webpage when SERPER not setup and online command was attempted - Do not stop responding if can't retrieve online results. Try to respond without the online context --- src/khoj/processor/conversation/openai/gpt.py | 6 ++-- src/khoj/routers/api_chat.py | 28 ++++++++++++++----- src/khoj/routers/helpers.py | 2 +- 3 files changed, 25 insertions(+), 11 deletions(-) diff --git a/src/khoj/processor/conversation/openai/gpt.py b/src/khoj/processor/conversation/openai/gpt.py index dd01efd4..c3edf334 100644 --- a/src/khoj/processor/conversation/openai/gpt.py +++ b/src/khoj/processor/conversation/openai/gpt.py @@ -1,7 +1,7 @@ import json import logging from datetime import datetime, timedelta -from typing import Optional +from typing import Dict, Optional from langchain.schema import ChatMessage @@ -103,7 +103,7 @@ def send_message_to_model(messages, api_key, model, response_type="text"): def converse( references, user_query, - online_results: Optional[dict] = None, + online_results: Optional[Dict[str, Dict]] = None, conversation_log={}, model: str = "gpt-3.5-turbo", api_key: Optional[str] = None, @@ -141,7 +141,7 @@ def converse( completion_func(chat_response=prompts.no_online_results_found.format()) return iter([prompts.no_online_results_found.format()]) - if ConversationCommand.Online in conversation_commands: + if ConversationCommand.Online in conversation_commands or ConversationCommand.Webpage in conversation_commands: conversation_primer = ( f"{prompts.online_search_conversation.format(online_results=str(online_results))}\n{conversation_primer}" ) diff --git a/src/khoj/routers/api_chat.py b/src/khoj/routers/api_chat.py index 7a99869c..77888ce5 100644 --- a/src/khoj/routers/api_chat.py +++ b/src/khoj/routers/api_chat.py @@ -14,7 +14,11 @@ from khoj.database.adapters import ConversationAdapters, EntryAdapters, aget_use from khoj.database.models import KhojUser from khoj.processor.conversation.prompts import help_message, no_entries_found from khoj.processor.conversation.utils import save_to_conversation_log -from khoj.processor.tools.online_search import search_online +from khoj.processor.tools.online_search import ( + online_search_enabled, + read_webpages, + search_online, +) from khoj.routers.api import extract_references_and_questions from khoj.routers.helpers import ( ApiUserRateLimiter, @@ -274,7 +278,7 @@ async def chat( compiled_references, inferred_queries, defiltered_query = await extract_references_and_questions( request, common, meta_log, q, (n or 5), (d or math.inf), conversation_commands, location ) - online_results: Dict = dict() + online_results: Dict[str, Dict] = {} if conversation_commands == [ConversationCommand.Notes] and not await EntryAdapters.auser_has_entries(user): no_entries_found_format = no_entries_found.format() @@ -288,13 +292,23 @@ async def chat( conversation_commands.remove(ConversationCommand.Notes) if ConversationCommand.Online in conversation_commands: + if not online_search_enabled(): + conversation_commands.remove(ConversationCommand.Online) + # If online search is not enabled, try to read webpages directly + if ConversationCommand.Webpage not in conversation_commands: + conversation_commands.append(ConversationCommand.Webpage) + else: + try: + online_results = await search_online(defiltered_query, meta_log, location) + except ValueError as e: + logger.warning(f"Error searching online: {e}. Attempting to respond without online results") + + if ConversationCommand.Webpage in conversation_commands: try: - online_results = await search_online(defiltered_query, meta_log, location) + online_results = await read_webpages(defiltered_query, meta_log, location) except ValueError as e: - return StreamingResponse( - iter(["Please set your SERPER_DEV_API_KEY to get started with online searches 🌐"]), - media_type="text/event-stream", - status_code=200, + logger.warning( + f"Error directly reading webpages: {e}. Attempting to respond without online results", exc_info=True ) if ConversationCommand.Image in conversation_commands: diff --git a/src/khoj/routers/helpers.py b/src/khoj/routers/helpers.py index a9dd5fb3..fb2a5df5 100644 --- a/src/khoj/routers/helpers.py +++ b/src/khoj/routers/helpers.py @@ -395,7 +395,7 @@ def generate_chat_response( q: str, meta_log: dict, compiled_references: List[str] = [], - online_results: Dict[str, Any] = {}, + online_results: Dict[str, Dict] = {}, inferred_queries: List[str] = [], conversation_commands: List[ConversationCommand] = [ConversationCommand.Default], user: KhojUser = None, From 85c62efca1114dfeb186d2fb2bbc2bffadc084bd Mon Sep 17 00:00:00 2001 From: Debanjum Singh Solanky Date: Wed, 13 Mar 2024 17:18:47 +0530 Subject: [PATCH 04/10] Test select webpage as data source and extract web urls chat actors --- tests/test_openai_chat_actors.py | 29 +++++++++++++++++++++++++++++ 1 file changed, 29 insertions(+) diff --git a/tests/test_openai_chat_actors.py b/tests/test_openai_chat_actors.py index 8db577e9..e7c4f895 100644 --- a/tests/test_openai_chat_actors.py +++ b/tests/test_openai_chat_actors.py @@ -11,6 +11,7 @@ from khoj.routers.helpers import ( aget_relevant_information_sources, aget_relevant_output_modes, generate_online_subqueries, + infer_webpage_urls, ) from khoj.utils.helpers import ConversationCommand @@ -510,6 +511,34 @@ async def test_select_data_sources_actor_chooses_to_search_online(chat_client): assert ConversationCommand.Online in conversation_commands +# ---------------------------------------------------------------------------------------------------- +@pytest.mark.anyio +@pytest.mark.django_db(transaction=True) +async def test_select_data_sources_actor_chooses_to_read_webpage(chat_client): + # Arrange + user_query = "Summarize the wikipedia page on the history of the internet" + + # Act + conversation_commands = await aget_relevant_information_sources(user_query, {}) + + # Assert + assert ConversationCommand.Webpage in conversation_commands + + +# ---------------------------------------------------------------------------------------------------- +@pytest.mark.anyio +@pytest.mark.django_db(transaction=True) +async def test_infer_webpage_urls_actor_extracts_correct_links(chat_client): + # Arrange + user_query = "Summarize the wikipedia page on the history of the internet" + + # Act + urls = await infer_webpage_urls(user_query, {}, None) + + # Assert + assert "https://en.wikipedia.org/wiki/History_of_the_Internet" in urls + + # Helpers # ---------------------------------------------------------------------------------------------------- def populate_chat_history(message_list): From b22a7dae5d70000d0579b7be7e775296ce292f2c Mon Sep 17 00:00:00 2001 From: Debanjum Singh Solanky Date: Thu, 14 Mar 2024 16:52:28 +0530 Subject: [PATCH 05/10] Tweak prompts to extract information from webpages, online results - Show more of the truncated messages for debugging context - Update Khoj personality prompt to encourage it to remember it's capabilities --- src/khoj/processor/conversation/openai/gpt.py | 2 +- src/khoj/processor/conversation/prompts.py | 7 ++++--- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/src/khoj/processor/conversation/openai/gpt.py b/src/khoj/processor/conversation/openai/gpt.py index c3edf334..178a9e44 100644 --- a/src/khoj/processor/conversation/openai/gpt.py +++ b/src/khoj/processor/conversation/openai/gpt.py @@ -157,7 +157,7 @@ def converse( max_prompt_size, tokenizer_name, ) - truncated_messages = "\n".join({f"{message.content[:40]}..." for message in messages}) + truncated_messages = "\n".join({f"{message.content[:70]}..." for message in messages}) logger.debug(f"Conversation Context for GPT: {truncated_messages}") # Get Response from GPT diff --git a/src/khoj/processor/conversation/prompts.py b/src/khoj/processor/conversation/prompts.py index 1ce0cd49..1f4a0cb9 100644 --- a/src/khoj/processor/conversation/prompts.py +++ b/src/khoj/processor/conversation/prompts.py @@ -10,7 +10,7 @@ You were created by Khoj Inc. with the following capabilities: - You *CAN REMEMBER ALL NOTES and PERSONAL INFORMATION FOREVER* that the user ever shares with you. - Users can share files and other information with you using the Khoj Desktop, Obsidian or Emacs app. They can also drag and drop their files into the chat window. -- You can generate images, look-up information from the internet, and answer questions based on the user's notes. +- You *CAN* generate images, look-up real-time information from the internet, and answer questions based on the user's notes. - You cannot set reminders. - Say "I don't know" or "I don't understand" if you don't know what to say or if you don't know the answer to a question. - Ask crisp follow-up questions to get additional context, when the answer cannot be inferred from the provided notes or past conversations. @@ -146,7 +146,8 @@ online_search_conversation = PromptTemplate.from_template( Use this up-to-date information from the internet to inform your response. Ask crisp follow-up questions to get additional context, when a helpful response cannot be provided from the online data or past conversations. -Information from the internet: {online_results} +Information from the internet: +{online_results} """.strip() ) @@ -280,7 +281,7 @@ Target Query: {query} Web Pages: {corpus} -Collate the relevant information from the website to answer the target query. +Collate only relevant information from the website to answer the target query. """.strip() ) From 1167f6ddf9b0661264136940ce517bc15930e4dc Mon Sep 17 00:00:00 2001 From: Debanjum Singh Solanky Date: Thu, 14 Mar 2024 15:52:21 +0530 Subject: [PATCH 06/10] Rename extract_content online results field to webpages --- src/khoj/processor/conversation/offline/chat_model.py | 4 ++-- src/khoj/processor/tools/online_search.py | 8 ++++---- src/khoj/routers/helpers.py | 4 ++-- 3 files changed, 8 insertions(+), 8 deletions(-) diff --git a/src/khoj/processor/conversation/offline/chat_model.py b/src/khoj/processor/conversation/offline/chat_model.py index 437bdd3d..d1469ecf 100644 --- a/src/khoj/processor/conversation/offline/chat_model.py +++ b/src/khoj/processor/conversation/offline/chat_model.py @@ -177,8 +177,8 @@ def converse_offline( if ConversationCommand.Online in conversation_commands: simplified_online_results = online_results.copy() for result in online_results: - if online_results[result].get("extracted_content"): - simplified_online_results[result] = online_results[result]["extracted_content"] + if online_results[result].get("webpages"): + simplified_online_results[result] = online_results[result]["webpages"] conversation_primer = f"{prompts.online_search_conversation.format(online_results=str(simplified_online_results))}\n{conversation_primer}" if not is_none_or_empty(compiled_references_message): diff --git a/src/khoj/processor/tools/online_search.py b/src/khoj/processor/tools/online_search.py index 84ca7bac..45ccf111 100644 --- a/src/khoj/processor/tools/online_search.py +++ b/src/khoj/processor/tools/online_search.py @@ -72,9 +72,9 @@ async def search_online(query: str, conversation_history: dict, location: Locati results = await asyncio.gather(*tasks) # Collect extracted info from the retrieved web pages - for subquery, extracted_webpage_content in results: - if extracted_webpage_content is not None: - response_dict[subquery]["extracted_content"] = extracted_webpage_content + for subquery, webpage_extract in results: + if webpage_extract is not None: + response_dict[subquery]["webpages"] = webpage_extract return response_dict @@ -104,7 +104,7 @@ async def read_webpages(query: str, conversation_history: dict, location: Locati results: Dict[str, Dict[str, str]] = defaultdict(dict) for url in urls: _, result = await read_webpage_and_extract_content(query, url) - results[url]["extracted_content"] = result + results[url]["webpages"] = result return results diff --git a/src/khoj/routers/helpers.py b/src/khoj/routers/helpers.py index fb2a5df5..e16f443d 100644 --- a/src/khoj/routers/helpers.py +++ b/src/khoj/routers/helpers.py @@ -334,8 +334,8 @@ async def generate_better_image_prompt( for result in online_results: if online_results[result].get("answerBox"): simplified_online_results[result] = online_results[result]["answerBox"] - elif online_results[result].get("extracted_content"): - simplified_online_results[result] = online_results[result]["extracted_content"] + elif online_results[result].get("webpages"): + simplified_online_results[result] = online_results[result]["webpages"] image_prompt = prompts.image_generation_improve_prompt.format( query=q, From 71b6905008514cd2cf376f2322de9fc41fa4d04d Mon Sep 17 00:00:00 2001 From: Debanjum Singh Solanky Date: Thu, 14 Mar 2024 16:34:04 +0530 Subject: [PATCH 07/10] Parallelize simple webpage read and extractor Similar to what is being done with search_online with olostep --- src/khoj/processor/tools/online_search.py | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/src/khoj/processor/tools/online_search.py b/src/khoj/processor/tools/online_search.py index 45ccf111..b250fe1a 100644 --- a/src/khoj/processor/tools/online_search.py +++ b/src/khoj/processor/tools/online_search.py @@ -100,12 +100,16 @@ def search_with_google(subquery: str): async def read_webpages(query: str, conversation_history: dict, location: LocationData): "Infer web pages to read from the query and extract relevant information from them" + logger.info(f"Inferring web pages to read") urls = await infer_webpage_urls(query, conversation_history, location) - results: Dict[str, Dict[str, str]] = defaultdict(dict) - for url in urls: - _, result = await read_webpage_and_extract_content(query, url) - results[url]["webpages"] = result - return results + + logger.info(f"Reading web pages at: {urls}") + tasks = [read_webpage_and_extract_content(query, url) for url in urls] + results = await asyncio.gather(*tasks) + + response: Dict[str, Dict[str, str]] = defaultdict(dict) + response[query]["webpages"] = [web_extract for _, web_extract in results if web_extract is not None] + return response async def read_webpage_and_extract_content(subquery: str, url: str) -> Tuple[str, Union[None, str]]: From a2e79c94bea740abff10be8335e3a9c2773b943a Mon Sep 17 00:00:00 2001 From: Debanjum Singh Solanky Date: Thu, 14 Mar 2024 16:50:31 +0530 Subject: [PATCH 08/10] Pass multiple webpages with their urls in online results context Previously even if MAX_WEBPAGES_TO_READ was > 1, only 1 extracted content would ever be passed. URL of the extracted webpage content wasn't passed to clients in online results context. This limited them from being rendered --- src/khoj/processor/tools/online_search.py | 27 +++++++++++------------ 1 file changed, 13 insertions(+), 14 deletions(-) diff --git a/src/khoj/processor/tools/online_search.py b/src/khoj/processor/tools/online_search.py index b250fe1a..c9745dc9 100644 --- a/src/khoj/processor/tools/online_search.py +++ b/src/khoj/processor/tools/online_search.py @@ -57,24 +57,21 @@ async def search_online(query: str, conversation_history: dict, location: Locati # Gather distinct web pages from organic search results of each subquery without an instant answer webpage_links = { - result["link"] + organic["link"]: subquery for subquery in response_dict - for result in response_dict[subquery].get("organic", [])[:MAX_WEBPAGES_TO_READ] + for organic in response_dict[subquery].get("organic", [])[:MAX_WEBPAGES_TO_READ] if "answerBox" not in response_dict[subquery] } # Read, extract relevant info from the retrieved web pages - tasks = [] - for webpage_link in webpage_links: - logger.info(f"Reading web page at '{webpage_link}'") - task = read_webpage_and_extract_content(subquery, webpage_link) - tasks.append(task) + logger.info(f"Reading web pages at: {webpage_links.keys()}") + tasks = [read_webpage_and_extract_content(subquery, link) for link, subquery in webpage_links.items()] results = await asyncio.gather(*tasks) # Collect extracted info from the retrieved web pages - for subquery, webpage_extract in results: + for subquery, webpage_extract, url in results: if webpage_extract is not None: - response_dict[subquery]["webpages"] = webpage_extract + response_dict[subquery]["webpages"] = {"link": url, "snippet": webpage_extract} return response_dict @@ -107,21 +104,23 @@ async def read_webpages(query: str, conversation_history: dict, location: Locati tasks = [read_webpage_and_extract_content(query, url) for url in urls] results = await asyncio.gather(*tasks) - response: Dict[str, Dict[str, str]] = defaultdict(dict) - response[query]["webpages"] = [web_extract for _, web_extract in results if web_extract is not None] + response: Dict[str, Dict] = defaultdict(dict) + response[query]["webpages"] = [ + {"query": q, "link": url, "snippet": web_extract} for q, web_extract, url in results if web_extract is not None + ] return response -async def read_webpage_and_extract_content(subquery: str, url: str) -> Tuple[str, Union[None, str]]: +async def read_webpage_and_extract_content(subquery: str, url: str) -> Tuple[str, Union[None, str], str]: try: with timer(f"Reading web page at '{url}' took", logger): content = await read_webpage_with_olostep(url) if OLOSTEP_API_KEY else await read_webpage_at_url(url) with timer(f"Extracting relevant information from web page at '{url}' took", logger): extracted_info = await extract_relevant_info(subquery, content) - return subquery, extracted_info + return subquery, extracted_info, url except Exception as e: logger.error(f"Failed to read web page at '{url}' with {e}") - return subquery, None + return subquery, None, url async def read_webpage_at_url(web_url: str) -> str: From dabf71bc3ceca3d881d036baa58058445b9f1dcf Mon Sep 17 00:00:00 2001 From: Debanjum Singh Solanky Date: Thu, 14 Mar 2024 18:25:54 +0530 Subject: [PATCH 09/10] Render webpage read in chat response references on Web, Desktop apps --- src/interface/desktop/chat.html | 11 ++++++++++- src/khoj/interface/web/chat.html | 11 ++++++++++- 2 files changed, 20 insertions(+), 2 deletions(-) diff --git a/src/interface/desktop/chat.html b/src/interface/desktop/chat.html index cc081da7..02ccf3b4 100644 --- a/src/interface/desktop/chat.html +++ b/src/interface/desktop/chat.html @@ -87,7 +87,7 @@ function generateOnlineReference(reference, index) { // Generate HTML for Chat Reference - let title = reference.title; + let title = reference.title || reference.link; let link = reference.link; let snippet = reference.snippet; let question = reference.question; @@ -191,6 +191,15 @@ referenceSection.appendChild(polishedReference); } } + + if (onlineReference.webpages && onlineReference.webpages.length > 0) { + numOnlineReferences += onlineReference.webpages.length; + for (let index in onlineReference.webpages) { + let reference = onlineReference.webpages[index]; + let polishedReference = generateOnlineReference(reference, index); + referenceSection.appendChild(polishedReference); + } + } } return numOnlineReferences; diff --git a/src/khoj/interface/web/chat.html b/src/khoj/interface/web/chat.html index c251bff2..73b4af0e 100644 --- a/src/khoj/interface/web/chat.html +++ b/src/khoj/interface/web/chat.html @@ -101,7 +101,7 @@ To get started, just start typing below. You can also type / to see a list of co function generateOnlineReference(reference, index) { // Generate HTML for Chat Reference - let title = reference.title; + let title = reference.title || reference.link; let link = reference.link; let snippet = reference.snippet; let question = reference.question; @@ -205,6 +205,15 @@ To get started, just start typing below. You can also type / to see a list of co referenceSection.appendChild(polishedReference); } } + + if (onlineReference.webpages && onlineReference.webpages.length > 0) { + numOnlineReferences += onlineReference.webpages.length; + for (let index in onlineReference.webpages) { + let reference = onlineReference.webpages[index]; + let polishedReference = generateOnlineReference(reference, index); + referenceSection.appendChild(polishedReference); + } + } } return numOnlineReferences; From 9e52ae9e9835a0b42125aa233bff0257b0134c7d Mon Sep 17 00:00:00 2001 From: Debanjum Singh Solanky Date: Sun, 24 Mar 2024 15:44:52 +0530 Subject: [PATCH 10/10] Time chat actor responses & chat api request start for perf analysis --- src/khoj/routers/api_chat.py | 1 + src/khoj/routers/helpers.py | 22 ++++++++++++++-------- 2 files changed, 15 insertions(+), 8 deletions(-) diff --git a/src/khoj/routers/api_chat.py b/src/khoj/routers/api_chat.py index 77888ce5..48951565 100644 --- a/src/khoj/routers/api_chat.py +++ b/src/khoj/routers/api_chat.py @@ -242,6 +242,7 @@ async def chat( ) -> Response: user: KhojUser = request.user.object q = unquote(q) + logger.info("Chat request by {user.username}: {q}") await is_ready_to_chat(user) conversation_commands = [get_conversation_command(query=q, any_references=True)] diff --git a/src/khoj/routers/helpers.py b/src/khoj/routers/helpers.py index e16f443d..7a736663 100644 --- a/src/khoj/routers/helpers.py +++ b/src/khoj/routers/helpers.py @@ -168,7 +168,8 @@ async def aget_relevant_information_sources(query: str, conversation_history: di chat_history=chat_history, ) - response = await send_message_to_model_wrapper(relevant_tools_prompt, response_type="json_object") + with timer("Chat actor: Infer information sources to refer", logger): + response = await send_message_to_model_wrapper(relevant_tools_prompt, response_type="json_object") try: response = response.strip() @@ -212,7 +213,8 @@ async def aget_relevant_output_modes(query: str, conversation_history: dict): chat_history=chat_history, ) - response = await send_message_to_model_wrapper(relevant_mode_prompt) + with timer("Chat actor: Infer output mode for chat response", logger): + response = await send_message_to_model_wrapper(relevant_mode_prompt) try: response = response.strip() @@ -245,7 +247,8 @@ async def infer_webpage_urls(q: str, conversation_history: dict, location_data: location=location, ) - response = await send_message_to_model_wrapper(online_queries_prompt, response_type="json_object") + with timer("Chat actor: Infer webpage urls to read", logger): + response = await send_message_to_model_wrapper(online_queries_prompt, response_type="json_object") # Validate that the response is a non-empty, JSON-serializable list of URLs try: @@ -274,7 +277,8 @@ async def generate_online_subqueries(q: str, conversation_history: dict, locatio location=location, ) - response = await send_message_to_model_wrapper(online_queries_prompt, response_type="json_object") + with timer("Chat actor: Generate online search subqueries", logger): + response = await send_message_to_model_wrapper(online_queries_prompt, response_type="json_object") # Validate that the response is a non-empty, JSON-serializable list try: @@ -303,9 +307,10 @@ async def extract_relevant_info(q: str, corpus: str) -> Union[str, None]: corpus=corpus.strip(), ) - response = await send_message_to_model_wrapper( - extract_relevant_information, prompts.system_prompt_extract_relevant_information - ) + with timer("Chat actor: Extract relevant information from data", logger): + response = await send_message_to_model_wrapper( + extract_relevant_information, prompts.system_prompt_extract_relevant_information + ) return response.strip() @@ -346,7 +351,8 @@ async def generate_better_image_prompt( online_results=simplified_online_results, ) - response = await send_message_to_model_wrapper(image_prompt) + with timer("Chat actor: Generate contextual image prompt", logger): + response = await send_message_to_model_wrapper(image_prompt) return response.strip()