diff --git a/src/interface/desktop/chat.html b/src/interface/desktop/chat.html index 94cde782..f37ae562 100644 --- a/src/interface/desktop/chat.html +++ b/src/interface/desktop/chat.html @@ -87,7 +87,7 @@ function generateOnlineReference(reference, index) { // Generate HTML for Chat Reference - let title = reference.title; + let title = reference.title || reference.link; let link = reference.link; let snippet = reference.snippet; let question = reference.question; @@ -191,6 +191,15 @@ referenceSection.appendChild(polishedReference); } } + + if (onlineReference.webpages && onlineReference.webpages.length > 0) { + numOnlineReferences += onlineReference.webpages.length; + for (let index in onlineReference.webpages) { + let reference = onlineReference.webpages[index]; + let polishedReference = generateOnlineReference(reference, index); + referenceSection.appendChild(polishedReference); + } + } } return numOnlineReferences; diff --git a/src/khoj/interface/web/chat.html b/src/khoj/interface/web/chat.html index 35047c31..438763f2 100644 --- a/src/khoj/interface/web/chat.html +++ b/src/khoj/interface/web/chat.html @@ -101,7 +101,7 @@ To get started, just start typing below. You can also type / to see a list of co function generateOnlineReference(reference, index) { // Generate HTML for Chat Reference - let title = reference.title; + let title = reference.title || reference.link; let link = reference.link; let snippet = reference.snippet; let question = reference.question; @@ -205,6 +205,15 @@ To get started, just start typing below. You can also type / to see a list of co referenceSection.appendChild(polishedReference); } } + + if (onlineReference.webpages && onlineReference.webpages.length > 0) { + numOnlineReferences += onlineReference.webpages.length; + for (let index in onlineReference.webpages) { + let reference = onlineReference.webpages[index]; + let polishedReference = generateOnlineReference(reference, index); + referenceSection.appendChild(polishedReference); + } + } } return numOnlineReferences; diff --git a/src/khoj/processor/conversation/offline/chat_model.py b/src/khoj/processor/conversation/offline/chat_model.py index 437bdd3d..d1469ecf 100644 --- a/src/khoj/processor/conversation/offline/chat_model.py +++ b/src/khoj/processor/conversation/offline/chat_model.py @@ -177,8 +177,8 @@ def converse_offline( if ConversationCommand.Online in conversation_commands: simplified_online_results = online_results.copy() for result in online_results: - if online_results[result].get("extracted_content"): - simplified_online_results[result] = online_results[result]["extracted_content"] + if online_results[result].get("webpages"): + simplified_online_results[result] = online_results[result]["webpages"] conversation_primer = f"{prompts.online_search_conversation.format(online_results=str(simplified_online_results))}\n{conversation_primer}" if not is_none_or_empty(compiled_references_message): diff --git a/src/khoj/processor/conversation/openai/gpt.py b/src/khoj/processor/conversation/openai/gpt.py index 644bb961..8a2059af 100644 --- a/src/khoj/processor/conversation/openai/gpt.py +++ b/src/khoj/processor/conversation/openai/gpt.py @@ -1,7 +1,7 @@ import json import logging from datetime import datetime, timedelta -from typing import Optional +from typing import Dict, Optional from langchain.schema import ChatMessage @@ -104,7 +104,7 @@ def send_message_to_model(messages, api_key, model, response_type="text"): def converse( references, user_query, - online_results: Optional[dict] = None, + online_results: Optional[Dict[str, Dict]] = None, conversation_log={}, model: str = "gpt-3.5-turbo", api_key: Optional[str] = None, @@ -142,7 +142,7 @@ def converse( completion_func(chat_response=prompts.no_online_results_found.format()) return iter([prompts.no_online_results_found.format()]) - if ConversationCommand.Online in conversation_commands: + if ConversationCommand.Online in conversation_commands or ConversationCommand.Webpage in conversation_commands: conversation_primer = ( f"{prompts.online_search_conversation.format(online_results=str(online_results))}\n{conversation_primer}" ) @@ -158,7 +158,7 @@ def converse( max_prompt_size, tokenizer_name, ) - truncated_messages = "\n".join({f"{message.content[:40]}..." for message in messages}) + truncated_messages = "\n".join({f"{message.content[:70]}..." for message in messages}) logger.debug(f"Conversation Context for GPT: {truncated_messages}") # Get Response from GPT diff --git a/src/khoj/processor/conversation/prompts.py b/src/khoj/processor/conversation/prompts.py index b6465ca5..35cdac2d 100644 --- a/src/khoj/processor/conversation/prompts.py +++ b/src/khoj/processor/conversation/prompts.py @@ -10,7 +10,7 @@ You were created by Khoj Inc. with the following capabilities: - You *CAN REMEMBER ALL NOTES and PERSONAL INFORMATION FOREVER* that the user ever shares with you. - Users can share files and other information with you using the Khoj Desktop, Obsidian or Emacs app. They can also drag and drop their files into the chat window. -- You can generate images, look-up information from the internet, and answer questions based on the user's notes. +- You *CAN* generate images, look-up real-time information from the internet, and answer questions based on the user's notes. - You cannot set reminders. - Say "I don't know" or "I don't understand" if you don't know what to say or if you don't know the answer to a question. - Ask crisp follow-up questions to get additional context, when the answer cannot be inferred from the provided notes or past conversations. @@ -146,7 +146,8 @@ online_search_conversation = PromptTemplate.from_template( Use this up-to-date information from the internet to inform your response. Ask crisp follow-up questions to get additional context, when a helpful response cannot be provided from the online data or past conversations. -Information from the internet: {online_results} +Information from the internet: +{online_results} """.strip() ) @@ -280,7 +281,7 @@ Target Query: {query} Web Pages: {corpus} -Collate the relevant information from the website to answer the target query. +Collate only relevant information from the website to answer the target query. """.strip() ) @@ -362,6 +363,14 @@ AI: Good morning! How can I help you today? Q: How can I share my files with Khoj? Khoj: {{"source": ["default", "online"]}} +Example: +Chat History: +User: What is the first element in the periodic table? +AI: The first element in the periodic table is Hydrogen. + +Q: Summarize this article https://en.wikipedia.org/wiki/Hydrogen +Khoj: {{"source": ["webpage"]}} + Example: Chat History: User: I want to start a new hobby. I'm thinking of learning to play the guitar. @@ -380,6 +389,50 @@ Khoj: """.strip() ) +infer_webpages_to_read = PromptTemplate.from_template( + """ +You are Khoj, an advanced web page reading assistant. You are to construct **up to three, valid** webpage urls to read before answering the user's question. +- You will receive the conversation history as context. +- Add as much context from the previous questions and answers as required to construct the webpage urls. +- Use multiple web page urls if required to retrieve the relevant information. +- You have access to the the whole internet to retrieve information. + +Which webpages will you need to read to answer the user's question? +Provide web page links as a list of strings in a JSON object. +Current Date: {current_date} +User's Location: {location} + +Here are some examples: +History: +User: I like to use Hacker News to get my tech news. +AI: Hacker News is an online forum for sharing and discussing the latest tech news. It is a great place to learn about new technologies and startups. + +Q: Summarize this post about vector database on Hacker News, https://news.ycombinator.com/item?id=12345 +Khoj: {{"links": ["https://news.ycombinator.com/item?id=12345"]}} + +History: +User: I'm currently living in New York but I'm thinking about moving to San Francisco. +AI: New York is a great city to live in. It has a lot of great restaurants and museums. San Francisco is also a great city to live in. It has good access to nature and a great tech scene. + +Q: What is the climate like in those cities? +Khoj: {{"links": ["https://en.wikipedia.org/wiki/New_York_City", "https://en.wikipedia.org/wiki/San_Francisco"]}} + +History: +User: Hey, how is it going? +AI: Not too bad. How can I help you today? + +Q: What's the latest news on r/worldnews? +Khoj: {{"links": ["https://www.reddit.com/r/worldnews/"]}} + +Now it's your turn to share actual webpage urls you'd like to read to answer the user's question. +History: +{chat_history} + +Q: {query} +Khoj: +""".strip() +) + online_search_conversation_subqueries = PromptTemplate.from_template( """ You are Khoj, an advanced google search assistant. You are tasked with constructing **up to three** google search queries to answer the user's question. diff --git a/src/khoj/processor/tools/online_search.py b/src/khoj/processor/tools/online_search.py index 597f394e..c9745dc9 100644 --- a/src/khoj/processor/tools/online_search.py +++ b/src/khoj/processor/tools/online_search.py @@ -2,6 +2,7 @@ import asyncio import json import logging import os +from collections import defaultdict from typing import Dict, Tuple, Union import aiohttp @@ -9,7 +10,11 @@ import requests from bs4 import BeautifulSoup from markdownify import markdownify -from khoj.routers.helpers import extract_relevant_info, generate_online_subqueries +from khoj.routers.helpers import ( + extract_relevant_info, + generate_online_subqueries, + infer_webpage_urls, +) from khoj.utils.helpers import is_none_or_empty, timer from khoj.utils.rawconfig import LocationData @@ -38,7 +43,7 @@ MAX_WEBPAGES_TO_READ = 1 async def search_online(query: str, conversation_history: dict, location: LocationData): - if SERPER_DEV_API_KEY is None: + if not online_search_enabled(): logger.warn("SERPER_DEV_API_KEY is not set") return {} @@ -52,24 +57,21 @@ async def search_online(query: str, conversation_history: dict, location: Locati # Gather distinct web pages from organic search results of each subquery without an instant answer webpage_links = { - result["link"] + organic["link"]: subquery for subquery in response_dict - for result in response_dict[subquery].get("organic", [])[:MAX_WEBPAGES_TO_READ] + for organic in response_dict[subquery].get("organic", [])[:MAX_WEBPAGES_TO_READ] if "answerBox" not in response_dict[subquery] } # Read, extract relevant info from the retrieved web pages - tasks = [] - for webpage_link in webpage_links: - logger.info(f"Reading web page at '{webpage_link}'") - task = read_webpage_and_extract_content(subquery, webpage_link) - tasks.append(task) + logger.info(f"Reading web pages at: {webpage_links.keys()}") + tasks = [read_webpage_and_extract_content(subquery, link) for link, subquery in webpage_links.items()] results = await asyncio.gather(*tasks) # Collect extracted info from the retrieved web pages - for subquery, extracted_webpage_content in results: - if extracted_webpage_content is not None: - response_dict[subquery]["extracted_content"] = extracted_webpage_content + for subquery, webpage_extract, url in results: + if webpage_extract is not None: + response_dict[subquery]["webpages"] = {"link": url, "snippet": webpage_extract} return response_dict @@ -93,19 +95,35 @@ def search_with_google(subquery: str): return extracted_search_result -async def read_webpage_and_extract_content(subquery: str, url: str) -> Tuple[str, Union[None, str]]: +async def read_webpages(query: str, conversation_history: dict, location: LocationData): + "Infer web pages to read from the query and extract relevant information from them" + logger.info(f"Inferring web pages to read") + urls = await infer_webpage_urls(query, conversation_history, location) + + logger.info(f"Reading web pages at: {urls}") + tasks = [read_webpage_and_extract_content(query, url) for url in urls] + results = await asyncio.gather(*tasks) + + response: Dict[str, Dict] = defaultdict(dict) + response[query]["webpages"] = [ + {"query": q, "link": url, "snippet": web_extract} for q, web_extract, url in results if web_extract is not None + ] + return response + + +async def read_webpage_and_extract_content(subquery: str, url: str) -> Tuple[str, Union[None, str], str]: try: with timer(f"Reading web page at '{url}' took", logger): - content = await read_webpage_with_olostep(url) if OLOSTEP_API_KEY else await read_webpage(url) + content = await read_webpage_with_olostep(url) if OLOSTEP_API_KEY else await read_webpage_at_url(url) with timer(f"Extracting relevant information from web page at '{url}' took", logger): extracted_info = await extract_relevant_info(subquery, content) - return subquery, extracted_info + return subquery, extracted_info, url except Exception as e: logger.error(f"Failed to read web page at '{url}' with {e}") - return subquery, None + return subquery, None, url -async def read_webpage(web_url: str) -> str: +async def read_webpage_at_url(web_url: str) -> str: headers = { "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.97 Safari/537.36", } @@ -129,3 +147,7 @@ async def read_webpage_with_olostep(web_url: str) -> str: response.raise_for_status() response_json = await response.json() return response_json["markdown_content"] + + +def online_search_enabled(): + return SERPER_DEV_API_KEY is not None diff --git a/src/khoj/routers/api_chat.py b/src/khoj/routers/api_chat.py index ff83b95e..94642490 100644 --- a/src/khoj/routers/api_chat.py +++ b/src/khoj/routers/api_chat.py @@ -14,7 +14,11 @@ from khoj.database.adapters import ConversationAdapters, EntryAdapters, aget_use from khoj.database.models import KhojUser from khoj.processor.conversation.prompts import help_message, no_entries_found from khoj.processor.conversation.utils import save_to_conversation_log -from khoj.processor.tools.online_search import search_online +from khoj.processor.tools.online_search import ( + online_search_enabled, + read_webpages, + search_online, +) from khoj.routers.api import extract_references_and_questions from khoj.routers.helpers import ( ApiUserRateLimiter, @@ -238,6 +242,7 @@ async def chat( ) -> Response: user: KhojUser = request.user.object q = unquote(q) + logger.info("Chat request by {user.username}: {q}") await is_ready_to_chat(user) conversation_commands = [get_conversation_command(query=q, any_references=True)] @@ -280,7 +285,7 @@ async def chat( compiled_references, inferred_queries, defiltered_query = await extract_references_and_questions( request, common, meta_log, q, (n or 5), (d or math.inf), conversation_commands, location ) - online_results: Dict = dict() + online_results: Dict[str, Dict] = {} if conversation_commands == [ConversationCommand.Notes] and not await EntryAdapters.auser_has_entries(user): no_entries_found_format = no_entries_found.format() @@ -294,13 +299,23 @@ async def chat( conversation_commands.remove(ConversationCommand.Notes) if ConversationCommand.Online in conversation_commands: + if not online_search_enabled(): + conversation_commands.remove(ConversationCommand.Online) + # If online search is not enabled, try to read webpages directly + if ConversationCommand.Webpage not in conversation_commands: + conversation_commands.append(ConversationCommand.Webpage) + else: + try: + online_results = await search_online(defiltered_query, meta_log, location) + except ValueError as e: + logger.warning(f"Error searching online: {e}. Attempting to respond without online results") + + if ConversationCommand.Webpage in conversation_commands: try: - online_results = await search_online(defiltered_query, meta_log, location) + online_results = await read_webpages(defiltered_query, meta_log, location) except ValueError as e: - return StreamingResponse( - iter(["Please set your SERPER_DEV_API_KEY to get started with online searches 🌐"]), - media_type="text/event-stream", - status_code=200, + logger.warning( + f"Error directly reading webpages: {e}. Attempting to respond without online results", exc_info=True ) if ConversationCommand.Image in conversation_commands: diff --git a/src/khoj/routers/helpers.py b/src/khoj/routers/helpers.py index 724d640a..7a736663 100644 --- a/src/khoj/routers/helpers.py +++ b/src/khoj/routers/helpers.py @@ -36,6 +36,7 @@ from khoj.utils.config import GPT4AllProcessorModel from khoj.utils.helpers import ( ConversationCommand, is_none_or_empty, + is_valid_url, log_telemetry, mode_descriptions_for_llm, timer, @@ -167,7 +168,8 @@ async def aget_relevant_information_sources(query: str, conversation_history: di chat_history=chat_history, ) - response = await send_message_to_model_wrapper(relevant_tools_prompt, response_type="json_object") + with timer("Chat actor: Infer information sources to refer", logger): + response = await send_message_to_model_wrapper(relevant_tools_prompt, response_type="json_object") try: response = response.strip() @@ -211,7 +213,8 @@ async def aget_relevant_output_modes(query: str, conversation_history: dict): chat_history=chat_history, ) - response = await send_message_to_model_wrapper(relevant_mode_prompt) + with timer("Chat actor: Infer output mode for chat response", logger): + response = await send_message_to_model_wrapper(relevant_mode_prompt) try: response = response.strip() @@ -229,6 +232,36 @@ async def aget_relevant_output_modes(query: str, conversation_history: dict): return ConversationCommand.Default +async def infer_webpage_urls(q: str, conversation_history: dict, location_data: LocationData) -> List[str]: + """ + Infer webpage links from the given query + """ + location = f"{location_data.city}, {location_data.region}, {location_data.country}" if location_data else "Unknown" + chat_history = construct_chat_history(conversation_history) + + utc_date = datetime.utcnow().strftime("%Y-%m-%d") + online_queries_prompt = prompts.infer_webpages_to_read.format( + current_date=utc_date, + query=q, + chat_history=chat_history, + location=location, + ) + + with timer("Chat actor: Infer webpage urls to read", logger): + response = await send_message_to_model_wrapper(online_queries_prompt, response_type="json_object") + + # Validate that the response is a non-empty, JSON-serializable list of URLs + try: + response = response.strip() + urls = json.loads(response) + valid_unique_urls = {str(url).strip() for url in urls["links"] if is_valid_url(url)} + if is_none_or_empty(valid_unique_urls): + raise ValueError(f"Invalid list of urls: {response}") + return list(valid_unique_urls) + except Exception: + raise ValueError(f"Invalid list of urls: {response}") + + async def generate_online_subqueries(q: str, conversation_history: dict, location_data: LocationData) -> List[str]: """ Generate subqueries from the given query @@ -244,7 +277,8 @@ async def generate_online_subqueries(q: str, conversation_history: dict, locatio location=location, ) - response = await send_message_to_model_wrapper(online_queries_prompt, response_type="json_object") + with timer("Chat actor: Generate online search subqueries", logger): + response = await send_message_to_model_wrapper(online_queries_prompt, response_type="json_object") # Validate that the response is a non-empty, JSON-serializable list try: @@ -273,9 +307,10 @@ async def extract_relevant_info(q: str, corpus: str) -> Union[str, None]: corpus=corpus.strip(), ) - response = await send_message_to_model_wrapper( - extract_relevant_information, prompts.system_prompt_extract_relevant_information - ) + with timer("Chat actor: Extract relevant information from data", logger): + response = await send_message_to_model_wrapper( + extract_relevant_information, prompts.system_prompt_extract_relevant_information + ) return response.strip() @@ -304,8 +339,8 @@ async def generate_better_image_prompt( for result in online_results: if online_results[result].get("answerBox"): simplified_online_results[result] = online_results[result]["answerBox"] - elif online_results[result].get("extracted_content"): - simplified_online_results[result] = online_results[result]["extracted_content"] + elif online_results[result].get("webpages"): + simplified_online_results[result] = online_results[result]["webpages"] image_prompt = prompts.image_generation_improve_prompt.format( query=q, @@ -316,7 +351,8 @@ async def generate_better_image_prompt( online_results=simplified_online_results, ) - response = await send_message_to_model_wrapper(image_prompt) + with timer("Chat actor: Generate contextual image prompt", logger): + response = await send_message_to_model_wrapper(image_prompt) return response.strip() @@ -365,7 +401,7 @@ def generate_chat_response( q: str, meta_log: dict, compiled_references: List[str] = [], - online_results: Dict[str, Any] = {}, + online_results: Dict[str, Dict] = {}, inferred_queries: List[str] = [], conversation_commands: List[ConversationCommand] = [ConversationCommand.Default], user: KhojUser = None, diff --git a/src/khoj/utils/helpers.py b/src/khoj/utils/helpers.py index 150398ee..d713c335 100644 --- a/src/khoj/utils/helpers.py +++ b/src/khoj/utils/helpers.py @@ -15,6 +15,7 @@ from os import path from pathlib import Path from time import perf_counter from typing import TYPE_CHECKING, Optional, Union +from urllib.parse import urlparse import torch from asgiref.sync import sync_to_async @@ -270,6 +271,7 @@ class ConversationCommand(str, Enum): Notes = "notes" Help = "help" Online = "online" + Webpage = "webpage" Image = "image" @@ -278,15 +280,17 @@ command_descriptions = { ConversationCommand.Notes: "Only talk about information that is available in your knowledge base.", ConversationCommand.Default: "The default command when no command specified. It intelligently auto-switches between general and notes mode.", ConversationCommand.Online: "Search for information on the internet.", + ConversationCommand.Webpage: "Get information from webpage links provided by you.", ConversationCommand.Image: "Generate images by describing your imagination in words.", ConversationCommand.Help: "Display a help message with all available commands and other metadata.", } tool_descriptions_for_llm = { ConversationCommand.Default: "To use a mix of your internal knowledge and the user's personal knowledge, or if you don't entirely understand the query.", - ConversationCommand.General: "Use this when you can answer the question without any outside information or personal knowledge", + ConversationCommand.General: "To use when you can answer the question without any outside information or personal knowledge", ConversationCommand.Notes: "To search the user's personal knowledge base. Especially helpful if the question expects context from the user's notes or documents.", ConversationCommand.Online: "To search for the latest, up-to-date information from the internet. Note: **Questions about Khoj should always use this data source**", + ConversationCommand.Webpage: "To use if the user has directly provided the webpage urls or you are certain of the webpage urls to read.", } mode_descriptions_for_llm = { @@ -340,3 +344,12 @@ def in_debug_mode(): """Check if Khoj is running in debug mode. Set KHOJ_DEBUG environment variable to true to enable debug mode.""" return is_env_var_true("KHOJ_DEBUG") + + +def is_valid_url(url: str) -> bool: + """Check if a string is a valid URL""" + try: + result = urlparse(url.strip()) + return all([result.scheme, result.netloc]) + except: + return False diff --git a/tests/test_helpers.py b/tests/test_helpers.py index 086e4895..131c3553 100644 --- a/tests/test_helpers.py +++ b/tests/test_helpers.py @@ -7,7 +7,10 @@ import pytest from scipy.stats import linregress from khoj.processor.embeddings import EmbeddingsModel -from khoj.processor.tools.online_search import read_webpage, read_webpage_with_olostep +from khoj.processor.tools.online_search import ( + read_webpage_at_url, + read_webpage_with_olostep, +) from khoj.utils import helpers @@ -90,7 +93,7 @@ async def test_reading_webpage(): website = "https://en.wikipedia.org/wiki/Great_Chicago_Fire" # Act - response = await read_webpage(website) + response = await read_webpage_at_url(website) # Assert assert ( diff --git a/tests/test_openai_chat_actors.py b/tests/test_openai_chat_actors.py index 8db577e9..e7c4f895 100644 --- a/tests/test_openai_chat_actors.py +++ b/tests/test_openai_chat_actors.py @@ -11,6 +11,7 @@ from khoj.routers.helpers import ( aget_relevant_information_sources, aget_relevant_output_modes, generate_online_subqueries, + infer_webpage_urls, ) from khoj.utils.helpers import ConversationCommand @@ -510,6 +511,34 @@ async def test_select_data_sources_actor_chooses_to_search_online(chat_client): assert ConversationCommand.Online in conversation_commands +# ---------------------------------------------------------------------------------------------------- +@pytest.mark.anyio +@pytest.mark.django_db(transaction=True) +async def test_select_data_sources_actor_chooses_to_read_webpage(chat_client): + # Arrange + user_query = "Summarize the wikipedia page on the history of the internet" + + # Act + conversation_commands = await aget_relevant_information_sources(user_query, {}) + + # Assert + assert ConversationCommand.Webpage in conversation_commands + + +# ---------------------------------------------------------------------------------------------------- +@pytest.mark.anyio +@pytest.mark.django_db(transaction=True) +async def test_infer_webpage_urls_actor_extracts_correct_links(chat_client): + # Arrange + user_query = "Summarize the wikipedia page on the history of the internet" + + # Act + urls = await infer_webpage_urls(user_query, {}, None) + + # Assert + assert "https://en.wikipedia.org/wiki/History_of_the_Internet" in urls + + # Helpers # ---------------------------------------------------------------------------------------------------- def populate_chat_history(message_list):