Allow directly reading web pages, even when SERP not enabled (#676)

### Overview
Khoj can now read website directly without needing to go through the search step first

### Details
- Parallelize simple webpage read and extractor
- Rename extract_content online results field to web pages
- Tweak prompts to extract information from webpages, online results
- Test select webpage as data source and extract web urls chat actors

- Render webpage read in chat response references on Web, Desktop apps
- Pass multiple webpages with their urls in online results context

- Support webpage command in chat API
- Add webpage chat command for read web pages requested by user
- Create chat actor for directly reading webpages based on user message
This commit is contained in:
Debanjum
2024-03-24 16:25:25 +05:30
committed by GitHub
11 changed files with 237 additions and 48 deletions

View File

@@ -87,7 +87,7 @@
function generateOnlineReference(reference, index) { function generateOnlineReference(reference, index) {
// Generate HTML for Chat Reference // Generate HTML for Chat Reference
let title = reference.title; let title = reference.title || reference.link;
let link = reference.link; let link = reference.link;
let snippet = reference.snippet; let snippet = reference.snippet;
let question = reference.question; let question = reference.question;
@@ -191,6 +191,15 @@
referenceSection.appendChild(polishedReference); referenceSection.appendChild(polishedReference);
} }
} }
if (onlineReference.webpages && onlineReference.webpages.length > 0) {
numOnlineReferences += onlineReference.webpages.length;
for (let index in onlineReference.webpages) {
let reference = onlineReference.webpages[index];
let polishedReference = generateOnlineReference(reference, index);
referenceSection.appendChild(polishedReference);
}
}
} }
return numOnlineReferences; return numOnlineReferences;

View File

@@ -101,7 +101,7 @@ To get started, just start typing below. You can also type / to see a list of co
function generateOnlineReference(reference, index) { function generateOnlineReference(reference, index) {
// Generate HTML for Chat Reference // Generate HTML for Chat Reference
let title = reference.title; let title = reference.title || reference.link;
let link = reference.link; let link = reference.link;
let snippet = reference.snippet; let snippet = reference.snippet;
let question = reference.question; let question = reference.question;
@@ -205,6 +205,15 @@ To get started, just start typing below. You can also type / to see a list of co
referenceSection.appendChild(polishedReference); referenceSection.appendChild(polishedReference);
} }
} }
if (onlineReference.webpages && onlineReference.webpages.length > 0) {
numOnlineReferences += onlineReference.webpages.length;
for (let index in onlineReference.webpages) {
let reference = onlineReference.webpages[index];
let polishedReference = generateOnlineReference(reference, index);
referenceSection.appendChild(polishedReference);
}
}
} }
return numOnlineReferences; return numOnlineReferences;

View File

@@ -177,8 +177,8 @@ def converse_offline(
if ConversationCommand.Online in conversation_commands: if ConversationCommand.Online in conversation_commands:
simplified_online_results = online_results.copy() simplified_online_results = online_results.copy()
for result in online_results: for result in online_results:
if online_results[result].get("extracted_content"): if online_results[result].get("webpages"):
simplified_online_results[result] = online_results[result]["extracted_content"] simplified_online_results[result] = online_results[result]["webpages"]
conversation_primer = f"{prompts.online_search_conversation.format(online_results=str(simplified_online_results))}\n{conversation_primer}" conversation_primer = f"{prompts.online_search_conversation.format(online_results=str(simplified_online_results))}\n{conversation_primer}"
if not is_none_or_empty(compiled_references_message): if not is_none_or_empty(compiled_references_message):

View File

@@ -1,7 +1,7 @@
import json import json
import logging import logging
from datetime import datetime, timedelta from datetime import datetime, timedelta
from typing import Optional from typing import Dict, Optional
from langchain.schema import ChatMessage from langchain.schema import ChatMessage
@@ -104,7 +104,7 @@ def send_message_to_model(messages, api_key, model, response_type="text"):
def converse( def converse(
references, references,
user_query, user_query,
online_results: Optional[dict] = None, online_results: Optional[Dict[str, Dict]] = None,
conversation_log={}, conversation_log={},
model: str = "gpt-3.5-turbo", model: str = "gpt-3.5-turbo",
api_key: Optional[str] = None, api_key: Optional[str] = None,
@@ -142,7 +142,7 @@ def converse(
completion_func(chat_response=prompts.no_online_results_found.format()) completion_func(chat_response=prompts.no_online_results_found.format())
return iter([prompts.no_online_results_found.format()]) return iter([prompts.no_online_results_found.format()])
if ConversationCommand.Online in conversation_commands: if ConversationCommand.Online in conversation_commands or ConversationCommand.Webpage in conversation_commands:
conversation_primer = ( conversation_primer = (
f"{prompts.online_search_conversation.format(online_results=str(online_results))}\n{conversation_primer}" f"{prompts.online_search_conversation.format(online_results=str(online_results))}\n{conversation_primer}"
) )
@@ -158,7 +158,7 @@ def converse(
max_prompt_size, max_prompt_size,
tokenizer_name, tokenizer_name,
) )
truncated_messages = "\n".join({f"{message.content[:40]}..." for message in messages}) truncated_messages = "\n".join({f"{message.content[:70]}..." for message in messages})
logger.debug(f"Conversation Context for GPT: {truncated_messages}") logger.debug(f"Conversation Context for GPT: {truncated_messages}")
# Get Response from GPT # Get Response from GPT

View File

@@ -10,7 +10,7 @@ You were created by Khoj Inc. with the following capabilities:
- You *CAN REMEMBER ALL NOTES and PERSONAL INFORMATION FOREVER* that the user ever shares with you. - You *CAN REMEMBER ALL NOTES and PERSONAL INFORMATION FOREVER* that the user ever shares with you.
- Users can share files and other information with you using the Khoj Desktop, Obsidian or Emacs app. They can also drag and drop their files into the chat window. - Users can share files and other information with you using the Khoj Desktop, Obsidian or Emacs app. They can also drag and drop their files into the chat window.
- You can generate images, look-up information from the internet, and answer questions based on the user's notes. - You *CAN* generate images, look-up real-time information from the internet, and answer questions based on the user's notes.
- You cannot set reminders. - You cannot set reminders.
- Say "I don't know" or "I don't understand" if you don't know what to say or if you don't know the answer to a question. - Say "I don't know" or "I don't understand" if you don't know what to say or if you don't know the answer to a question.
- Ask crisp follow-up questions to get additional context, when the answer cannot be inferred from the provided notes or past conversations. - Ask crisp follow-up questions to get additional context, when the answer cannot be inferred from the provided notes or past conversations.
@@ -146,7 +146,8 @@ online_search_conversation = PromptTemplate.from_template(
Use this up-to-date information from the internet to inform your response. Use this up-to-date information from the internet to inform your response.
Ask crisp follow-up questions to get additional context, when a helpful response cannot be provided from the online data or past conversations. Ask crisp follow-up questions to get additional context, when a helpful response cannot be provided from the online data or past conversations.
Information from the internet: {online_results} Information from the internet:
{online_results}
""".strip() """.strip()
) )
@@ -280,7 +281,7 @@ Target Query: {query}
Web Pages: Web Pages:
{corpus} {corpus}
Collate the relevant information from the website to answer the target query. Collate only relevant information from the website to answer the target query.
""".strip() """.strip()
) )
@@ -362,6 +363,14 @@ AI: Good morning! How can I help you today?
Q: How can I share my files with Khoj? Q: How can I share my files with Khoj?
Khoj: {{"source": ["default", "online"]}} Khoj: {{"source": ["default", "online"]}}
Example:
Chat History:
User: What is the first element in the periodic table?
AI: The first element in the periodic table is Hydrogen.
Q: Summarize this article https://en.wikipedia.org/wiki/Hydrogen
Khoj: {{"source": ["webpage"]}}
Example: Example:
Chat History: Chat History:
User: I want to start a new hobby. I'm thinking of learning to play the guitar. User: I want to start a new hobby. I'm thinking of learning to play the guitar.
@@ -380,6 +389,50 @@ Khoj:
""".strip() """.strip()
) )
infer_webpages_to_read = PromptTemplate.from_template(
"""
You are Khoj, an advanced web page reading assistant. You are to construct **up to three, valid** webpage urls to read before answering the user's question.
- You will receive the conversation history as context.
- Add as much context from the previous questions and answers as required to construct the webpage urls.
- Use multiple web page urls if required to retrieve the relevant information.
- You have access to the the whole internet to retrieve information.
Which webpages will you need to read to answer the user's question?
Provide web page links as a list of strings in a JSON object.
Current Date: {current_date}
User's Location: {location}
Here are some examples:
History:
User: I like to use Hacker News to get my tech news.
AI: Hacker News is an online forum for sharing and discussing the latest tech news. It is a great place to learn about new technologies and startups.
Q: Summarize this post about vector database on Hacker News, https://news.ycombinator.com/item?id=12345
Khoj: {{"links": ["https://news.ycombinator.com/item?id=12345"]}}
History:
User: I'm currently living in New York but I'm thinking about moving to San Francisco.
AI: New York is a great city to live in. It has a lot of great restaurants and museums. San Francisco is also a great city to live in. It has good access to nature and a great tech scene.
Q: What is the climate like in those cities?
Khoj: {{"links": ["https://en.wikipedia.org/wiki/New_York_City", "https://en.wikipedia.org/wiki/San_Francisco"]}}
History:
User: Hey, how is it going?
AI: Not too bad. How can I help you today?
Q: What's the latest news on r/worldnews?
Khoj: {{"links": ["https://www.reddit.com/r/worldnews/"]}}
Now it's your turn to share actual webpage urls you'd like to read to answer the user's question.
History:
{chat_history}
Q: {query}
Khoj:
""".strip()
)
online_search_conversation_subqueries = PromptTemplate.from_template( online_search_conversation_subqueries = PromptTemplate.from_template(
""" """
You are Khoj, an advanced google search assistant. You are tasked with constructing **up to three** google search queries to answer the user's question. You are Khoj, an advanced google search assistant. You are tasked with constructing **up to three** google search queries to answer the user's question.

View File

@@ -2,6 +2,7 @@ import asyncio
import json import json
import logging import logging
import os import os
from collections import defaultdict
from typing import Dict, Tuple, Union from typing import Dict, Tuple, Union
import aiohttp import aiohttp
@@ -9,7 +10,11 @@ import requests
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
from markdownify import markdownify from markdownify import markdownify
from khoj.routers.helpers import extract_relevant_info, generate_online_subqueries from khoj.routers.helpers import (
extract_relevant_info,
generate_online_subqueries,
infer_webpage_urls,
)
from khoj.utils.helpers import is_none_or_empty, timer from khoj.utils.helpers import is_none_or_empty, timer
from khoj.utils.rawconfig import LocationData from khoj.utils.rawconfig import LocationData
@@ -38,7 +43,7 @@ MAX_WEBPAGES_TO_READ = 1
async def search_online(query: str, conversation_history: dict, location: LocationData): async def search_online(query: str, conversation_history: dict, location: LocationData):
if SERPER_DEV_API_KEY is None: if not online_search_enabled():
logger.warn("SERPER_DEV_API_KEY is not set") logger.warn("SERPER_DEV_API_KEY is not set")
return {} return {}
@@ -52,24 +57,21 @@ async def search_online(query: str, conversation_history: dict, location: Locati
# Gather distinct web pages from organic search results of each subquery without an instant answer # Gather distinct web pages from organic search results of each subquery without an instant answer
webpage_links = { webpage_links = {
result["link"] organic["link"]: subquery
for subquery in response_dict for subquery in response_dict
for result in response_dict[subquery].get("organic", [])[:MAX_WEBPAGES_TO_READ] for organic in response_dict[subquery].get("organic", [])[:MAX_WEBPAGES_TO_READ]
if "answerBox" not in response_dict[subquery] if "answerBox" not in response_dict[subquery]
} }
# Read, extract relevant info from the retrieved web pages # Read, extract relevant info from the retrieved web pages
tasks = [] logger.info(f"Reading web pages at: {webpage_links.keys()}")
for webpage_link in webpage_links: tasks = [read_webpage_and_extract_content(subquery, link) for link, subquery in webpage_links.items()]
logger.info(f"Reading web page at '{webpage_link}'")
task = read_webpage_and_extract_content(subquery, webpage_link)
tasks.append(task)
results = await asyncio.gather(*tasks) results = await asyncio.gather(*tasks)
# Collect extracted info from the retrieved web pages # Collect extracted info from the retrieved web pages
for subquery, extracted_webpage_content in results: for subquery, webpage_extract, url in results:
if extracted_webpage_content is not None: if webpage_extract is not None:
response_dict[subquery]["extracted_content"] = extracted_webpage_content response_dict[subquery]["webpages"] = {"link": url, "snippet": webpage_extract}
return response_dict return response_dict
@@ -93,19 +95,35 @@ def search_with_google(subquery: str):
return extracted_search_result return extracted_search_result
async def read_webpage_and_extract_content(subquery: str, url: str) -> Tuple[str, Union[None, str]]: async def read_webpages(query: str, conversation_history: dict, location: LocationData):
"Infer web pages to read from the query and extract relevant information from them"
logger.info(f"Inferring web pages to read")
urls = await infer_webpage_urls(query, conversation_history, location)
logger.info(f"Reading web pages at: {urls}")
tasks = [read_webpage_and_extract_content(query, url) for url in urls]
results = await asyncio.gather(*tasks)
response: Dict[str, Dict] = defaultdict(dict)
response[query]["webpages"] = [
{"query": q, "link": url, "snippet": web_extract} for q, web_extract, url in results if web_extract is not None
]
return response
async def read_webpage_and_extract_content(subquery: str, url: str) -> Tuple[str, Union[None, str], str]:
try: try:
with timer(f"Reading web page at '{url}' took", logger): with timer(f"Reading web page at '{url}' took", logger):
content = await read_webpage_with_olostep(url) if OLOSTEP_API_KEY else await read_webpage(url) content = await read_webpage_with_olostep(url) if OLOSTEP_API_KEY else await read_webpage_at_url(url)
with timer(f"Extracting relevant information from web page at '{url}' took", logger): with timer(f"Extracting relevant information from web page at '{url}' took", logger):
extracted_info = await extract_relevant_info(subquery, content) extracted_info = await extract_relevant_info(subquery, content)
return subquery, extracted_info return subquery, extracted_info, url
except Exception as e: except Exception as e:
logger.error(f"Failed to read web page at '{url}' with {e}") logger.error(f"Failed to read web page at '{url}' with {e}")
return subquery, None return subquery, None, url
async def read_webpage(web_url: str) -> str: async def read_webpage_at_url(web_url: str) -> str:
headers = { headers = {
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.97 Safari/537.36", "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.97 Safari/537.36",
} }
@@ -129,3 +147,7 @@ async def read_webpage_with_olostep(web_url: str) -> str:
response.raise_for_status() response.raise_for_status()
response_json = await response.json() response_json = await response.json()
return response_json["markdown_content"] return response_json["markdown_content"]
def online_search_enabled():
return SERPER_DEV_API_KEY is not None

View File

@@ -14,7 +14,11 @@ from khoj.database.adapters import ConversationAdapters, EntryAdapters, aget_use
from khoj.database.models import KhojUser from khoj.database.models import KhojUser
from khoj.processor.conversation.prompts import help_message, no_entries_found from khoj.processor.conversation.prompts import help_message, no_entries_found
from khoj.processor.conversation.utils import save_to_conversation_log from khoj.processor.conversation.utils import save_to_conversation_log
from khoj.processor.tools.online_search import search_online from khoj.processor.tools.online_search import (
online_search_enabled,
read_webpages,
search_online,
)
from khoj.routers.api import extract_references_and_questions from khoj.routers.api import extract_references_and_questions
from khoj.routers.helpers import ( from khoj.routers.helpers import (
ApiUserRateLimiter, ApiUserRateLimiter,
@@ -238,6 +242,7 @@ async def chat(
) -> Response: ) -> Response:
user: KhojUser = request.user.object user: KhojUser = request.user.object
q = unquote(q) q = unquote(q)
logger.info("Chat request by {user.username}: {q}")
await is_ready_to_chat(user) await is_ready_to_chat(user)
conversation_commands = [get_conversation_command(query=q, any_references=True)] conversation_commands = [get_conversation_command(query=q, any_references=True)]
@@ -280,7 +285,7 @@ async def chat(
compiled_references, inferred_queries, defiltered_query = await extract_references_and_questions( compiled_references, inferred_queries, defiltered_query = await extract_references_and_questions(
request, common, meta_log, q, (n or 5), (d or math.inf), conversation_commands, location request, common, meta_log, q, (n or 5), (d or math.inf), conversation_commands, location
) )
online_results: Dict = dict() online_results: Dict[str, Dict] = {}
if conversation_commands == [ConversationCommand.Notes] and not await EntryAdapters.auser_has_entries(user): if conversation_commands == [ConversationCommand.Notes] and not await EntryAdapters.auser_has_entries(user):
no_entries_found_format = no_entries_found.format() no_entries_found_format = no_entries_found.format()
@@ -294,13 +299,23 @@ async def chat(
conversation_commands.remove(ConversationCommand.Notes) conversation_commands.remove(ConversationCommand.Notes)
if ConversationCommand.Online in conversation_commands: if ConversationCommand.Online in conversation_commands:
if not online_search_enabled():
conversation_commands.remove(ConversationCommand.Online)
# If online search is not enabled, try to read webpages directly
if ConversationCommand.Webpage not in conversation_commands:
conversation_commands.append(ConversationCommand.Webpage)
else:
try: try:
online_results = await search_online(defiltered_query, meta_log, location) online_results = await search_online(defiltered_query, meta_log, location)
except ValueError as e: except ValueError as e:
return StreamingResponse( logger.warning(f"Error searching online: {e}. Attempting to respond without online results")
iter(["Please set your SERPER_DEV_API_KEY to get started with online searches 🌐"]),
media_type="text/event-stream", if ConversationCommand.Webpage in conversation_commands:
status_code=200, try:
online_results = await read_webpages(defiltered_query, meta_log, location)
except ValueError as e:
logger.warning(
f"Error directly reading webpages: {e}. Attempting to respond without online results", exc_info=True
) )
if ConversationCommand.Image in conversation_commands: if ConversationCommand.Image in conversation_commands:

View File

@@ -36,6 +36,7 @@ from khoj.utils.config import GPT4AllProcessorModel
from khoj.utils.helpers import ( from khoj.utils.helpers import (
ConversationCommand, ConversationCommand,
is_none_or_empty, is_none_or_empty,
is_valid_url,
log_telemetry, log_telemetry,
mode_descriptions_for_llm, mode_descriptions_for_llm,
timer, timer,
@@ -167,6 +168,7 @@ async def aget_relevant_information_sources(query: str, conversation_history: di
chat_history=chat_history, chat_history=chat_history,
) )
with timer("Chat actor: Infer information sources to refer", logger):
response = await send_message_to_model_wrapper(relevant_tools_prompt, response_type="json_object") response = await send_message_to_model_wrapper(relevant_tools_prompt, response_type="json_object")
try: try:
@@ -211,6 +213,7 @@ async def aget_relevant_output_modes(query: str, conversation_history: dict):
chat_history=chat_history, chat_history=chat_history,
) )
with timer("Chat actor: Infer output mode for chat response", logger):
response = await send_message_to_model_wrapper(relevant_mode_prompt) response = await send_message_to_model_wrapper(relevant_mode_prompt)
try: try:
@@ -229,6 +232,36 @@ async def aget_relevant_output_modes(query: str, conversation_history: dict):
return ConversationCommand.Default return ConversationCommand.Default
async def infer_webpage_urls(q: str, conversation_history: dict, location_data: LocationData) -> List[str]:
"""
Infer webpage links from the given query
"""
location = f"{location_data.city}, {location_data.region}, {location_data.country}" if location_data else "Unknown"
chat_history = construct_chat_history(conversation_history)
utc_date = datetime.utcnow().strftime("%Y-%m-%d")
online_queries_prompt = prompts.infer_webpages_to_read.format(
current_date=utc_date,
query=q,
chat_history=chat_history,
location=location,
)
with timer("Chat actor: Infer webpage urls to read", logger):
response = await send_message_to_model_wrapper(online_queries_prompt, response_type="json_object")
# Validate that the response is a non-empty, JSON-serializable list of URLs
try:
response = response.strip()
urls = json.loads(response)
valid_unique_urls = {str(url).strip() for url in urls["links"] if is_valid_url(url)}
if is_none_or_empty(valid_unique_urls):
raise ValueError(f"Invalid list of urls: {response}")
return list(valid_unique_urls)
except Exception:
raise ValueError(f"Invalid list of urls: {response}")
async def generate_online_subqueries(q: str, conversation_history: dict, location_data: LocationData) -> List[str]: async def generate_online_subqueries(q: str, conversation_history: dict, location_data: LocationData) -> List[str]:
""" """
Generate subqueries from the given query Generate subqueries from the given query
@@ -244,6 +277,7 @@ async def generate_online_subqueries(q: str, conversation_history: dict, locatio
location=location, location=location,
) )
with timer("Chat actor: Generate online search subqueries", logger):
response = await send_message_to_model_wrapper(online_queries_prompt, response_type="json_object") response = await send_message_to_model_wrapper(online_queries_prompt, response_type="json_object")
# Validate that the response is a non-empty, JSON-serializable list # Validate that the response is a non-empty, JSON-serializable list
@@ -273,6 +307,7 @@ async def extract_relevant_info(q: str, corpus: str) -> Union[str, None]:
corpus=corpus.strip(), corpus=corpus.strip(),
) )
with timer("Chat actor: Extract relevant information from data", logger):
response = await send_message_to_model_wrapper( response = await send_message_to_model_wrapper(
extract_relevant_information, prompts.system_prompt_extract_relevant_information extract_relevant_information, prompts.system_prompt_extract_relevant_information
) )
@@ -304,8 +339,8 @@ async def generate_better_image_prompt(
for result in online_results: for result in online_results:
if online_results[result].get("answerBox"): if online_results[result].get("answerBox"):
simplified_online_results[result] = online_results[result]["answerBox"] simplified_online_results[result] = online_results[result]["answerBox"]
elif online_results[result].get("extracted_content"): elif online_results[result].get("webpages"):
simplified_online_results[result] = online_results[result]["extracted_content"] simplified_online_results[result] = online_results[result]["webpages"]
image_prompt = prompts.image_generation_improve_prompt.format( image_prompt = prompts.image_generation_improve_prompt.format(
query=q, query=q,
@@ -316,6 +351,7 @@ async def generate_better_image_prompt(
online_results=simplified_online_results, online_results=simplified_online_results,
) )
with timer("Chat actor: Generate contextual image prompt", logger):
response = await send_message_to_model_wrapper(image_prompt) response = await send_message_to_model_wrapper(image_prompt)
return response.strip() return response.strip()
@@ -365,7 +401,7 @@ def generate_chat_response(
q: str, q: str,
meta_log: dict, meta_log: dict,
compiled_references: List[str] = [], compiled_references: List[str] = [],
online_results: Dict[str, Any] = {}, online_results: Dict[str, Dict] = {},
inferred_queries: List[str] = [], inferred_queries: List[str] = [],
conversation_commands: List[ConversationCommand] = [ConversationCommand.Default], conversation_commands: List[ConversationCommand] = [ConversationCommand.Default],
user: KhojUser = None, user: KhojUser = None,

View File

@@ -15,6 +15,7 @@ from os import path
from pathlib import Path from pathlib import Path
from time import perf_counter from time import perf_counter
from typing import TYPE_CHECKING, Optional, Union from typing import TYPE_CHECKING, Optional, Union
from urllib.parse import urlparse
import torch import torch
from asgiref.sync import sync_to_async from asgiref.sync import sync_to_async
@@ -270,6 +271,7 @@ class ConversationCommand(str, Enum):
Notes = "notes" Notes = "notes"
Help = "help" Help = "help"
Online = "online" Online = "online"
Webpage = "webpage"
Image = "image" Image = "image"
@@ -278,15 +280,17 @@ command_descriptions = {
ConversationCommand.Notes: "Only talk about information that is available in your knowledge base.", ConversationCommand.Notes: "Only talk about information that is available in your knowledge base.",
ConversationCommand.Default: "The default command when no command specified. It intelligently auto-switches between general and notes mode.", ConversationCommand.Default: "The default command when no command specified. It intelligently auto-switches between general and notes mode.",
ConversationCommand.Online: "Search for information on the internet.", ConversationCommand.Online: "Search for information on the internet.",
ConversationCommand.Webpage: "Get information from webpage links provided by you.",
ConversationCommand.Image: "Generate images by describing your imagination in words.", ConversationCommand.Image: "Generate images by describing your imagination in words.",
ConversationCommand.Help: "Display a help message with all available commands and other metadata.", ConversationCommand.Help: "Display a help message with all available commands and other metadata.",
} }
tool_descriptions_for_llm = { tool_descriptions_for_llm = {
ConversationCommand.Default: "To use a mix of your internal knowledge and the user's personal knowledge, or if you don't entirely understand the query.", ConversationCommand.Default: "To use a mix of your internal knowledge and the user's personal knowledge, or if you don't entirely understand the query.",
ConversationCommand.General: "Use this when you can answer the question without any outside information or personal knowledge", ConversationCommand.General: "To use when you can answer the question without any outside information or personal knowledge",
ConversationCommand.Notes: "To search the user's personal knowledge base. Especially helpful if the question expects context from the user's notes or documents.", ConversationCommand.Notes: "To search the user's personal knowledge base. Especially helpful if the question expects context from the user's notes or documents.",
ConversationCommand.Online: "To search for the latest, up-to-date information from the internet. Note: **Questions about Khoj should always use this data source**", ConversationCommand.Online: "To search for the latest, up-to-date information from the internet. Note: **Questions about Khoj should always use this data source**",
ConversationCommand.Webpage: "To use if the user has directly provided the webpage urls or you are certain of the webpage urls to read.",
} }
mode_descriptions_for_llm = { mode_descriptions_for_llm = {
@@ -340,3 +344,12 @@ def in_debug_mode():
"""Check if Khoj is running in debug mode. """Check if Khoj is running in debug mode.
Set KHOJ_DEBUG environment variable to true to enable debug mode.""" Set KHOJ_DEBUG environment variable to true to enable debug mode."""
return is_env_var_true("KHOJ_DEBUG") return is_env_var_true("KHOJ_DEBUG")
def is_valid_url(url: str) -> bool:
"""Check if a string is a valid URL"""
try:
result = urlparse(url.strip())
return all([result.scheme, result.netloc])
except:
return False

View File

@@ -7,7 +7,10 @@ import pytest
from scipy.stats import linregress from scipy.stats import linregress
from khoj.processor.embeddings import EmbeddingsModel from khoj.processor.embeddings import EmbeddingsModel
from khoj.processor.tools.online_search import read_webpage, read_webpage_with_olostep from khoj.processor.tools.online_search import (
read_webpage_at_url,
read_webpage_with_olostep,
)
from khoj.utils import helpers from khoj.utils import helpers
@@ -90,7 +93,7 @@ async def test_reading_webpage():
website = "https://en.wikipedia.org/wiki/Great_Chicago_Fire" website = "https://en.wikipedia.org/wiki/Great_Chicago_Fire"
# Act # Act
response = await read_webpage(website) response = await read_webpage_at_url(website)
# Assert # Assert
assert ( assert (

View File

@@ -11,6 +11,7 @@ from khoj.routers.helpers import (
aget_relevant_information_sources, aget_relevant_information_sources,
aget_relevant_output_modes, aget_relevant_output_modes,
generate_online_subqueries, generate_online_subqueries,
infer_webpage_urls,
) )
from khoj.utils.helpers import ConversationCommand from khoj.utils.helpers import ConversationCommand
@@ -510,6 +511,34 @@ async def test_select_data_sources_actor_chooses_to_search_online(chat_client):
assert ConversationCommand.Online in conversation_commands assert ConversationCommand.Online in conversation_commands
# ----------------------------------------------------------------------------------------------------
@pytest.mark.anyio
@pytest.mark.django_db(transaction=True)
async def test_select_data_sources_actor_chooses_to_read_webpage(chat_client):
# Arrange
user_query = "Summarize the wikipedia page on the history of the internet"
# Act
conversation_commands = await aget_relevant_information_sources(user_query, {})
# Assert
assert ConversationCommand.Webpage in conversation_commands
# ----------------------------------------------------------------------------------------------------
@pytest.mark.anyio
@pytest.mark.django_db(transaction=True)
async def test_infer_webpage_urls_actor_extracts_correct_links(chat_client):
# Arrange
user_query = "Summarize the wikipedia page on the history of the internet"
# Act
urls = await infer_webpage_urls(user_query, {}, None)
# Assert
assert "https://en.wikipedia.org/wiki/History_of_the_Internet" in urls
# Helpers # Helpers
# ---------------------------------------------------------------------------------------------------- # ----------------------------------------------------------------------------------------------------
def populate_chat_history(message_list): def populate_chat_history(message_list):