Allow directly reading web pages, even when SERP not enabled (#676)

### Overview
Khoj can now read website directly without needing to go through the search step first

### Details
- Parallelize simple webpage read and extractor
- Rename extract_content online results field to web pages
- Tweak prompts to extract information from webpages, online results
- Test select webpage as data source and extract web urls chat actors

- Render webpage read in chat response references on Web, Desktop apps
- Pass multiple webpages with their urls in online results context

- Support webpage command in chat API
- Add webpage chat command for read web pages requested by user
- Create chat actor for directly reading webpages based on user message
This commit is contained in:
Debanjum
2024-03-24 16:25:25 +05:30
committed by GitHub
11 changed files with 237 additions and 48 deletions

View File

@@ -7,7 +7,10 @@ import pytest
from scipy.stats import linregress
from khoj.processor.embeddings import EmbeddingsModel
from khoj.processor.tools.online_search import read_webpage, read_webpage_with_olostep
from khoj.processor.tools.online_search import (
read_webpage_at_url,
read_webpage_with_olostep,
)
from khoj.utils import helpers
@@ -90,7 +93,7 @@ async def test_reading_webpage():
website = "https://en.wikipedia.org/wiki/Great_Chicago_Fire"
# Act
response = await read_webpage(website)
response = await read_webpage_at_url(website)
# Assert
assert (

View File

@@ -11,6 +11,7 @@ from khoj.routers.helpers import (
aget_relevant_information_sources,
aget_relevant_output_modes,
generate_online_subqueries,
infer_webpage_urls,
)
from khoj.utils.helpers import ConversationCommand
@@ -510,6 +511,34 @@ async def test_select_data_sources_actor_chooses_to_search_online(chat_client):
assert ConversationCommand.Online in conversation_commands
# ----------------------------------------------------------------------------------------------------
@pytest.mark.anyio
@pytest.mark.django_db(transaction=True)
async def test_select_data_sources_actor_chooses_to_read_webpage(chat_client):
# Arrange
user_query = "Summarize the wikipedia page on the history of the internet"
# Act
conversation_commands = await aget_relevant_information_sources(user_query, {})
# Assert
assert ConversationCommand.Webpage in conversation_commands
# ----------------------------------------------------------------------------------------------------
@pytest.mark.anyio
@pytest.mark.django_db(transaction=True)
async def test_infer_webpage_urls_actor_extracts_correct_links(chat_client):
# Arrange
user_query = "Summarize the wikipedia page on the history of the internet"
# Act
urls = await infer_webpage_urls(user_query, {}, None)
# Assert
assert "https://en.wikipedia.org/wiki/History_of_the_Internet" in urls
# Helpers
# ----------------------------------------------------------------------------------------------------
def populate_chat_history(message_list):