mirror of
https://github.com/khoaliber/khoj.git
synced 2026-03-08 21:29:12 +00:00
Allow directly reading web pages, even when SERP not enabled (#676)
### Overview Khoj can now read website directly without needing to go through the search step first ### Details - Parallelize simple webpage read and extractor - Rename extract_content online results field to web pages - Tweak prompts to extract information from webpages, online results - Test select webpage as data source and extract web urls chat actors - Render webpage read in chat response references on Web, Desktop apps - Pass multiple webpages with their urls in online results context - Support webpage command in chat API - Add webpage chat command for read web pages requested by user - Create chat actor for directly reading webpages based on user message
This commit is contained in:
@@ -7,7 +7,10 @@ import pytest
|
||||
from scipy.stats import linregress
|
||||
|
||||
from khoj.processor.embeddings import EmbeddingsModel
|
||||
from khoj.processor.tools.online_search import read_webpage, read_webpage_with_olostep
|
||||
from khoj.processor.tools.online_search import (
|
||||
read_webpage_at_url,
|
||||
read_webpage_with_olostep,
|
||||
)
|
||||
from khoj.utils import helpers
|
||||
|
||||
|
||||
@@ -90,7 +93,7 @@ async def test_reading_webpage():
|
||||
website = "https://en.wikipedia.org/wiki/Great_Chicago_Fire"
|
||||
|
||||
# Act
|
||||
response = await read_webpage(website)
|
||||
response = await read_webpage_at_url(website)
|
||||
|
||||
# Assert
|
||||
assert (
|
||||
|
||||
@@ -11,6 +11,7 @@ from khoj.routers.helpers import (
|
||||
aget_relevant_information_sources,
|
||||
aget_relevant_output_modes,
|
||||
generate_online_subqueries,
|
||||
infer_webpage_urls,
|
||||
)
|
||||
from khoj.utils.helpers import ConversationCommand
|
||||
|
||||
@@ -510,6 +511,34 @@ async def test_select_data_sources_actor_chooses_to_search_online(chat_client):
|
||||
assert ConversationCommand.Online in conversation_commands
|
||||
|
||||
|
||||
# ----------------------------------------------------------------------------------------------------
|
||||
@pytest.mark.anyio
|
||||
@pytest.mark.django_db(transaction=True)
|
||||
async def test_select_data_sources_actor_chooses_to_read_webpage(chat_client):
|
||||
# Arrange
|
||||
user_query = "Summarize the wikipedia page on the history of the internet"
|
||||
|
||||
# Act
|
||||
conversation_commands = await aget_relevant_information_sources(user_query, {})
|
||||
|
||||
# Assert
|
||||
assert ConversationCommand.Webpage in conversation_commands
|
||||
|
||||
|
||||
# ----------------------------------------------------------------------------------------------------
|
||||
@pytest.mark.anyio
|
||||
@pytest.mark.django_db(transaction=True)
|
||||
async def test_infer_webpage_urls_actor_extracts_correct_links(chat_client):
|
||||
# Arrange
|
||||
user_query = "Summarize the wikipedia page on the history of the internet"
|
||||
|
||||
# Act
|
||||
urls = await infer_webpage_urls(user_query, {}, None)
|
||||
|
||||
# Assert
|
||||
assert "https://en.wikipedia.org/wiki/History_of_the_Internet" in urls
|
||||
|
||||
|
||||
# Helpers
|
||||
# ----------------------------------------------------------------------------------------------------
|
||||
def populate_chat_history(message_list):
|
||||
|
||||
Reference in New Issue
Block a user