mirror of
https://github.com/khoaliber/khoj.git
synced 2026-03-02 21:19:12 +00:00
Add Ability to Summarize Documents (#800)
* Uses entire file text and summarizer model to generate document summary. * Uses the contents of the user's query to create a tailored summary. * Integrates with File Filters #788 for a better UX.
This commit is contained in:
@@ -19,6 +19,7 @@ from khoj.database.models import (
|
||||
KhojUser,
|
||||
LocalMarkdownConfig,
|
||||
LocalOrgConfig,
|
||||
LocalPdfConfig,
|
||||
LocalPlaintextConfig,
|
||||
)
|
||||
from khoj.processor.content.org_mode.org_to_entries import OrgToEntries
|
||||
@@ -415,6 +416,18 @@ def org_config_with_only_new_file(new_org_file: Path, default_user: KhojUser):
|
||||
return LocalOrgConfig.objects.filter(user=default_user).first()
|
||||
|
||||
|
||||
@pytest.fixture(scope="function")
|
||||
def pdf_configured_user1(default_user: KhojUser):
|
||||
LocalPdfConfig.objects.create(
|
||||
input_files=None,
|
||||
input_filter=["tests/data/pdf/singlepage.pdf"],
|
||||
user=default_user,
|
||||
)
|
||||
# Index Markdown Content for Search
|
||||
all_files = fs_syncer.collect_files(user=default_user)
|
||||
success = configure_content(all_files, user=default_user)
|
||||
|
||||
|
||||
@pytest.fixture(scope="function")
|
||||
def sample_org_data():
|
||||
return get_sample_data("org")
|
||||
|
||||
@@ -51,7 +51,9 @@ class ChatModelOptionsFactory(factory.django.DjangoModelFactory):
|
||||
tokenizer = None
|
||||
chat_model = "NousResearch/Hermes-2-Pro-Mistral-7B-GGUF"
|
||||
model_type = "offline"
|
||||
openai_config = factory.SubFactory(OpenAIProcessorConversationConfigFactory)
|
||||
openai_config = factory.LazyAttribute(
|
||||
lambda obj: OpenAIProcessorConversationConfigFactory() if os.getenv("OPENAI_API_KEY") else None
|
||||
)
|
||||
|
||||
|
||||
class UserConversationProcessorConfigFactory(factory.django.DjangoModelFactory):
|
||||
|
||||
@@ -23,13 +23,14 @@ def test_extract_markdown_with_no_headings(tmp_path):
|
||||
entries = MarkdownToEntries.extract_markdown_entries(markdown_files=data, max_tokens=3)
|
||||
|
||||
# Assert
|
||||
assert len(entries) == 1
|
||||
assert len(entries) == 2
|
||||
assert len(entries[1]) == 1
|
||||
# Ensure raw entry with no headings do not get heading prefix prepended
|
||||
assert not entries[0].raw.startswith("#")
|
||||
assert not entries[1][0].raw.startswith("#")
|
||||
# Ensure compiled entry has filename prepended as top level heading
|
||||
assert entries[0].compiled.startswith(expected_heading)
|
||||
assert entries[1][0].compiled.startswith(expected_heading)
|
||||
# Ensure compiled entry also includes the file name
|
||||
assert str(tmp_path) in entries[0].compiled
|
||||
assert str(tmp_path) in entries[1][0].compiled
|
||||
|
||||
|
||||
def test_extract_single_markdown_entry(tmp_path):
|
||||
@@ -48,7 +49,8 @@ def test_extract_single_markdown_entry(tmp_path):
|
||||
entries = MarkdownToEntries.extract_markdown_entries(markdown_files=data, max_tokens=3)
|
||||
|
||||
# Assert
|
||||
assert len(entries) == 1
|
||||
assert len(entries) == 2
|
||||
assert len(entries[1]) == 1
|
||||
|
||||
|
||||
def test_extract_multiple_markdown_entries(tmp_path):
|
||||
@@ -72,8 +74,9 @@ def test_extract_multiple_markdown_entries(tmp_path):
|
||||
|
||||
# Assert
|
||||
assert len(entries) == 2
|
||||
assert len(entries[1]) == 2
|
||||
# Ensure entry compiled strings include the markdown files they originate from
|
||||
assert all([tmp_path.stem in entry.compiled for entry in entries])
|
||||
assert all([tmp_path.stem in entry.compiled for entry in entries[1]])
|
||||
|
||||
|
||||
def test_extract_entries_with_different_level_headings(tmp_path):
|
||||
@@ -94,8 +97,9 @@ def test_extract_entries_with_different_level_headings(tmp_path):
|
||||
|
||||
# Assert
|
||||
assert len(entries) == 2
|
||||
assert entries[0].raw == "# Heading 1\n## Sub-Heading 1.1", "Ensure entry includes heading ancestory"
|
||||
assert entries[1].raw == "# Heading 2\n"
|
||||
assert len(entries[1]) == 2
|
||||
assert entries[1][0].raw == "# Heading 1\n## Sub-Heading 1.1", "Ensure entry includes heading ancestory"
|
||||
assert entries[1][1].raw == "# Heading 2\n"
|
||||
|
||||
|
||||
def test_extract_entries_with_non_incremental_heading_levels(tmp_path):
|
||||
@@ -116,10 +120,11 @@ def test_extract_entries_with_non_incremental_heading_levels(tmp_path):
|
||||
entries = MarkdownToEntries.extract_markdown_entries(markdown_files=data, max_tokens=3)
|
||||
|
||||
# Assert
|
||||
assert len(entries) == 3
|
||||
assert entries[0].raw == "# Heading 1\n#### Sub-Heading 1.1", "Ensure entry includes heading ancestory"
|
||||
assert entries[1].raw == "# Heading 1\n## Sub-Heading 1.2", "Ensure entry includes heading ancestory"
|
||||
assert entries[2].raw == "# Heading 2\n"
|
||||
assert len(entries) == 2
|
||||
assert len(entries[1]) == 3
|
||||
assert entries[1][0].raw == "# Heading 1\n#### Sub-Heading 1.1", "Ensure entry includes heading ancestory"
|
||||
assert entries[1][1].raw == "# Heading 1\n## Sub-Heading 1.2", "Ensure entry includes heading ancestory"
|
||||
assert entries[1][2].raw == "# Heading 2\n"
|
||||
|
||||
|
||||
def test_extract_entries_with_text_before_headings(tmp_path):
|
||||
@@ -141,10 +146,13 @@ body line 2
|
||||
entries = MarkdownToEntries.extract_markdown_entries(markdown_files=data, max_tokens=3)
|
||||
|
||||
# Assert
|
||||
assert len(entries) == 3
|
||||
assert entries[0].raw == "\nText before headings"
|
||||
assert entries[1].raw == "# Heading 1\nbody line 1"
|
||||
assert entries[2].raw == "# Heading 1\n## Heading 2\nbody line 2\n", "Ensure raw entry includes heading ancestory"
|
||||
assert len(entries) == 2
|
||||
assert len(entries[1]) == 3
|
||||
assert entries[1][0].raw == "\nText before headings"
|
||||
assert entries[1][1].raw == "# Heading 1\nbody line 1"
|
||||
assert (
|
||||
entries[1][2].raw == "# Heading 1\n## Heading 2\nbody line 2\n"
|
||||
), "Ensure raw entry includes heading ancestory"
|
||||
|
||||
|
||||
def test_parse_markdown_file_into_single_entry_if_small(tmp_path):
|
||||
@@ -165,8 +173,9 @@ body line 1.1
|
||||
entries = MarkdownToEntries.extract_markdown_entries(markdown_files=data, max_tokens=12)
|
||||
|
||||
# Assert
|
||||
assert len(entries) == 1
|
||||
assert entries[0].raw == entry
|
||||
assert len(entries) == 2
|
||||
assert len(entries[1]) == 1
|
||||
assert entries[1][0].raw == entry
|
||||
|
||||
|
||||
def test_parse_markdown_entry_with_children_as_single_entry_if_small(tmp_path):
|
||||
@@ -191,13 +200,14 @@ longer body line 2.1
|
||||
entries = MarkdownToEntries.extract_markdown_entries(markdown_files=data, max_tokens=12)
|
||||
|
||||
# Assert
|
||||
assert len(entries) == 3
|
||||
assert len(entries) == 2
|
||||
assert len(entries[1]) == 3
|
||||
assert (
|
||||
entries[0].raw == "# Heading 1\nbody line 1\n## Subheading 1.1\nbody line 1.1"
|
||||
entries[1][0].raw == "# Heading 1\nbody line 1\n## Subheading 1.1\nbody line 1.1"
|
||||
), "First entry includes children headings"
|
||||
assert entries[1].raw == "# Heading 2\nbody line 2", "Second entry does not include children headings"
|
||||
assert entries[1][1].raw == "# Heading 2\nbody line 2", "Second entry does not include children headings"
|
||||
assert (
|
||||
entries[2].raw == "# Heading 2\n## Subheading 2.1\nlonger body line 2.1\n"
|
||||
entries[1][2].raw == "# Heading 2\n## Subheading 2.1\nlonger body line 2.1\n"
|
||||
), "Third entry is second entries child heading"
|
||||
|
||||
|
||||
|
||||
@@ -6,7 +6,7 @@ import pytest
|
||||
from faker import Faker
|
||||
from freezegun import freeze_time
|
||||
|
||||
from khoj.database.models import Agent, KhojUser
|
||||
from khoj.database.models import Agent, Entry, KhojUser
|
||||
from khoj.processor.conversation import prompts
|
||||
from khoj.processor.conversation.utils import message_to_log
|
||||
from khoj.routers.helpers import aget_relevant_information_sources
|
||||
@@ -305,6 +305,200 @@ def test_answer_not_known_using_notes_command(client_offline_chat, default_user2
|
||||
assert response_message == prompts.no_notes_found.format()
|
||||
|
||||
|
||||
# ----------------------------------------------------------------------------------------------------
|
||||
@pytest.mark.django_db(transaction=True)
|
||||
@pytest.mark.chatquality
|
||||
def test_summarize_one_file(client_offline_chat, default_user2: KhojUser):
|
||||
message_list = []
|
||||
conversation = create_conversation(message_list, default_user2)
|
||||
# post "Xi Li.markdown" file to the file filters
|
||||
file_list = (
|
||||
Entry.objects.filter(user=default_user2, file_source="computer")
|
||||
.distinct("file_path")
|
||||
.values_list("file_path", flat=True)
|
||||
)
|
||||
# pick the file that has "Xi Li.markdown" in the name
|
||||
summarization_file = ""
|
||||
for file in file_list:
|
||||
if "Birthday Gift for Xiu turning 4.markdown" in file:
|
||||
summarization_file = file
|
||||
break
|
||||
assert summarization_file != ""
|
||||
|
||||
response = client_offline_chat.post(
|
||||
"api/chat/conversation/file-filters",
|
||||
json={"filename": summarization_file, "conversation_id": str(conversation.id)},
|
||||
)
|
||||
query = urllib.parse.quote("/summarize")
|
||||
response = client_offline_chat.get(f"/api/chat?q={query}&conversation_id={conversation.id}&stream=true")
|
||||
response_message = response.content.decode("utf-8")
|
||||
# Assert
|
||||
assert response_message != ""
|
||||
assert response_message != "No files selected for summarization. Please add files using the section on the left."
|
||||
assert response_message != "Only one file can be selected for summarization."
|
||||
|
||||
|
||||
@pytest.mark.django_db(transaction=True)
|
||||
@pytest.mark.chatquality
|
||||
def test_summarize_extra_text(client_offline_chat, default_user2: KhojUser):
|
||||
message_list = []
|
||||
conversation = create_conversation(message_list, default_user2)
|
||||
# post "Xi Li.markdown" file to the file filters
|
||||
file_list = (
|
||||
Entry.objects.filter(user=default_user2, file_source="computer")
|
||||
.distinct("file_path")
|
||||
.values_list("file_path", flat=True)
|
||||
)
|
||||
# pick the file that has "Xi Li.markdown" in the name
|
||||
summarization_file = ""
|
||||
for file in file_list:
|
||||
if "Birthday Gift for Xiu turning 4.markdown" in file:
|
||||
summarization_file = file
|
||||
break
|
||||
assert summarization_file != ""
|
||||
|
||||
response = client_offline_chat.post(
|
||||
"api/chat/conversation/file-filters",
|
||||
json={"filename": summarization_file, "conversation_id": str(conversation.id)},
|
||||
)
|
||||
query = urllib.parse.quote("/summarize tell me about Xiu")
|
||||
response = client_offline_chat.get(f"/api/chat?q={query}&conversation_id={conversation.id}&stream=true")
|
||||
response_message = response.content.decode("utf-8")
|
||||
# Assert
|
||||
assert response_message != ""
|
||||
assert response_message != "No files selected for summarization. Please add files using the section on the left."
|
||||
assert response_message != "Only one file can be selected for summarization."
|
||||
|
||||
|
||||
@pytest.mark.django_db(transaction=True)
|
||||
@pytest.mark.chatquality
|
||||
def test_summarize_multiple_files(client_offline_chat, default_user2: KhojUser):
|
||||
message_list = []
|
||||
conversation = create_conversation(message_list, default_user2)
|
||||
# post "Xi Li.markdown" file to the file filters
|
||||
file_list = (
|
||||
Entry.objects.filter(user=default_user2, file_source="computer")
|
||||
.distinct("file_path")
|
||||
.values_list("file_path", flat=True)
|
||||
)
|
||||
|
||||
response = client_offline_chat.post(
|
||||
"api/chat/conversation/file-filters", json={"filename": file_list[0], "conversation_id": str(conversation.id)}
|
||||
)
|
||||
response = client_offline_chat.post(
|
||||
"api/chat/conversation/file-filters", json={"filename": file_list[1], "conversation_id": str(conversation.id)}
|
||||
)
|
||||
|
||||
query = urllib.parse.quote("/summarize")
|
||||
response = client_offline_chat.get(f"/api/chat?q={query}&conversation_id={conversation.id}&stream=true")
|
||||
response_message = response.content.decode("utf-8")
|
||||
|
||||
# Assert
|
||||
assert response_message == "Only one file can be selected for summarization."
|
||||
|
||||
|
||||
@pytest.mark.django_db(transaction=True)
|
||||
@pytest.mark.chatquality
|
||||
def test_summarize_no_files(client_offline_chat, default_user2: KhojUser):
|
||||
message_list = []
|
||||
conversation = create_conversation(message_list, default_user2)
|
||||
query = urllib.parse.quote("/summarize")
|
||||
response = client_offline_chat.get(f"/api/chat?q={query}&conversation_id={conversation.id}&stream=true")
|
||||
response_message = response.content.decode("utf-8")
|
||||
# Assert
|
||||
assert response_message == "No files selected for summarization. Please add files using the section on the left."
|
||||
|
||||
|
||||
@pytest.mark.django_db(transaction=True)
|
||||
@pytest.mark.chatquality
|
||||
def test_summarize_different_conversation(client_offline_chat, default_user2: KhojUser):
|
||||
message_list = []
|
||||
conversation1 = create_conversation(message_list, default_user2)
|
||||
conversation2 = create_conversation(message_list, default_user2)
|
||||
|
||||
file_list = (
|
||||
Entry.objects.filter(user=default_user2, file_source="computer")
|
||||
.distinct("file_path")
|
||||
.values_list("file_path", flat=True)
|
||||
)
|
||||
summarization_file = ""
|
||||
for file in file_list:
|
||||
if "Birthday Gift for Xiu turning 4.markdown" in file:
|
||||
summarization_file = file
|
||||
break
|
||||
assert summarization_file != ""
|
||||
|
||||
# add file filter to conversation 1.
|
||||
response = client_offline_chat.post(
|
||||
"api/chat/conversation/file-filters",
|
||||
json={"filename": summarization_file, "conversation_id": str(conversation1.id)},
|
||||
)
|
||||
|
||||
query = urllib.parse.quote("/summarize")
|
||||
response = client_offline_chat.get(f"/api/chat?q={query}&conversation_id={conversation2.id}&stream=true")
|
||||
response_message = response.content.decode("utf-8")
|
||||
|
||||
# Assert
|
||||
assert response_message == "No files selected for summarization. Please add files using the section on the left."
|
||||
|
||||
# now make sure that the file filter is still in conversation 1
|
||||
response = client_offline_chat.get(f"/api/chat?q={query}&conversation_id={conversation1.id}&stream=true")
|
||||
response_message = response.content.decode("utf-8")
|
||||
|
||||
# Assert
|
||||
assert response_message != ""
|
||||
assert response_message != "No files selected for summarization. Please add files using the section on the left."
|
||||
assert response_message != "Only one file can be selected for summarization."
|
||||
|
||||
|
||||
@pytest.mark.django_db(transaction=True)
|
||||
@pytest.mark.chatquality
|
||||
def test_summarize_nonexistant_file(client_offline_chat, default_user2: KhojUser):
|
||||
message_list = []
|
||||
conversation = create_conversation(message_list, default_user2)
|
||||
# post "imaginary.markdown" file to the file filters
|
||||
response = client_offline_chat.post(
|
||||
"api/chat/conversation/file-filters",
|
||||
json={"filename": "imaginary.markdown", "conversation_id": str(conversation.id)},
|
||||
)
|
||||
query = urllib.parse.quote("/summarize")
|
||||
response = client_offline_chat.get(f"/api/chat?q={query}&conversation_id={conversation.id}&stream=true")
|
||||
response_message = response.content.decode("utf-8")
|
||||
# Assert
|
||||
assert response_message == "No files selected for summarization. Please add files using the section on the left."
|
||||
|
||||
|
||||
@pytest.mark.django_db(transaction=True)
|
||||
@pytest.mark.chatquality
|
||||
def test_summarize_diff_user_file(
|
||||
client_offline_chat, default_user: KhojUser, pdf_configured_user1, default_user2: KhojUser
|
||||
):
|
||||
message_list = []
|
||||
conversation = create_conversation(message_list, default_user2)
|
||||
# Get the pdf file called singlepage.pdf
|
||||
file_list = (
|
||||
Entry.objects.filter(user=default_user, file_source="computer")
|
||||
.distinct("file_path")
|
||||
.values_list("file_path", flat=True)
|
||||
)
|
||||
summarization_file = ""
|
||||
for file in file_list:
|
||||
if "singlepage.pdf" in file:
|
||||
summarization_file = file
|
||||
break
|
||||
assert summarization_file != ""
|
||||
# add singlepage.pdf to the file filters
|
||||
response = client_offline_chat.post(
|
||||
"api/chat/conversation/file-filters",
|
||||
json={"filename": summarization_file, "conversation_id": str(conversation.id)},
|
||||
)
|
||||
query = urllib.parse.quote("/summarize")
|
||||
response = client_offline_chat.get(f"/api/chat?q={query}&conversation_id={conversation.id}&stream=true")
|
||||
response_message = response.content.decode("utf-8")
|
||||
# Assert
|
||||
assert response_message == "No files selected for summarization. Please add files using the section on the left."
|
||||
|
||||
|
||||
# ----------------------------------------------------------------------------------------------------
|
||||
@pytest.mark.chatquality
|
||||
@pytest.mark.django_db(transaction=True)
|
||||
@@ -522,7 +716,7 @@ async def test_get_correct_tools_online(client_offline_chat):
|
||||
user_query = "What's the weather in Patagonia this week?"
|
||||
|
||||
# Act
|
||||
tools = await aget_relevant_information_sources(user_query, {})
|
||||
tools = await aget_relevant_information_sources(user_query, {}, is_task=False)
|
||||
|
||||
# Assert
|
||||
tools = [tool.value for tool in tools]
|
||||
@@ -537,7 +731,7 @@ async def test_get_correct_tools_notes(client_offline_chat):
|
||||
user_query = "Where did I go for my first battleship training?"
|
||||
|
||||
# Act
|
||||
tools = await aget_relevant_information_sources(user_query, {})
|
||||
tools = await aget_relevant_information_sources(user_query, {}, is_task=False)
|
||||
|
||||
# Assert
|
||||
tools = [tool.value for tool in tools]
|
||||
@@ -552,7 +746,7 @@ async def test_get_correct_tools_online_or_general_and_notes(client_offline_chat
|
||||
user_query = "What's the highest point in Patagonia and have I been there?"
|
||||
|
||||
# Act
|
||||
tools = await aget_relevant_information_sources(user_query, {})
|
||||
tools = await aget_relevant_information_sources(user_query, {}, is_task=False)
|
||||
|
||||
# Assert
|
||||
tools = [tool.value for tool in tools]
|
||||
@@ -569,7 +763,7 @@ async def test_get_correct_tools_general(client_offline_chat):
|
||||
user_query = "How many noble gases are there?"
|
||||
|
||||
# Act
|
||||
tools = await aget_relevant_information_sources(user_query, {})
|
||||
tools = await aget_relevant_information_sources(user_query, {}, is_task=False)
|
||||
|
||||
# Assert
|
||||
tools = [tool.value for tool in tools]
|
||||
@@ -593,7 +787,7 @@ async def test_get_correct_tools_with_chat_history(client_offline_chat, default_
|
||||
chat_history = create_conversation(chat_log, default_user2)
|
||||
|
||||
# Act
|
||||
tools = await aget_relevant_information_sources(user_query, chat_history)
|
||||
tools = await aget_relevant_information_sources(user_query, chat_history, is_task=False)
|
||||
|
||||
# Assert
|
||||
tools = [tool.value for tool in tools]
|
||||
|
||||
@@ -5,7 +5,7 @@ from urllib.parse import quote
|
||||
import pytest
|
||||
from freezegun import freeze_time
|
||||
|
||||
from khoj.database.models import Agent, KhojUser
|
||||
from khoj.database.models import Agent, Entry, KhojUser, LocalPdfConfig
|
||||
from khoj.processor.conversation import prompts
|
||||
from khoj.processor.conversation.utils import message_to_log
|
||||
from khoj.routers.helpers import aget_relevant_information_sources
|
||||
@@ -289,6 +289,198 @@ def test_answer_not_known_using_notes_command(chat_client_no_background, default
|
||||
assert response_message == prompts.no_entries_found.format()
|
||||
|
||||
|
||||
# ----------------------------------------------------------------------------------------------------
|
||||
@pytest.mark.django_db(transaction=True)
|
||||
@pytest.mark.chatquality
|
||||
def test_summarize_one_file(chat_client, default_user2: KhojUser):
|
||||
message_list = []
|
||||
conversation = create_conversation(message_list, default_user2)
|
||||
# post "Xi Li.markdown" file to the file filters
|
||||
file_list = (
|
||||
Entry.objects.filter(user=default_user2, file_source="computer")
|
||||
.distinct("file_path")
|
||||
.values_list("file_path", flat=True)
|
||||
)
|
||||
# pick the file that has "Xi Li.markdown" in the name
|
||||
summarization_file = ""
|
||||
for file in file_list:
|
||||
if "Birthday Gift for Xiu turning 4.markdown" in file:
|
||||
summarization_file = file
|
||||
break
|
||||
assert summarization_file != ""
|
||||
|
||||
response = chat_client.post(
|
||||
"api/chat/conversation/file-filters",
|
||||
json={"filename": summarization_file, "conversation_id": str(conversation.id)},
|
||||
)
|
||||
query = urllib.parse.quote("/summarize")
|
||||
response = chat_client.get(f"/api/chat?q={query}&conversation_id={conversation.id}&stream=true")
|
||||
response_message = response.content.decode("utf-8")
|
||||
# Assert
|
||||
assert response_message != ""
|
||||
assert response_message != "No files selected for summarization. Please add files using the section on the left."
|
||||
assert response_message != "Only one file can be selected for summarization."
|
||||
|
||||
|
||||
@pytest.mark.django_db(transaction=True)
|
||||
@pytest.mark.chatquality
|
||||
def test_summarize_extra_text(chat_client, default_user2: KhojUser):
|
||||
message_list = []
|
||||
conversation = create_conversation(message_list, default_user2)
|
||||
# post "Xi Li.markdown" file to the file filters
|
||||
file_list = (
|
||||
Entry.objects.filter(user=default_user2, file_source="computer")
|
||||
.distinct("file_path")
|
||||
.values_list("file_path", flat=True)
|
||||
)
|
||||
# pick the file that has "Xi Li.markdown" in the name
|
||||
summarization_file = ""
|
||||
for file in file_list:
|
||||
if "Birthday Gift for Xiu turning 4.markdown" in file:
|
||||
summarization_file = file
|
||||
break
|
||||
assert summarization_file != ""
|
||||
|
||||
response = chat_client.post(
|
||||
"api/chat/conversation/file-filters",
|
||||
json={"filename": summarization_file, "conversation_id": str(conversation.id)},
|
||||
)
|
||||
query = urllib.parse.quote("/summarize tell me about Xiu")
|
||||
response = chat_client.get(f"/api/chat?q={query}&conversation_id={conversation.id}&stream=true")
|
||||
response_message = response.content.decode("utf-8")
|
||||
# Assert
|
||||
assert response_message != ""
|
||||
assert response_message != "No files selected for summarization. Please add files using the section on the left."
|
||||
assert response_message != "Only one file can be selected for summarization."
|
||||
|
||||
|
||||
@pytest.mark.django_db(transaction=True)
|
||||
@pytest.mark.chatquality
|
||||
def test_summarize_multiple_files(chat_client, default_user2: KhojUser):
|
||||
message_list = []
|
||||
conversation = create_conversation(message_list, default_user2)
|
||||
# post "Xi Li.markdown" file to the file filters
|
||||
file_list = (
|
||||
Entry.objects.filter(user=default_user2, file_source="computer")
|
||||
.distinct("file_path")
|
||||
.values_list("file_path", flat=True)
|
||||
)
|
||||
|
||||
response = chat_client.post(
|
||||
"api/chat/conversation/file-filters", json={"filename": file_list[0], "conversation_id": str(conversation.id)}
|
||||
)
|
||||
response = chat_client.post(
|
||||
"api/chat/conversation/file-filters", json={"filename": file_list[1], "conversation_id": str(conversation.id)}
|
||||
)
|
||||
|
||||
query = urllib.parse.quote("/summarize")
|
||||
response = chat_client.get(f"/api/chat?q={query}&conversation_id={conversation.id}&stream=true")
|
||||
response_message = response.content.decode("utf-8")
|
||||
|
||||
# Assert
|
||||
assert response_message == "Only one file can be selected for summarization."
|
||||
|
||||
|
||||
@pytest.mark.django_db(transaction=True)
|
||||
@pytest.mark.chatquality
|
||||
def test_summarize_no_files(chat_client, default_user2: KhojUser):
|
||||
message_list = []
|
||||
conversation = create_conversation(message_list, default_user2)
|
||||
query = urllib.parse.quote("/summarize")
|
||||
response = chat_client.get(f"/api/chat?q={query}&conversation_id={conversation.id}&stream=true")
|
||||
response_message = response.content.decode("utf-8")
|
||||
# Assert
|
||||
assert response_message == "No files selected for summarization. Please add files using the section on the left."
|
||||
|
||||
|
||||
@pytest.mark.django_db(transaction=True)
|
||||
@pytest.mark.chatquality
|
||||
def test_summarize_different_conversation(chat_client, default_user2: KhojUser):
|
||||
message_list = []
|
||||
conversation1 = create_conversation(message_list, default_user2)
|
||||
conversation2 = create_conversation(message_list, default_user2)
|
||||
|
||||
file_list = (
|
||||
Entry.objects.filter(user=default_user2, file_source="computer")
|
||||
.distinct("file_path")
|
||||
.values_list("file_path", flat=True)
|
||||
)
|
||||
summarization_file = ""
|
||||
for file in file_list:
|
||||
if "Birthday Gift for Xiu turning 4.markdown" in file:
|
||||
summarization_file = file
|
||||
break
|
||||
assert summarization_file != ""
|
||||
|
||||
# add file filter to conversation 1.
|
||||
response = chat_client.post(
|
||||
"api/chat/conversation/file-filters",
|
||||
json={"filename": summarization_file, "conversation_id": str(conversation1.id)},
|
||||
)
|
||||
|
||||
query = urllib.parse.quote("/summarize")
|
||||
response = chat_client.get(f"/api/chat?q={query}&conversation_id={conversation2.id}&stream=true")
|
||||
response_message = response.content.decode("utf-8")
|
||||
|
||||
# Assert
|
||||
assert response_message == "No files selected for summarization. Please add files using the section on the left."
|
||||
|
||||
# now make sure that the file filter is still in conversation 1
|
||||
response = chat_client.get(f"/api/chat?q={query}&conversation_id={conversation1.id}&stream=true")
|
||||
response_message = response.content.decode("utf-8")
|
||||
|
||||
# Assert
|
||||
assert response_message != ""
|
||||
assert response_message != "No files selected for summarization. Please add files using the section on the left."
|
||||
assert response_message != "Only one file can be selected for summarization."
|
||||
|
||||
|
||||
@pytest.mark.django_db(transaction=True)
|
||||
@pytest.mark.chatquality
|
||||
def test_summarize_nonexistant_file(chat_client, default_user2: KhojUser):
|
||||
message_list = []
|
||||
conversation = create_conversation(message_list, default_user2)
|
||||
# post "imaginary.markdown" file to the file filters
|
||||
response = chat_client.post(
|
||||
"api/chat/conversation/file-filters",
|
||||
json={"filename": "imaginary.markdown", "conversation_id": str(conversation.id)},
|
||||
)
|
||||
query = urllib.parse.quote("/summarize")
|
||||
response = chat_client.get(f"/api/chat?q={query}&conversation_id={conversation.id}&stream=true")
|
||||
response_message = response.content.decode("utf-8")
|
||||
# Assert
|
||||
assert response_message == "No files selected for summarization. Please add files using the section on the left."
|
||||
|
||||
|
||||
@pytest.mark.django_db(transaction=True)
|
||||
@pytest.mark.chatquality
|
||||
def test_summarize_diff_user_file(chat_client, default_user: KhojUser, pdf_configured_user1, default_user2: KhojUser):
|
||||
message_list = []
|
||||
conversation = create_conversation(message_list, default_user2)
|
||||
# Get the pdf file called singlepage.pdf
|
||||
file_list = (
|
||||
Entry.objects.filter(user=default_user, file_source="computer")
|
||||
.distinct("file_path")
|
||||
.values_list("file_path", flat=True)
|
||||
)
|
||||
summarization_file = ""
|
||||
for file in file_list:
|
||||
if "singlepage.pdf" in file:
|
||||
summarization_file = file
|
||||
break
|
||||
assert summarization_file != ""
|
||||
# add singlepage.pdf to the file filters
|
||||
response = chat_client.post(
|
||||
"api/chat/conversation/file-filters",
|
||||
json={"filename": summarization_file, "conversation_id": str(conversation.id)},
|
||||
)
|
||||
query = urllib.parse.quote("/summarize")
|
||||
response = chat_client.get(f"/api/chat?q={query}&conversation_id={conversation.id}&stream=true")
|
||||
response_message = response.content.decode("utf-8")
|
||||
# Assert
|
||||
assert response_message == "No files selected for summarization. Please add files using the section on the left."
|
||||
|
||||
|
||||
# ----------------------------------------------------------------------------------------------------
|
||||
@pytest.mark.xfail(AssertionError, reason="Chat director not capable of answering time aware questions yet")
|
||||
@pytest.mark.django_db(transaction=True)
|
||||
|
||||
@@ -33,10 +33,12 @@ def test_configure_indexing_heading_only_entries(tmp_path):
|
||||
# Assert
|
||||
if index_heading_entries:
|
||||
# Entry with empty body indexed when index_heading_entries set to True
|
||||
assert len(entries) == 1
|
||||
assert len(entries) == 2
|
||||
assert len(entries[1]) == 1
|
||||
else:
|
||||
# Entry with empty body ignored when index_heading_entries set to False
|
||||
assert is_none_or_empty(entries)
|
||||
assert len(entries) == 2
|
||||
assert is_none_or_empty(entries[1])
|
||||
|
||||
|
||||
def test_entry_split_when_exceeds_max_tokens():
|
||||
@@ -55,9 +57,9 @@ def test_entry_split_when_exceeds_max_tokens():
|
||||
# Act
|
||||
# Extract Entries from specified Org files
|
||||
entries = OrgToEntries.extract_org_entries(org_files=data)
|
||||
|
||||
assert len(entries) == 2
|
||||
# Split each entry from specified Org files by max tokens
|
||||
entries = TextToEntries.split_entries_by_max_tokens(entries, max_tokens=5)
|
||||
entries = TextToEntries.split_entries_by_max_tokens(entries[1], max_tokens=5)
|
||||
|
||||
# Assert
|
||||
assert len(entries) == 2
|
||||
@@ -114,11 +116,12 @@ body line 1.1
|
||||
# Act
|
||||
# Extract Entries from specified Org files
|
||||
extracted_entries = OrgToEntries.extract_org_entries(org_files=data, max_tokens=12)
|
||||
for entry in extracted_entries:
|
||||
assert len(extracted_entries) == 2
|
||||
for entry in extracted_entries[1]:
|
||||
entry.raw = clean(entry.raw)
|
||||
|
||||
# Assert
|
||||
assert len(extracted_entries) == 1
|
||||
assert len(extracted_entries[1]) == 1
|
||||
assert entry.raw == expected_entry
|
||||
|
||||
|
||||
@@ -165,10 +168,11 @@ longer body line 2.1
|
||||
extracted_entries = OrgToEntries.extract_org_entries(org_files=data, max_tokens=12)
|
||||
|
||||
# Assert
|
||||
assert len(extracted_entries) == 3
|
||||
assert extracted_entries[0].compiled == first_expected_entry, "First entry includes children headings"
|
||||
assert extracted_entries[1].compiled == second_expected_entry, "Second entry does not include children headings"
|
||||
assert extracted_entries[2].compiled == third_expected_entry, "Third entry is second entries child heading"
|
||||
assert len(extracted_entries) == 2
|
||||
assert len(extracted_entries[1]) == 3
|
||||
assert extracted_entries[1][0].compiled == first_expected_entry, "First entry includes children headings"
|
||||
assert extracted_entries[1][1].compiled == second_expected_entry, "Second entry does not include children headings"
|
||||
assert extracted_entries[1][2].compiled == third_expected_entry, "Third entry is second entries child heading"
|
||||
|
||||
|
||||
def test_separate_sibling_org_entries_if_all_cannot_fit_in_token_limit(tmp_path):
|
||||
@@ -226,10 +230,11 @@ body line 3.1
|
||||
extracted_entries = OrgToEntries.extract_org_entries(org_files=data, max_tokens=30)
|
||||
|
||||
# Assert
|
||||
assert len(extracted_entries) == 3
|
||||
assert extracted_entries[0].compiled == first_expected_entry, "First entry includes children headings"
|
||||
assert extracted_entries[1].compiled == second_expected_entry, "Second entry includes children headings"
|
||||
assert extracted_entries[2].compiled == third_expected_entry, "Third entry includes children headings"
|
||||
assert len(extracted_entries) == 2
|
||||
assert len(extracted_entries[1]) == 3
|
||||
assert extracted_entries[1][0].compiled == first_expected_entry, "First entry includes children headings"
|
||||
assert extracted_entries[1][1].compiled == second_expected_entry, "Second entry includes children headings"
|
||||
assert extracted_entries[1][2].compiled == third_expected_entry, "Third entry includes children headings"
|
||||
|
||||
|
||||
def test_entry_with_body_to_entry(tmp_path):
|
||||
@@ -251,7 +256,8 @@ def test_entry_with_body_to_entry(tmp_path):
|
||||
entries = OrgToEntries.extract_org_entries(org_files=data, max_tokens=3)
|
||||
|
||||
# Assert
|
||||
assert len(entries) == 1
|
||||
assert len(entries) == 2
|
||||
assert len(entries[1]) == 1
|
||||
|
||||
|
||||
def test_file_with_entry_after_intro_text_to_entry(tmp_path):
|
||||
@@ -273,6 +279,7 @@ Intro text
|
||||
|
||||
# Assert
|
||||
assert len(entries) == 2
|
||||
assert len(entries[1]) == 2
|
||||
|
||||
|
||||
def test_file_with_no_headings_to_entry(tmp_path):
|
||||
@@ -291,7 +298,8 @@ def test_file_with_no_headings_to_entry(tmp_path):
|
||||
entries = OrgToEntries.extract_org_entries(org_files=data, max_tokens=3)
|
||||
|
||||
# Assert
|
||||
assert len(entries) == 1
|
||||
assert len(entries) == 2
|
||||
assert len(entries[1]) == 1
|
||||
|
||||
|
||||
def test_get_org_files(tmp_path):
|
||||
@@ -349,13 +357,14 @@ def test_extract_entries_with_different_level_headings(tmp_path):
|
||||
# Act
|
||||
# Extract Entries from specified Org files
|
||||
entries = OrgToEntries.extract_org_entries(org_files=data, index_heading_entries=True, max_tokens=3)
|
||||
for entry in entries:
|
||||
assert len(entries) == 2
|
||||
for entry in entries[1]:
|
||||
entry.raw = clean(f"{entry.raw}")
|
||||
|
||||
# Assert
|
||||
assert len(entries) == 2
|
||||
assert entries[0].raw == "* Heading 1\n** Sub-Heading 1.1\n", "Ensure entry includes heading ancestory"
|
||||
assert entries[1].raw == "* Heading 2\n"
|
||||
assert len(entries[1]) == 2
|
||||
assert entries[1][0].raw == "* Heading 1\n** Sub-Heading 1.1\n", "Ensure entry includes heading ancestory"
|
||||
assert entries[1][1].raw == "* Heading 2\n"
|
||||
|
||||
|
||||
# Helper Functions
|
||||
|
||||
@@ -17,7 +17,8 @@ def test_single_page_pdf_to_jsonl():
|
||||
entries = PdfToEntries.extract_pdf_entries(pdf_files=data)
|
||||
|
||||
# Assert
|
||||
assert len(entries) == 1
|
||||
assert len(entries) == 2
|
||||
assert len(entries[1]) == 1
|
||||
|
||||
|
||||
def test_multi_page_pdf_to_jsonl():
|
||||
@@ -31,7 +32,8 @@ def test_multi_page_pdf_to_jsonl():
|
||||
entries = PdfToEntries.extract_pdf_entries(pdf_files=data)
|
||||
|
||||
# Assert
|
||||
assert len(entries) == 6
|
||||
assert len(entries) == 2
|
||||
assert len(entries[1]) == 6
|
||||
|
||||
|
||||
def test_ocr_page_pdf_to_jsonl():
|
||||
@@ -43,9 +45,9 @@ def test_ocr_page_pdf_to_jsonl():
|
||||
|
||||
data = {"tests/data/pdf/ocr_samples.pdf": pdf_bytes}
|
||||
entries = PdfToEntries.extract_pdf_entries(pdf_files=data)
|
||||
|
||||
assert len(entries) == 1
|
||||
assert "playing on a strip of marsh" in entries[0].raw
|
||||
assert len(entries) == 2
|
||||
assert len(entries[1]) == 1
|
||||
assert "playing on a strip of marsh" in entries[1][0].raw
|
||||
|
||||
|
||||
def test_get_pdf_files(tmp_path):
|
||||
|
||||
@@ -25,15 +25,16 @@ def test_plaintext_file(tmp_path):
|
||||
entries = PlaintextToEntries.extract_plaintext_entries(data)
|
||||
|
||||
# Convert each entry.file to absolute path to make them JSON serializable
|
||||
for entry in entries:
|
||||
for entry in entries[1]:
|
||||
entry.file = str(Path(entry.file).absolute())
|
||||
|
||||
# Assert
|
||||
assert len(entries) == 1
|
||||
assert len(entries) == 2
|
||||
assert len(entries[1]) == 1
|
||||
# Ensure raw entry with no headings do not get heading prefix prepended
|
||||
assert not entries[0].raw.startswith("#")
|
||||
assert not entries[1][0].raw.startswith("#")
|
||||
# Ensure compiled entry has filename prepended as top level heading
|
||||
assert entries[0].compiled == f"{plaintextfile}\n{raw_entry}"
|
||||
assert entries[1][0].compiled == f"{plaintextfile}\n{raw_entry}"
|
||||
|
||||
|
||||
def test_get_plaintext_files(tmp_path):
|
||||
@@ -94,8 +95,9 @@ def test_parse_html_plaintext_file(content_config, default_user: KhojUser):
|
||||
entries = PlaintextToEntries.extract_plaintext_entries(extracted_plaintext_files)
|
||||
|
||||
# Assert
|
||||
assert len(entries) == 1
|
||||
assert "<div>" not in entries[0].raw
|
||||
assert len(entries) == 2
|
||||
assert len(entries[1]) == 1
|
||||
assert "<div>" not in entries[1][0].raw
|
||||
|
||||
|
||||
def test_large_plaintext_file_split_into_multiple_entries(tmp_path):
|
||||
@@ -113,13 +115,20 @@ def test_large_plaintext_file_split_into_multiple_entries(tmp_path):
|
||||
|
||||
# Act
|
||||
# Extract Entries from specified plaintext files
|
||||
normal_entries = PlaintextToEntries.extract_plaintext_entries(normal_data)
|
||||
large_entries = PlaintextToEntries.extract_plaintext_entries(large_data)
|
||||
|
||||
# assert
|
||||
assert len(normal_entries) == 2
|
||||
assert len(large_entries) == 2
|
||||
|
||||
normal_entries = PlaintextToEntries.split_entries_by_max_tokens(
|
||||
PlaintextToEntries.extract_plaintext_entries(normal_data),
|
||||
normal_entries[1],
|
||||
max_tokens=max_tokens,
|
||||
raw_is_compiled=True,
|
||||
)
|
||||
large_entries = PlaintextToEntries.split_entries_by_max_tokens(
|
||||
PlaintextToEntries.extract_plaintext_entries(large_data), max_tokens=max_tokens, raw_is_compiled=True
|
||||
large_entries[1], max_tokens=max_tokens, raw_is_compiled=True
|
||||
)
|
||||
|
||||
# Assert
|
||||
|
||||
Reference in New Issue
Block a user