Add Ability to Summarize Documents (#800)

* Uses entire file text and summarizer model to generate document summary. * Uses the contents of the user's query to create a tailored summary. * Integrates with File Filters #788 for a better UX.
2026-03-02 21:19:12 +00:00 · 2024-06-18 10:01:07 -04:00
parent 677d49d438
commit d4e5c95711
21 changed files with 791 additions and 85 deletions
--- a/tests/test_offline_chat_director.py
+++ b/tests/test_offline_chat_director.py
@@ -6,7 +6,7 @@ import pytest
 from faker import Faker
 from freezegun import freeze_time

-from khoj.database.models import Agent, KhojUser
+from khoj.database.models import Agent, Entry, KhojUser
 from khoj.processor.conversation import prompts
 from khoj.processor.conversation.utils import message_to_log
 from khoj.routers.helpers import aget_relevant_information_sources
@@ -305,6 +305,200 @@ def test_answer_not_known_using_notes_command(client_offline_chat, default_user2
    assert response_message == prompts.no_notes_found.format()


+# ----------------------------------------------------------------------------------------------------
+@pytest.mark.django_db(transaction=True)
+@pytest.mark.chatquality
+def test_summarize_one_file(client_offline_chat, default_user2: KhojUser):
+    message_list = []
+    conversation = create_conversation(message_list, default_user2)
+    # post "Xi Li.markdown" file to the file filters
+    file_list = (
+        Entry.objects.filter(user=default_user2, file_source="computer")
+        .distinct("file_path")
+        .values_list("file_path", flat=True)
+    )
+    # pick the file that has "Xi Li.markdown" in the name
+    summarization_file = ""
+    for file in file_list:
+        if "Birthday Gift for Xiu turning 4.markdown" in file:
+            summarization_file = file
+            break
+    assert summarization_file != ""
+
+    response = client_offline_chat.post(
+        "api/chat/conversation/file-filters",
+        json={"filename": summarization_file, "conversation_id": str(conversation.id)},
+    )
+    query = urllib.parse.quote("/summarize")
+    response = client_offline_chat.get(f"/api/chat?q={query}&conversation_id={conversation.id}&stream=true")
+    response_message = response.content.decode("utf-8")
+    # Assert
+    assert response_message != ""
+    assert response_message != "No files selected for summarization. Please add files using the section on the left."
+    assert response_message != "Only one file can be selected for summarization."
+
+
+@pytest.mark.django_db(transaction=True)
+@pytest.mark.chatquality
+def test_summarize_extra_text(client_offline_chat, default_user2: KhojUser):
+    message_list = []
+    conversation = create_conversation(message_list, default_user2)
+    # post "Xi Li.markdown" file to the file filters
+    file_list = (
+        Entry.objects.filter(user=default_user2, file_source="computer")
+        .distinct("file_path")
+        .values_list("file_path", flat=True)
+    )
+    # pick the file that has "Xi Li.markdown" in the name
+    summarization_file = ""
+    for file in file_list:
+        if "Birthday Gift for Xiu turning 4.markdown" in file:
+            summarization_file = file
+            break
+    assert summarization_file != ""
+
+    response = client_offline_chat.post(
+        "api/chat/conversation/file-filters",
+        json={"filename": summarization_file, "conversation_id": str(conversation.id)},
+    )
+    query = urllib.parse.quote("/summarize tell me about Xiu")
+    response = client_offline_chat.get(f"/api/chat?q={query}&conversation_id={conversation.id}&stream=true")
+    response_message = response.content.decode("utf-8")
+    # Assert
+    assert response_message != ""
+    assert response_message != "No files selected for summarization. Please add files using the section on the left."
+    assert response_message != "Only one file can be selected for summarization."
+
+
+@pytest.mark.django_db(transaction=True)
+@pytest.mark.chatquality
+def test_summarize_multiple_files(client_offline_chat, default_user2: KhojUser):
+    message_list = []
+    conversation = create_conversation(message_list, default_user2)
+    # post "Xi Li.markdown" file to the file filters
+    file_list = (
+        Entry.objects.filter(user=default_user2, file_source="computer")
+        .distinct("file_path")
+        .values_list("file_path", flat=True)
+    )
+
+    response = client_offline_chat.post(
+        "api/chat/conversation/file-filters", json={"filename": file_list[0], "conversation_id": str(conversation.id)}
+    )
+    response = client_offline_chat.post(
+        "api/chat/conversation/file-filters", json={"filename": file_list[1], "conversation_id": str(conversation.id)}
+    )
+
+    query = urllib.parse.quote("/summarize")
+    response = client_offline_chat.get(f"/api/chat?q={query}&conversation_id={conversation.id}&stream=true")
+    response_message = response.content.decode("utf-8")
+
+    # Assert
+    assert response_message == "Only one file can be selected for summarization."
+
+
+@pytest.mark.django_db(transaction=True)
+@pytest.mark.chatquality
+def test_summarize_no_files(client_offline_chat, default_user2: KhojUser):
+    message_list = []
+    conversation = create_conversation(message_list, default_user2)
+    query = urllib.parse.quote("/summarize")
+    response = client_offline_chat.get(f"/api/chat?q={query}&conversation_id={conversation.id}&stream=true")
+    response_message = response.content.decode("utf-8")
+    # Assert
+    assert response_message == "No files selected for summarization. Please add files using the section on the left."
+
+
+@pytest.mark.django_db(transaction=True)
+@pytest.mark.chatquality
+def test_summarize_different_conversation(client_offline_chat, default_user2: KhojUser):
+    message_list = []
+    conversation1 = create_conversation(message_list, default_user2)
+    conversation2 = create_conversation(message_list, default_user2)
+
+    file_list = (
+        Entry.objects.filter(user=default_user2, file_source="computer")
+        .distinct("file_path")
+        .values_list("file_path", flat=True)
+    )
+    summarization_file = ""
+    for file in file_list:
+        if "Birthday Gift for Xiu turning 4.markdown" in file:
+            summarization_file = file
+            break
+    assert summarization_file != ""
+
+    # add file filter to conversation 1.
+    response = client_offline_chat.post(
+        "api/chat/conversation/file-filters",
+        json={"filename": summarization_file, "conversation_id": str(conversation1.id)},
+    )
+
+    query = urllib.parse.quote("/summarize")
+    response = client_offline_chat.get(f"/api/chat?q={query}&conversation_id={conversation2.id}&stream=true")
+    response_message = response.content.decode("utf-8")
+
+    # Assert
+    assert response_message == "No files selected for summarization. Please add files using the section on the left."
+
+    # now make sure that the file filter is still in conversation 1
+    response = client_offline_chat.get(f"/api/chat?q={query}&conversation_id={conversation1.id}&stream=true")
+    response_message = response.content.decode("utf-8")
+
+    # Assert
+    assert response_message != ""
+    assert response_message != "No files selected for summarization. Please add files using the section on the left."
+    assert response_message != "Only one file can be selected for summarization."
+
+
+@pytest.mark.django_db(transaction=True)
+@pytest.mark.chatquality
+def test_summarize_nonexistant_file(client_offline_chat, default_user2: KhojUser):
+    message_list = []
+    conversation = create_conversation(message_list, default_user2)
+    # post "imaginary.markdown" file to the file filters
+    response = client_offline_chat.post(
+        "api/chat/conversation/file-filters",
+        json={"filename": "imaginary.markdown", "conversation_id": str(conversation.id)},
+    )
+    query = urllib.parse.quote("/summarize")
+    response = client_offline_chat.get(f"/api/chat?q={query}&conversation_id={conversation.id}&stream=true")
+    response_message = response.content.decode("utf-8")
+    # Assert
+    assert response_message == "No files selected for summarization. Please add files using the section on the left."
+
+
+@pytest.mark.django_db(transaction=True)
+@pytest.mark.chatquality
+def test_summarize_diff_user_file(
+    client_offline_chat, default_user: KhojUser, pdf_configured_user1, default_user2: KhojUser
+):
+    message_list = []
+    conversation = create_conversation(message_list, default_user2)
+    # Get the pdf file called singlepage.pdf
+    file_list = (
+        Entry.objects.filter(user=default_user, file_source="computer")
+        .distinct("file_path")
+        .values_list("file_path", flat=True)
+    )
+    summarization_file = ""
+    for file in file_list:
+        if "singlepage.pdf" in file:
+            summarization_file = file
+            break
+    assert summarization_file != ""
+    # add singlepage.pdf to the file filters
+    response = client_offline_chat.post(
+        "api/chat/conversation/file-filters",
+        json={"filename": summarization_file, "conversation_id": str(conversation.id)},
+    )
+    query = urllib.parse.quote("/summarize")
+    response = client_offline_chat.get(f"/api/chat?q={query}&conversation_id={conversation.id}&stream=true")
+    response_message = response.content.decode("utf-8")
+    # Assert
+    assert response_message == "No files selected for summarization. Please add files using the section on the left."
+
+
 # ----------------------------------------------------------------------------------------------------
@pytest.mark.chatquality
@pytest.mark.django_db(transaction=True)
@@ -522,7 +716,7 @@ async def test_get_correct_tools_online(client_offline_chat):
    user_query = "What's the weather in Patagonia this week?"

    # Act
-    tools = await aget_relevant_information_sources(user_query, {})
+    tools = await aget_relevant_information_sources(user_query, {}, is_task=False)

    # Assert
    tools = [tool.value for tool in tools]
@@ -537,7 +731,7 @@ async def test_get_correct_tools_notes(client_offline_chat):
    user_query = "Where did I go for my first battleship training?"

    # Act
-    tools = await aget_relevant_information_sources(user_query, {})
+    tools = await aget_relevant_information_sources(user_query, {}, is_task=False)

    # Assert
    tools = [tool.value for tool in tools]
@@ -552,7 +746,7 @@ async def test_get_correct_tools_online_or_general_and_notes(client_offline_chat
    user_query = "What's the highest point in Patagonia and have I been there?"

    # Act
-    tools = await aget_relevant_information_sources(user_query, {})
+    tools = await aget_relevant_information_sources(user_query, {}, is_task=False)

    # Assert
    tools = [tool.value for tool in tools]
@@ -569,7 +763,7 @@ async def test_get_correct_tools_general(client_offline_chat):
    user_query = "How many noble gases are there?"

    # Act
-    tools = await aget_relevant_information_sources(user_query, {})
+    tools = await aget_relevant_information_sources(user_query, {}, is_task=False)

    # Assert
    tools = [tool.value for tool in tools]
@@ -593,7 +787,7 @@ async def test_get_correct_tools_with_chat_history(client_offline_chat, default_
    chat_history = create_conversation(chat_log, default_user2)

    # Act
-    tools = await aget_relevant_information_sources(user_query, chat_history)
+    tools = await aget_relevant_information_sources(user_query, chat_history, is_task=False)

    # Assert
    tools = [tool.value for tool in tools]