Merge branch 'master' into support-incremental-updates-of-embeddings

2026-03-06 05:39:12 +00:00 · 2022-09-10 22:11:43 +03:00
parent 030fab9bb2 ed8d432fdd
commit ebd5039bd1
20 changed files with 225 additions and 70 deletions
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -4,6 +4,7 @@ import pytest
 # Internal Packages
 from src.search_type import image_search, text_search
 from src.utils.config import SearchType
+from src.utils.helpers import resolve_absolute_path
 from src.utils.rawconfig import ContentConfig, TextContentConfig, ImageContentConfig, SearchConfig, TextSearchConfig, ImageSearchConfig
 from src.processor.org_mode.org_to_jsonl import org_to_jsonl
 from src.search_filter.date_filter import DateFilter
@@ -12,41 +13,41 @@ from src.search_filter.file_filter import FileFilter


@pytest.fixture(scope='session')
-def search_config(tmp_path_factory) -> SearchConfig:
-    model_dir = tmp_path_factory.mktemp('data')
-
+def search_config() -> SearchConfig:
+    model_dir = resolve_absolute_path('~/.khoj/search')
+    model_dir.mkdir(parents=True, exist_ok=True)
    search_config = SearchConfig()

    search_config.symmetric = TextSearchConfig(
        encoder = "sentence-transformers/all-MiniLM-L6-v2",
        cross_encoder = "cross-encoder/ms-marco-MiniLM-L-6-v2",
-        model_directory = model_dir
+        model_directory = model_dir / 'symmetric/'
    )

    search_config.asymmetric = TextSearchConfig(
        encoder = "sentence-transformers/multi-qa-MiniLM-L6-cos-v1",
        cross_encoder = "cross-encoder/ms-marco-MiniLM-L-6-v2",
-        model_directory = model_dir
+        model_directory = model_dir / 'asymmetric/'
    )

    search_config.image = ImageSearchConfig(
        encoder = "sentence-transformers/clip-ViT-B-32",
-        model_directory = model_dir
+        model_directory = model_dir / 'image/'
    )

    return search_config


@pytest.fixture(scope='session')
-def model_dir(search_config: SearchConfig):
-    model_dir = search_config.asymmetric.model_directory
+def content_config(tmp_path_factory, search_config: SearchConfig):
+    content_dir = tmp_path_factory.mktemp('content')

    # Generate Image Embeddings from Test Images
    content_config = ContentConfig()
    content_config.image = ImageContentConfig(
        input_directories = ['tests/data/images'],
-        embeddings_file = model_dir.joinpath('image_embeddings.pt'),
-        batch_size = 10,
+        embeddings_file = content_dir.joinpath('image_embeddings.pt'),
+        batch_size = 1,
        use_xmp_metadata = False)

    image_search.setup(content_config.image, search_config.image, regenerate=False)
@@ -55,28 +56,10 @@ def model_dir(search_config: SearchConfig):
    content_config.org = TextContentConfig(
        input_files = None,
        input_filter = 'tests/data/org/*.org',
-        compressed_jsonl = model_dir.joinpath('notes.jsonl.gz'),
-        embeddings_file = model_dir.joinpath('note_embeddings.pt'))
+        compressed_jsonl = content_dir.joinpath('notes.jsonl.gz'),
+        embeddings_file = content_dir.joinpath('note_embeddings.pt'))

    filters = [DateFilter(), WordFilter(), FileFilter()]
    text_search.setup(org_to_jsonl, content_config.org, search_config.asymmetric, regenerate=False, filters=filters)

-    return model_dir
-
-
-@pytest.fixture(scope='session')
-def content_config(model_dir) -> ContentConfig:
-    content_config = ContentConfig()
-    content_config.org = TextContentConfig(
-        input_files = None,
-        input_filter = 'tests/data/org/*.org',
-        compressed_jsonl = model_dir.joinpath('notes.jsonl.gz'),
-        embeddings_file = model_dir.joinpath('note_embeddings.pt'))
-
-    content_config.image = ImageContentConfig(
-        input_directories = ['tests/data/images'],
-        embeddings_file = model_dir.joinpath('image_embeddings.pt'),
-        batch_size = 1,
-        use_xmp_metadata = False)
-
-    return content_config
+    return content_config
--- a/tests/test_cli.py
+++ b/tests/test_cli.py
@@ -2,9 +2,6 @@
 from pathlib import Path
 from random import random

-# External Modules
-import pytest
-
 # Internal Packages
 from src.utils.cli import cli
 from src.utils.helpers import resolve_absolute_path
--- a/tests/test_image_search.py
+++ b/tests/test_image_search.py
@@ -48,8 +48,13 @@ def test_image_search(content_config: ContentConfig, search_config: SearchConfig
            image_files_url='/static/images',
            count=1)

-        actual_image = Image.open(output_directory.joinpath(Path(results[0]["entry"]).name))
+        actual_image_path = output_directory.joinpath(Path(results[0]["entry"]).name)
+        actual_image = Image.open(actual_image_path)
        expected_image = Image.open(content_config.image.input_directories[0].joinpath(expected_image_name))

        # Assert
        assert expected_image == actual_image
+
+    # Cleanup
+    # Delete the image files copied to results directory
+    actual_image_path.unlink()
--- a/tests/test_org_to_jsonl.py
+++ b/tests/test_org_to_jsonl.py
@@ -1,6 +1,5 @@
 # Standard Packages
 import json
-from posixpath import split

 # Internal Packages
 from src.processor.org_mode.org_to_jsonl import convert_org_entries_to_jsonl, convert_org_nodes_to_entries, extract_org_entries
@@ -15,7 +14,7 @@ def test_entry_with_empty_body_line_to_jsonl(tmp_path):
    :PROPERTIES:
    :ID:       42-42-42
    :END:
-    \t\r\n 
+    \t\r 
    '''
    orgfile = create_file(tmp_path, entry)

@@ -38,7 +37,29 @@ def test_entry_with_body_to_jsonl(tmp_path):
    :PROPERTIES:
    :ID:       42-42-42
    :END:
-    \t\r\nBody Line 1\n
+    \t\r
+    Body Line 1
+    '''
+    orgfile = create_file(tmp_path, entry)
+
+    # Act
+    # Extract Entries from specified Org files
+    entries, entry_to_file_map = extract_org_entries(org_files=[orgfile])
+
+    # Process Each Entry from All Notes Files
+    jsonl_string = convert_org_entries_to_jsonl(convert_org_nodes_to_entries(entries, entry_to_file_map))
+    jsonl_data = [json.loads(json_string) for json_string in jsonl_string.splitlines()]
+
+    # Assert
+    assert len(jsonl_data) == 1
+
+
+def test_file_with_no_headings_to_jsonl(tmp_path):
+    "Ensure files with no heading, only body text are loaded."
+    # Arrange
+    entry = f'''
+    - Bullet point 1
+    - Bullet point 2
    '''
    orgfile = create_file(tmp_path, entry)

--- a/tests/test_orgnode.py
+++ b/tests/test_orgnode.py
@@ -8,6 +8,28 @@ from src.processor.org_mode import orgnode


 # Test
+# ----------------------------------------------------------------------------------------------------
+def test_parse_entry_with_no_headings(tmp_path):
+    "Test parsing of entry with minimal fields"
+    # Arrange
+    entry = f'''Body Line 1'''
+    orgfile = create_file(tmp_path, entry)
+
+    # Act
+    entries = orgnode.makelist(orgfile)
+
+    # Assert
+    assert len(entries) == 1
+    assert entries[0].Heading() == f'{orgfile}'
+    assert entries[0].Tags() == list()
+    assert entries[0].Body() == "Body Line 1"
+    assert entries[0].Priority() == ""
+    assert entries[0].Property("ID") == ""
+    assert entries[0].Closed() == ""
+    assert entries[0].Scheduled() == ""
+    assert entries[0].Deadline() == ""
+
+
 # ----------------------------------------------------------------------------------------------------
 def test_parse_minimal_entry(tmp_path):
    "Test parsing of entry with minimal fields"
@@ -81,18 +103,17 @@ Body Line 1
 Body Line 2
 '''
    orgfile = create_file(tmp_path, entry)
-    normalized_orgfile = f'~/{relpath(orgfile, start=Path.home())}'

    # Act
    entries = orgnode.makelist(orgfile)

    # Assert
    # SOURCE link rendered with Heading
-    assert f':SOURCE: [[file:{normalized_orgfile}::*{entries[0].Heading()}]]' in f'{entries[0]}'
+    assert f':SOURCE: [[file:{orgfile}::*{entries[0].Heading()}]]' in f'{entries[0]}'
    # ID link rendered with ID
    assert f':ID: id:123-456-789-4234-1231' in f'{entries[0]}'
    # LINE link rendered with line number
-    assert f':LINE: file:{normalized_orgfile}::2' in f'{entries[0]}'
+    assert f':LINE: file:{orgfile}::2' in f'{entries[0]}'


 # ----------------------------------------------------------------------------------------------------
@@ -115,8 +136,7 @@ Body Line 1'''
    # parsed heading from entry
    assert entries[0].Heading() == "Heading[1]"
    # ensure SOURCE link has square brackets in filename, heading escaped in rendered entries
-    normalized_orgfile = f'~/{relpath(orgfile, start=Path.home())}'
-    escaped_orgfile = f'{normalized_orgfile}'.replace("[1]", "\\[1\\]")
+    escaped_orgfile = f'{orgfile}'.replace("[1]", "\\[1\\]")
    assert f':SOURCE: [[file:{escaped_orgfile}::*Heading\[1\]' in f'{entries[0]}'


@@ -168,10 +188,80 @@ Body 2
        assert entry.Logbook() == [(datetime.datetime(1984,4,index+1,9,0,0), datetime.datetime(1984,4,index+1,12,0,0))]


+# ----------------------------------------------------------------------------------------------------
+def test_parse_entry_with_empty_title(tmp_path):
+    "Test parsing of entry with minimal fields"
+    # Arrange
+    entry = f'''#+TITLE: 
+Body Line 1'''
+    orgfile = create_file(tmp_path, entry)
+
+    # Act
+    entries = orgnode.makelist(orgfile)
+
+    # Assert
+    assert len(entries) == 1
+    assert entries[0].Heading() == f'{orgfile}'
+    assert entries[0].Tags() == list()
+    assert entries[0].Body() == "Body Line 1"
+    assert entries[0].Priority() == ""
+    assert entries[0].Property("ID") == ""
+    assert entries[0].Closed() == ""
+    assert entries[0].Scheduled() == ""
+    assert entries[0].Deadline() == ""
+
+
+# ----------------------------------------------------------------------------------------------------
+def test_parse_entry_with_title_and_no_headings(tmp_path):
+    "Test parsing of entry with minimal fields"
+    # Arrange
+    entry = f'''#+TITLE: test
+Body Line 1'''
+    orgfile = create_file(tmp_path, entry)
+
+    # Act
+    entries = orgnode.makelist(orgfile)
+
+    # Assert
+    assert len(entries) == 1
+    assert entries[0].Heading() == 'test'
+    assert entries[0].Tags() == list()
+    assert entries[0].Body() == "Body Line 1"
+    assert entries[0].Priority() == ""
+    assert entries[0].Property("ID") == ""
+    assert entries[0].Closed() == ""
+    assert entries[0].Scheduled() == ""
+    assert entries[0].Deadline() == ""
+
+
+# ----------------------------------------------------------------------------------------------------
+def test_parse_entry_with_multiple_titles_and_no_headings(tmp_path):
+    "Test parsing of entry with minimal fields"
+    # Arrange
+    entry = f'''#+TITLE: title1 
+Body Line 1
+#+TITLE:  title2  '''
+    orgfile = create_file(tmp_path, entry)
+
+    # Act
+    entries = orgnode.makelist(orgfile)
+
+    # Assert
+    assert len(entries) == 1
+    assert entries[0].Heading() == 'title1 title2'
+    assert entries[0].Tags() == list()
+    assert entries[0].Body() == "Body Line 1\n"
+    assert entries[0].Priority() == ""
+    assert entries[0].Property("ID") == ""
+    assert entries[0].Closed() == ""
+    assert entries[0].Scheduled() == ""
+    assert entries[0].Deadline() == ""
+
+
 # Helper Functions
 def create_file(tmp_path, entry, filename="test.org"):
    org_file = tmp_path / f"notes/{filename}"
    org_file.parent.mkdir()
    org_file.touch()
    org_file.write_text(entry)
-    return org_file
+    return org_file
--- a/tests/test_text_search.py
+++ b/tests/test_text_search.py
@@ -1,6 +1,10 @@
 # System Packages
+from copy import deepcopy
 from pathlib import Path

+# External Packages
+import pytest
+
 # Internal Packages
 from src.utils.state import model
 from src.search_type import text_search
@@ -9,6 +13,25 @@ from src.processor.org_mode.org_to_jsonl import org_to_jsonl


 # Test
+# ----------------------------------------------------------------------------------------------------
+def test_asymmetric_setup_with_empty_file_raises_error(content_config: ContentConfig, search_config: SearchConfig):
+    # Arrange
+    file_to_index = Path(content_config.org.input_filter).parent / "new_file_to_index.org"
+    file_to_index.touch()
+    new_org_content_config = deepcopy(content_config.org)
+    new_org_content_config.input_files = [f'{file_to_index}']
+    new_org_content_config.input_filter = None
+
+    # Act
+    # Generate notes embeddings during asymmetric setup
+    with pytest.raises(ValueError, match=r'^No valid entries found*'):
+        text_search.setup(org_to_jsonl, new_org_content_config, search_config.asymmetric, regenerate=True)
+
+    # Cleanup
+    # delete created test file
+    file_to_index.unlink()
+
+
 # ----------------------------------------------------------------------------------------------------
 def test_asymmetric_setup(content_config: ContentConfig, search_config: SearchConfig):
    # Act
@@ -23,7 +46,7 @@ def test_asymmetric_setup(content_config: ContentConfig, search_config: SearchCo
 # ----------------------------------------------------------------------------------------------------
 def test_asymmetric_search(content_config: ContentConfig, search_config: SearchConfig):
    # Arrange
-    model.notes_search = text_search.setup(org_to_jsonl, content_config.org, search_config.asymmetric, regenerate=False)
+    model.notes_search = text_search.setup(org_to_jsonl, content_config.org, search_config.asymmetric, regenerate=True)
    query = "How to git install application?"

    # Act