mirror of
https://github.com/khoaliber/khoj.git
synced 2026-03-07 13:23:15 +00:00
Move to a push-first model for retrieving embeddings from local files (#457)
* Initial version - setup a file-push architecture for generating embeddings with Khoj * Update unit tests to fix with new application design * Allow configure server to be called without regenerating the index; this no longer works because the API for indexing files is not up in time for the server to send a request * Use state.host and state.port for configuring the URL for the indexer * On application startup, load in embeddings from configurations files, rather than regenerating the corpus based on file system
This commit is contained in:
@@ -9,6 +9,7 @@ import pytest
|
||||
from khoj.main import app
|
||||
from khoj.configure import configure_processor, configure_routes, configure_search_types
|
||||
from khoj.processor.markdown.markdown_to_jsonl import MarkdownToJsonl
|
||||
from khoj.processor.plaintext.plaintext_to_jsonl import PlaintextToJsonl
|
||||
from khoj.search_type import image_search, text_search
|
||||
from khoj.utils.config import SearchModels
|
||||
from khoj.utils.helpers import resolve_absolute_path
|
||||
@@ -97,7 +98,12 @@ def content_config(tmp_path_factory, search_models: SearchModels, search_config:
|
||||
|
||||
filters = [DateFilter(), WordFilter(), FileFilter()]
|
||||
text_search.setup(
|
||||
OrgToJsonl, content_config.org, search_models.text_search.bi_encoder, regenerate=False, filters=filters
|
||||
OrgToJsonl,
|
||||
get_sample_data("org"),
|
||||
content_config.org,
|
||||
search_models.text_search.bi_encoder,
|
||||
regenerate=False,
|
||||
filters=filters,
|
||||
)
|
||||
|
||||
content_config.plugins = {
|
||||
@@ -109,6 +115,20 @@ def content_config(tmp_path_factory, search_models: SearchModels, search_config:
|
||||
)
|
||||
}
|
||||
|
||||
if os.getenv("GITHUB_PAT_TOKEN"):
|
||||
content_config.github = GithubContentConfig(
|
||||
pat_token=os.getenv("GITHUB_PAT_TOKEN", ""),
|
||||
repos=[
|
||||
GithubRepoConfig(
|
||||
owner="khoj-ai",
|
||||
name="lantern",
|
||||
branch="master",
|
||||
)
|
||||
],
|
||||
compressed_jsonl=content_dir.joinpath("github.jsonl.gz"),
|
||||
embeddings_file=content_dir.joinpath("github_embeddings.pt"),
|
||||
)
|
||||
|
||||
content_config.plaintext = TextContentConfig(
|
||||
input_files=None,
|
||||
input_filter=["tests/data/plaintext/*.txt", "tests/data/plaintext/*.md", "tests/data/plaintext/*.html"],
|
||||
@@ -132,6 +152,7 @@ def content_config(tmp_path_factory, search_models: SearchModels, search_config:
|
||||
filters = [DateFilter(), WordFilter(), FileFilter()]
|
||||
text_search.setup(
|
||||
JsonlToJsonl,
|
||||
None,
|
||||
content_config.plugins["plugin1"],
|
||||
search_models.text_search.bi_encoder,
|
||||
regenerate=False,
|
||||
@@ -203,6 +224,7 @@ def chat_client(md_content_config: ContentConfig, search_config: SearchConfig, p
|
||||
state.search_models.text_search = text_search.initialize_model(search_config.asymmetric)
|
||||
state.content_index.markdown = text_search.setup(
|
||||
MarkdownToJsonl,
|
||||
get_sample_data("markdown"),
|
||||
md_content_config.markdown,
|
||||
state.search_models.text_search.bi_encoder,
|
||||
regenerate=False,
|
||||
@@ -226,11 +248,22 @@ def client(content_config: ContentConfig, search_config: SearchConfig, processor
|
||||
state.search_models.text_search = text_search.initialize_model(search_config.asymmetric)
|
||||
state.search_models.image_search = image_search.initialize_model(search_config.image)
|
||||
state.content_index.org = text_search.setup(
|
||||
OrgToJsonl, content_config.org, state.search_models.text_search.bi_encoder, regenerate=False
|
||||
OrgToJsonl,
|
||||
get_sample_data("org"),
|
||||
content_config.org,
|
||||
state.search_models.text_search.bi_encoder,
|
||||
regenerate=False,
|
||||
)
|
||||
state.content_index.image = image_search.setup(
|
||||
content_config.image, state.search_models.image_search, regenerate=False
|
||||
)
|
||||
state.content_index.plaintext = text_search.setup(
|
||||
PlaintextToJsonl,
|
||||
get_sample_data("plaintext"),
|
||||
content_config.plaintext,
|
||||
state.search_models.text_search.bi_encoder,
|
||||
regenerate=False,
|
||||
)
|
||||
|
||||
state.processor_config = configure_processor(processor_config)
|
||||
|
||||
@@ -250,8 +283,21 @@ def client_offline_chat(
|
||||
# Index Markdown Content for Search
|
||||
filters = [DateFilter(), WordFilter(), FileFilter()]
|
||||
state.search_models.text_search = text_search.initialize_model(search_config.asymmetric)
|
||||
state.search_models.image_search = image_search.initialize_model(search_config.image)
|
||||
state.content_index.org = text_search.setup(
|
||||
OrgToJsonl,
|
||||
get_sample_data("org"),
|
||||
content_config.org,
|
||||
state.search_models.text_search.bi_encoder,
|
||||
regenerate=False,
|
||||
)
|
||||
state.content_index.image = image_search.setup(
|
||||
content_config.image, state.search_models.image_search, regenerate=False
|
||||
)
|
||||
|
||||
state.content_index.markdown = text_search.setup(
|
||||
MarkdownToJsonl,
|
||||
get_sample_data("markdown"),
|
||||
md_content_config.markdown,
|
||||
state.search_models.text_search.bi_encoder,
|
||||
regenerate=False,
|
||||
@@ -284,3 +330,69 @@ def org_config_with_only_new_file(content_config: ContentConfig, new_org_file: P
|
||||
new_org_config.input_files = [f"{new_org_file}"]
|
||||
new_org_config.input_filter = None
|
||||
return new_org_config
|
||||
|
||||
|
||||
@pytest.fixture(scope="function")
|
||||
def sample_org_data():
|
||||
return get_sample_data("org")
|
||||
|
||||
|
||||
def get_sample_data(type):
|
||||
sample_data = {
|
||||
"org": {
|
||||
"readme.org": """
|
||||
* Khoj
|
||||
/Allow natural language search on user content like notes, images using transformer based models/
|
||||
|
||||
All data is processed locally. User can interface with khoj app via [[./interface/emacs/khoj.el][Emacs]], API or Commandline
|
||||
|
||||
** Dependencies
|
||||
- Python3
|
||||
- [[https://docs.conda.io/en/latest/miniconda.html#latest-miniconda-installer-links][Miniconda]]
|
||||
|
||||
** Install
|
||||
#+begin_src shell
|
||||
git clone https://github.com/khoj-ai/khoj && cd khoj
|
||||
conda env create -f environment.yml
|
||||
conda activate khoj
|
||||
#+end_src"""
|
||||
},
|
||||
"markdown": {
|
||||
"readme.markdown": """
|
||||
# Khoj
|
||||
Allow natural language search on user content like notes, images using transformer based models
|
||||
|
||||
All data is processed locally. User can interface with khoj app via [Emacs](./interface/emacs/khoj.el), API or Commandline
|
||||
|
||||
## Dependencies
|
||||
- Python3
|
||||
- [Miniconda](https://docs.conda.io/en/latest/miniconda.html#latest-miniconda-installer-links)
|
||||
|
||||
## Install
|
||||
```shell
|
||||
git clone
|
||||
conda env create -f environment.yml
|
||||
conda activate khoj
|
||||
```
|
||||
"""
|
||||
},
|
||||
"plaintext": {
|
||||
"readme.txt": """
|
||||
Khoj
|
||||
Allow natural language search on user content like notes, images using transformer based models
|
||||
|
||||
All data is processed locally. User can interface with khoj app via Emacs, API or Commandline
|
||||
|
||||
Dependencies
|
||||
- Python3
|
||||
- Miniconda
|
||||
|
||||
Install
|
||||
git clone
|
||||
conda env create -f environment.yml
|
||||
conda activate khoj
|
||||
"""
|
||||
},
|
||||
}
|
||||
|
||||
return sample_data[type]
|
||||
|
||||
@@ -11,7 +11,6 @@ from fastapi.testclient import TestClient
|
||||
from khoj.main import app
|
||||
from khoj.configure import configure_routes, configure_search_types
|
||||
from khoj.utils import state
|
||||
from khoj.utils.config import SearchModels
|
||||
from khoj.utils.state import search_models, content_index, config
|
||||
from khoj.search_type import text_search, image_search
|
||||
from khoj.utils.rawconfig import ContentConfig, SearchConfig
|
||||
@@ -51,28 +50,6 @@ def test_update_with_invalid_content_type(client):
|
||||
assert response.status_code == 422
|
||||
|
||||
|
||||
# ----------------------------------------------------------------------------------------------------
|
||||
def test_update_with_valid_content_type(client):
|
||||
for content_type in ["all", "org", "markdown", "image", "pdf", "notion", "plugin1"]:
|
||||
# Act
|
||||
response = client.get(f"/api/update?t={content_type}")
|
||||
# Assert
|
||||
assert response.status_code == 200, f"Returned status: {response.status_code} for content type: {content_type}"
|
||||
|
||||
|
||||
# ----------------------------------------------------------------------------------------------------
|
||||
def test_update_with_github_fails_without_pat(client):
|
||||
# Act
|
||||
response = client.get(f"/api/update?t=github")
|
||||
|
||||
# Assert
|
||||
assert response.status_code == 500, f"Returned status: {response.status_code} for content type: github"
|
||||
assert (
|
||||
response.json()["detail"]
|
||||
== "🚨 Failed to update server via API: Github PAT token is not set. Skipping github content"
|
||||
)
|
||||
|
||||
|
||||
# ----------------------------------------------------------------------------------------------------
|
||||
def test_regenerate_with_invalid_content_type(client):
|
||||
# Act
|
||||
@@ -82,11 +59,29 @@ def test_regenerate_with_invalid_content_type(client):
|
||||
assert response.status_code == 422
|
||||
|
||||
|
||||
# ----------------------------------------------------------------------------------------------------
|
||||
def test_index_batch(client):
|
||||
# Arrange
|
||||
request_body = get_sample_files_data()
|
||||
headers = {"x-api-key": "secret"}
|
||||
|
||||
# Act
|
||||
response = client.post("/indexer/batch", json=request_body, headers=headers)
|
||||
|
||||
# Assert
|
||||
assert response.status_code == 200
|
||||
|
||||
|
||||
# ----------------------------------------------------------------------------------------------------
|
||||
def test_regenerate_with_valid_content_type(client):
|
||||
for content_type in ["all", "org", "markdown", "image", "pdf", "notion", "plugin1"]:
|
||||
# Arrange
|
||||
request_body = get_sample_files_data()
|
||||
|
||||
headers = {"x-api-key": "secret"}
|
||||
|
||||
# Act
|
||||
response = client.get(f"/api/update?force=true&t={content_type}")
|
||||
response = client.post(f"/indexer/batch?search_type={content_type}", json=request_body, headers=headers)
|
||||
# Assert
|
||||
assert response.status_code == 200, f"Returned status: {response.status_code} for content type: {content_type}"
|
||||
|
||||
@@ -96,12 +91,15 @@ def test_regenerate_with_github_fails_without_pat(client):
|
||||
# Act
|
||||
response = client.get(f"/api/update?force=true&t=github")
|
||||
|
||||
# Arrange
|
||||
request_body = get_sample_files_data()
|
||||
|
||||
headers = {"x-api-key": "secret"}
|
||||
|
||||
# Act
|
||||
response = client.post(f"/indexer/batch?search_type=github", json=request_body, headers=headers)
|
||||
# Assert
|
||||
assert response.status_code == 500, f"Returned status: {response.status_code} for content type: github"
|
||||
assert (
|
||||
response.json()["detail"]
|
||||
== "🚨 Failed to update server via API: Github PAT token is not set. Skipping github content"
|
||||
)
|
||||
assert response.status_code == 200, f"Returned status: {response.status_code} for content type: github"
|
||||
|
||||
|
||||
# ----------------------------------------------------------------------------------------------------
|
||||
@@ -111,7 +109,7 @@ def test_get_configured_types_via_api(client):
|
||||
|
||||
# Assert
|
||||
assert response.status_code == 200
|
||||
assert response.json() == ["all", "org", "image", "plugin1"]
|
||||
assert response.json() == ["all", "org", "image", "plaintext", "plugin1"]
|
||||
|
||||
|
||||
# ----------------------------------------------------------------------------------------------------
|
||||
@@ -194,11 +192,11 @@ def test_image_search(client, content_config: ContentConfig, search_config: Sear
|
||||
|
||||
|
||||
# ----------------------------------------------------------------------------------------------------
|
||||
def test_notes_search(client, content_config: ContentConfig, search_config: SearchConfig):
|
||||
def test_notes_search(client, content_config: ContentConfig, search_config: SearchConfig, sample_org_data):
|
||||
# Arrange
|
||||
search_models.text_search = text_search.initialize_model(search_config.asymmetric)
|
||||
content_index.org = text_search.setup(
|
||||
OrgToJsonl, content_config.org, search_models.text_search.bi_encoder, regenerate=False
|
||||
OrgToJsonl, sample_org_data, content_config.org, search_models.text_search.bi_encoder, regenerate=False
|
||||
)
|
||||
user_query = quote("How to git install application?")
|
||||
|
||||
@@ -213,12 +211,19 @@ def test_notes_search(client, content_config: ContentConfig, search_config: Sear
|
||||
|
||||
|
||||
# ----------------------------------------------------------------------------------------------------
|
||||
def test_notes_search_with_only_filters(client, content_config: ContentConfig, search_config: SearchConfig):
|
||||
def test_notes_search_with_only_filters(
|
||||
client, content_config: ContentConfig, search_config: SearchConfig, sample_org_data
|
||||
):
|
||||
# Arrange
|
||||
filters = [WordFilter(), FileFilter()]
|
||||
search_models.text_search = text_search.initialize_model(search_config.asymmetric)
|
||||
content_index.org = text_search.setup(
|
||||
OrgToJsonl, content_config.org, search_models.text_search.bi_encoder, regenerate=False, filters=filters
|
||||
OrgToJsonl,
|
||||
sample_org_data,
|
||||
content_config.org,
|
||||
search_models.text_search.bi_encoder,
|
||||
regenerate=False,
|
||||
filters=filters,
|
||||
)
|
||||
user_query = quote('+"Emacs" file:"*.org"')
|
||||
|
||||
@@ -233,12 +238,14 @@ def test_notes_search_with_only_filters(client, content_config: ContentConfig, s
|
||||
|
||||
|
||||
# ----------------------------------------------------------------------------------------------------
|
||||
def test_notes_search_with_include_filter(client, content_config: ContentConfig, search_config: SearchConfig):
|
||||
def test_notes_search_with_include_filter(
|
||||
client, content_config: ContentConfig, search_config: SearchConfig, sample_org_data
|
||||
):
|
||||
# Arrange
|
||||
filters = [WordFilter()]
|
||||
search_models.text_search = text_search.initialize_model(search_config.asymmetric)
|
||||
content_index.org = text_search.setup(
|
||||
OrgToJsonl, content_config.org, search_models.text_search, regenerate=False, filters=filters
|
||||
OrgToJsonl, sample_org_data, content_config.org, search_models.text_search, regenerate=False, filters=filters
|
||||
)
|
||||
user_query = quote('How to git install application? +"Emacs"')
|
||||
|
||||
@@ -253,12 +260,19 @@ def test_notes_search_with_include_filter(client, content_config: ContentConfig,
|
||||
|
||||
|
||||
# ----------------------------------------------------------------------------------------------------
|
||||
def test_notes_search_with_exclude_filter(client, content_config: ContentConfig, search_config: SearchConfig):
|
||||
def test_notes_search_with_exclude_filter(
|
||||
client, content_config: ContentConfig, search_config: SearchConfig, sample_org_data
|
||||
):
|
||||
# Arrange
|
||||
filters = [WordFilter()]
|
||||
search_models.text_search = text_search.initialize_model(search_config.asymmetric)
|
||||
content_index.org = text_search.setup(
|
||||
OrgToJsonl, content_config.org, search_models.text_search.bi_encoder, regenerate=False, filters=filters
|
||||
OrgToJsonl,
|
||||
sample_org_data,
|
||||
content_config.org,
|
||||
search_models.text_search.bi_encoder,
|
||||
regenerate=False,
|
||||
filters=filters,
|
||||
)
|
||||
user_query = quote('How to git install application? -"clone"')
|
||||
|
||||
@@ -270,3 +284,28 @@ def test_notes_search_with_exclude_filter(client, content_config: ContentConfig,
|
||||
# assert actual_data does not contains word "clone"
|
||||
search_result = response.json()[0]["entry"]
|
||||
assert "clone" not in search_result
|
||||
|
||||
|
||||
def get_sample_files_data():
|
||||
return {
|
||||
"org": {
|
||||
"path/to/filename.org": "* practicing piano",
|
||||
"path/to/filename1.org": "** top 3 reasons why I moved to SF",
|
||||
"path/to/filename2.org": "* how to build a search engine",
|
||||
},
|
||||
"pdf": {
|
||||
"path/to/filename.pdf": "Moore's law does not apply to consumer hardware",
|
||||
"path/to/filename1.pdf": "The sun is a ball of helium",
|
||||
"path/to/filename2.pdf": "Effect of sunshine on baseline human happiness",
|
||||
},
|
||||
"plaintext": {
|
||||
"path/to/filename.txt": "data,column,value",
|
||||
"path/to/filename1.txt": "<html>my first web page</html>",
|
||||
"path/to/filename2.txt": "2021-02-02 Journal Entry",
|
||||
},
|
||||
"markdown": {
|
||||
"path/to/filename.md": "# Notes from client call",
|
||||
"path/to/filename1.md": "## Studying anthropological records from the Fatimid caliphate",
|
||||
"path/to/filename2.md": "**Understanding science through the lens of art**",
|
||||
},
|
||||
}
|
||||
|
||||
@@ -1,9 +1,12 @@
|
||||
# Standard Packages
|
||||
import json
|
||||
from pathlib import Path
|
||||
import os
|
||||
|
||||
# Internal Packages
|
||||
from khoj.processor.markdown.markdown_to_jsonl import MarkdownToJsonl
|
||||
from khoj.utils.fs_syncer import get_markdown_files
|
||||
from khoj.utils.rawconfig import TextContentConfig
|
||||
|
||||
|
||||
def test_markdown_file_with_no_headings_to_jsonl(tmp_path):
|
||||
@@ -13,12 +16,14 @@ def test_markdown_file_with_no_headings_to_jsonl(tmp_path):
|
||||
- Bullet point 1
|
||||
- Bullet point 2
|
||||
"""
|
||||
markdownfile = create_file(tmp_path, entry)
|
||||
expected_heading = "# " + markdownfile.stem
|
||||
data = {
|
||||
f"{tmp_path}": entry,
|
||||
}
|
||||
expected_heading = f"# {tmp_path.stem}"
|
||||
|
||||
# Act
|
||||
# Extract Entries from specified Markdown files
|
||||
entry_nodes, file_to_entries = MarkdownToJsonl.extract_markdown_entries(markdown_files=[markdownfile])
|
||||
entry_nodes, file_to_entries = MarkdownToJsonl.extract_markdown_entries(markdown_files=data)
|
||||
|
||||
# Process Each Entry from All Notes Files
|
||||
jsonl_string = MarkdownToJsonl.convert_markdown_maps_to_jsonl(
|
||||
@@ -41,11 +46,13 @@ def test_single_markdown_entry_to_jsonl(tmp_path):
|
||||
\t\r
|
||||
Body Line 1
|
||||
"""
|
||||
markdownfile = create_file(tmp_path, entry)
|
||||
data = {
|
||||
f"{tmp_path}": entry,
|
||||
}
|
||||
|
||||
# Act
|
||||
# Extract Entries from specified Markdown files
|
||||
entries, entry_to_file_map = MarkdownToJsonl.extract_markdown_entries(markdown_files=[markdownfile])
|
||||
entries, entry_to_file_map = MarkdownToJsonl.extract_markdown_entries(markdown_files=data)
|
||||
|
||||
# Process Each Entry from All Notes Files
|
||||
jsonl_string = MarkdownToJsonl.convert_markdown_maps_to_jsonl(
|
||||
@@ -68,11 +75,13 @@ def test_multiple_markdown_entries_to_jsonl(tmp_path):
|
||||
\t\r
|
||||
Heading 2 Body Line 2
|
||||
"""
|
||||
markdownfile = create_file(tmp_path, entry)
|
||||
data = {
|
||||
f"{tmp_path}": entry,
|
||||
}
|
||||
|
||||
# Act
|
||||
# Extract Entries from specified Markdown files
|
||||
entry_strings, entry_to_file_map = MarkdownToJsonl.extract_markdown_entries(markdown_files=[markdownfile])
|
||||
entry_strings, entry_to_file_map = MarkdownToJsonl.extract_markdown_entries(markdown_files=data)
|
||||
entries = MarkdownToJsonl.convert_markdown_entries_to_maps(entry_strings, entry_to_file_map)
|
||||
|
||||
# Process Each Entry from All Notes Files
|
||||
@@ -82,7 +91,7 @@ def test_multiple_markdown_entries_to_jsonl(tmp_path):
|
||||
# Assert
|
||||
assert len(jsonl_data) == 2
|
||||
# Ensure entry compiled strings include the markdown files they originate from
|
||||
assert all([markdownfile.stem in entry.compiled for entry in entries])
|
||||
assert all([tmp_path.stem in entry.compiled for entry in entries])
|
||||
|
||||
|
||||
def test_get_markdown_files(tmp_path):
|
||||
@@ -99,18 +108,27 @@ def test_get_markdown_files(tmp_path):
|
||||
create_file(tmp_path, filename="not-included-markdown.md")
|
||||
create_file(tmp_path, filename="not-included-text.txt")
|
||||
|
||||
expected_files = sorted(map(str, [group1_file1, group1_file2, group2_file1, group2_file2, file1]))
|
||||
expected_files = set(
|
||||
[os.path.join(tmp_path, file.name) for file in [group1_file1, group1_file2, group2_file1, group2_file2, file1]]
|
||||
)
|
||||
|
||||
# Setup input-files, input-filters
|
||||
input_files = [tmp_path / "notes.md"]
|
||||
input_filter = [tmp_path / "group1*.md", tmp_path / "group2*.markdown"]
|
||||
|
||||
markdown_config = TextContentConfig(
|
||||
input_files=input_files,
|
||||
input_filter=[str(filter) for filter in input_filter],
|
||||
compressed_jsonl=tmp_path / "test.jsonl",
|
||||
embeddings_file=tmp_path / "test_embeddings.jsonl",
|
||||
)
|
||||
|
||||
# Act
|
||||
extracted_org_files = MarkdownToJsonl.get_markdown_files(input_files, input_filter)
|
||||
extracted_org_files = get_markdown_files(markdown_config)
|
||||
|
||||
# Assert
|
||||
assert len(extracted_org_files) == 5
|
||||
assert extracted_org_files == expected_files
|
||||
assert set(extracted_org_files.keys()) == expected_files
|
||||
|
||||
|
||||
def test_extract_entries_with_different_level_headings(tmp_path):
|
||||
@@ -120,11 +138,13 @@ def test_extract_entries_with_different_level_headings(tmp_path):
|
||||
# Heading 1
|
||||
## Heading 2
|
||||
"""
|
||||
markdownfile = create_file(tmp_path, entry)
|
||||
data = {
|
||||
f"{tmp_path}": entry,
|
||||
}
|
||||
|
||||
# Act
|
||||
# Extract Entries from specified Markdown files
|
||||
entries, _ = MarkdownToJsonl.extract_markdown_entries(markdown_files=[markdownfile])
|
||||
entries, _ = MarkdownToJsonl.extract_markdown_entries(markdown_files=data)
|
||||
|
||||
# Assert
|
||||
assert len(entries) == 2
|
||||
|
||||
@@ -1,11 +1,14 @@
|
||||
# Standard Packages
|
||||
import json
|
||||
import os
|
||||
|
||||
# Internal Packages
|
||||
from khoj.processor.org_mode.org_to_jsonl import OrgToJsonl
|
||||
from khoj.processor.text_to_jsonl import TextToJsonl
|
||||
from khoj.utils.helpers import is_none_or_empty
|
||||
from khoj.utils.rawconfig import Entry
|
||||
from khoj.utils.fs_syncer import get_org_files
|
||||
from khoj.utils.rawconfig import TextContentConfig
|
||||
|
||||
|
||||
def test_configure_heading_entry_to_jsonl(tmp_path):
|
||||
@@ -18,14 +21,17 @@ def test_configure_heading_entry_to_jsonl(tmp_path):
|
||||
:END:
|
||||
\t \r
|
||||
"""
|
||||
orgfile = create_file(tmp_path, entry)
|
||||
|
||||
data = {
|
||||
f"{tmp_path}": entry,
|
||||
}
|
||||
|
||||
for index_heading_entries in [True, False]:
|
||||
# Act
|
||||
# Extract entries into jsonl from specified Org files
|
||||
jsonl_string = OrgToJsonl.convert_org_entries_to_jsonl(
|
||||
OrgToJsonl.convert_org_nodes_to_entries(
|
||||
*OrgToJsonl.extract_org_entries(org_files=[orgfile]), index_heading_entries=index_heading_entries
|
||||
*OrgToJsonl.extract_org_entries(org_files=data), index_heading_entries=index_heading_entries
|
||||
)
|
||||
)
|
||||
jsonl_data = [json.loads(json_string) for json_string in jsonl_string.splitlines()]
|
||||
@@ -46,12 +52,14 @@ def test_entry_split_when_exceeds_max_words(tmp_path):
|
||||
\t\r
|
||||
Body Line
|
||||
"""
|
||||
orgfile = create_file(tmp_path, entry)
|
||||
expected_heading = f"* {orgfile.stem}\n** Heading"
|
||||
data = {
|
||||
f"{tmp_path}": entry,
|
||||
}
|
||||
expected_heading = f"* {tmp_path.stem}\n** Heading"
|
||||
|
||||
# Act
|
||||
# Extract Entries from specified Org files
|
||||
entries, entry_to_file_map = OrgToJsonl.extract_org_entries(org_files=[orgfile])
|
||||
entries, entry_to_file_map = OrgToJsonl.extract_org_entries(org_files=data)
|
||||
|
||||
# Split each entry from specified Org files by max words
|
||||
jsonl_string = OrgToJsonl.convert_org_entries_to_jsonl(
|
||||
@@ -95,11 +103,13 @@ def test_entry_with_body_to_jsonl(tmp_path):
|
||||
\t\r
|
||||
Body Line 1
|
||||
"""
|
||||
orgfile = create_file(tmp_path, entry)
|
||||
data = {
|
||||
f"{tmp_path}": entry,
|
||||
}
|
||||
|
||||
# Act
|
||||
# Extract Entries from specified Org files
|
||||
entries, entry_to_file_map = OrgToJsonl.extract_org_entries(org_files=[orgfile])
|
||||
entries, entry_to_file_map = OrgToJsonl.extract_org_entries(org_files=data)
|
||||
|
||||
# Process Each Entry from All Notes Files
|
||||
jsonl_string = OrgToJsonl.convert_org_entries_to_jsonl(
|
||||
@@ -120,11 +130,13 @@ Intro text
|
||||
* Entry Heading
|
||||
entry body
|
||||
"""
|
||||
orgfile = create_file(tmp_path, entry)
|
||||
data = {
|
||||
f"{tmp_path}": entry,
|
||||
}
|
||||
|
||||
# Act
|
||||
# Extract Entries from specified Org files
|
||||
entry_nodes, file_to_entries = OrgToJsonl.extract_org_entries(org_files=[orgfile])
|
||||
entry_nodes, file_to_entries = OrgToJsonl.extract_org_entries(org_files=data)
|
||||
|
||||
# Process Each Entry from All Notes Files
|
||||
entries = OrgToJsonl.convert_org_nodes_to_entries(entry_nodes, file_to_entries)
|
||||
@@ -142,11 +154,13 @@ def test_file_with_no_headings_to_jsonl(tmp_path):
|
||||
- Bullet point 1
|
||||
- Bullet point 2
|
||||
"""
|
||||
orgfile = create_file(tmp_path, entry)
|
||||
data = {
|
||||
f"{tmp_path}": entry,
|
||||
}
|
||||
|
||||
# Act
|
||||
# Extract Entries from specified Org files
|
||||
entry_nodes, file_to_entries = OrgToJsonl.extract_org_entries(org_files=[orgfile])
|
||||
entry_nodes, file_to_entries = OrgToJsonl.extract_org_entries(org_files=data)
|
||||
|
||||
# Process Each Entry from All Notes Files
|
||||
entries = OrgToJsonl.convert_org_nodes_to_entries(entry_nodes, file_to_entries)
|
||||
@@ -171,18 +185,30 @@ def test_get_org_files(tmp_path):
|
||||
create_file(tmp_path, filename="orgfile2.org")
|
||||
create_file(tmp_path, filename="text1.txt")
|
||||
|
||||
expected_files = sorted(map(str, [group1_file1, group1_file2, group2_file1, group2_file2, orgfile1]))
|
||||
expected_files = set(
|
||||
[
|
||||
os.path.join(tmp_path, file.name)
|
||||
for file in [group1_file1, group1_file2, group2_file1, group2_file2, orgfile1]
|
||||
]
|
||||
)
|
||||
|
||||
# Setup input-files, input-filters
|
||||
input_files = [tmp_path / "orgfile1.org"]
|
||||
input_filter = [tmp_path / "group1*.org", tmp_path / "group2*.org"]
|
||||
|
||||
org_config = TextContentConfig(
|
||||
input_files=input_files,
|
||||
input_filter=[str(filter) for filter in input_filter],
|
||||
compressed_jsonl=tmp_path / "test.jsonl",
|
||||
embeddings_file=tmp_path / "test_embeddings.jsonl",
|
||||
)
|
||||
|
||||
# Act
|
||||
extracted_org_files = OrgToJsonl.get_org_files(input_files, input_filter)
|
||||
extracted_org_files = get_org_files(org_config)
|
||||
|
||||
# Assert
|
||||
assert len(extracted_org_files) == 5
|
||||
assert extracted_org_files == expected_files
|
||||
assert set(extracted_org_files.keys()) == expected_files
|
||||
|
||||
|
||||
def test_extract_entries_with_different_level_headings(tmp_path):
|
||||
@@ -192,11 +218,13 @@ def test_extract_entries_with_different_level_headings(tmp_path):
|
||||
* Heading 1
|
||||
** Heading 2
|
||||
"""
|
||||
orgfile = create_file(tmp_path, entry)
|
||||
data = {
|
||||
f"{tmp_path}": entry,
|
||||
}
|
||||
|
||||
# Act
|
||||
# Extract Entries from specified Org files
|
||||
entries, _ = OrgToJsonl.extract_org_entries(org_files=[orgfile])
|
||||
entries, _ = OrgToJsonl.extract_org_entries(org_files=data)
|
||||
|
||||
# Assert
|
||||
assert len(entries) == 2
|
||||
|
||||
@@ -44,7 +44,7 @@ Body Line 1"""
|
||||
assert len(entries) == 1
|
||||
assert entries[0].heading == "Heading"
|
||||
assert entries[0].tags == list()
|
||||
assert entries[0].body == "Body Line 1"
|
||||
assert entries[0].body == "Body Line 1\n\n"
|
||||
assert entries[0].priority == ""
|
||||
assert entries[0].Property("ID") == ""
|
||||
assert entries[0].closed == ""
|
||||
@@ -78,7 +78,7 @@ Body Line 2"""
|
||||
assert entries[0].heading == "Heading"
|
||||
assert entries[0].todo == "DONE"
|
||||
assert entries[0].tags == ["Tag1", "TAG2", "tag3"]
|
||||
assert entries[0].body == "- Clocked Log 1\nBody Line 1\nBody Line 2"
|
||||
assert entries[0].body == "- Clocked Log 1\n\nBody Line 1\n\nBody Line 2\n\n"
|
||||
assert entries[0].priority == "A"
|
||||
assert entries[0].Property("ID") == "id:123-456-789-4234-1231"
|
||||
assert entries[0].closed == datetime.date(1984, 4, 1)
|
||||
@@ -205,7 +205,7 @@ Body 2
|
||||
assert entry.heading == f"Heading{index+1}"
|
||||
assert entry.todo == "FAILED" if index == 0 else "CANCELLED"
|
||||
assert entry.tags == [f"tag{index+1}"]
|
||||
assert entry.body == f"- Clocked Log {index+1}\nBody {index+1}\n\n"
|
||||
assert entry.body == f"- Clocked Log {index+1}\n\nBody {index+1}\n\n"
|
||||
assert entry.priority == "A"
|
||||
assert entry.Property("ID") == f"id:123-456-789-4234-000{index+1}"
|
||||
assert entry.closed == datetime.date(1984, 4, index + 1)
|
||||
@@ -305,7 +305,7 @@ entry body
|
||||
assert entries[0].heading == "Title"
|
||||
assert entries[0].body == "intro body\n"
|
||||
assert entries[1].heading == "Entry Heading"
|
||||
assert entries[1].body == "entry body\n"
|
||||
assert entries[1].body == "entry body\n\n"
|
||||
|
||||
|
||||
# ----------------------------------------------------------------------------------------------------
|
||||
@@ -327,7 +327,7 @@ entry body
|
||||
assert entries[0].heading == "Title1 Title2"
|
||||
assert entries[0].body == "intro body\n"
|
||||
assert entries[1].heading == "Entry Heading"
|
||||
assert entries[1].body == "entry body\n"
|
||||
assert entries[1].body == "entry body\n\n"
|
||||
|
||||
|
||||
# Helper Functions
|
||||
|
||||
@@ -1,15 +1,24 @@
|
||||
# Standard Packages
|
||||
import json
|
||||
import os
|
||||
|
||||
# Internal Packages
|
||||
from khoj.processor.pdf.pdf_to_jsonl import PdfToJsonl
|
||||
|
||||
from khoj.utils.fs_syncer import get_pdf_files
|
||||
from khoj.utils.rawconfig import TextContentConfig
|
||||
|
||||
|
||||
def test_single_page_pdf_to_jsonl():
|
||||
"Convert single page PDF file to jsonl."
|
||||
# Act
|
||||
# Extract Entries from specified Pdf files
|
||||
entries, entry_to_file_map = PdfToJsonl.extract_pdf_entries(pdf_files=["tests/data/pdf/singlepage.pdf"])
|
||||
# Read singlepage.pdf into memory as bytes
|
||||
with open("tests/data/pdf/singlepage.pdf", "rb") as f:
|
||||
pdf_bytes = f.read()
|
||||
|
||||
data = {"tests/data/pdf/singlepage.pdf": pdf_bytes}
|
||||
entries, entry_to_file_map = PdfToJsonl.extract_pdf_entries(pdf_files=data)
|
||||
|
||||
# Process Each Entry from All Pdf Files
|
||||
jsonl_string = PdfToJsonl.convert_pdf_maps_to_jsonl(
|
||||
@@ -25,7 +34,11 @@ def test_multi_page_pdf_to_jsonl():
|
||||
"Convert multiple pages from single PDF file to jsonl."
|
||||
# Act
|
||||
# Extract Entries from specified Pdf files
|
||||
entries, entry_to_file_map = PdfToJsonl.extract_pdf_entries(pdf_files=["tests/data/pdf/multipage.pdf"])
|
||||
with open("tests/data/pdf/multipage.pdf", "rb") as f:
|
||||
pdf_bytes = f.read()
|
||||
|
||||
data = {"tests/data/pdf/multipage.pdf": pdf_bytes}
|
||||
entries, entry_to_file_map = PdfToJsonl.extract_pdf_entries(pdf_files=data)
|
||||
|
||||
# Process Each Entry from All Pdf Files
|
||||
jsonl_string = PdfToJsonl.convert_pdf_maps_to_jsonl(
|
||||
@@ -51,18 +64,27 @@ def test_get_pdf_files(tmp_path):
|
||||
create_file(tmp_path, filename="not-included-document.pdf")
|
||||
create_file(tmp_path, filename="not-included-text.txt")
|
||||
|
||||
expected_files = sorted(map(str, [group1_file1, group1_file2, group2_file1, group2_file2, file1]))
|
||||
expected_files = set(
|
||||
[os.path.join(tmp_path, file.name) for file in [group1_file1, group1_file2, group2_file1, group2_file2, file1]]
|
||||
)
|
||||
|
||||
# Setup input-files, input-filters
|
||||
input_files = [tmp_path / "document.pdf"]
|
||||
input_filter = [tmp_path / "group1*.pdf", tmp_path / "group2*.pdf"]
|
||||
|
||||
pdf_config = TextContentConfig(
|
||||
input_files=input_files,
|
||||
input_filter=[str(path) for path in input_filter],
|
||||
compressed_jsonl=tmp_path / "test.jsonl",
|
||||
embeddings_file=tmp_path / "test_embeddings.jsonl",
|
||||
)
|
||||
|
||||
# Act
|
||||
extracted_pdf_files = PdfToJsonl.get_pdf_files(input_files, input_filter)
|
||||
extracted_pdf_files = get_pdf_files(pdf_config)
|
||||
|
||||
# Assert
|
||||
assert len(extracted_pdf_files) == 5
|
||||
assert extracted_pdf_files == expected_files
|
||||
assert set(extracted_pdf_files.keys()) == expected_files
|
||||
|
||||
|
||||
# Helper Functions
|
||||
|
||||
@@ -1,8 +1,11 @@
|
||||
# Standard Packages
|
||||
import json
|
||||
import os
|
||||
from pathlib import Path
|
||||
|
||||
# Internal Packages
|
||||
from khoj.utils.fs_syncer import get_plaintext_files
|
||||
from khoj.utils.rawconfig import TextContentConfig
|
||||
from khoj.processor.plaintext.plaintext_to_jsonl import PlaintextToJsonl
|
||||
|
||||
|
||||
@@ -18,9 +21,12 @@ def test_plaintext_file(tmp_path):
|
||||
|
||||
# Act
|
||||
# Extract Entries from specified plaintext files
|
||||
file_to_entries = PlaintextToJsonl.extract_plaintext_entries(plaintext_files=[str(plaintextfile)])
|
||||
|
||||
maps = PlaintextToJsonl.convert_plaintext_entries_to_maps(file_to_entries)
|
||||
data = {
|
||||
f"{plaintextfile}": entry,
|
||||
}
|
||||
|
||||
maps = PlaintextToJsonl.convert_plaintext_entries_to_maps(entry_to_file_map=data)
|
||||
|
||||
# Convert each entry.file to absolute path to make them JSON serializable
|
||||
for map in maps:
|
||||
@@ -59,33 +65,40 @@ def test_get_plaintext_files(tmp_path):
|
||||
create_file(tmp_path, filename="not-included-markdown.md")
|
||||
create_file(tmp_path, filename="not-included-text.txt")
|
||||
|
||||
expected_files = sorted(
|
||||
map(str, [group1_file1, group1_file2, group2_file1, group2_file2, file1, group2_file3, group2_file4])
|
||||
expected_files = set(
|
||||
[
|
||||
os.path.join(tmp_path, file.name)
|
||||
for file in [group1_file1, group1_file2, group2_file1, group2_file2, group2_file3, group2_file4, file1]
|
||||
]
|
||||
)
|
||||
|
||||
# Setup input-files, input-filters
|
||||
input_files = [tmp_path / "notes.txt"]
|
||||
input_filter = [tmp_path / "group1*.md", tmp_path / "group2*.*"]
|
||||
|
||||
plaintext_config = TextContentConfig(
|
||||
input_files=input_files,
|
||||
input_filter=[str(filter) for filter in input_filter],
|
||||
compressed_jsonl=tmp_path / "test.jsonl",
|
||||
embeddings_file=tmp_path / "test_embeddings.jsonl",
|
||||
)
|
||||
|
||||
# Act
|
||||
extracted_plaintext_files = PlaintextToJsonl.get_plaintext_files(input_files, input_filter)
|
||||
extracted_plaintext_files = get_plaintext_files(plaintext_config)
|
||||
|
||||
# Assert
|
||||
assert len(extracted_plaintext_files) == 7
|
||||
assert set(extracted_plaintext_files) == set(expected_files)
|
||||
assert set(extracted_plaintext_files.keys()) == set(expected_files)
|
||||
|
||||
|
||||
def test_parse_html_plaintext_file(content_config):
|
||||
"Ensure HTML files are parsed correctly"
|
||||
# Arrange
|
||||
# Setup input-files, input-filters
|
||||
input_files = content_config.plaintext.input_files
|
||||
input_filter = content_config.plaintext.input_filter
|
||||
extracted_plaintext_files = get_plaintext_files(content_config.plaintext)
|
||||
|
||||
# Act
|
||||
extracted_plaintext_files = PlaintextToJsonl.get_plaintext_files(input_files, input_filter)
|
||||
file_to_entries = PlaintextToJsonl.extract_plaintext_entries(extracted_plaintext_files)
|
||||
maps = PlaintextToJsonl.convert_plaintext_entries_to_maps(file_to_entries)
|
||||
maps = PlaintextToJsonl.convert_plaintext_entries_to_maps(extracted_plaintext_files)
|
||||
|
||||
# Assert
|
||||
assert len(maps) == 1
|
||||
|
||||
@@ -13,6 +13,7 @@ from khoj.search_type import text_search
|
||||
from khoj.utils.rawconfig import ContentConfig, SearchConfig, TextContentConfig
|
||||
from khoj.processor.org_mode.org_to_jsonl import OrgToJsonl
|
||||
from khoj.processor.github.github_to_jsonl import GithubToJsonl
|
||||
from khoj.utils.fs_syncer import get_org_files
|
||||
|
||||
|
||||
# Test
|
||||
@@ -27,26 +28,30 @@ def test_text_search_setup_with_missing_file_raises_error(
|
||||
|
||||
# Act
|
||||
# Generate notes embeddings during asymmetric setup
|
||||
with pytest.raises(ValueError, match=r"^No valid entries found in specified files:*"):
|
||||
text_search.setup(OrgToJsonl, org_config_with_only_new_file, search_config.asymmetric, regenerate=True)
|
||||
with pytest.raises(FileNotFoundError):
|
||||
data = get_org_files(org_config_with_only_new_file)
|
||||
|
||||
|
||||
# ----------------------------------------------------------------------------------------------------
|
||||
def test_text_search_setup_with_empty_file_raises_error(
|
||||
org_config_with_only_new_file: TextContentConfig, search_config: SearchConfig
|
||||
):
|
||||
# Arrange
|
||||
data = get_org_files(org_config_with_only_new_file)
|
||||
# Act
|
||||
# Generate notes embeddings during asymmetric setup
|
||||
with pytest.raises(ValueError, match=r"^No valid entries found*"):
|
||||
text_search.setup(OrgToJsonl, org_config_with_only_new_file, search_config.asymmetric, regenerate=True)
|
||||
text_search.setup(OrgToJsonl, data, org_config_with_only_new_file, search_config.asymmetric, regenerate=True)
|
||||
|
||||
|
||||
# ----------------------------------------------------------------------------------------------------
|
||||
def test_text_search_setup(content_config: ContentConfig, search_models: SearchModels):
|
||||
# Arrange
|
||||
data = get_org_files(content_config.org)
|
||||
# Act
|
||||
# Regenerate notes embeddings during asymmetric setup
|
||||
notes_model = text_search.setup(
|
||||
OrgToJsonl, content_config.org, search_models.text_search.bi_encoder, regenerate=True
|
||||
OrgToJsonl, data, content_config.org, search_models.text_search.bi_encoder, regenerate=True
|
||||
)
|
||||
|
||||
# Assert
|
||||
@@ -59,14 +64,16 @@ def test_text_index_same_if_content_unchanged(content_config: ContentConfig, sea
|
||||
# Arrange
|
||||
caplog.set_level(logging.INFO, logger="khoj")
|
||||
|
||||
data = get_org_files(content_config.org)
|
||||
|
||||
# Act
|
||||
# Generate initial notes embeddings during asymmetric setup
|
||||
text_search.setup(OrgToJsonl, content_config.org, search_models.text_search.bi_encoder, regenerate=True)
|
||||
text_search.setup(OrgToJsonl, data, content_config.org, search_models.text_search.bi_encoder, regenerate=True)
|
||||
initial_logs = caplog.text
|
||||
caplog.clear() # Clear logs
|
||||
|
||||
# Run asymmetric setup again with no changes to data source. Ensure index is not updated
|
||||
text_search.setup(OrgToJsonl, content_config.org, search_models.text_search.bi_encoder, regenerate=False)
|
||||
text_search.setup(OrgToJsonl, data, content_config.org, search_models.text_search.bi_encoder, regenerate=False)
|
||||
final_logs = caplog.text
|
||||
|
||||
# Assert
|
||||
@@ -78,9 +85,11 @@ def test_text_index_same_if_content_unchanged(content_config: ContentConfig, sea
|
||||
@pytest.mark.anyio
|
||||
async def test_text_search(content_config: ContentConfig, search_config: SearchConfig):
|
||||
# Arrange
|
||||
data = get_org_files(content_config.org)
|
||||
|
||||
search_models.text_search = text_search.initialize_model(search_config.asymmetric)
|
||||
content_index.org = text_search.setup(
|
||||
OrgToJsonl, content_config.org, search_models.text_search.bi_encoder, regenerate=True
|
||||
OrgToJsonl, data, content_config.org, search_models.text_search.bi_encoder, regenerate=True
|
||||
)
|
||||
query = "How to git install application?"
|
||||
|
||||
@@ -108,10 +117,12 @@ def test_entry_chunking_by_max_tokens(org_config_with_only_new_file: TextContent
|
||||
for index in range(max_tokens + 1):
|
||||
f.write(f"{index} ")
|
||||
|
||||
data = get_org_files(org_config_with_only_new_file)
|
||||
|
||||
# Act
|
||||
# reload embeddings, entries, notes model after adding new org-mode file
|
||||
initial_notes_model = text_search.setup(
|
||||
OrgToJsonl, org_config_with_only_new_file, search_models.text_search.bi_encoder, regenerate=False
|
||||
OrgToJsonl, data, org_config_with_only_new_file, search_models.text_search.bi_encoder, regenerate=False
|
||||
)
|
||||
|
||||
# Assert
|
||||
@@ -125,8 +136,9 @@ def test_regenerate_index_with_new_entry(
|
||||
content_config: ContentConfig, search_models: SearchModels, new_org_file: Path
|
||||
):
|
||||
# Arrange
|
||||
data = get_org_files(content_config.org)
|
||||
initial_notes_model = text_search.setup(
|
||||
OrgToJsonl, content_config.org, search_models.text_search.bi_encoder, regenerate=True
|
||||
OrgToJsonl, data, content_config.org, search_models.text_search.bi_encoder, regenerate=True
|
||||
)
|
||||
|
||||
assert len(initial_notes_model.entries) == 10
|
||||
@@ -137,10 +149,12 @@ def test_regenerate_index_with_new_entry(
|
||||
with open(new_org_file, "w") as f:
|
||||
f.write("\n* A Chihuahua doing Tango\n- Saw a super cute video of a chihuahua doing the Tango on Youtube\n")
|
||||
|
||||
data = get_org_files(content_config.org)
|
||||
|
||||
# Act
|
||||
# regenerate notes jsonl, model embeddings and model to include entry from new file
|
||||
regenerated_notes_model = text_search.setup(
|
||||
OrgToJsonl, content_config.org, search_models.text_search.bi_encoder, regenerate=True
|
||||
OrgToJsonl, data, content_config.org, search_models.text_search.bi_encoder, regenerate=True
|
||||
)
|
||||
|
||||
# Assert
|
||||
@@ -169,15 +183,19 @@ def test_update_index_with_duplicate_entries_in_stable_order(
|
||||
with open(new_file_to_index, "w") as f:
|
||||
f.write(f"{new_entry}{new_entry}")
|
||||
|
||||
data = get_org_files(org_config_with_only_new_file)
|
||||
|
||||
# Act
|
||||
# load embeddings, entries, notes model after adding new org-mode file
|
||||
initial_index = text_search.setup(
|
||||
OrgToJsonl, org_config_with_only_new_file, search_models.text_search.bi_encoder, regenerate=True
|
||||
OrgToJsonl, data, org_config_with_only_new_file, search_models.text_search.bi_encoder, regenerate=True
|
||||
)
|
||||
|
||||
data = get_org_files(org_config_with_only_new_file)
|
||||
|
||||
# update embeddings, entries, notes model after adding new org-mode file
|
||||
updated_index = text_search.setup(
|
||||
OrgToJsonl, org_config_with_only_new_file, search_models.text_search.bi_encoder, regenerate=False
|
||||
OrgToJsonl, data, org_config_with_only_new_file, search_models.text_search.bi_encoder, regenerate=False
|
||||
)
|
||||
|
||||
# Assert
|
||||
@@ -200,19 +218,22 @@ def test_update_index_with_deleted_entry(org_config_with_only_new_file: TextCont
|
||||
new_entry = "* TODO A Chihuahua doing Tango\n- Saw a super cute video of a chihuahua doing the Tango on Youtube\n"
|
||||
with open(new_file_to_index, "w") as f:
|
||||
f.write(f"{new_entry}{new_entry} -- Tatooine")
|
||||
data = get_org_files(org_config_with_only_new_file)
|
||||
|
||||
# load embeddings, entries, notes model after adding new org file with 2 entries
|
||||
initial_index = text_search.setup(
|
||||
OrgToJsonl, org_config_with_only_new_file, search_models.text_search.bi_encoder, regenerate=True
|
||||
OrgToJsonl, data, org_config_with_only_new_file, search_models.text_search.bi_encoder, regenerate=True
|
||||
)
|
||||
|
||||
# update embeddings, entries, notes model after removing an entry from the org file
|
||||
with open(new_file_to_index, "w") as f:
|
||||
f.write(f"{new_entry}")
|
||||
|
||||
data = get_org_files(org_config_with_only_new_file)
|
||||
|
||||
# Act
|
||||
updated_index = text_search.setup(
|
||||
OrgToJsonl, org_config_with_only_new_file, search_models.text_search.bi_encoder, regenerate=False
|
||||
OrgToJsonl, data, org_config_with_only_new_file, search_models.text_search.bi_encoder, regenerate=False
|
||||
)
|
||||
|
||||
# Assert
|
||||
@@ -229,8 +250,9 @@ def test_update_index_with_deleted_entry(org_config_with_only_new_file: TextCont
|
||||
# ----------------------------------------------------------------------------------------------------
|
||||
def test_update_index_with_new_entry(content_config: ContentConfig, search_models: SearchModels, new_org_file: Path):
|
||||
# Arrange
|
||||
data = get_org_files(content_config.org)
|
||||
initial_notes_model = text_search.setup(
|
||||
OrgToJsonl, content_config.org, search_models.text_search.bi_encoder, regenerate=True, normalize=False
|
||||
OrgToJsonl, data, content_config.org, search_models.text_search.bi_encoder, regenerate=True, normalize=False
|
||||
)
|
||||
|
||||
# append org-mode entry to first org input file in config
|
||||
@@ -238,11 +260,13 @@ def test_update_index_with_new_entry(content_config: ContentConfig, search_model
|
||||
new_entry = "\n* A Chihuahua doing Tango\n- Saw a super cute video of a chihuahua doing the Tango on Youtube\n"
|
||||
f.write(new_entry)
|
||||
|
||||
data = get_org_files(content_config.org)
|
||||
|
||||
# Act
|
||||
# update embeddings, entries with the newly added note
|
||||
content_config.org.input_files = [f"{new_org_file}"]
|
||||
final_notes_model = text_search.setup(
|
||||
OrgToJsonl, content_config.org, search_models.text_search.bi_encoder, regenerate=False, normalize=False
|
||||
OrgToJsonl, data, content_config.org, search_models.text_search.bi_encoder, regenerate=False, normalize=False
|
||||
)
|
||||
|
||||
# Assert
|
||||
|
||||
Reference in New Issue
Block a user