Move to a push-first model for retrieving embeddings from local files (#457)

* Initial version - setup a file-push architecture for generating embeddings with Khoj
* Update unit tests to fix with new application design
* Allow configure server to be called without regenerating the index; this no longer works because the API for indexing files is not up in time for the server to send a request
* Use state.host and state.port for configuring the URL for the indexer
* On application startup, load in embeddings from configurations files, rather than regenerating the corpus based on file system
This commit is contained in:
sabaimran
2023-08-31 12:55:17 -07:00
committed by GitHub
parent 92cbfef7ab
commit 4854258047
23 changed files with 990 additions and 508 deletions

View File

@@ -9,6 +9,7 @@ import pytest
from khoj.main import app
from khoj.configure import configure_processor, configure_routes, configure_search_types
from khoj.processor.markdown.markdown_to_jsonl import MarkdownToJsonl
from khoj.processor.plaintext.plaintext_to_jsonl import PlaintextToJsonl
from khoj.search_type import image_search, text_search
from khoj.utils.config import SearchModels
from khoj.utils.helpers import resolve_absolute_path
@@ -97,7 +98,12 @@ def content_config(tmp_path_factory, search_models: SearchModels, search_config:
filters = [DateFilter(), WordFilter(), FileFilter()]
text_search.setup(
OrgToJsonl, content_config.org, search_models.text_search.bi_encoder, regenerate=False, filters=filters
OrgToJsonl,
get_sample_data("org"),
content_config.org,
search_models.text_search.bi_encoder,
regenerate=False,
filters=filters,
)
content_config.plugins = {
@@ -109,6 +115,20 @@ def content_config(tmp_path_factory, search_models: SearchModels, search_config:
)
}
if os.getenv("GITHUB_PAT_TOKEN"):
content_config.github = GithubContentConfig(
pat_token=os.getenv("GITHUB_PAT_TOKEN", ""),
repos=[
GithubRepoConfig(
owner="khoj-ai",
name="lantern",
branch="master",
)
],
compressed_jsonl=content_dir.joinpath("github.jsonl.gz"),
embeddings_file=content_dir.joinpath("github_embeddings.pt"),
)
content_config.plaintext = TextContentConfig(
input_files=None,
input_filter=["tests/data/plaintext/*.txt", "tests/data/plaintext/*.md", "tests/data/plaintext/*.html"],
@@ -132,6 +152,7 @@ def content_config(tmp_path_factory, search_models: SearchModels, search_config:
filters = [DateFilter(), WordFilter(), FileFilter()]
text_search.setup(
JsonlToJsonl,
None,
content_config.plugins["plugin1"],
search_models.text_search.bi_encoder,
regenerate=False,
@@ -203,6 +224,7 @@ def chat_client(md_content_config: ContentConfig, search_config: SearchConfig, p
state.search_models.text_search = text_search.initialize_model(search_config.asymmetric)
state.content_index.markdown = text_search.setup(
MarkdownToJsonl,
get_sample_data("markdown"),
md_content_config.markdown,
state.search_models.text_search.bi_encoder,
regenerate=False,
@@ -226,11 +248,22 @@ def client(content_config: ContentConfig, search_config: SearchConfig, processor
state.search_models.text_search = text_search.initialize_model(search_config.asymmetric)
state.search_models.image_search = image_search.initialize_model(search_config.image)
state.content_index.org = text_search.setup(
OrgToJsonl, content_config.org, state.search_models.text_search.bi_encoder, regenerate=False
OrgToJsonl,
get_sample_data("org"),
content_config.org,
state.search_models.text_search.bi_encoder,
regenerate=False,
)
state.content_index.image = image_search.setup(
content_config.image, state.search_models.image_search, regenerate=False
)
state.content_index.plaintext = text_search.setup(
PlaintextToJsonl,
get_sample_data("plaintext"),
content_config.plaintext,
state.search_models.text_search.bi_encoder,
regenerate=False,
)
state.processor_config = configure_processor(processor_config)
@@ -250,8 +283,21 @@ def client_offline_chat(
# Index Markdown Content for Search
filters = [DateFilter(), WordFilter(), FileFilter()]
state.search_models.text_search = text_search.initialize_model(search_config.asymmetric)
state.search_models.image_search = image_search.initialize_model(search_config.image)
state.content_index.org = text_search.setup(
OrgToJsonl,
get_sample_data("org"),
content_config.org,
state.search_models.text_search.bi_encoder,
regenerate=False,
)
state.content_index.image = image_search.setup(
content_config.image, state.search_models.image_search, regenerate=False
)
state.content_index.markdown = text_search.setup(
MarkdownToJsonl,
get_sample_data("markdown"),
md_content_config.markdown,
state.search_models.text_search.bi_encoder,
regenerate=False,
@@ -284,3 +330,69 @@ def org_config_with_only_new_file(content_config: ContentConfig, new_org_file: P
new_org_config.input_files = [f"{new_org_file}"]
new_org_config.input_filter = None
return new_org_config
@pytest.fixture(scope="function")
def sample_org_data():
return get_sample_data("org")
def get_sample_data(type):
sample_data = {
"org": {
"readme.org": """
* Khoj
/Allow natural language search on user content like notes, images using transformer based models/
All data is processed locally. User can interface with khoj app via [[./interface/emacs/khoj.el][Emacs]], API or Commandline
** Dependencies
- Python3
- [[https://docs.conda.io/en/latest/miniconda.html#latest-miniconda-installer-links][Miniconda]]
** Install
#+begin_src shell
git clone https://github.com/khoj-ai/khoj && cd khoj
conda env create -f environment.yml
conda activate khoj
#+end_src"""
},
"markdown": {
"readme.markdown": """
# Khoj
Allow natural language search on user content like notes, images using transformer based models
All data is processed locally. User can interface with khoj app via [Emacs](./interface/emacs/khoj.el), API or Commandline
## Dependencies
- Python3
- [Miniconda](https://docs.conda.io/en/latest/miniconda.html#latest-miniconda-installer-links)
## Install
```shell
git clone
conda env create -f environment.yml
conda activate khoj
```
"""
},
"plaintext": {
"readme.txt": """
Khoj
Allow natural language search on user content like notes, images using transformer based models
All data is processed locally. User can interface with khoj app via Emacs, API or Commandline
Dependencies
- Python3
- Miniconda
Install
git clone
conda env create -f environment.yml
conda activate khoj
"""
},
}
return sample_data[type]

View File

@@ -11,7 +11,6 @@ from fastapi.testclient import TestClient
from khoj.main import app
from khoj.configure import configure_routes, configure_search_types
from khoj.utils import state
from khoj.utils.config import SearchModels
from khoj.utils.state import search_models, content_index, config
from khoj.search_type import text_search, image_search
from khoj.utils.rawconfig import ContentConfig, SearchConfig
@@ -51,28 +50,6 @@ def test_update_with_invalid_content_type(client):
assert response.status_code == 422
# ----------------------------------------------------------------------------------------------------
def test_update_with_valid_content_type(client):
for content_type in ["all", "org", "markdown", "image", "pdf", "notion", "plugin1"]:
# Act
response = client.get(f"/api/update?t={content_type}")
# Assert
assert response.status_code == 200, f"Returned status: {response.status_code} for content type: {content_type}"
# ----------------------------------------------------------------------------------------------------
def test_update_with_github_fails_without_pat(client):
# Act
response = client.get(f"/api/update?t=github")
# Assert
assert response.status_code == 500, f"Returned status: {response.status_code} for content type: github"
assert (
response.json()["detail"]
== "🚨 Failed to update server via API: Github PAT token is not set. Skipping github content"
)
# ----------------------------------------------------------------------------------------------------
def test_regenerate_with_invalid_content_type(client):
# Act
@@ -82,11 +59,29 @@ def test_regenerate_with_invalid_content_type(client):
assert response.status_code == 422
# ----------------------------------------------------------------------------------------------------
def test_index_batch(client):
# Arrange
request_body = get_sample_files_data()
headers = {"x-api-key": "secret"}
# Act
response = client.post("/indexer/batch", json=request_body, headers=headers)
# Assert
assert response.status_code == 200
# ----------------------------------------------------------------------------------------------------
def test_regenerate_with_valid_content_type(client):
for content_type in ["all", "org", "markdown", "image", "pdf", "notion", "plugin1"]:
# Arrange
request_body = get_sample_files_data()
headers = {"x-api-key": "secret"}
# Act
response = client.get(f"/api/update?force=true&t={content_type}")
response = client.post(f"/indexer/batch?search_type={content_type}", json=request_body, headers=headers)
# Assert
assert response.status_code == 200, f"Returned status: {response.status_code} for content type: {content_type}"
@@ -96,12 +91,15 @@ def test_regenerate_with_github_fails_without_pat(client):
# Act
response = client.get(f"/api/update?force=true&t=github")
# Arrange
request_body = get_sample_files_data()
headers = {"x-api-key": "secret"}
# Act
response = client.post(f"/indexer/batch?search_type=github", json=request_body, headers=headers)
# Assert
assert response.status_code == 500, f"Returned status: {response.status_code} for content type: github"
assert (
response.json()["detail"]
== "🚨 Failed to update server via API: Github PAT token is not set. Skipping github content"
)
assert response.status_code == 200, f"Returned status: {response.status_code} for content type: github"
# ----------------------------------------------------------------------------------------------------
@@ -111,7 +109,7 @@ def test_get_configured_types_via_api(client):
# Assert
assert response.status_code == 200
assert response.json() == ["all", "org", "image", "plugin1"]
assert response.json() == ["all", "org", "image", "plaintext", "plugin1"]
# ----------------------------------------------------------------------------------------------------
@@ -194,11 +192,11 @@ def test_image_search(client, content_config: ContentConfig, search_config: Sear
# ----------------------------------------------------------------------------------------------------
def test_notes_search(client, content_config: ContentConfig, search_config: SearchConfig):
def test_notes_search(client, content_config: ContentConfig, search_config: SearchConfig, sample_org_data):
# Arrange
search_models.text_search = text_search.initialize_model(search_config.asymmetric)
content_index.org = text_search.setup(
OrgToJsonl, content_config.org, search_models.text_search.bi_encoder, regenerate=False
OrgToJsonl, sample_org_data, content_config.org, search_models.text_search.bi_encoder, regenerate=False
)
user_query = quote("How to git install application?")
@@ -213,12 +211,19 @@ def test_notes_search(client, content_config: ContentConfig, search_config: Sear
# ----------------------------------------------------------------------------------------------------
def test_notes_search_with_only_filters(client, content_config: ContentConfig, search_config: SearchConfig):
def test_notes_search_with_only_filters(
client, content_config: ContentConfig, search_config: SearchConfig, sample_org_data
):
# Arrange
filters = [WordFilter(), FileFilter()]
search_models.text_search = text_search.initialize_model(search_config.asymmetric)
content_index.org = text_search.setup(
OrgToJsonl, content_config.org, search_models.text_search.bi_encoder, regenerate=False, filters=filters
OrgToJsonl,
sample_org_data,
content_config.org,
search_models.text_search.bi_encoder,
regenerate=False,
filters=filters,
)
user_query = quote('+"Emacs" file:"*.org"')
@@ -233,12 +238,14 @@ def test_notes_search_with_only_filters(client, content_config: ContentConfig, s
# ----------------------------------------------------------------------------------------------------
def test_notes_search_with_include_filter(client, content_config: ContentConfig, search_config: SearchConfig):
def test_notes_search_with_include_filter(
client, content_config: ContentConfig, search_config: SearchConfig, sample_org_data
):
# Arrange
filters = [WordFilter()]
search_models.text_search = text_search.initialize_model(search_config.asymmetric)
content_index.org = text_search.setup(
OrgToJsonl, content_config.org, search_models.text_search, regenerate=False, filters=filters
OrgToJsonl, sample_org_data, content_config.org, search_models.text_search, regenerate=False, filters=filters
)
user_query = quote('How to git install application? +"Emacs"')
@@ -253,12 +260,19 @@ def test_notes_search_with_include_filter(client, content_config: ContentConfig,
# ----------------------------------------------------------------------------------------------------
def test_notes_search_with_exclude_filter(client, content_config: ContentConfig, search_config: SearchConfig):
def test_notes_search_with_exclude_filter(
client, content_config: ContentConfig, search_config: SearchConfig, sample_org_data
):
# Arrange
filters = [WordFilter()]
search_models.text_search = text_search.initialize_model(search_config.asymmetric)
content_index.org = text_search.setup(
OrgToJsonl, content_config.org, search_models.text_search.bi_encoder, regenerate=False, filters=filters
OrgToJsonl,
sample_org_data,
content_config.org,
search_models.text_search.bi_encoder,
regenerate=False,
filters=filters,
)
user_query = quote('How to git install application? -"clone"')
@@ -270,3 +284,28 @@ def test_notes_search_with_exclude_filter(client, content_config: ContentConfig,
# assert actual_data does not contains word "clone"
search_result = response.json()[0]["entry"]
assert "clone" not in search_result
def get_sample_files_data():
return {
"org": {
"path/to/filename.org": "* practicing piano",
"path/to/filename1.org": "** top 3 reasons why I moved to SF",
"path/to/filename2.org": "* how to build a search engine",
},
"pdf": {
"path/to/filename.pdf": "Moore's law does not apply to consumer hardware",
"path/to/filename1.pdf": "The sun is a ball of helium",
"path/to/filename2.pdf": "Effect of sunshine on baseline human happiness",
},
"plaintext": {
"path/to/filename.txt": "data,column,value",
"path/to/filename1.txt": "<html>my first web page</html>",
"path/to/filename2.txt": "2021-02-02 Journal Entry",
},
"markdown": {
"path/to/filename.md": "# Notes from client call",
"path/to/filename1.md": "## Studying anthropological records from the Fatimid caliphate",
"path/to/filename2.md": "**Understanding science through the lens of art**",
},
}

View File

@@ -1,9 +1,12 @@
# Standard Packages
import json
from pathlib import Path
import os
# Internal Packages
from khoj.processor.markdown.markdown_to_jsonl import MarkdownToJsonl
from khoj.utils.fs_syncer import get_markdown_files
from khoj.utils.rawconfig import TextContentConfig
def test_markdown_file_with_no_headings_to_jsonl(tmp_path):
@@ -13,12 +16,14 @@ def test_markdown_file_with_no_headings_to_jsonl(tmp_path):
- Bullet point 1
- Bullet point 2
"""
markdownfile = create_file(tmp_path, entry)
expected_heading = "# " + markdownfile.stem
data = {
f"{tmp_path}": entry,
}
expected_heading = f"# {tmp_path.stem}"
# Act
# Extract Entries from specified Markdown files
entry_nodes, file_to_entries = MarkdownToJsonl.extract_markdown_entries(markdown_files=[markdownfile])
entry_nodes, file_to_entries = MarkdownToJsonl.extract_markdown_entries(markdown_files=data)
# Process Each Entry from All Notes Files
jsonl_string = MarkdownToJsonl.convert_markdown_maps_to_jsonl(
@@ -41,11 +46,13 @@ def test_single_markdown_entry_to_jsonl(tmp_path):
\t\r
Body Line 1
"""
markdownfile = create_file(tmp_path, entry)
data = {
f"{tmp_path}": entry,
}
# Act
# Extract Entries from specified Markdown files
entries, entry_to_file_map = MarkdownToJsonl.extract_markdown_entries(markdown_files=[markdownfile])
entries, entry_to_file_map = MarkdownToJsonl.extract_markdown_entries(markdown_files=data)
# Process Each Entry from All Notes Files
jsonl_string = MarkdownToJsonl.convert_markdown_maps_to_jsonl(
@@ -68,11 +75,13 @@ def test_multiple_markdown_entries_to_jsonl(tmp_path):
\t\r
Heading 2 Body Line 2
"""
markdownfile = create_file(tmp_path, entry)
data = {
f"{tmp_path}": entry,
}
# Act
# Extract Entries from specified Markdown files
entry_strings, entry_to_file_map = MarkdownToJsonl.extract_markdown_entries(markdown_files=[markdownfile])
entry_strings, entry_to_file_map = MarkdownToJsonl.extract_markdown_entries(markdown_files=data)
entries = MarkdownToJsonl.convert_markdown_entries_to_maps(entry_strings, entry_to_file_map)
# Process Each Entry from All Notes Files
@@ -82,7 +91,7 @@ def test_multiple_markdown_entries_to_jsonl(tmp_path):
# Assert
assert len(jsonl_data) == 2
# Ensure entry compiled strings include the markdown files they originate from
assert all([markdownfile.stem in entry.compiled for entry in entries])
assert all([tmp_path.stem in entry.compiled for entry in entries])
def test_get_markdown_files(tmp_path):
@@ -99,18 +108,27 @@ def test_get_markdown_files(tmp_path):
create_file(tmp_path, filename="not-included-markdown.md")
create_file(tmp_path, filename="not-included-text.txt")
expected_files = sorted(map(str, [group1_file1, group1_file2, group2_file1, group2_file2, file1]))
expected_files = set(
[os.path.join(tmp_path, file.name) for file in [group1_file1, group1_file2, group2_file1, group2_file2, file1]]
)
# Setup input-files, input-filters
input_files = [tmp_path / "notes.md"]
input_filter = [tmp_path / "group1*.md", tmp_path / "group2*.markdown"]
markdown_config = TextContentConfig(
input_files=input_files,
input_filter=[str(filter) for filter in input_filter],
compressed_jsonl=tmp_path / "test.jsonl",
embeddings_file=tmp_path / "test_embeddings.jsonl",
)
# Act
extracted_org_files = MarkdownToJsonl.get_markdown_files(input_files, input_filter)
extracted_org_files = get_markdown_files(markdown_config)
# Assert
assert len(extracted_org_files) == 5
assert extracted_org_files == expected_files
assert set(extracted_org_files.keys()) == expected_files
def test_extract_entries_with_different_level_headings(tmp_path):
@@ -120,11 +138,13 @@ def test_extract_entries_with_different_level_headings(tmp_path):
# Heading 1
## Heading 2
"""
markdownfile = create_file(tmp_path, entry)
data = {
f"{tmp_path}": entry,
}
# Act
# Extract Entries from specified Markdown files
entries, _ = MarkdownToJsonl.extract_markdown_entries(markdown_files=[markdownfile])
entries, _ = MarkdownToJsonl.extract_markdown_entries(markdown_files=data)
# Assert
assert len(entries) == 2

View File

@@ -1,11 +1,14 @@
# Standard Packages
import json
import os
# Internal Packages
from khoj.processor.org_mode.org_to_jsonl import OrgToJsonl
from khoj.processor.text_to_jsonl import TextToJsonl
from khoj.utils.helpers import is_none_or_empty
from khoj.utils.rawconfig import Entry
from khoj.utils.fs_syncer import get_org_files
from khoj.utils.rawconfig import TextContentConfig
def test_configure_heading_entry_to_jsonl(tmp_path):
@@ -18,14 +21,17 @@ def test_configure_heading_entry_to_jsonl(tmp_path):
:END:
\t \r
"""
orgfile = create_file(tmp_path, entry)
data = {
f"{tmp_path}": entry,
}
for index_heading_entries in [True, False]:
# Act
# Extract entries into jsonl from specified Org files
jsonl_string = OrgToJsonl.convert_org_entries_to_jsonl(
OrgToJsonl.convert_org_nodes_to_entries(
*OrgToJsonl.extract_org_entries(org_files=[orgfile]), index_heading_entries=index_heading_entries
*OrgToJsonl.extract_org_entries(org_files=data), index_heading_entries=index_heading_entries
)
)
jsonl_data = [json.loads(json_string) for json_string in jsonl_string.splitlines()]
@@ -46,12 +52,14 @@ def test_entry_split_when_exceeds_max_words(tmp_path):
\t\r
Body Line
"""
orgfile = create_file(tmp_path, entry)
expected_heading = f"* {orgfile.stem}\n** Heading"
data = {
f"{tmp_path}": entry,
}
expected_heading = f"* {tmp_path.stem}\n** Heading"
# Act
# Extract Entries from specified Org files
entries, entry_to_file_map = OrgToJsonl.extract_org_entries(org_files=[orgfile])
entries, entry_to_file_map = OrgToJsonl.extract_org_entries(org_files=data)
# Split each entry from specified Org files by max words
jsonl_string = OrgToJsonl.convert_org_entries_to_jsonl(
@@ -95,11 +103,13 @@ def test_entry_with_body_to_jsonl(tmp_path):
\t\r
Body Line 1
"""
orgfile = create_file(tmp_path, entry)
data = {
f"{tmp_path}": entry,
}
# Act
# Extract Entries from specified Org files
entries, entry_to_file_map = OrgToJsonl.extract_org_entries(org_files=[orgfile])
entries, entry_to_file_map = OrgToJsonl.extract_org_entries(org_files=data)
# Process Each Entry from All Notes Files
jsonl_string = OrgToJsonl.convert_org_entries_to_jsonl(
@@ -120,11 +130,13 @@ Intro text
* Entry Heading
entry body
"""
orgfile = create_file(tmp_path, entry)
data = {
f"{tmp_path}": entry,
}
# Act
# Extract Entries from specified Org files
entry_nodes, file_to_entries = OrgToJsonl.extract_org_entries(org_files=[orgfile])
entry_nodes, file_to_entries = OrgToJsonl.extract_org_entries(org_files=data)
# Process Each Entry from All Notes Files
entries = OrgToJsonl.convert_org_nodes_to_entries(entry_nodes, file_to_entries)
@@ -142,11 +154,13 @@ def test_file_with_no_headings_to_jsonl(tmp_path):
- Bullet point 1
- Bullet point 2
"""
orgfile = create_file(tmp_path, entry)
data = {
f"{tmp_path}": entry,
}
# Act
# Extract Entries from specified Org files
entry_nodes, file_to_entries = OrgToJsonl.extract_org_entries(org_files=[orgfile])
entry_nodes, file_to_entries = OrgToJsonl.extract_org_entries(org_files=data)
# Process Each Entry from All Notes Files
entries = OrgToJsonl.convert_org_nodes_to_entries(entry_nodes, file_to_entries)
@@ -171,18 +185,30 @@ def test_get_org_files(tmp_path):
create_file(tmp_path, filename="orgfile2.org")
create_file(tmp_path, filename="text1.txt")
expected_files = sorted(map(str, [group1_file1, group1_file2, group2_file1, group2_file2, orgfile1]))
expected_files = set(
[
os.path.join(tmp_path, file.name)
for file in [group1_file1, group1_file2, group2_file1, group2_file2, orgfile1]
]
)
# Setup input-files, input-filters
input_files = [tmp_path / "orgfile1.org"]
input_filter = [tmp_path / "group1*.org", tmp_path / "group2*.org"]
org_config = TextContentConfig(
input_files=input_files,
input_filter=[str(filter) for filter in input_filter],
compressed_jsonl=tmp_path / "test.jsonl",
embeddings_file=tmp_path / "test_embeddings.jsonl",
)
# Act
extracted_org_files = OrgToJsonl.get_org_files(input_files, input_filter)
extracted_org_files = get_org_files(org_config)
# Assert
assert len(extracted_org_files) == 5
assert extracted_org_files == expected_files
assert set(extracted_org_files.keys()) == expected_files
def test_extract_entries_with_different_level_headings(tmp_path):
@@ -192,11 +218,13 @@ def test_extract_entries_with_different_level_headings(tmp_path):
* Heading 1
** Heading 2
"""
orgfile = create_file(tmp_path, entry)
data = {
f"{tmp_path}": entry,
}
# Act
# Extract Entries from specified Org files
entries, _ = OrgToJsonl.extract_org_entries(org_files=[orgfile])
entries, _ = OrgToJsonl.extract_org_entries(org_files=data)
# Assert
assert len(entries) == 2

View File

@@ -44,7 +44,7 @@ Body Line 1"""
assert len(entries) == 1
assert entries[0].heading == "Heading"
assert entries[0].tags == list()
assert entries[0].body == "Body Line 1"
assert entries[0].body == "Body Line 1\n\n"
assert entries[0].priority == ""
assert entries[0].Property("ID") == ""
assert entries[0].closed == ""
@@ -78,7 +78,7 @@ Body Line 2"""
assert entries[0].heading == "Heading"
assert entries[0].todo == "DONE"
assert entries[0].tags == ["Tag1", "TAG2", "tag3"]
assert entries[0].body == "- Clocked Log 1\nBody Line 1\nBody Line 2"
assert entries[0].body == "- Clocked Log 1\n\nBody Line 1\n\nBody Line 2\n\n"
assert entries[0].priority == "A"
assert entries[0].Property("ID") == "id:123-456-789-4234-1231"
assert entries[0].closed == datetime.date(1984, 4, 1)
@@ -205,7 +205,7 @@ Body 2
assert entry.heading == f"Heading{index+1}"
assert entry.todo == "FAILED" if index == 0 else "CANCELLED"
assert entry.tags == [f"tag{index+1}"]
assert entry.body == f"- Clocked Log {index+1}\nBody {index+1}\n\n"
assert entry.body == f"- Clocked Log {index+1}\n\nBody {index+1}\n\n"
assert entry.priority == "A"
assert entry.Property("ID") == f"id:123-456-789-4234-000{index+1}"
assert entry.closed == datetime.date(1984, 4, index + 1)
@@ -305,7 +305,7 @@ entry body
assert entries[0].heading == "Title"
assert entries[0].body == "intro body\n"
assert entries[1].heading == "Entry Heading"
assert entries[1].body == "entry body\n"
assert entries[1].body == "entry body\n\n"
# ----------------------------------------------------------------------------------------------------
@@ -327,7 +327,7 @@ entry body
assert entries[0].heading == "Title1 Title2"
assert entries[0].body == "intro body\n"
assert entries[1].heading == "Entry Heading"
assert entries[1].body == "entry body\n"
assert entries[1].body == "entry body\n\n"
# Helper Functions

View File

@@ -1,15 +1,24 @@
# Standard Packages
import json
import os
# Internal Packages
from khoj.processor.pdf.pdf_to_jsonl import PdfToJsonl
from khoj.utils.fs_syncer import get_pdf_files
from khoj.utils.rawconfig import TextContentConfig
def test_single_page_pdf_to_jsonl():
"Convert single page PDF file to jsonl."
# Act
# Extract Entries from specified Pdf files
entries, entry_to_file_map = PdfToJsonl.extract_pdf_entries(pdf_files=["tests/data/pdf/singlepage.pdf"])
# Read singlepage.pdf into memory as bytes
with open("tests/data/pdf/singlepage.pdf", "rb") as f:
pdf_bytes = f.read()
data = {"tests/data/pdf/singlepage.pdf": pdf_bytes}
entries, entry_to_file_map = PdfToJsonl.extract_pdf_entries(pdf_files=data)
# Process Each Entry from All Pdf Files
jsonl_string = PdfToJsonl.convert_pdf_maps_to_jsonl(
@@ -25,7 +34,11 @@ def test_multi_page_pdf_to_jsonl():
"Convert multiple pages from single PDF file to jsonl."
# Act
# Extract Entries from specified Pdf files
entries, entry_to_file_map = PdfToJsonl.extract_pdf_entries(pdf_files=["tests/data/pdf/multipage.pdf"])
with open("tests/data/pdf/multipage.pdf", "rb") as f:
pdf_bytes = f.read()
data = {"tests/data/pdf/multipage.pdf": pdf_bytes}
entries, entry_to_file_map = PdfToJsonl.extract_pdf_entries(pdf_files=data)
# Process Each Entry from All Pdf Files
jsonl_string = PdfToJsonl.convert_pdf_maps_to_jsonl(
@@ -51,18 +64,27 @@ def test_get_pdf_files(tmp_path):
create_file(tmp_path, filename="not-included-document.pdf")
create_file(tmp_path, filename="not-included-text.txt")
expected_files = sorted(map(str, [group1_file1, group1_file2, group2_file1, group2_file2, file1]))
expected_files = set(
[os.path.join(tmp_path, file.name) for file in [group1_file1, group1_file2, group2_file1, group2_file2, file1]]
)
# Setup input-files, input-filters
input_files = [tmp_path / "document.pdf"]
input_filter = [tmp_path / "group1*.pdf", tmp_path / "group2*.pdf"]
pdf_config = TextContentConfig(
input_files=input_files,
input_filter=[str(path) for path in input_filter],
compressed_jsonl=tmp_path / "test.jsonl",
embeddings_file=tmp_path / "test_embeddings.jsonl",
)
# Act
extracted_pdf_files = PdfToJsonl.get_pdf_files(input_files, input_filter)
extracted_pdf_files = get_pdf_files(pdf_config)
# Assert
assert len(extracted_pdf_files) == 5
assert extracted_pdf_files == expected_files
assert set(extracted_pdf_files.keys()) == expected_files
# Helper Functions

View File

@@ -1,8 +1,11 @@
# Standard Packages
import json
import os
from pathlib import Path
# Internal Packages
from khoj.utils.fs_syncer import get_plaintext_files
from khoj.utils.rawconfig import TextContentConfig
from khoj.processor.plaintext.plaintext_to_jsonl import PlaintextToJsonl
@@ -18,9 +21,12 @@ def test_plaintext_file(tmp_path):
# Act
# Extract Entries from specified plaintext files
file_to_entries = PlaintextToJsonl.extract_plaintext_entries(plaintext_files=[str(plaintextfile)])
maps = PlaintextToJsonl.convert_plaintext_entries_to_maps(file_to_entries)
data = {
f"{plaintextfile}": entry,
}
maps = PlaintextToJsonl.convert_plaintext_entries_to_maps(entry_to_file_map=data)
# Convert each entry.file to absolute path to make them JSON serializable
for map in maps:
@@ -59,33 +65,40 @@ def test_get_plaintext_files(tmp_path):
create_file(tmp_path, filename="not-included-markdown.md")
create_file(tmp_path, filename="not-included-text.txt")
expected_files = sorted(
map(str, [group1_file1, group1_file2, group2_file1, group2_file2, file1, group2_file3, group2_file4])
expected_files = set(
[
os.path.join(tmp_path, file.name)
for file in [group1_file1, group1_file2, group2_file1, group2_file2, group2_file3, group2_file4, file1]
]
)
# Setup input-files, input-filters
input_files = [tmp_path / "notes.txt"]
input_filter = [tmp_path / "group1*.md", tmp_path / "group2*.*"]
plaintext_config = TextContentConfig(
input_files=input_files,
input_filter=[str(filter) for filter in input_filter],
compressed_jsonl=tmp_path / "test.jsonl",
embeddings_file=tmp_path / "test_embeddings.jsonl",
)
# Act
extracted_plaintext_files = PlaintextToJsonl.get_plaintext_files(input_files, input_filter)
extracted_plaintext_files = get_plaintext_files(plaintext_config)
# Assert
assert len(extracted_plaintext_files) == 7
assert set(extracted_plaintext_files) == set(expected_files)
assert set(extracted_plaintext_files.keys()) == set(expected_files)
def test_parse_html_plaintext_file(content_config):
"Ensure HTML files are parsed correctly"
# Arrange
# Setup input-files, input-filters
input_files = content_config.plaintext.input_files
input_filter = content_config.plaintext.input_filter
extracted_plaintext_files = get_plaintext_files(content_config.plaintext)
# Act
extracted_plaintext_files = PlaintextToJsonl.get_plaintext_files(input_files, input_filter)
file_to_entries = PlaintextToJsonl.extract_plaintext_entries(extracted_plaintext_files)
maps = PlaintextToJsonl.convert_plaintext_entries_to_maps(file_to_entries)
maps = PlaintextToJsonl.convert_plaintext_entries_to_maps(extracted_plaintext_files)
# Assert
assert len(maps) == 1

View File

@@ -13,6 +13,7 @@ from khoj.search_type import text_search
from khoj.utils.rawconfig import ContentConfig, SearchConfig, TextContentConfig
from khoj.processor.org_mode.org_to_jsonl import OrgToJsonl
from khoj.processor.github.github_to_jsonl import GithubToJsonl
from khoj.utils.fs_syncer import get_org_files
# Test
@@ -27,26 +28,30 @@ def test_text_search_setup_with_missing_file_raises_error(
# Act
# Generate notes embeddings during asymmetric setup
with pytest.raises(ValueError, match=r"^No valid entries found in specified files:*"):
text_search.setup(OrgToJsonl, org_config_with_only_new_file, search_config.asymmetric, regenerate=True)
with pytest.raises(FileNotFoundError):
data = get_org_files(org_config_with_only_new_file)
# ----------------------------------------------------------------------------------------------------
def test_text_search_setup_with_empty_file_raises_error(
org_config_with_only_new_file: TextContentConfig, search_config: SearchConfig
):
# Arrange
data = get_org_files(org_config_with_only_new_file)
# Act
# Generate notes embeddings during asymmetric setup
with pytest.raises(ValueError, match=r"^No valid entries found*"):
text_search.setup(OrgToJsonl, org_config_with_only_new_file, search_config.asymmetric, regenerate=True)
text_search.setup(OrgToJsonl, data, org_config_with_only_new_file, search_config.asymmetric, regenerate=True)
# ----------------------------------------------------------------------------------------------------
def test_text_search_setup(content_config: ContentConfig, search_models: SearchModels):
# Arrange
data = get_org_files(content_config.org)
# Act
# Regenerate notes embeddings during asymmetric setup
notes_model = text_search.setup(
OrgToJsonl, content_config.org, search_models.text_search.bi_encoder, regenerate=True
OrgToJsonl, data, content_config.org, search_models.text_search.bi_encoder, regenerate=True
)
# Assert
@@ -59,14 +64,16 @@ def test_text_index_same_if_content_unchanged(content_config: ContentConfig, sea
# Arrange
caplog.set_level(logging.INFO, logger="khoj")
data = get_org_files(content_config.org)
# Act
# Generate initial notes embeddings during asymmetric setup
text_search.setup(OrgToJsonl, content_config.org, search_models.text_search.bi_encoder, regenerate=True)
text_search.setup(OrgToJsonl, data, content_config.org, search_models.text_search.bi_encoder, regenerate=True)
initial_logs = caplog.text
caplog.clear() # Clear logs
# Run asymmetric setup again with no changes to data source. Ensure index is not updated
text_search.setup(OrgToJsonl, content_config.org, search_models.text_search.bi_encoder, regenerate=False)
text_search.setup(OrgToJsonl, data, content_config.org, search_models.text_search.bi_encoder, regenerate=False)
final_logs = caplog.text
# Assert
@@ -78,9 +85,11 @@ def test_text_index_same_if_content_unchanged(content_config: ContentConfig, sea
@pytest.mark.anyio
async def test_text_search(content_config: ContentConfig, search_config: SearchConfig):
# Arrange
data = get_org_files(content_config.org)
search_models.text_search = text_search.initialize_model(search_config.asymmetric)
content_index.org = text_search.setup(
OrgToJsonl, content_config.org, search_models.text_search.bi_encoder, regenerate=True
OrgToJsonl, data, content_config.org, search_models.text_search.bi_encoder, regenerate=True
)
query = "How to git install application?"
@@ -108,10 +117,12 @@ def test_entry_chunking_by_max_tokens(org_config_with_only_new_file: TextContent
for index in range(max_tokens + 1):
f.write(f"{index} ")
data = get_org_files(org_config_with_only_new_file)
# Act
# reload embeddings, entries, notes model after adding new org-mode file
initial_notes_model = text_search.setup(
OrgToJsonl, org_config_with_only_new_file, search_models.text_search.bi_encoder, regenerate=False
OrgToJsonl, data, org_config_with_only_new_file, search_models.text_search.bi_encoder, regenerate=False
)
# Assert
@@ -125,8 +136,9 @@ def test_regenerate_index_with_new_entry(
content_config: ContentConfig, search_models: SearchModels, new_org_file: Path
):
# Arrange
data = get_org_files(content_config.org)
initial_notes_model = text_search.setup(
OrgToJsonl, content_config.org, search_models.text_search.bi_encoder, regenerate=True
OrgToJsonl, data, content_config.org, search_models.text_search.bi_encoder, regenerate=True
)
assert len(initial_notes_model.entries) == 10
@@ -137,10 +149,12 @@ def test_regenerate_index_with_new_entry(
with open(new_org_file, "w") as f:
f.write("\n* A Chihuahua doing Tango\n- Saw a super cute video of a chihuahua doing the Tango on Youtube\n")
data = get_org_files(content_config.org)
# Act
# regenerate notes jsonl, model embeddings and model to include entry from new file
regenerated_notes_model = text_search.setup(
OrgToJsonl, content_config.org, search_models.text_search.bi_encoder, regenerate=True
OrgToJsonl, data, content_config.org, search_models.text_search.bi_encoder, regenerate=True
)
# Assert
@@ -169,15 +183,19 @@ def test_update_index_with_duplicate_entries_in_stable_order(
with open(new_file_to_index, "w") as f:
f.write(f"{new_entry}{new_entry}")
data = get_org_files(org_config_with_only_new_file)
# Act
# load embeddings, entries, notes model after adding new org-mode file
initial_index = text_search.setup(
OrgToJsonl, org_config_with_only_new_file, search_models.text_search.bi_encoder, regenerate=True
OrgToJsonl, data, org_config_with_only_new_file, search_models.text_search.bi_encoder, regenerate=True
)
data = get_org_files(org_config_with_only_new_file)
# update embeddings, entries, notes model after adding new org-mode file
updated_index = text_search.setup(
OrgToJsonl, org_config_with_only_new_file, search_models.text_search.bi_encoder, regenerate=False
OrgToJsonl, data, org_config_with_only_new_file, search_models.text_search.bi_encoder, regenerate=False
)
# Assert
@@ -200,19 +218,22 @@ def test_update_index_with_deleted_entry(org_config_with_only_new_file: TextCont
new_entry = "* TODO A Chihuahua doing Tango\n- Saw a super cute video of a chihuahua doing the Tango on Youtube\n"
with open(new_file_to_index, "w") as f:
f.write(f"{new_entry}{new_entry} -- Tatooine")
data = get_org_files(org_config_with_only_new_file)
# load embeddings, entries, notes model after adding new org file with 2 entries
initial_index = text_search.setup(
OrgToJsonl, org_config_with_only_new_file, search_models.text_search.bi_encoder, regenerate=True
OrgToJsonl, data, org_config_with_only_new_file, search_models.text_search.bi_encoder, regenerate=True
)
# update embeddings, entries, notes model after removing an entry from the org file
with open(new_file_to_index, "w") as f:
f.write(f"{new_entry}")
data = get_org_files(org_config_with_only_new_file)
# Act
updated_index = text_search.setup(
OrgToJsonl, org_config_with_only_new_file, search_models.text_search.bi_encoder, regenerate=False
OrgToJsonl, data, org_config_with_only_new_file, search_models.text_search.bi_encoder, regenerate=False
)
# Assert
@@ -229,8 +250,9 @@ def test_update_index_with_deleted_entry(org_config_with_only_new_file: TextCont
# ----------------------------------------------------------------------------------------------------
def test_update_index_with_new_entry(content_config: ContentConfig, search_models: SearchModels, new_org_file: Path):
# Arrange
data = get_org_files(content_config.org)
initial_notes_model = text_search.setup(
OrgToJsonl, content_config.org, search_models.text_search.bi_encoder, regenerate=True, normalize=False
OrgToJsonl, data, content_config.org, search_models.text_search.bi_encoder, regenerate=True, normalize=False
)
# append org-mode entry to first org input file in config
@@ -238,11 +260,13 @@ def test_update_index_with_new_entry(content_config: ContentConfig, search_model
new_entry = "\n* A Chihuahua doing Tango\n- Saw a super cute video of a chihuahua doing the Tango on Youtube\n"
f.write(new_entry)
data = get_org_files(content_config.org)
# Act
# update embeddings, entries with the newly added note
content_config.org.input_files = [f"{new_org_file}"]
final_notes_model = text_search.setup(
OrgToJsonl, content_config.org, search_models.text_search.bi_encoder, regenerate=False, normalize=False
OrgToJsonl, data, content_config.org, search_models.text_search.bi_encoder, regenerate=False, normalize=False
)
# Assert