Use Black to format Khoj server code and tests

This commit is contained in:
Debanjum Singh Solanky
2023-02-17 10:04:26 -06:00
parent 6130fddf45
commit 5e83baab21
44 changed files with 1167 additions and 915 deletions

View File

@@ -6,59 +6,67 @@ import pytest
# Internal Packages
from khoj.search_type import image_search, text_search
from khoj.utils.helpers import resolve_absolute_path
from khoj.utils.rawconfig import ContentConfig, TextContentConfig, ImageContentConfig, SearchConfig, TextSearchConfig, ImageSearchConfig
from khoj.utils.rawconfig import (
ContentConfig,
TextContentConfig,
ImageContentConfig,
SearchConfig,
TextSearchConfig,
ImageSearchConfig,
)
from khoj.processor.org_mode.org_to_jsonl import OrgToJsonl
from khoj.search_filter.date_filter import DateFilter
from khoj.search_filter.word_filter import WordFilter
from khoj.search_filter.file_filter import FileFilter
@pytest.fixture(scope='session')
@pytest.fixture(scope="session")
def search_config() -> SearchConfig:
model_dir = resolve_absolute_path('~/.khoj/search')
model_dir = resolve_absolute_path("~/.khoj/search")
model_dir.mkdir(parents=True, exist_ok=True)
search_config = SearchConfig()
search_config.symmetric = TextSearchConfig(
encoder = "sentence-transformers/all-MiniLM-L6-v2",
cross_encoder = "cross-encoder/ms-marco-MiniLM-L-6-v2",
model_directory = model_dir / 'symmetric/'
encoder="sentence-transformers/all-MiniLM-L6-v2",
cross_encoder="cross-encoder/ms-marco-MiniLM-L-6-v2",
model_directory=model_dir / "symmetric/",
)
search_config.asymmetric = TextSearchConfig(
encoder = "sentence-transformers/multi-qa-MiniLM-L6-cos-v1",
cross_encoder = "cross-encoder/ms-marco-MiniLM-L-6-v2",
model_directory = model_dir / 'asymmetric/'
encoder="sentence-transformers/multi-qa-MiniLM-L6-cos-v1",
cross_encoder="cross-encoder/ms-marco-MiniLM-L-6-v2",
model_directory=model_dir / "asymmetric/",
)
search_config.image = ImageSearchConfig(
encoder = "sentence-transformers/clip-ViT-B-32",
model_directory = model_dir / 'image/'
encoder="sentence-transformers/clip-ViT-B-32", model_directory=model_dir / "image/"
)
return search_config
@pytest.fixture(scope='session')
@pytest.fixture(scope="session")
def content_config(tmp_path_factory, search_config: SearchConfig):
content_dir = tmp_path_factory.mktemp('content')
content_dir = tmp_path_factory.mktemp("content")
# Generate Image Embeddings from Test Images
content_config = ContentConfig()
content_config.image = ImageContentConfig(
input_directories = ['tests/data/images'],
embeddings_file = content_dir.joinpath('image_embeddings.pt'),
batch_size = 1,
use_xmp_metadata = False)
input_directories=["tests/data/images"],
embeddings_file=content_dir.joinpath("image_embeddings.pt"),
batch_size=1,
use_xmp_metadata=False,
)
image_search.setup(content_config.image, search_config.image, regenerate=False)
# Generate Notes Embeddings from Test Notes
content_config.org = TextContentConfig(
input_files = None,
input_filter = ['tests/data/org/*.org'],
compressed_jsonl = content_dir.joinpath('notes.jsonl.gz'),
embeddings_file = content_dir.joinpath('note_embeddings.pt'))
input_files=None,
input_filter=["tests/data/org/*.org"],
compressed_jsonl=content_dir.joinpath("notes.jsonl.gz"),
embeddings_file=content_dir.joinpath("note_embeddings.pt"),
)
filters = [DateFilter(), WordFilter(), FileFilter()]
text_search.setup(OrgToJsonl, content_config.org, search_config.asymmetric, regenerate=False, filters=filters)
@@ -66,7 +74,7 @@ def content_config(tmp_path_factory, search_config: SearchConfig):
return content_config
@pytest.fixture(scope='function')
@pytest.fixture(scope="function")
def new_org_file(content_config: ContentConfig):
# Setup
new_org_file = Path(content_config.org.input_filter[0]).parent / "new_file.org"
@@ -79,9 +87,9 @@ def new_org_file(content_config: ContentConfig):
new_org_file.unlink()
@pytest.fixture(scope='function')
@pytest.fixture(scope="function")
def org_config_with_only_new_file(content_config: ContentConfig, new_org_file: Path):
new_org_config = deepcopy(content_config.org)
new_org_config.input_files = [f'{new_org_file}']
new_org_config.input_files = [f"{new_org_file}"]
new_org_config.input_filter = None
return new_org_config
return new_org_config

View File

@@ -8,10 +8,10 @@ from khoj.processor.ledger.beancount_to_jsonl import BeancountToJsonl
def test_no_transactions_in_file(tmp_path):
"Handle file with no transactions."
# Arrange
entry = f'''
entry = f"""
- Bullet point 1
- Bullet point 2
'''
"""
beancount_file = create_file(tmp_path, entry)
# Act
@@ -20,7 +20,8 @@ def test_no_transactions_in_file(tmp_path):
# Process Each Entry from All Beancount Files
jsonl_string = BeancountToJsonl.convert_transaction_maps_to_jsonl(
BeancountToJsonl.convert_transactions_to_maps(entry_nodes, file_to_entries))
BeancountToJsonl.convert_transactions_to_maps(entry_nodes, file_to_entries)
)
jsonl_data = [json.loads(json_string) for json_string in jsonl_string.splitlines()]
# Assert
@@ -30,11 +31,11 @@ def test_no_transactions_in_file(tmp_path):
def test_single_beancount_transaction_to_jsonl(tmp_path):
"Convert transaction from single file to jsonl."
# Arrange
entry = f'''
entry = f"""
1984-04-01 * "Payee" "Narration"
Expenses:Test:Test 1.00 KES
Assets:Test:Test -1.00 KES
'''
"""
beancount_file = create_file(tmp_path, entry)
# Act
@@ -43,7 +44,8 @@ Assets:Test:Test -1.00 KES
# Process Each Entry from All Beancount Files
jsonl_string = BeancountToJsonl.convert_transaction_maps_to_jsonl(
BeancountToJsonl.convert_transactions_to_maps(entries, entry_to_file_map))
BeancountToJsonl.convert_transactions_to_maps(entries, entry_to_file_map)
)
jsonl_data = [json.loads(json_string) for json_string in jsonl_string.splitlines()]
# Assert
@@ -53,7 +55,7 @@ Assets:Test:Test -1.00 KES
def test_multiple_transactions_to_jsonl(tmp_path):
"Convert multiple transactions from single file to jsonl."
# Arrange
entry = f'''
entry = f"""
1984-04-01 * "Payee" "Narration"
Expenses:Test:Test 1.00 KES
Assets:Test:Test -1.00 KES
@@ -61,7 +63,7 @@ Assets:Test:Test -1.00 KES
1984-04-01 * "Payee" "Narration"
Expenses:Test:Test 1.00 KES
Assets:Test:Test -1.00 KES
'''
"""
beancount_file = create_file(tmp_path, entry)
@@ -71,7 +73,8 @@ Assets:Test:Test -1.00 KES
# Process Each Entry from All Beancount Files
jsonl_string = BeancountToJsonl.convert_transaction_maps_to_jsonl(
BeancountToJsonl.convert_transactions_to_maps(entries, entry_to_file_map))
BeancountToJsonl.convert_transactions_to_maps(entries, entry_to_file_map)
)
jsonl_data = [json.loads(json_string) for json_string in jsonl_string.splitlines()]
# Assert
@@ -95,8 +98,8 @@ def test_get_beancount_files(tmp_path):
expected_files = sorted(map(str, [group1_file1, group1_file2, group2_file1, group2_file2, file1]))
# Setup input-files, input-filters
input_files = [tmp_path / 'ledger.bean']
input_filter = [tmp_path / 'group1*.bean', tmp_path / 'group2*.beancount']
input_files = [tmp_path / "ledger.bean"]
input_filter = [tmp_path / "group1*.bean", tmp_path / "group2*.beancount"]
# Act
extracted_org_files = BeancountToJsonl.get_beancount_files(input_files, input_filter)

View File

@@ -6,7 +6,7 @@ from khoj.processor.conversation.gpt import converse, understand, message_to_pro
# Initialize variables for tests
model = 'text-davinci-003'
model = "text-davinci-003"
api_key = None # Input your OpenAI API key to run the tests below
@@ -14,19 +14,22 @@ api_key = None # Input your OpenAI API key to run the tests below
# ----------------------------------------------------------------------------------------------------
def test_message_to_understand_prompt():
# Arrange
understand_primer = "Extract information from each chat message\n\nremember(memory-type, data);\nmemory-type=[\"companion\", \"notes\", \"ledger\", \"image\", \"music\"]\nsearch(search-type, data);\nsearch-type=[\"google\", \"youtube\"]\ngenerate(activity);\nactivity=[\"paint\",\"write\", \"chat\"]\ntrigger-emotion(emotion);\nemotion=[\"happy\",\"confidence\",\"fear\",\"surprise\",\"sadness\",\"disgust\",\"anger\", \"curiosity\", \"calm\"]\n\nQ: How are you doing?\nA: activity(\"chat\"); trigger-emotion(\"surprise\")\nQ: Do you remember what I told you about my brother Antoine when we were at the beach?\nA: remember(\"notes\", \"Brother Antoine when we were at the beach\"); trigger-emotion(\"curiosity\");\nQ: what did we talk about last time?\nA: remember(\"notes\", \"talk last time\"); trigger-emotion(\"curiosity\");\nQ: Let's make some drawings!\nA: generate(\"paint\"); trigger-emotion(\"happy\");\nQ: Do you know anything about Lebanon?\nA: search(\"google\", \"lebanon\"); trigger-emotion(\"confidence\");\nQ: Find a video about a panda rolling in the grass\nA: search(\"youtube\",\"panda rolling in the grass\"); trigger-emotion(\"happy\"); \nQ: Tell me a scary story\nA: generate(\"write\" \"A story about some adventure\"); trigger-emotion(\"fear\");\nQ: What fiction book was I reading last week about AI starship?\nA: remember(\"notes\", \"read fiction book about AI starship last week\"); trigger-emotion(\"curiosity\");\nQ: How much did I spend at Subway for dinner last time?\nA: remember(\"ledger\", \"last Subway dinner\"); trigger-emotion(\"curiosity\");\nQ: I'm feeling sleepy\nA: activity(\"chat\"); trigger-emotion(\"calm\")\nQ: What was that popular Sri lankan song that Alex showed me recently?\nA: remember(\"music\", \"popular Sri lankan song that Alex showed recently\"); trigger-emotion(\"curiosity\"); \nQ: You're pretty funny!\nA: activity(\"chat\"); trigger-emotion(\"pride\")"
expected_response = "Extract information from each chat message\n\nremember(memory-type, data);\nmemory-type=[\"companion\", \"notes\", \"ledger\", \"image\", \"music\"]\nsearch(search-type, data);\nsearch-type=[\"google\", \"youtube\"]\ngenerate(activity);\nactivity=[\"paint\",\"write\", \"chat\"]\ntrigger-emotion(emotion);\nemotion=[\"happy\",\"confidence\",\"fear\",\"surprise\",\"sadness\",\"disgust\",\"anger\", \"curiosity\", \"calm\"]\n\nQ: How are you doing?\nA: activity(\"chat\"); trigger-emotion(\"surprise\")\nQ: Do you remember what I told you about my brother Antoine when we were at the beach?\nA: remember(\"notes\", \"Brother Antoine when we were at the beach\"); trigger-emotion(\"curiosity\");\nQ: what did we talk about last time?\nA: remember(\"notes\", \"talk last time\"); trigger-emotion(\"curiosity\");\nQ: Let's make some drawings!\nA: generate(\"paint\"); trigger-emotion(\"happy\");\nQ: Do you know anything about Lebanon?\nA: search(\"google\", \"lebanon\"); trigger-emotion(\"confidence\");\nQ: Find a video about a panda rolling in the grass\nA: search(\"youtube\",\"panda rolling in the grass\"); trigger-emotion(\"happy\"); \nQ: Tell me a scary story\nA: generate(\"write\" \"A story about some adventure\"); trigger-emotion(\"fear\");\nQ: What fiction book was I reading last week about AI starship?\nA: remember(\"notes\", \"read fiction book about AI starship last week\"); trigger-emotion(\"curiosity\");\nQ: How much did I spend at Subway for dinner last time?\nA: remember(\"ledger\", \"last Subway dinner\"); trigger-emotion(\"curiosity\");\nQ: I'm feeling sleepy\nA: activity(\"chat\"); trigger-emotion(\"calm\")\nQ: What was that popular Sri lankan song that Alex showed me recently?\nA: remember(\"music\", \"popular Sri lankan song that Alex showed recently\"); trigger-emotion(\"curiosity\"); \nQ: You're pretty funny!\nA: activity(\"chat\"); trigger-emotion(\"pride\")\nQ: When did I last dine at Burger King?\nA:"
understand_primer = 'Extract information from each chat message\n\nremember(memory-type, data);\nmemory-type=["companion", "notes", "ledger", "image", "music"]\nsearch(search-type, data);\nsearch-type=["google", "youtube"]\ngenerate(activity);\nactivity=["paint","write", "chat"]\ntrigger-emotion(emotion);\nemotion=["happy","confidence","fear","surprise","sadness","disgust","anger", "curiosity", "calm"]\n\nQ: How are you doing?\nA: activity("chat"); trigger-emotion("surprise")\nQ: Do you remember what I told you about my brother Antoine when we were at the beach?\nA: remember("notes", "Brother Antoine when we were at the beach"); trigger-emotion("curiosity");\nQ: what did we talk about last time?\nA: remember("notes", "talk last time"); trigger-emotion("curiosity");\nQ: Let\'s make some drawings!\nA: generate("paint"); trigger-emotion("happy");\nQ: Do you know anything about Lebanon?\nA: search("google", "lebanon"); trigger-emotion("confidence");\nQ: Find a video about a panda rolling in the grass\nA: search("youtube","panda rolling in the grass"); trigger-emotion("happy"); \nQ: Tell me a scary story\nA: generate("write" "A story about some adventure"); trigger-emotion("fear");\nQ: What fiction book was I reading last week about AI starship?\nA: remember("notes", "read fiction book about AI starship last week"); trigger-emotion("curiosity");\nQ: How much did I spend at Subway for dinner last time?\nA: remember("ledger", "last Subway dinner"); trigger-emotion("curiosity");\nQ: I\'m feeling sleepy\nA: activity("chat"); trigger-emotion("calm")\nQ: What was that popular Sri lankan song that Alex showed me recently?\nA: remember("music", "popular Sri lankan song that Alex showed recently"); trigger-emotion("curiosity"); \nQ: You\'re pretty funny!\nA: activity("chat"); trigger-emotion("pride")'
expected_response = 'Extract information from each chat message\n\nremember(memory-type, data);\nmemory-type=["companion", "notes", "ledger", "image", "music"]\nsearch(search-type, data);\nsearch-type=["google", "youtube"]\ngenerate(activity);\nactivity=["paint","write", "chat"]\ntrigger-emotion(emotion);\nemotion=["happy","confidence","fear","surprise","sadness","disgust","anger", "curiosity", "calm"]\n\nQ: How are you doing?\nA: activity("chat"); trigger-emotion("surprise")\nQ: Do you remember what I told you about my brother Antoine when we were at the beach?\nA: remember("notes", "Brother Antoine when we were at the beach"); trigger-emotion("curiosity");\nQ: what did we talk about last time?\nA: remember("notes", "talk last time"); trigger-emotion("curiosity");\nQ: Let\'s make some drawings!\nA: generate("paint"); trigger-emotion("happy");\nQ: Do you know anything about Lebanon?\nA: search("google", "lebanon"); trigger-emotion("confidence");\nQ: Find a video about a panda rolling in the grass\nA: search("youtube","panda rolling in the grass"); trigger-emotion("happy"); \nQ: Tell me a scary story\nA: generate("write" "A story about some adventure"); trigger-emotion("fear");\nQ: What fiction book was I reading last week about AI starship?\nA: remember("notes", "read fiction book about AI starship last week"); trigger-emotion("curiosity");\nQ: How much did I spend at Subway for dinner last time?\nA: remember("ledger", "last Subway dinner"); trigger-emotion("curiosity");\nQ: I\'m feeling sleepy\nA: activity("chat"); trigger-emotion("calm")\nQ: What was that popular Sri lankan song that Alex showed me recently?\nA: remember("music", "popular Sri lankan song that Alex showed recently"); trigger-emotion("curiosity"); \nQ: You\'re pretty funny!\nA: activity("chat"); trigger-emotion("pride")\nQ: When did I last dine at Burger King?\nA:'
# Act
actual_response = message_to_prompt("When did I last dine at Burger King?", understand_primer, start_sequence="\nA:", restart_sequence="\nQ:")
actual_response = message_to_prompt(
"When did I last dine at Burger King?", understand_primer, start_sequence="\nA:", restart_sequence="\nQ:"
)
# Assert
assert actual_response == expected_response
# ----------------------------------------------------------------------------------------------------
@pytest.mark.skipif(api_key is None,
reason="Set api_key variable to your OpenAI API key from https://beta.openai.com/account/api-keys")
@pytest.mark.skipif(
api_key is None, reason="Set api_key variable to your OpenAI API key from https://beta.openai.com/account/api-keys"
)
def test_minimal_chat_with_gpt():
# Act
response = converse("What will happen when the stars go out?", model=model, api_key=api_key)
@@ -36,21 +39,29 @@ def test_minimal_chat_with_gpt():
# ----------------------------------------------------------------------------------------------------
@pytest.mark.skipif(api_key is None,
reason="Set api_key variable to your OpenAI API key from https://beta.openai.com/account/api-keys")
@pytest.mark.skipif(
api_key is None, reason="Set api_key variable to your OpenAI API key from https://beta.openai.com/account/api-keys"
)
def test_chat_with_history():
# Arrange
ai_prompt="AI:"
human_prompt="Human:"
ai_prompt = "AI:"
human_prompt = "Human:"
conversation_primer = f'''
conversation_primer = f"""
The following is a conversation with an AI assistant. The assistant is helpful, creative, clever, and very friendly companion.
{human_prompt} Hello, I am Testatron. Who are you?
{ai_prompt} Hi, I am Khoj, an AI conversational companion created by OpenAI. How can I help you today?'''
{ai_prompt} Hi, I am Khoj, an AI conversational companion created by OpenAI. How can I help you today?"""
# Act
response = converse("Hi Khoj, What is my name?", model=model, conversation_history=conversation_primer, api_key=api_key, temperature=0, max_tokens=50)
response = converse(
"Hi Khoj, What is my name?",
model=model,
conversation_history=conversation_primer,
api_key=api_key,
temperature=0,
max_tokens=50,
)
# Assert
assert len(response) > 0
@@ -58,12 +69,13 @@ The following is a conversation with an AI assistant. The assistant is helpful,
# ----------------------------------------------------------------------------------------------------
@pytest.mark.skipif(api_key is None,
reason="Set api_key variable to your OpenAI API key from https://beta.openai.com/account/api-keys")
@pytest.mark.skipif(
api_key is None, reason="Set api_key variable to your OpenAI API key from https://beta.openai.com/account/api-keys"
)
def test_understand_message_using_gpt():
# Act
response = understand("When did I last dine at Subway?", model=model, api_key=api_key)
# Assert
assert len(response) > 0
assert response['intent']['memory-type'] == 'ledger'
assert response["intent"]["memory-type"] == "ledger"

View File

@@ -14,35 +14,37 @@ def test_cli_minimal_default():
actual_args = cli([])
# Assert
assert actual_args.config_file == resolve_absolute_path(Path('~/.khoj/khoj.yml'))
assert actual_args.config_file == resolve_absolute_path(Path("~/.khoj/khoj.yml"))
assert actual_args.regenerate == False
assert actual_args.no_gui == False
assert actual_args.verbose == 0
# ----------------------------------------------------------------------------------------------------
def test_cli_invalid_config_file_path():
# Arrange
non_existent_config_file = f"non-existent-khoj-{random()}.yml"
# Act
actual_args = cli([f'-c={non_existent_config_file}'])
actual_args = cli([f"-c={non_existent_config_file}"])
# Assert
assert actual_args.config_file == resolve_absolute_path(non_existent_config_file)
assert actual_args.config == None
# ----------------------------------------------------------------------------------------------------
def test_cli_config_from_file():
# Act
actual_args = cli(['-c=tests/data/config.yml',
'--regenerate',
'--no-gui',
'-vvv'])
actual_args = cli(["-c=tests/data/config.yml", "--regenerate", "--no-gui", "-vvv"])
# Assert
assert actual_args.config_file == resolve_absolute_path(Path('tests/data/config.yml'))
assert actual_args.config_file == resolve_absolute_path(Path("tests/data/config.yml"))
assert actual_args.no_gui == True
assert actual_args.regenerate == True
assert actual_args.config is not None
assert actual_args.config.content_type.org.input_files == [Path('~/first_from_config.org'), Path('~/second_from_config.org')]
assert actual_args.config.content_type.org.input_files == [
Path("~/first_from_config.org"),
Path("~/second_from_config.org"),
]
assert actual_args.verbose == 3

View File

@@ -21,6 +21,7 @@ from khoj.search_filter.file_filter import FileFilter
# ----------------------------------------------------------------------------------------------------
client = TestClient(app)
# Test
# ----------------------------------------------------------------------------------------------------
def test_search_with_invalid_content_type():
@@ -98,9 +99,11 @@ def test_image_search(content_config: ContentConfig, search_config: SearchConfig
config.content_type = content_config
config.search_type = search_config
model.image_search = image_search.setup(content_config.image, search_config.image, regenerate=False)
query_expected_image_pairs = [("kitten", "kitten_park.jpg"),
("a horse and dog on a leash", "horse_dog.jpg"),
("A guinea pig eating grass", "guineapig_grass.jpg")]
query_expected_image_pairs = [
("kitten", "kitten_park.jpg"),
("a horse and dog on a leash", "horse_dog.jpg"),
("A guinea pig eating grass", "guineapig_grass.jpg"),
]
for query, expected_image_name in query_expected_image_pairs:
# Act
@@ -135,7 +138,9 @@ def test_notes_search(content_config: ContentConfig, search_config: SearchConfig
def test_notes_search_with_only_filters(content_config: ContentConfig, search_config: SearchConfig):
# Arrange
filters = [WordFilter(), FileFilter()]
model.orgmode_search = text_search.setup(OrgToJsonl, content_config.org, search_config.asymmetric, regenerate=False, filters=filters)
model.orgmode_search = text_search.setup(
OrgToJsonl, content_config.org, search_config.asymmetric, regenerate=False, filters=filters
)
user_query = quote('+"Emacs" file:"*.org"')
# Act
@@ -152,7 +157,9 @@ def test_notes_search_with_only_filters(content_config: ContentConfig, search_co
def test_notes_search_with_include_filter(content_config: ContentConfig, search_config: SearchConfig):
# Arrange
filters = [WordFilter()]
model.orgmode_search = text_search.setup(OrgToJsonl, content_config.org, search_config.asymmetric, regenerate=False, filters=filters)
model.orgmode_search = text_search.setup(
OrgToJsonl, content_config.org, search_config.asymmetric, regenerate=False, filters=filters
)
user_query = quote('How to git install application? +"Emacs"')
# Act
@@ -169,7 +176,9 @@ def test_notes_search_with_include_filter(content_config: ContentConfig, search_
def test_notes_search_with_exclude_filter(content_config: ContentConfig, search_config: SearchConfig):
# Arrange
filters = [WordFilter()]
model.orgmode_search = text_search.setup(OrgToJsonl, content_config.org, search_config.asymmetric, regenerate=False, filters=filters)
model.orgmode_search = text_search.setup(
OrgToJsonl, content_config.org, search_config.asymmetric, regenerate=False, filters=filters
)
user_query = quote('How to git install application? -"clone"')
# Act

View File

@@ -10,53 +10,59 @@ from khoj.utils.rawconfig import Entry
def test_date_filter():
entries = [
Entry(compiled='', raw='Entry with no date'),
Entry(compiled='', raw='April Fools entry: 1984-04-01'),
Entry(compiled='', raw='Entry with date:1984-04-02')
Entry(compiled="", raw="Entry with no date"),
Entry(compiled="", raw="April Fools entry: 1984-04-01"),
Entry(compiled="", raw="Entry with date:1984-04-02"),
]
q_with_no_date_filter = 'head tail'
q_with_no_date_filter = "head tail"
ret_query, entry_indices = DateFilter().apply(q_with_no_date_filter, entries)
assert ret_query == 'head tail'
assert ret_query == "head tail"
assert entry_indices == {0, 1, 2}
q_with_dtrange_non_overlapping_at_boundary = 'head dt>"1984-04-01" dt<"1984-04-02" tail'
ret_query, entry_indices = DateFilter().apply(q_with_dtrange_non_overlapping_at_boundary, entries)
assert ret_query == 'head tail'
assert ret_query == "head tail"
assert entry_indices == set()
query_with_overlapping_dtrange = 'head dt>"1984-04-01" dt<"1984-04-03" tail'
ret_query, entry_indices = DateFilter().apply(query_with_overlapping_dtrange, entries)
assert ret_query == 'head tail'
assert ret_query == "head tail"
assert entry_indices == {2}
query_with_overlapping_dtrange = 'head dt>="1984-04-01" dt<"1984-04-02" tail'
ret_query, entry_indices = DateFilter().apply(query_with_overlapping_dtrange, entries)
assert ret_query == 'head tail'
assert ret_query == "head tail"
assert entry_indices == {1}
query_with_overlapping_dtrange = 'head dt>"1984-04-01" dt<="1984-04-02" tail'
ret_query, entry_indices = DateFilter().apply(query_with_overlapping_dtrange, entries)
assert ret_query == 'head tail'
assert ret_query == "head tail"
assert entry_indices == {2}
query_with_overlapping_dtrange = 'head dt>="1984-04-01" dt<="1984-04-02" tail'
ret_query, entry_indices = DateFilter().apply(query_with_overlapping_dtrange, entries)
assert ret_query == 'head tail'
assert ret_query == "head tail"
assert entry_indices == {1, 2}
def test_extract_date_range():
assert DateFilter().extract_date_range('head dt>"1984-01-04" dt<"1984-01-07" tail') == [datetime(1984, 1, 5, 0, 0, 0).timestamp(), datetime(1984, 1, 7, 0, 0, 0).timestamp()]
assert DateFilter().extract_date_range('head dt>"1984-01-04" dt<"1984-01-07" tail') == [
datetime(1984, 1, 5, 0, 0, 0).timestamp(),
datetime(1984, 1, 7, 0, 0, 0).timestamp(),
]
assert DateFilter().extract_date_range('head dt<="1984-01-01"') == [0, datetime(1984, 1, 2, 0, 0, 0).timestamp()]
assert DateFilter().extract_date_range('head dt>="1984-01-01"') == [datetime(1984, 1, 1, 0, 0, 0).timestamp(), inf]
assert DateFilter().extract_date_range('head dt:"1984-01-01"') == [datetime(1984, 1, 1, 0, 0, 0).timestamp(), datetime(1984, 1, 2, 0, 0, 0).timestamp()]
assert DateFilter().extract_date_range('head dt:"1984-01-01"') == [
datetime(1984, 1, 1, 0, 0, 0).timestamp(),
datetime(1984, 1, 2, 0, 0, 0).timestamp(),
]
# Unparseable date filter specified in query
assert DateFilter().extract_date_range('head dt:"Summer of 69" tail') == None
# No date filter specified in query
assert DateFilter().extract_date_range('head tail') == None
assert DateFilter().extract_date_range("head tail") == None
# Non intersecting date ranges
assert DateFilter().extract_date_range('head dt>"1984-01-01" dt<"1984-01-01" tail') == None
@@ -66,43 +72,79 @@ def test_parse():
test_now = datetime(1984, 4, 1, 21, 21, 21)
# day variations
assert DateFilter().parse('today', relative_base=test_now) == (datetime(1984, 4, 1, 0, 0, 0), datetime(1984, 4, 2, 0, 0, 0))
assert DateFilter().parse('tomorrow', relative_base=test_now) == (datetime(1984, 4, 2, 0, 0, 0), datetime(1984, 4, 3, 0, 0, 0))
assert DateFilter().parse('yesterday', relative_base=test_now) == (datetime(1984, 3, 31, 0, 0, 0), datetime(1984, 4, 1, 0, 0, 0))
assert DateFilter().parse('5 days ago', relative_base=test_now) == (datetime(1984, 3, 27, 0, 0, 0), datetime(1984, 3, 28, 0, 0, 0))
assert DateFilter().parse("today", relative_base=test_now) == (
datetime(1984, 4, 1, 0, 0, 0),
datetime(1984, 4, 2, 0, 0, 0),
)
assert DateFilter().parse("tomorrow", relative_base=test_now) == (
datetime(1984, 4, 2, 0, 0, 0),
datetime(1984, 4, 3, 0, 0, 0),
)
assert DateFilter().parse("yesterday", relative_base=test_now) == (
datetime(1984, 3, 31, 0, 0, 0),
datetime(1984, 4, 1, 0, 0, 0),
)
assert DateFilter().parse("5 days ago", relative_base=test_now) == (
datetime(1984, 3, 27, 0, 0, 0),
datetime(1984, 3, 28, 0, 0, 0),
)
# week variations
assert DateFilter().parse('last week', relative_base=test_now) == (datetime(1984, 3, 18, 0, 0, 0), datetime(1984, 3, 25, 0, 0, 0))
assert DateFilter().parse('2 weeks ago', relative_base=test_now) == (datetime(1984, 3, 11, 0, 0, 0), datetime(1984, 3, 18, 0, 0, 0))
assert DateFilter().parse("last week", relative_base=test_now) == (
datetime(1984, 3, 18, 0, 0, 0),
datetime(1984, 3, 25, 0, 0, 0),
)
assert DateFilter().parse("2 weeks ago", relative_base=test_now) == (
datetime(1984, 3, 11, 0, 0, 0),
datetime(1984, 3, 18, 0, 0, 0),
)
# month variations
assert DateFilter().parse('next month', relative_base=test_now) == (datetime(1984, 5, 1, 0, 0, 0), datetime(1984, 6, 1, 0, 0, 0))
assert DateFilter().parse('2 months ago', relative_base=test_now) == (datetime(1984, 2, 1, 0, 0, 0), datetime(1984, 3, 1, 0, 0, 0))
assert DateFilter().parse("next month", relative_base=test_now) == (
datetime(1984, 5, 1, 0, 0, 0),
datetime(1984, 6, 1, 0, 0, 0),
)
assert DateFilter().parse("2 months ago", relative_base=test_now) == (
datetime(1984, 2, 1, 0, 0, 0),
datetime(1984, 3, 1, 0, 0, 0),
)
# year variations
assert DateFilter().parse('this year', relative_base=test_now) == (datetime(1984, 1, 1, 0, 0, 0), datetime(1985, 1, 1, 0, 0, 0))
assert DateFilter().parse('20 years later', relative_base=test_now) == (datetime(2004, 1, 1, 0, 0, 0), datetime(2005, 1, 1, 0, 0, 0))
assert DateFilter().parse("this year", relative_base=test_now) == (
datetime(1984, 1, 1, 0, 0, 0),
datetime(1985, 1, 1, 0, 0, 0),
)
assert DateFilter().parse("20 years later", relative_base=test_now) == (
datetime(2004, 1, 1, 0, 0, 0),
datetime(2005, 1, 1, 0, 0, 0),
)
# specific month/date variation
assert DateFilter().parse('in august', relative_base=test_now) == (datetime(1983, 8, 1, 0, 0, 0), datetime(1983, 8, 2, 0, 0, 0))
assert DateFilter().parse('on 1983-08-01', relative_base=test_now) == (datetime(1983, 8, 1, 0, 0, 0), datetime(1983, 8, 2, 0, 0, 0))
assert DateFilter().parse("in august", relative_base=test_now) == (
datetime(1983, 8, 1, 0, 0, 0),
datetime(1983, 8, 2, 0, 0, 0),
)
assert DateFilter().parse("on 1983-08-01", relative_base=test_now) == (
datetime(1983, 8, 1, 0, 0, 0),
datetime(1983, 8, 2, 0, 0, 0),
)
def test_date_filter_regex():
dtrange_match = re.findall(DateFilter().date_regex, 'multi word head dt>"today" dt:"1984-01-01"')
assert dtrange_match == [('>', 'today'), (':', '1984-01-01')]
assert dtrange_match == [(">", "today"), (":", "1984-01-01")]
dtrange_match = re.findall(DateFilter().date_regex, 'head dt>"today" dt:"1984-01-01" multi word tail')
assert dtrange_match == [('>', 'today'), (':', '1984-01-01')]
assert dtrange_match == [(">", "today"), (":", "1984-01-01")]
dtrange_match = re.findall(DateFilter().date_regex, 'multi word head dt>="today" dt="1984-01-01"')
assert dtrange_match == [('>=', 'today'), ('=', '1984-01-01')]
assert dtrange_match == [(">=", "today"), ("=", "1984-01-01")]
dtrange_match = re.findall(DateFilter().date_regex, 'dt<"multi word date" multi word tail')
assert dtrange_match == [('<', 'multi word date')]
assert dtrange_match == [("<", "multi word date")]
dtrange_match = re.findall(DateFilter().date_regex, 'head dt<="multi word date"')
assert dtrange_match == [('<=', 'multi word date')]
assert dtrange_match == [("<=", "multi word date")]
dtrange_match = re.findall(DateFilter().date_regex, 'head tail')
assert dtrange_match == []
dtrange_match = re.findall(DateFilter().date_regex, "head tail")
assert dtrange_match == []

View File

@@ -7,7 +7,7 @@ def test_no_file_filter():
# Arrange
file_filter = FileFilter()
entries = arrange_content()
q_with_no_filter = 'head tail'
q_with_no_filter = "head tail"
# Act
can_filter = file_filter.can_filter(q_with_no_filter)
@@ -15,7 +15,7 @@ def test_no_file_filter():
# Assert
assert can_filter == False
assert ret_query == 'head tail'
assert ret_query == "head tail"
assert entry_indices == {0, 1, 2, 3}
@@ -31,7 +31,7 @@ def test_file_filter_with_non_existent_file():
# Assert
assert can_filter == True
assert ret_query == 'head tail'
assert ret_query == "head tail"
assert entry_indices == {}
@@ -47,7 +47,7 @@ def test_single_file_filter():
# Assert
assert can_filter == True
assert ret_query == 'head tail'
assert ret_query == "head tail"
assert entry_indices == {0, 2}
@@ -63,7 +63,7 @@ def test_file_filter_with_partial_match():
# Assert
assert can_filter == True
assert ret_query == 'head tail'
assert ret_query == "head tail"
assert entry_indices == {0, 2}
@@ -79,7 +79,7 @@ def test_file_filter_with_regex_match():
# Assert
assert can_filter == True
assert ret_query == 'head tail'
assert ret_query == "head tail"
assert entry_indices == {0, 1, 2, 3}
@@ -95,16 +95,16 @@ def test_multiple_file_filter():
# Assert
assert can_filter == True
assert ret_query == 'head tail'
assert ret_query == "head tail"
assert entry_indices == {0, 1, 2, 3}
def arrange_content():
entries = [
Entry(compiled='', raw='First Entry', file= 'file 1.org'),
Entry(compiled='', raw='Second Entry', file= 'file2.org'),
Entry(compiled='', raw='Third Entry', file= 'file 1.org'),
Entry(compiled='', raw='Fourth Entry', file= 'file2.org')
Entry(compiled="", raw="First Entry", file="file 1.org"),
Entry(compiled="", raw="Second Entry", file="file2.org"),
Entry(compiled="", raw="Third Entry", file="file 1.org"),
Entry(compiled="", raw="Fourth Entry", file="file2.org"),
]
return entries

View File

@@ -1,5 +1,6 @@
from khoj.utils import helpers
def test_get_from_null_dict():
# null handling
assert helpers.get_from_dict(dict()) == dict()
@@ -7,39 +8,39 @@ def test_get_from_null_dict():
# key present in nested dictionary
# 1-level dictionary
assert helpers.get_from_dict({'a': 1, 'b': 2}, 'a') == 1
assert helpers.get_from_dict({'a': 1, 'b': 2}, 'c') == None
assert helpers.get_from_dict({"a": 1, "b": 2}, "a") == 1
assert helpers.get_from_dict({"a": 1, "b": 2}, "c") == None
# 2-level dictionary
assert helpers.get_from_dict({'a': {'a_a': 1}, 'b': 2}, 'a') == {'a_a': 1}
assert helpers.get_from_dict({'a': {'a_a': 1}, 'b': 2}, 'a', 'a_a') == 1
assert helpers.get_from_dict({"a": {"a_a": 1}, "b": 2}, "a") == {"a_a": 1}
assert helpers.get_from_dict({"a": {"a_a": 1}, "b": 2}, "a", "a_a") == 1
# key not present in nested dictionary
# 2-level_dictionary
assert helpers.get_from_dict({'a': {'a_a': 1}, 'b': 2}, 'b', 'b_a') == None
assert helpers.get_from_dict({"a": {"a_a": 1}, "b": 2}, "b", "b_a") == None
def test_merge_dicts():
# basic merge of dicts with non-overlapping keys
assert helpers.merge_dicts(priority_dict={'a': 1}, default_dict={'b': 2}) == {'a': 1, 'b': 2}
assert helpers.merge_dicts(priority_dict={"a": 1}, default_dict={"b": 2}) == {"a": 1, "b": 2}
# use default dict items when not present in priority dict
assert helpers.merge_dicts(priority_dict={}, default_dict={'b': 2}) == {'b': 2}
assert helpers.merge_dicts(priority_dict={}, default_dict={"b": 2}) == {"b": 2}
# do not override existing key in priority_dict with default dict
assert helpers.merge_dicts(priority_dict={'a': 1}, default_dict={'a': 2}) == {'a': 1}
assert helpers.merge_dicts(priority_dict={"a": 1}, default_dict={"a": 2}) == {"a": 1}
def test_lru_cache():
# Test initializing cache
cache = helpers.LRU({'a': 1, 'b': 2}, capacity=2)
assert cache == {'a': 1, 'b': 2}
cache = helpers.LRU({"a": 1, "b": 2}, capacity=2)
assert cache == {"a": 1, "b": 2}
# Test capacity overflow
cache['c'] = 3
assert cache == {'b': 2, 'c': 3}
cache["c"] = 3
assert cache == {"b": 2, "c": 3}
# Test delete least recently used item from LRU cache on capacity overflow
cache['b'] # accessing 'b' makes it the most recently used item
cache['d'] = 4 # so 'c' is deleted from the cache instead of 'b'
assert cache == {'b': 2, 'd': 4}
cache["b"] # accessing 'b' makes it the most recently used item
cache["d"] = 4 # so 'c' is deleted from the cache instead of 'b'
assert cache == {"b": 2, "d": 4}

View File

@@ -30,7 +30,8 @@ def test_image_metadata(content_config: ContentConfig):
expected_metadata_image_name_pairs = [
(["Billi Ka Bacha.", "Cat", "Grass"], "kitten_park.jpg"),
(["Pasture.", "Horse", "Dog"], "horse_dog.jpg"),
(["Guinea Pig Eating Celery.", "Rodent", "Whiskers"], "guineapig_grass.jpg")]
(["Guinea Pig Eating Celery.", "Rodent", "Whiskers"], "guineapig_grass.jpg"),
]
test_image_paths = [
Path(content_config.image.input_directories[0] / image_name[1])
@@ -51,23 +52,23 @@ def test_image_search(content_config: ContentConfig, search_config: SearchConfig
# Arrange
output_directory = resolve_absolute_path(web_directory)
model.image_search = image_search.setup(content_config.image, search_config.image, regenerate=False)
query_expected_image_pairs = [("kitten", "kitten_park.jpg"),
("horse and dog in a farm", "horse_dog.jpg"),
("A guinea pig eating grass", "guineapig_grass.jpg")]
query_expected_image_pairs = [
("kitten", "kitten_park.jpg"),
("horse and dog in a farm", "horse_dog.jpg"),
("A guinea pig eating grass", "guineapig_grass.jpg"),
]
# Act
for query, expected_image_name in query_expected_image_pairs:
hits = image_search.query(
query,
count = 1,
model = model.image_search)
hits = image_search.query(query, count=1, model=model.image_search)
results = image_search.collate_results(
hits,
model.image_search.image_names,
output_directory=output_directory,
image_files_url='/static/images',
count=1)
image_files_url="/static/images",
count=1,
)
actual_image_path = output_directory.joinpath(Path(results[0].entry).name)
actual_image = Image.open(actual_image_path)
@@ -86,16 +87,13 @@ def test_image_search_query_truncated(content_config: ContentConfig, search_conf
# Arrange
model.image_search = image_search.setup(content_config.image, search_config.image, regenerate=False)
max_words_supported = 10
query = " ".join(["hello"]*100)
truncated_query = " ".join(["hello"]*max_words_supported)
query = " ".join(["hello"] * 100)
truncated_query = " ".join(["hello"] * max_words_supported)
# Act
try:
with caplog.at_level(logging.INFO, logger="khoj.search_type.image_search"):
image_search.query(
query,
count = 1,
model = model.image_search)
image_search.query(query, count=1, model=model.image_search)
# Assert
except RuntimeError as e:
if "The size of tensor a (102) must match the size of tensor b (77)" in str(e):
@@ -115,17 +113,15 @@ def test_image_search_by_filepath(content_config: ContentConfig, search_config:
# Act
with caplog.at_level(logging.INFO, logger="khoj.search_type.image_search"):
hits = image_search.query(
query,
count = 1,
model = model.image_search)
hits = image_search.query(query, count=1, model=model.image_search)
results = image_search.collate_results(
hits,
model.image_search.image_names,
output_directory=output_directory,
image_files_url='/static/images',
count=1)
image_files_url="/static/images",
count=1,
)
actual_image_path = output_directory.joinpath(Path(results[0].entry).name)
actual_image = Image.open(actual_image_path)
@@ -133,7 +129,9 @@ def test_image_search_by_filepath(content_config: ContentConfig, search_config:
# Assert
# Ensure file search triggered instead of query with file path as string
assert f"Find Images by Image: {resolve_absolute_path(expected_image_path)}" in caplog.text, "File search not triggered"
assert (
f"Find Images by Image: {resolve_absolute_path(expected_image_path)}" in caplog.text
), "File search not triggered"
# Ensure the correct image is returned
assert expected_image == actual_image, "Incorrect image returned by file search"

View File

@@ -8,10 +8,10 @@ from khoj.processor.markdown.markdown_to_jsonl import MarkdownToJsonl
def test_markdown_file_with_no_headings_to_jsonl(tmp_path):
"Convert files with no heading to jsonl."
# Arrange
entry = f'''
entry = f"""
- Bullet point 1
- Bullet point 2
'''
"""
markdownfile = create_file(tmp_path, entry)
# Act
@@ -20,7 +20,8 @@ def test_markdown_file_with_no_headings_to_jsonl(tmp_path):
# Process Each Entry from All Notes Files
jsonl_string = MarkdownToJsonl.convert_markdown_maps_to_jsonl(
MarkdownToJsonl.convert_markdown_entries_to_maps(entry_nodes, file_to_entries))
MarkdownToJsonl.convert_markdown_entries_to_maps(entry_nodes, file_to_entries)
)
jsonl_data = [json.loads(json_string) for json_string in jsonl_string.splitlines()]
# Assert
@@ -30,10 +31,10 @@ def test_markdown_file_with_no_headings_to_jsonl(tmp_path):
def test_single_markdown_entry_to_jsonl(tmp_path):
"Convert markdown entry from single file to jsonl."
# Arrange
entry = f'''### Heading
entry = f"""### Heading
\t\r
Body Line 1
'''
"""
markdownfile = create_file(tmp_path, entry)
# Act
@@ -42,7 +43,8 @@ def test_single_markdown_entry_to_jsonl(tmp_path):
# Process Each Entry from All Notes Files
jsonl_string = MarkdownToJsonl.convert_markdown_maps_to_jsonl(
MarkdownToJsonl.convert_markdown_entries_to_maps(entries, entry_to_file_map))
MarkdownToJsonl.convert_markdown_entries_to_maps(entries, entry_to_file_map)
)
jsonl_data = [json.loads(json_string) for json_string in jsonl_string.splitlines()]
# Assert
@@ -52,14 +54,14 @@ def test_single_markdown_entry_to_jsonl(tmp_path):
def test_multiple_markdown_entries_to_jsonl(tmp_path):
"Convert multiple markdown entries from single file to jsonl."
# Arrange
entry = f'''
entry = f"""
### Heading 1
\t\r
Heading 1 Body Line 1
### Heading 2
\t\r
Heading 2 Body Line 2
'''
"""
markdownfile = create_file(tmp_path, entry)
# Act
@@ -68,7 +70,8 @@ def test_multiple_markdown_entries_to_jsonl(tmp_path):
# Process Each Entry from All Notes Files
jsonl_string = MarkdownToJsonl.convert_markdown_maps_to_jsonl(
MarkdownToJsonl.convert_markdown_entries_to_maps(entries, entry_to_file_map))
MarkdownToJsonl.convert_markdown_entries_to_maps(entries, entry_to_file_map)
)
jsonl_data = [json.loads(json_string) for json_string in jsonl_string.splitlines()]
# Assert
@@ -92,8 +95,8 @@ def test_get_markdown_files(tmp_path):
expected_files = sorted(map(str, [group1_file1, group1_file2, group2_file1, group2_file2, file1]))
# Setup input-files, input-filters
input_files = [tmp_path / 'notes.md']
input_filter = [tmp_path / 'group1*.md', tmp_path / 'group2*.markdown']
input_files = [tmp_path / "notes.md"]
input_filter = [tmp_path / "group1*.md", tmp_path / "group2*.markdown"]
# Act
extracted_org_files = MarkdownToJsonl.get_markdown_files(input_files, input_filter)
@@ -106,10 +109,10 @@ def test_get_markdown_files(tmp_path):
def test_extract_entries_with_different_level_headings(tmp_path):
"Extract markdown entries with different level headings."
# Arrange
entry = f'''
entry = f"""
# Heading 1
## Heading 2
'''
"""
markdownfile = create_file(tmp_path, entry)
# Act

View File

@@ -9,23 +9,25 @@ from khoj.utils.rawconfig import Entry
def test_configure_heading_entry_to_jsonl(tmp_path):
'''Ensure entries with empty body are ignored, unless explicitly configured to index heading entries.
Property drawers not considered Body. Ignore control characters for evaluating if Body empty.'''
"""Ensure entries with empty body are ignored, unless explicitly configured to index heading entries.
Property drawers not considered Body. Ignore control characters for evaluating if Body empty."""
# Arrange
entry = f'''*** Heading
entry = f"""*** Heading
:PROPERTIES:
:ID: 42-42-42
:END:
\t \r
'''
"""
orgfile = create_file(tmp_path, entry)
for index_heading_entries in [True, False]:
# Act
# Extract entries into jsonl from specified Org files
jsonl_string = OrgToJsonl.convert_org_entries_to_jsonl(OrgToJsonl.convert_org_nodes_to_entries(
*OrgToJsonl.extract_org_entries(org_files=[orgfile]),
index_heading_entries=index_heading_entries))
jsonl_string = OrgToJsonl.convert_org_entries_to_jsonl(
OrgToJsonl.convert_org_nodes_to_entries(
*OrgToJsonl.extract_org_entries(org_files=[orgfile]), index_heading_entries=index_heading_entries
)
)
jsonl_data = [json.loads(json_string) for json_string in jsonl_string.splitlines()]
# Assert
@@ -40,10 +42,10 @@ def test_configure_heading_entry_to_jsonl(tmp_path):
def test_entry_split_when_exceeds_max_words(tmp_path):
"Ensure entries with compiled words exceeding max_words are split."
# Arrange
entry = f'''*** Heading
entry = f"""*** Heading
\t\r
Body Line 1
'''
"""
orgfile = create_file(tmp_path, entry)
# Act
@@ -53,9 +55,9 @@ def test_entry_split_when_exceeds_max_words(tmp_path):
# Split each entry from specified Org files by max words
jsonl_string = OrgToJsonl.convert_org_entries_to_jsonl(
TextToJsonl.split_entries_by_max_tokens(
OrgToJsonl.convert_org_nodes_to_entries(entries, entry_to_file_map),
max_tokens = 2)
OrgToJsonl.convert_org_nodes_to_entries(entries, entry_to_file_map), max_tokens=2
)
)
jsonl_data = [json.loads(json_string) for json_string in jsonl_string.splitlines()]
# Assert
@@ -65,15 +67,15 @@ def test_entry_split_when_exceeds_max_words(tmp_path):
def test_entry_split_drops_large_words(tmp_path):
"Ensure entries drops words larger than specified max word length from compiled version."
# Arrange
entry_text = f'''*** Heading
entry_text = f"""*** Heading
\t\r
Body Line 1
'''
"""
entry = Entry(raw=entry_text, compiled=entry_text)
# Act
# Split entry by max words and drop words larger than max word length
processed_entry = TextToJsonl.split_entries_by_max_tokens([entry], max_word_length = 5)[0]
processed_entry = TextToJsonl.split_entries_by_max_tokens([entry], max_word_length=5)[0]
# Assert
# "Heading" dropped from compiled version because its over the set max word limit
@@ -83,13 +85,13 @@ def test_entry_split_drops_large_words(tmp_path):
def test_entry_with_body_to_jsonl(tmp_path):
"Ensure entries with valid body text are loaded."
# Arrange
entry = f'''*** Heading
entry = f"""*** Heading
:PROPERTIES:
:ID: 42-42-42
:END:
\t\r
Body Line 1
'''
"""
orgfile = create_file(tmp_path, entry)
# Act
@@ -97,7 +99,9 @@ def test_entry_with_body_to_jsonl(tmp_path):
entries, entry_to_file_map = OrgToJsonl.extract_org_entries(org_files=[orgfile])
# Process Each Entry from All Notes Files
jsonl_string = OrgToJsonl.convert_org_entries_to_jsonl(OrgToJsonl.convert_org_nodes_to_entries(entries, entry_to_file_map))
jsonl_string = OrgToJsonl.convert_org_entries_to_jsonl(
OrgToJsonl.convert_org_nodes_to_entries(entries, entry_to_file_map)
)
jsonl_data = [json.loads(json_string) for json_string in jsonl_string.splitlines()]
# Assert
@@ -107,10 +111,10 @@ def test_entry_with_body_to_jsonl(tmp_path):
def test_file_with_no_headings_to_jsonl(tmp_path):
"Ensure files with no heading, only body text are loaded."
# Arrange
entry = f'''
entry = f"""
- Bullet point 1
- Bullet point 2
'''
"""
orgfile = create_file(tmp_path, entry)
# Act
@@ -120,7 +124,7 @@ def test_file_with_no_headings_to_jsonl(tmp_path):
# Process Each Entry from All Notes Files
entries = OrgToJsonl.convert_org_nodes_to_entries(entry_nodes, file_to_entries)
jsonl_string = OrgToJsonl.convert_org_entries_to_jsonl(entries)
jsonl_data = [json.loads(json_string) for json_string in jsonl_string.splitlines()]
jsonl_data = [json.loads(json_string) for json_string in jsonl_string.splitlines()]
# Assert
assert len(jsonl_data) == 1
@@ -143,8 +147,8 @@ def test_get_org_files(tmp_path):
expected_files = sorted(map(str, [group1_file1, group1_file2, group2_file1, group2_file2, orgfile1]))
# Setup input-files, input-filters
input_files = [tmp_path / 'orgfile1.org']
input_filter = [tmp_path / 'group1*.org', tmp_path / 'group2*.org']
input_files = [tmp_path / "orgfile1.org"]
input_filter = [tmp_path / "group1*.org", tmp_path / "group2*.org"]
# Act
extracted_org_files = OrgToJsonl.get_org_files(input_files, input_filter)
@@ -157,10 +161,10 @@ def test_get_org_files(tmp_path):
def test_extract_entries_with_different_level_headings(tmp_path):
"Extract org entries with different level headings."
# Arrange
entry = f'''
entry = f"""
* Heading 1
** Heading 2
'''
"""
orgfile = create_file(tmp_path, entry)
# Act
@@ -169,8 +173,8 @@ def test_extract_entries_with_different_level_headings(tmp_path):
# Assert
assert len(entries) == 2
assert f'{entries[0]}'.startswith("* Heading 1")
assert f'{entries[1]}'.startswith("** Heading 2")
assert f"{entries[0]}".startswith("* Heading 1")
assert f"{entries[1]}".startswith("** Heading 2")
# Helper Functions

View File

@@ -10,7 +10,7 @@ from khoj.processor.org_mode import orgnode
def test_parse_entry_with_no_headings(tmp_path):
"Test parsing of entry with minimal fields"
# Arrange
entry = f'''Body Line 1'''
entry = f"""Body Line 1"""
orgfile = create_file(tmp_path, entry)
# Act
@@ -18,7 +18,7 @@ def test_parse_entry_with_no_headings(tmp_path):
# Assert
assert len(entries) == 1
assert entries[0].heading == f'{orgfile}'
assert entries[0].heading == f"{orgfile}"
assert entries[0].tags == list()
assert entries[0].body == "Body Line 1"
assert entries[0].priority == ""
@@ -32,9 +32,9 @@ def test_parse_entry_with_no_headings(tmp_path):
def test_parse_minimal_entry(tmp_path):
"Test parsing of entry with minimal fields"
# Arrange
entry = f'''
entry = f"""
* Heading
Body Line 1'''
Body Line 1"""
orgfile = create_file(tmp_path, entry)
# Act
@@ -56,7 +56,7 @@ Body Line 1'''
def test_parse_complete_entry(tmp_path):
"Test parsing of entry with all important fields"
# Arrange
entry = f'''
entry = f"""
*** DONE [#A] Heading :Tag1:TAG2:tag3:
CLOSED: [1984-04-01 Sun 12:00] SCHEDULED: <1984-04-01 Sun 09:00> DEADLINE: <1984-04-01 Sun>
:PROPERTIES:
@@ -67,7 +67,7 @@ CLOCK: [1984-04-01 Sun 09:00]--[1984-04-01 Sun 12:00] => 3:00
- Clocked Log 1
:END:
Body Line 1
Body Line 2'''
Body Line 2"""
orgfile = create_file(tmp_path, entry)
# Act
@@ -81,45 +81,45 @@ Body Line 2'''
assert entries[0].body == "- Clocked Log 1\nBody Line 1\nBody Line 2"
assert entries[0].priority == "A"
assert entries[0].Property("ID") == "id:123-456-789-4234-1231"
assert entries[0].closed == datetime.date(1984,4,1)
assert entries[0].scheduled == datetime.date(1984,4,1)
assert entries[0].deadline == datetime.date(1984,4,1)
assert entries[0].logbook == [(datetime.datetime(1984,4,1,9,0,0), datetime.datetime(1984,4,1,12,0,0))]
assert entries[0].closed == datetime.date(1984, 4, 1)
assert entries[0].scheduled == datetime.date(1984, 4, 1)
assert entries[0].deadline == datetime.date(1984, 4, 1)
assert entries[0].logbook == [(datetime.datetime(1984, 4, 1, 9, 0, 0), datetime.datetime(1984, 4, 1, 12, 0, 0))]
# ----------------------------------------------------------------------------------------------------
def test_render_entry_with_property_drawer_and_empty_body(tmp_path):
"Render heading entry with property drawer"
# Arrange
entry_to_render = f'''
entry_to_render = f"""
*** [#A] Heading1 :tag1:
:PROPERTIES:
:ID: 111-111-111-1111-1111
:END:
\t\r \n
'''
"""
orgfile = create_file(tmp_path, entry_to_render)
expected_entry = f'''*** [#A] Heading1 :tag1:
expected_entry = f"""*** [#A] Heading1 :tag1:
:PROPERTIES:
:LINE: file:{orgfile}::2
:ID: id:111-111-111-1111-1111
:SOURCE: [[file:{orgfile}::*Heading1]]
:END:
'''
"""
# Act
parsed_entries = orgnode.makelist(orgfile)
# Assert
assert f'{parsed_entries[0]}' == expected_entry
assert f"{parsed_entries[0]}" == expected_entry
# ----------------------------------------------------------------------------------------------------
def test_all_links_to_entry_rendered(tmp_path):
"Ensure all links to entry rendered in property drawer from entry"
# Arrange
entry = f'''
entry = f"""
*** [#A] Heading :tag1:
:PROPERTIES:
:ID: 123-456-789-4234-1231
@@ -127,7 +127,7 @@ def test_all_links_to_entry_rendered(tmp_path):
Body Line 1
*** Heading2
Body Line 2
'''
"""
orgfile = create_file(tmp_path, entry)
# Act
@@ -135,23 +135,23 @@ Body Line 2
# Assert
# SOURCE link rendered with Heading
assert f':SOURCE: [[file:{orgfile}::*{entries[0].heading}]]' in f'{entries[0]}'
assert f":SOURCE: [[file:{orgfile}::*{entries[0].heading}]]" in f"{entries[0]}"
# ID link rendered with ID
assert f':ID: id:123-456-789-4234-1231' in f'{entries[0]}'
assert f":ID: id:123-456-789-4234-1231" in f"{entries[0]}"
# LINE link rendered with line number
assert f':LINE: file:{orgfile}::2' in f'{entries[0]}'
assert f":LINE: file:{orgfile}::2" in f"{entries[0]}"
# ----------------------------------------------------------------------------------------------------
def test_source_link_to_entry_escaped_for_rendering(tmp_path):
"Test SOURCE link renders with square brackets in filename, heading escaped for org-mode rendering"
# Arrange
entry = f'''
entry = f"""
*** [#A] Heading[1] :tag1:
:PROPERTIES:
:ID: 123-456-789-4234-1231
:END:
Body Line 1'''
Body Line 1"""
orgfile = create_file(tmp_path, entry, filename="test[1].org")
# Act
@@ -162,15 +162,15 @@ Body Line 1'''
# parsed heading from entry
assert entries[0].heading == "Heading[1]"
# ensure SOURCE link has square brackets in filename, heading escaped in rendered entries
escaped_orgfile = f'{orgfile}'.replace("[1]", "\\[1\\]")
assert f':SOURCE: [[file:{escaped_orgfile}::*Heading\[1\]' in f'{entries[0]}'
escaped_orgfile = f"{orgfile}".replace("[1]", "\\[1\\]")
assert f":SOURCE: [[file:{escaped_orgfile}::*Heading\[1\]" in f"{entries[0]}"
# ----------------------------------------------------------------------------------------------------
def test_parse_multiple_entries(tmp_path):
"Test parsing of multiple entries"
# Arrange
content = f'''
content = f"""
*** FAILED [#A] Heading1 :tag1:
CLOSED: [1984-04-01 Sun 12:00] SCHEDULED: <1984-04-01 Sun 09:00> DEADLINE: <1984-04-01 Sun>
:PROPERTIES:
@@ -193,7 +193,7 @@ CLOCK: [1984-04-02 Mon 09:00]--[1984-04-02 Mon 12:00] => 3:00
:END:
Body 2
'''
"""
orgfile = create_file(tmp_path, content)
# Act
@@ -208,18 +208,20 @@ Body 2
assert entry.body == f"- Clocked Log {index+1}\nBody {index+1}\n\n"
assert entry.priority == "A"
assert entry.Property("ID") == f"id:123-456-789-4234-000{index+1}"
assert entry.closed == datetime.date(1984,4,index+1)
assert entry.scheduled == datetime.date(1984,4,index+1)
assert entry.deadline == datetime.date(1984,4,index+1)
assert entry.logbook == [(datetime.datetime(1984,4,index+1,9,0,0), datetime.datetime(1984,4,index+1,12,0,0))]
assert entry.closed == datetime.date(1984, 4, index + 1)
assert entry.scheduled == datetime.date(1984, 4, index + 1)
assert entry.deadline == datetime.date(1984, 4, index + 1)
assert entry.logbook == [
(datetime.datetime(1984, 4, index + 1, 9, 0, 0), datetime.datetime(1984, 4, index + 1, 12, 0, 0))
]
# ----------------------------------------------------------------------------------------------------
def test_parse_entry_with_empty_title(tmp_path):
"Test parsing of entry with minimal fields"
# Arrange
entry = f'''#+TITLE:
Body Line 1'''
entry = f"""#+TITLE:
Body Line 1"""
orgfile = create_file(tmp_path, entry)
# Act
@@ -227,7 +229,7 @@ Body Line 1'''
# Assert
assert len(entries) == 1
assert entries[0].heading == f'{orgfile}'
assert entries[0].heading == f"{orgfile}"
assert entries[0].tags == list()
assert entries[0].body == "Body Line 1"
assert entries[0].priority == ""
@@ -241,8 +243,8 @@ Body Line 1'''
def test_parse_entry_with_title_and_no_headings(tmp_path):
"Test parsing of entry with minimal fields"
# Arrange
entry = f'''#+TITLE: test
Body Line 1'''
entry = f"""#+TITLE: test
Body Line 1"""
orgfile = create_file(tmp_path, entry)
# Act
@@ -250,7 +252,7 @@ Body Line 1'''
# Assert
assert len(entries) == 1
assert entries[0].heading == 'test'
assert entries[0].heading == "test"
assert entries[0].tags == list()
assert entries[0].body == "Body Line 1"
assert entries[0].priority == ""
@@ -264,9 +266,9 @@ Body Line 1'''
def test_parse_entry_with_multiple_titles_and_no_headings(tmp_path):
"Test parsing of entry with minimal fields"
# Arrange
entry = f'''#+TITLE: title1
entry = f"""#+TITLE: title1
Body Line 1
#+TITLE: title2 '''
#+TITLE: title2 """
orgfile = create_file(tmp_path, entry)
# Act
@@ -274,7 +276,7 @@ Body Line 1
# Assert
assert len(entries) == 1
assert entries[0].heading == 'title1 title2'
assert entries[0].heading == "title1 title2"
assert entries[0].tags == list()
assert entries[0].body == "Body Line 1\n"
assert entries[0].priority == ""

View File

@@ -14,7 +14,9 @@ from khoj.processor.org_mode.org_to_jsonl import OrgToJsonl
# Test
# ----------------------------------------------------------------------------------------------------
def test_asymmetric_setup_with_missing_file_raises_error(org_config_with_only_new_file: TextContentConfig, search_config: SearchConfig):
def test_asymmetric_setup_with_missing_file_raises_error(
org_config_with_only_new_file: TextContentConfig, search_config: SearchConfig
):
# Arrange
# Ensure file mentioned in org.input-files is missing
single_new_file = Path(org_config_with_only_new_file.input_files[0])
@@ -27,10 +29,12 @@ def test_asymmetric_setup_with_missing_file_raises_error(org_config_with_only_ne
# ----------------------------------------------------------------------------------------------------
def test_asymmetric_setup_with_empty_file_raises_error(org_config_with_only_new_file: TextContentConfig, search_config: SearchConfig):
def test_asymmetric_setup_with_empty_file_raises_error(
org_config_with_only_new_file: TextContentConfig, search_config: SearchConfig
):
# Act
# Generate notes embeddings during asymmetric setup
with pytest.raises(ValueError, match=r'^No valid entries found*'):
with pytest.raises(ValueError, match=r"^No valid entries found*"):
text_search.setup(OrgToJsonl, org_config_with_only_new_file, search_config.asymmetric, regenerate=True)
@@ -52,15 +56,9 @@ def test_asymmetric_search(content_config: ContentConfig, search_config: SearchC
query = "How to git install application?"
# Act
hits, entries = text_search.query(
query,
model = model.notes_search,
rank_results=True)
hits, entries = text_search.query(query, model=model.notes_search, rank_results=True)
results = text_search.collate_results(
hits,
entries,
count=1)
results = text_search.collate_results(hits, entries, count=1)
# Assert
# Actual_data should contain "Khoj via Emacs" entry
@@ -76,12 +74,14 @@ def test_entry_chunking_by_max_tokens(org_config_with_only_new_file: TextContent
new_file_to_index = Path(org_config_with_only_new_file.input_files[0])
with open(new_file_to_index, "w") as f:
f.write(f"* Entry more than {max_tokens} words\n")
for index in range(max_tokens+1):
for index in range(max_tokens + 1):
f.write(f"{index} ")
# Act
# reload embeddings, entries, notes model after adding new org-mode file
initial_notes_model = text_search.setup(OrgToJsonl, org_config_with_only_new_file, search_config.asymmetric, regenerate=False)
initial_notes_model = text_search.setup(
OrgToJsonl, org_config_with_only_new_file, search_config.asymmetric, regenerate=False
)
# Assert
# verify newly added org-mode entry is split by max tokens
@@ -92,18 +92,20 @@ def test_entry_chunking_by_max_tokens(org_config_with_only_new_file: TextContent
# ----------------------------------------------------------------------------------------------------
def test_asymmetric_reload(content_config: ContentConfig, search_config: SearchConfig, new_org_file: Path):
# Arrange
initial_notes_model= text_search.setup(OrgToJsonl, content_config.org, search_config.asymmetric, regenerate=True)
initial_notes_model = text_search.setup(OrgToJsonl, content_config.org, search_config.asymmetric, regenerate=True)
assert len(initial_notes_model.entries) == 10
assert len(initial_notes_model.corpus_embeddings) == 10
# append org-mode entry to first org input file in config
content_config.org.input_files = [f'{new_org_file}']
content_config.org.input_files = [f"{new_org_file}"]
with open(new_org_file, "w") as f:
f.write("\n* A Chihuahua doing Tango\n- Saw a super cute video of a chihuahua doing the Tango on Youtube\n")
# regenerate notes jsonl, model embeddings and model to include entry from new file
regenerated_notes_model = text_search.setup(OrgToJsonl, content_config.org, search_config.asymmetric, regenerate=True)
regenerated_notes_model = text_search.setup(
OrgToJsonl, content_config.org, search_config.asymmetric, regenerate=True
)
# Act
# reload embeddings, entries, notes model from previously generated notes jsonl and model embeddings files
@@ -137,7 +139,7 @@ def test_incremental_update(content_config: ContentConfig, search_config: Search
# Act
# update embeddings, entries with the newly added note
content_config.org.input_files = [f'{new_org_file}']
content_config.org.input_files = [f"{new_org_file}"]
initial_notes_model = text_search.setup(OrgToJsonl, content_config.org, search_config.asymmetric, regenerate=False)
# Assert

View File

@@ -7,7 +7,7 @@ def test_no_word_filter():
# Arrange
word_filter = WordFilter()
entries = arrange_content()
q_with_no_filter = 'head tail'
q_with_no_filter = "head tail"
# Act
can_filter = word_filter.can_filter(q_with_no_filter)
@@ -15,7 +15,7 @@ def test_no_word_filter():
# Assert
assert can_filter == False
assert ret_query == 'head tail'
assert ret_query == "head tail"
assert entry_indices == {0, 1, 2, 3}
@@ -31,7 +31,7 @@ def test_word_exclude_filter():
# Assert
assert can_filter == True
assert ret_query == 'head tail'
assert ret_query == "head tail"
assert entry_indices == {0, 2}
@@ -47,7 +47,7 @@ def test_word_include_filter():
# Assert
assert can_filter == True
assert ret_query == 'head tail'
assert ret_query == "head tail"
assert entry_indices == {2, 3}
@@ -63,16 +63,16 @@ def test_word_include_and_exclude_filter():
# Assert
assert can_filter == True
assert ret_query == 'head tail'
assert ret_query == "head tail"
assert entry_indices == {2}
def arrange_content():
entries = [
Entry(compiled='', raw='Minimal Entry'),
Entry(compiled='', raw='Entry with exclude_word'),
Entry(compiled='', raw='Entry with include_word'),
Entry(compiled='', raw='Entry with include_word and exclude_word')
Entry(compiled="", raw="Minimal Entry"),
Entry(compiled="", raw="Entry with exclude_word"),
Entry(compiled="", raw="Entry with include_word"),
Entry(compiled="", raw="Entry with include_word and exclude_word"),
]
return entries