mirror of
https://github.com/khoaliber/khoj.git
synced 2026-03-09 05:39:12 +00:00
Use Base TextToJsonl class to standardize <text>_to_jsonl processors
- Start standardizing implementation of the `text_to_jsonl' processors
- `text_to_jsonl; scripts already had a shared structure
- This change starts to codify that implicit structure
- Benefits
- Ease adding more `text_to_jsonl; processors
- Allow merging shared functionality
- Help with type hinting
- Drawbacks
- Lower agility to change. But this was already an implicit issue as
the text_to_jsonl processors got more deeply wired into the app
This commit is contained in:
@@ -6,7 +6,7 @@ from src.search_type import image_search, text_search
|
||||
from src.utils.config import SearchType
|
||||
from src.utils.helpers import resolve_absolute_path
|
||||
from src.utils.rawconfig import ContentConfig, TextContentConfig, ImageContentConfig, SearchConfig, TextSearchConfig, ImageSearchConfig
|
||||
from src.processor.org_mode.org_to_jsonl import org_to_jsonl
|
||||
from src.processor.org_mode.org_to_jsonl import OrgToJsonl
|
||||
from src.search_filter.date_filter import DateFilter
|
||||
from src.search_filter.word_filter import WordFilter
|
||||
from src.search_filter.file_filter import FileFilter
|
||||
@@ -60,6 +60,6 @@ def content_config(tmp_path_factory, search_config: SearchConfig):
|
||||
embeddings_file = content_dir.joinpath('note_embeddings.pt'))
|
||||
|
||||
filters = [DateFilter(), WordFilter(), FileFilter()]
|
||||
text_search.setup(org_to_jsonl, content_config.org, search_config.asymmetric, regenerate=False, filters=filters)
|
||||
text_search.setup(OrgToJsonl, content_config.org, search_config.asymmetric, regenerate=False, filters=filters)
|
||||
|
||||
return content_config
|
||||
|
||||
@@ -2,7 +2,7 @@
|
||||
import json
|
||||
|
||||
# Internal Packages
|
||||
from src.processor.ledger.beancount_to_jsonl import extract_beancount_transactions, convert_transactions_to_maps, convert_transaction_maps_to_jsonl, get_beancount_files
|
||||
from src.processor.ledger.beancount_to_jsonl import BeancountToJsonl
|
||||
|
||||
|
||||
def test_no_transactions_in_file(tmp_path):
|
||||
@@ -16,10 +16,11 @@ def test_no_transactions_in_file(tmp_path):
|
||||
|
||||
# Act
|
||||
# Extract Entries from specified Beancount files
|
||||
entry_nodes, file_to_entries = extract_beancount_transactions(beancount_files=[beancount_file])
|
||||
entry_nodes, file_to_entries = BeancountToJsonl.extract_beancount_transactions(beancount_files=[beancount_file])
|
||||
|
||||
# Process Each Entry from All Beancount Files
|
||||
jsonl_string = convert_transaction_maps_to_jsonl(convert_transactions_to_maps(entry_nodes, file_to_entries))
|
||||
jsonl_string = BeancountToJsonl.convert_transaction_maps_to_jsonl(
|
||||
BeancountToJsonl.convert_transactions_to_maps(entry_nodes, file_to_entries))
|
||||
jsonl_data = [json.loads(json_string) for json_string in jsonl_string.splitlines()]
|
||||
|
||||
# Assert
|
||||
@@ -38,10 +39,11 @@ Assets:Test:Test -1.00 KES
|
||||
|
||||
# Act
|
||||
# Extract Entries from specified Beancount files
|
||||
entries, entry_to_file_map = extract_beancount_transactions(beancount_files=[beancount_file])
|
||||
entries, entry_to_file_map = BeancountToJsonl.extract_beancount_transactions(beancount_files=[beancount_file])
|
||||
|
||||
# Process Each Entry from All Beancount Files
|
||||
jsonl_string = convert_transaction_maps_to_jsonl(convert_transactions_to_maps(entries, entry_to_file_map))
|
||||
jsonl_string = BeancountToJsonl.convert_transaction_maps_to_jsonl(
|
||||
BeancountToJsonl.convert_transactions_to_maps(entries, entry_to_file_map))
|
||||
jsonl_data = [json.loads(json_string) for json_string in jsonl_string.splitlines()]
|
||||
|
||||
# Assert
|
||||
@@ -65,10 +67,11 @@ Assets:Test:Test -1.00 KES
|
||||
|
||||
# Act
|
||||
# Extract Entries from specified Beancount files
|
||||
entries, entry_to_file_map = extract_beancount_transactions(beancount_files=[beancount_file])
|
||||
entries, entry_to_file_map = BeancountToJsonl.extract_beancount_transactions(beancount_files=[beancount_file])
|
||||
|
||||
# Process Each Entry from All Beancount Files
|
||||
jsonl_string = convert_transaction_maps_to_jsonl(convert_transactions_to_maps(entries, entry_to_file_map))
|
||||
jsonl_string = BeancountToJsonl.convert_transaction_maps_to_jsonl(
|
||||
BeancountToJsonl.convert_transactions_to_maps(entries, entry_to_file_map))
|
||||
jsonl_data = [json.loads(json_string) for json_string in jsonl_string.splitlines()]
|
||||
|
||||
# Assert
|
||||
@@ -96,7 +99,7 @@ def test_get_beancount_files(tmp_path):
|
||||
input_filter = [tmp_path / 'group1*.bean', tmp_path / 'group2*.beancount']
|
||||
|
||||
# Act
|
||||
extracted_org_files = get_beancount_files(input_files, input_filter)
|
||||
extracted_org_files = BeancountToJsonl.get_beancount_files(input_files, input_filter)
|
||||
|
||||
# Assert
|
||||
assert len(extracted_org_files) == 5
|
||||
|
||||
@@ -12,7 +12,7 @@ from src.main import app
|
||||
from src.utils.state import model, config
|
||||
from src.search_type import text_search, image_search
|
||||
from src.utils.rawconfig import ContentConfig, SearchConfig
|
||||
from src.processor.org_mode.org_to_jsonl import org_to_jsonl
|
||||
from src.processor.org_mode.org_to_jsonl import OrgToJsonl
|
||||
from src.search_filter.word_filter import WordFilter
|
||||
from src.search_filter.file_filter import FileFilter
|
||||
|
||||
@@ -118,7 +118,7 @@ def test_image_search(content_config: ContentConfig, search_config: SearchConfig
|
||||
# ----------------------------------------------------------------------------------------------------
|
||||
def test_notes_search(content_config: ContentConfig, search_config: SearchConfig):
|
||||
# Arrange
|
||||
model.orgmode_search = text_search.setup(org_to_jsonl, content_config.org, search_config.asymmetric, regenerate=False)
|
||||
model.orgmode_search = text_search.setup(OrgToJsonl, content_config.org, search_config.asymmetric, regenerate=False)
|
||||
user_query = quote("How to git install application?")
|
||||
|
||||
# Act
|
||||
@@ -135,7 +135,7 @@ def test_notes_search(content_config: ContentConfig, search_config: SearchConfig
|
||||
def test_notes_search_with_only_filters(content_config: ContentConfig, search_config: SearchConfig):
|
||||
# Arrange
|
||||
filters = [WordFilter(), FileFilter()]
|
||||
model.orgmode_search = text_search.setup(org_to_jsonl, content_config.org, search_config.asymmetric, regenerate=False, filters=filters)
|
||||
model.orgmode_search = text_search.setup(OrgToJsonl, content_config.org, search_config.asymmetric, regenerate=False, filters=filters)
|
||||
user_query = quote('+"Emacs" file:"*.org"')
|
||||
|
||||
# Act
|
||||
@@ -152,7 +152,7 @@ def test_notes_search_with_only_filters(content_config: ContentConfig, search_co
|
||||
def test_notes_search_with_include_filter(content_config: ContentConfig, search_config: SearchConfig):
|
||||
# Arrange
|
||||
filters = [WordFilter()]
|
||||
model.orgmode_search = text_search.setup(org_to_jsonl, content_config.org, search_config.asymmetric, regenerate=False, filters=filters)
|
||||
model.orgmode_search = text_search.setup(OrgToJsonl, content_config.org, search_config.asymmetric, regenerate=False, filters=filters)
|
||||
user_query = quote('How to git install application? +"Emacs"')
|
||||
|
||||
# Act
|
||||
@@ -169,7 +169,7 @@ def test_notes_search_with_include_filter(content_config: ContentConfig, search_
|
||||
def test_notes_search_with_exclude_filter(content_config: ContentConfig, search_config: SearchConfig):
|
||||
# Arrange
|
||||
filters = [WordFilter()]
|
||||
model.orgmode_search = text_search.setup(org_to_jsonl, content_config.org, search_config.asymmetric, regenerate=False, filters=filters)
|
||||
model.orgmode_search = text_search.setup(OrgToJsonl, content_config.org, search_config.asymmetric, regenerate=False, filters=filters)
|
||||
user_query = quote('How to git install application? -"clone"')
|
||||
|
||||
# Act
|
||||
|
||||
@@ -2,7 +2,7 @@
|
||||
import json
|
||||
|
||||
# Internal Packages
|
||||
from src.processor.markdown.markdown_to_jsonl import extract_markdown_entries, convert_markdown_maps_to_jsonl, convert_markdown_entries_to_maps, get_markdown_files
|
||||
from src.processor.markdown.markdown_to_jsonl import MarkdownToJsonl
|
||||
|
||||
|
||||
def test_markdown_file_with_no_headings_to_jsonl(tmp_path):
|
||||
@@ -16,10 +16,11 @@ def test_markdown_file_with_no_headings_to_jsonl(tmp_path):
|
||||
|
||||
# Act
|
||||
# Extract Entries from specified Markdown files
|
||||
entry_nodes, file_to_entries = extract_markdown_entries(markdown_files=[markdownfile])
|
||||
entry_nodes, file_to_entries = MarkdownToJsonl.extract_markdown_entries(markdown_files=[markdownfile])
|
||||
|
||||
# Process Each Entry from All Notes Files
|
||||
jsonl_string = convert_markdown_maps_to_jsonl(convert_markdown_entries_to_maps(entry_nodes, file_to_entries))
|
||||
jsonl_string = MarkdownToJsonl.convert_markdown_maps_to_jsonl(
|
||||
MarkdownToJsonl.convert_markdown_entries_to_maps(entry_nodes, file_to_entries))
|
||||
jsonl_data = [json.loads(json_string) for json_string in jsonl_string.splitlines()]
|
||||
|
||||
# Assert
|
||||
@@ -37,10 +38,11 @@ def test_single_markdown_entry_to_jsonl(tmp_path):
|
||||
|
||||
# Act
|
||||
# Extract Entries from specified Markdown files
|
||||
entries, entry_to_file_map = extract_markdown_entries(markdown_files=[markdownfile])
|
||||
entries, entry_to_file_map = MarkdownToJsonl.extract_markdown_entries(markdown_files=[markdownfile])
|
||||
|
||||
# Process Each Entry from All Notes Files
|
||||
jsonl_string = convert_markdown_maps_to_jsonl(convert_markdown_entries_to_maps(entries, entry_to_file_map))
|
||||
jsonl_string = MarkdownToJsonl.convert_markdown_maps_to_jsonl(
|
||||
MarkdownToJsonl.convert_markdown_entries_to_maps(entries, entry_to_file_map))
|
||||
jsonl_data = [json.loads(json_string) for json_string in jsonl_string.splitlines()]
|
||||
|
||||
# Assert
|
||||
@@ -62,10 +64,11 @@ def test_multiple_markdown_entries_to_jsonl(tmp_path):
|
||||
|
||||
# Act
|
||||
# Extract Entries from specified Markdown files
|
||||
entries, entry_to_file_map = extract_markdown_entries(markdown_files=[markdownfile])
|
||||
entries, entry_to_file_map = MarkdownToJsonl.extract_markdown_entries(markdown_files=[markdownfile])
|
||||
|
||||
# Process Each Entry from All Notes Files
|
||||
jsonl_string = convert_markdown_maps_to_jsonl(convert_markdown_entries_to_maps(entries, entry_to_file_map))
|
||||
jsonl_string = MarkdownToJsonl.convert_markdown_maps_to_jsonl(
|
||||
MarkdownToJsonl.convert_markdown_entries_to_maps(entries, entry_to_file_map))
|
||||
jsonl_data = [json.loads(json_string) for json_string in jsonl_string.splitlines()]
|
||||
|
||||
# Assert
|
||||
@@ -93,7 +96,7 @@ def test_get_markdown_files(tmp_path):
|
||||
input_filter = [tmp_path / 'group1*.md', tmp_path / 'group2*.markdown']
|
||||
|
||||
# Act
|
||||
extracted_org_files = get_markdown_files(input_files, input_filter)
|
||||
extracted_org_files = MarkdownToJsonl.get_markdown_files(input_files, input_filter)
|
||||
|
||||
# Assert
|
||||
assert len(extracted_org_files) == 5
|
||||
|
||||
@@ -2,7 +2,7 @@
|
||||
import json
|
||||
|
||||
# Internal Packages
|
||||
from src.processor.org_mode.org_to_jsonl import convert_org_entries_to_jsonl, convert_org_nodes_to_entries, extract_org_entries, get_org_files
|
||||
from src.processor.org_mode.org_to_jsonl import OrgToJsonl
|
||||
from src.utils.helpers import is_none_or_empty
|
||||
|
||||
|
||||
@@ -21,8 +21,8 @@ def test_configure_heading_entry_to_jsonl(tmp_path):
|
||||
for index_heading_entries in [True, False]:
|
||||
# Act
|
||||
# Extract entries into jsonl from specified Org files
|
||||
jsonl_string = convert_org_entries_to_jsonl(convert_org_nodes_to_entries(
|
||||
*extract_org_entries(org_files=[orgfile]),
|
||||
jsonl_string = OrgToJsonl.convert_org_entries_to_jsonl(OrgToJsonl.convert_org_nodes_to_entries(
|
||||
*OrgToJsonl.extract_org_entries(org_files=[orgfile]),
|
||||
index_heading_entries=index_heading_entries))
|
||||
jsonl_data = [json.loads(json_string) for json_string in jsonl_string.splitlines()]
|
||||
|
||||
@@ -49,10 +49,10 @@ def test_entry_with_body_to_jsonl(tmp_path):
|
||||
|
||||
# Act
|
||||
# Extract Entries from specified Org files
|
||||
entries, entry_to_file_map = extract_org_entries(org_files=[orgfile])
|
||||
entries, entry_to_file_map = OrgToJsonl.extract_org_entries(org_files=[orgfile])
|
||||
|
||||
# Process Each Entry from All Notes Files
|
||||
jsonl_string = convert_org_entries_to_jsonl(convert_org_nodes_to_entries(entries, entry_to_file_map))
|
||||
jsonl_string = OrgToJsonl.convert_org_entries_to_jsonl(OrgToJsonl.convert_org_nodes_to_entries(entries, entry_to_file_map))
|
||||
jsonl_data = [json.loads(json_string) for json_string in jsonl_string.splitlines()]
|
||||
|
||||
# Assert
|
||||
@@ -70,11 +70,11 @@ def test_file_with_no_headings_to_jsonl(tmp_path):
|
||||
|
||||
# Act
|
||||
# Extract Entries from specified Org files
|
||||
entry_nodes, file_to_entries = extract_org_entries(org_files=[orgfile])
|
||||
entry_nodes, file_to_entries = OrgToJsonl.extract_org_entries(org_files=[orgfile])
|
||||
|
||||
# Process Each Entry from All Notes Files
|
||||
entries = convert_org_nodes_to_entries(entry_nodes, file_to_entries)
|
||||
jsonl_string = convert_org_entries_to_jsonl(entries)
|
||||
entries = OrgToJsonl.convert_org_nodes_to_entries(entry_nodes, file_to_entries)
|
||||
jsonl_string = OrgToJsonl.convert_org_entries_to_jsonl(entries)
|
||||
jsonl_data = [json.loads(json_string) for json_string in jsonl_string.splitlines()]
|
||||
|
||||
# Assert
|
||||
@@ -102,7 +102,7 @@ def test_get_org_files(tmp_path):
|
||||
input_filter = [tmp_path / 'group1*.org', tmp_path / 'group2*.org']
|
||||
|
||||
# Act
|
||||
extracted_org_files = get_org_files(input_files, input_filter)
|
||||
extracted_org_files = OrgToJsonl.get_org_files(input_files, input_filter)
|
||||
|
||||
# Assert
|
||||
assert len(extracted_org_files) == 5
|
||||
|
||||
@@ -9,7 +9,7 @@ import pytest
|
||||
from src.utils.state import model
|
||||
from src.search_type import text_search
|
||||
from src.utils.rawconfig import ContentConfig, SearchConfig
|
||||
from src.processor.org_mode.org_to_jsonl import org_to_jsonl
|
||||
from src.processor.org_mode.org_to_jsonl import OrgToJsonl
|
||||
|
||||
|
||||
# Test
|
||||
@@ -24,7 +24,7 @@ def test_asymmetric_setup_with_missing_file_raises_error(content_config: Content
|
||||
# Act
|
||||
# Generate notes embeddings during asymmetric setup
|
||||
with pytest.raises(FileNotFoundError):
|
||||
text_search.setup(org_to_jsonl, new_org_content_config, search_config.asymmetric, regenerate=True)
|
||||
text_search.setup(OrgToJsonl, new_org_content_config, search_config.asymmetric, regenerate=True)
|
||||
|
||||
|
||||
# ----------------------------------------------------------------------------------------------------
|
||||
@@ -39,7 +39,7 @@ def test_asymmetric_setup_with_empty_file_raises_error(content_config: ContentCo
|
||||
# Act
|
||||
# Generate notes embeddings during asymmetric setup
|
||||
with pytest.raises(ValueError, match=r'^No valid entries found*'):
|
||||
text_search.setup(org_to_jsonl, new_org_content_config, search_config.asymmetric, regenerate=True)
|
||||
text_search.setup(OrgToJsonl, new_org_content_config, search_config.asymmetric, regenerate=True)
|
||||
|
||||
# Cleanup
|
||||
# delete created test file
|
||||
@@ -50,7 +50,7 @@ def test_asymmetric_setup_with_empty_file_raises_error(content_config: ContentCo
|
||||
def test_asymmetric_setup(content_config: ContentConfig, search_config: SearchConfig):
|
||||
# Act
|
||||
# Regenerate notes embeddings during asymmetric setup
|
||||
notes_model = text_search.setup(org_to_jsonl, content_config.org, search_config.asymmetric, regenerate=True)
|
||||
notes_model = text_search.setup(OrgToJsonl, content_config.org, search_config.asymmetric, regenerate=True)
|
||||
|
||||
# Assert
|
||||
assert len(notes_model.entries) == 10
|
||||
@@ -60,7 +60,7 @@ def test_asymmetric_setup(content_config: ContentConfig, search_config: SearchCo
|
||||
# ----------------------------------------------------------------------------------------------------
|
||||
def test_asymmetric_search(content_config: ContentConfig, search_config: SearchConfig):
|
||||
# Arrange
|
||||
model.notes_search = text_search.setup(org_to_jsonl, content_config.org, search_config.asymmetric, regenerate=True)
|
||||
model.notes_search = text_search.setup(OrgToJsonl, content_config.org, search_config.asymmetric, regenerate=True)
|
||||
query = "How to git install application?"
|
||||
|
||||
# Act
|
||||
@@ -83,7 +83,7 @@ def test_asymmetric_search(content_config: ContentConfig, search_config: SearchC
|
||||
# ----------------------------------------------------------------------------------------------------
|
||||
def test_asymmetric_reload(content_config: ContentConfig, search_config: SearchConfig):
|
||||
# Arrange
|
||||
initial_notes_model= text_search.setup(org_to_jsonl, content_config.org, search_config.asymmetric, regenerate=False)
|
||||
initial_notes_model= text_search.setup(OrgToJsonl, content_config.org, search_config.asymmetric, regenerate=False)
|
||||
|
||||
assert len(initial_notes_model.entries) == 10
|
||||
assert len(initial_notes_model.corpus_embeddings) == 10
|
||||
@@ -96,11 +96,11 @@ def test_asymmetric_reload(content_config: ContentConfig, search_config: SearchC
|
||||
f.write("\n* A Chihuahua doing Tango\n- Saw a super cute video of a chihuahua doing the Tango on Youtube\n")
|
||||
|
||||
# regenerate notes jsonl, model embeddings and model to include entry from new file
|
||||
regenerated_notes_model = text_search.setup(org_to_jsonl, content_config.org, search_config.asymmetric, regenerate=True)
|
||||
regenerated_notes_model = text_search.setup(OrgToJsonl, content_config.org, search_config.asymmetric, regenerate=True)
|
||||
|
||||
# Act
|
||||
# reload embeddings, entries, notes model from previously generated notes jsonl and model embeddings files
|
||||
initial_notes_model = text_search.setup(org_to_jsonl, content_config.org, search_config.asymmetric, regenerate=False)
|
||||
initial_notes_model = text_search.setup(OrgToJsonl, content_config.org, search_config.asymmetric, regenerate=False)
|
||||
|
||||
# Assert
|
||||
assert len(regenerated_notes_model.entries) == 11
|
||||
@@ -119,7 +119,7 @@ def test_asymmetric_reload(content_config: ContentConfig, search_config: SearchC
|
||||
# ----------------------------------------------------------------------------------------------------
|
||||
def test_incremental_update(content_config: ContentConfig, search_config: SearchConfig):
|
||||
# Arrange
|
||||
initial_notes_model = text_search.setup(org_to_jsonl, content_config.org, search_config.asymmetric, regenerate=True)
|
||||
initial_notes_model = text_search.setup(OrgToJsonl, content_config.org, search_config.asymmetric, regenerate=True)
|
||||
|
||||
assert len(initial_notes_model.entries) == 10
|
||||
assert len(initial_notes_model.corpus_embeddings) == 10
|
||||
@@ -133,7 +133,7 @@ def test_incremental_update(content_config: ContentConfig, search_config: Search
|
||||
|
||||
# Act
|
||||
# update embeddings, entries with the newly added note
|
||||
initial_notes_model = text_search.setup(org_to_jsonl, content_config.org, search_config.asymmetric, regenerate=False)
|
||||
initial_notes_model = text_search.setup(OrgToJsonl, content_config.org, search_config.asymmetric, regenerate=False)
|
||||
|
||||
# verify new entry added in updated embeddings, entries
|
||||
assert len(initial_notes_model.entries) == 11
|
||||
|
||||
Reference in New Issue
Block a user