Index Parent Headings of Org-Mode Entries to Improve Search Context (#548)

### Overview
The parent hierarchy of org-mode entries can store important context. 
This change updates OrgNode to track parent headings for each org entry and adds the parent outline for each entry to the index

### Details
- Test search uses ancestor headings as context for improved results
- Add ancestor headings of each org-mode entry to their compiled form
- Track ancestor headings for each org-mode entry in org-node parser

Resolves #85
This commit is contained in:
Debanjum
2023-11-19 13:18:19 -08:00
committed by GitHub
10 changed files with 215 additions and 40 deletions

View File

@@ -384,6 +384,45 @@ def sample_org_data():
def get_sample_data(type):
sample_data = {
"org": {
"elisp.org": """
* Emacs Khoj
/An Emacs interface for [[https://github.com/khoj-ai/khoj][khoj]]/
** Requirements
- Install and Run [[https://github.com/khoj-ai/khoj][khoj]]
** Installation
*** Direct
- Put ~khoj.el~ in your Emacs load path. For e.g ~/.emacs.d/lisp
- Load via ~use-package~ in your ~/.emacs.d/init.el or .emacs file by adding below snippet
#+begin_src elisp
;; Khoj Package
(use-package khoj
:load-path "~/.emacs.d/lisp/khoj.el"
:bind ("C-c s" . 'khoj))
#+end_src
*** Using [[https://github.com/quelpa/quelpa#installation][Quelpa]]
- Ensure [[https://github.com/quelpa/quelpa#installation][Quelpa]], [[https://github.com/quelpa/quelpa-use-package#installation][quelpa-use-package]] are installed
- Add below snippet to your ~/.emacs.d/init.el or .emacs config file and execute it.
#+begin_src elisp
;; Khoj Package
(use-package khoj
:quelpa (khoj :fetcher url :url "https://raw.githubusercontent.com/khoj-ai/khoj/master/interface/emacs/khoj.el")
:bind ("C-c s" . 'khoj))
#+end_src
** Usage
1. Call ~khoj~ using keybinding ~C-c s~ or ~M-x khoj~
2. Enter Query in Natural Language
e.g "What is the meaning of life?" "What are my life goals?"
3. Wait for results
*Note: It takes about 15s on a Mac M1 and a ~100K lines corpus of org-mode files*
4. (Optional) Narrow down results further
Include/Exclude specific words from results by adding to query
e.g "What is the meaning of life? -god +none"
""",
"readme.org": """
* Khoj
/Allow natural language search on user content like notes, images using transformer based models/
@@ -399,7 +438,7 @@ def get_sample_data(type):
git clone https://github.com/khoj-ai/khoj && cd khoj
conda env create -f environment.yml
conda activate khoj
#+end_src"""
#+end_src""",
},
"markdown": {
"readme.markdown": """

View File

@@ -4,10 +4,9 @@
** Requirements
- Install and Run [[https://github.com/khoj-ai/khoj][khoj]]
** Installation
- Direct Install
** Install
*** Direct
- Put ~khoj.el~ in your Emacs load path. For e.g ~/.emacs.d/lisp
- Load via ~use-package~ in your ~/.emacs.d/init.el or .emacs file by adding below snippet
#+begin_src elisp
;; Khoj Package
@@ -16,7 +15,7 @@
:bind ("C-c s" . 'khoj))
#+end_src
- Use [[https://github.com/quelpa/quelpa#installation][Quelpa]]
*** Using [[https://github.com/quelpa/quelpa#installation][Quelpa]]
- Ensure [[https://github.com/quelpa/quelpa#installation][Quelpa]], [[https://github.com/quelpa/quelpa-use-package#installation][quelpa-use-package]] are installed
- Add below snippet to your ~/.emacs.d/init.el or .emacs config file and execute it.
#+begin_src elisp
@@ -28,17 +27,10 @@
** Usage
1. Call ~khoj~ using keybinding ~C-c s~ or ~M-x khoj~
2. Enter Query in Natural Language
e.g "What is the meaning of life?" "What are my life goals?"
3. Wait for results
*Note: It takes about 15s on a Mac M1 and a ~100K lines corpus of org-mode files*
4. (Optional) Narrow down results further
Include/Exclude specific words from results by adding to query
e.g "What is the meaning of life? -god +none"

View File

@@ -22,16 +22,16 @@
#+end_src
** Use
- *Khoj via Emacs*
*** *Khoj via Emacs*
- [[https://github.com/khoj-ai/khoj/tree/master/interface/emacs#installation][Install]] [[./interface/emacs/khoj.el][khoj.el]]
- Run ~M-x khoj <user-query>~ or Call ~C-c C-s~
- *Khoj via API*
*** *Khoj via API*
- Query: ~GET~ [[http://localhost:42110/api/search?q=%22what%20is%20the%20meaning%20of%20life%22][http://localhost:42110/api/search?q="What is the meaning of life"]]
- Update Index: ~GET~ [[http://localhost:42110/api/update][http://localhost:42110/api/update]]
- [[http://localhost:42110/docs][Khoj API Docs]]
- *Call Khoj via Python Script Directly*
*** *Call Khoj via Python Script Directly*
#+begin_src shell
python3 search_types/asymmetric.py \
--compressed-jsonl .notes.jsonl.gz \

View File

@@ -321,7 +321,7 @@ def test_notes_search_with_include_filter(client, sample_org_data, default_user:
assert response.status_code == 200
# assert actual_data contains word "Emacs"
search_result = response.json()[0]["entry"]
assert "Emacs" in search_result
assert "emacs" in search_result
# ----------------------------------------------------------------------------------------------------
@@ -347,6 +347,27 @@ def test_notes_search_with_exclude_filter(client, sample_org_data, default_user:
assert "clone" not in search_result
# ----------------------------------------------------------------------------------------------------
@pytest.mark.django_db(transaction=True)
def test_notes_search_requires_parent_context(
client, search_config: SearchConfig, sample_org_data, default_user: KhojUser
):
# Arrange
headers = {"Authorization": "Bearer kk-secret"}
text_search.setup(OrgToEntries, sample_org_data, regenerate=False, user=default_user)
user_query = quote("Install Khoj on Emacs")
# Act
response = client.get(f"/api/search?q={user_query}&n=1&t=org&r=true&max_distance=0.18", headers=headers)
# Assert
assert response.status_code == 200
assert len(response.json()) == 1, "Expected only 1 result"
search_result = response.json()[0]["entry"]
assert "Emacs load path" in search_result, "Expected 'Emacs load path' in search result"
# ----------------------------------------------------------------------------------------------------
@pytest.mark.django_db(transaction=True)
def test_different_user_data_not_accessed(client, sample_org_data, default_user: KhojUser):

View File

@@ -69,7 +69,7 @@ def test_index_update_with_user2_inaccessible_user1(client, api_user2: KhojApiUs
# Assert
assert update_response.status_code == 200
assert len(results) == 4
assert len(results) == 5
for result in results:
assert result["additional"]["file"] not in source_file_symbol

View File

@@ -45,9 +45,10 @@ def test_configure_heading_entry_to_jsonl(tmp_path):
assert is_none_or_empty(jsonl_data)
def test_entry_split_when_exceeds_max_words(tmp_path):
def test_entry_split_when_exceeds_max_words():
"Ensure entries with compiled words exceeding max_words are split."
# Arrange
tmp_path = "/tmp/test.org"
entry = f"""*** Heading
\t\r
Body Line
@@ -55,7 +56,7 @@ def test_entry_split_when_exceeds_max_words(tmp_path):
data = {
f"{tmp_path}": entry,
}
expected_heading = f"* {tmp_path.stem}\n** Heading"
expected_heading = f"* Path: {tmp_path}\n** Heading"
# Act
# Extract Entries from specified Org files

View File

@@ -161,6 +161,8 @@ Body Line 1"""
assert len(entries) == 1
# parsed heading from entry
assert entries[0].heading == "Heading[1]"
# track ancestors of entry
assert entries[0].ancestors == [f"{orgfile}"]
# ensure SOURCE link has square brackets in filename, heading escaped in rendered entries
escaped_orgfile = f"{orgfile}".replace("[1]", "\\[1\\]")
assert f":SOURCE: [[file:{escaped_orgfile}::*Heading\\[1\\]" in f"{entries[0]}"
@@ -260,6 +262,7 @@ Body Line 1"""
assert entries[0].closed == ""
assert entries[0].scheduled == ""
assert entries[0].deadline == ""
assert entries[0].ancestors == ["test"]
# ----------------------------------------------------------------------------------------------------
@@ -284,6 +287,7 @@ Body Line 1
assert entries[0].closed == ""
assert entries[0].scheduled == ""
assert entries[0].deadline == ""
assert entries[0].ancestors == ["title1 title2"]
# ----------------------------------------------------------------------------------------------------
@@ -304,8 +308,10 @@ entry body
assert len(entries) == 2
assert entries[0].heading == "Title"
assert entries[0].body == "intro body\n"
assert entries[0].ancestors == ["Title"]
assert entries[1].heading == "Entry Heading"
assert entries[1].body == "entry body\n\n"
assert entries[1].ancestors == ["Title"]
# ----------------------------------------------------------------------------------------------------
@@ -326,8 +332,93 @@ entry body
assert len(entries) == 2
assert entries[0].heading == "Title1 Title2"
assert entries[0].body == "intro body\n"
assert entries[0].ancestors == ["Title1 Title2"]
assert entries[1].heading == "Entry Heading"
assert entries[1].body == "entry body\n\n"
assert entries[0].ancestors == ["Title1 Title2"]
# ----------------------------------------------------------------------------------------------------
def test_parse_org_with_single_ancestor_heading(tmp_path):
"Parse org entries with parent headings context"
# Arrange
body = f"""
* Heading 1
body 1
** Sub Heading 1
"""
orgfile = create_file(tmp_path, body)
# Act
entries = orgnode.makelist_with_filepath(orgfile)
# Assert
assert len(entries) == 2
assert entries[0].heading == "Heading 1"
assert entries[0].ancestors == [f"{orgfile}"]
assert entries[1].heading == "Sub Heading 1"
assert entries[1].ancestors == [f"{orgfile}", "Heading 1"]
# ----------------------------------------------------------------------------------------------------
def test_parse_org_with_multiple_ancestor_headings(tmp_path):
"Parse org entries with parent headings context"
# Arrange
body = f"""
* Heading 1
body 1
** Sub Heading 1
*** Sub Sub Heading 1
sub sub body 1
"""
orgfile = create_file(tmp_path, body)
# Act
entries = orgnode.makelist_with_filepath(orgfile)
# Assert
assert len(entries) == 3
assert entries[0].heading == "Heading 1"
assert entries[0].ancestors == [f"{orgfile}"]
assert entries[1].heading == "Sub Heading 1"
assert entries[1].ancestors == [f"{orgfile}", "Heading 1"]
assert entries[2].heading == "Sub Sub Heading 1"
assert entries[2].ancestors == [f"{orgfile}", "Heading 1", "Sub Heading 1"]
# ----------------------------------------------------------------------------------------------------
def test_parse_org_with_multiple_ancestor_headings_of_siblings(tmp_path):
"Parse org entries with parent headings context"
# Arrange
body = f"""
* Heading 1
body 1
** Sub Heading 1
*** Sub Sub Heading 1
sub sub body 1
*** Sub Sub Heading 2
** Sub Heading 2
*** Sub Sub Heading 3
"""
orgfile = create_file(tmp_path, body)
# Act
entries = orgnode.makelist_with_filepath(orgfile)
# Assert
assert len(entries) == 6
assert entries[0].heading == "Heading 1"
assert entries[0].ancestors == [f"{orgfile}"]
assert entries[1].heading == "Sub Heading 1"
assert entries[1].ancestors == [f"{orgfile}", "Heading 1"]
assert entries[2].heading == "Sub Sub Heading 1"
assert entries[2].ancestors == [f"{orgfile}", "Heading 1", "Sub Heading 1"]
assert entries[3].heading == "Sub Sub Heading 2"
assert entries[3].ancestors == [f"{orgfile}", "Heading 1", "Sub Heading 1"]
assert entries[4].heading == "Sub Heading 2"
assert entries[4].ancestors == [f"{orgfile}", "Heading 1"]
assert entries[5].heading == "Sub Sub Heading 3"
assert entries[5].ancestors == [f"{orgfile}", "Heading 1", "Sub Heading 2"]
# Helper Functions

View File

@@ -70,7 +70,7 @@ def test_text_search_setup_with_empty_file_creates_no_entries(
text_search.setup(OrgToEntries, data, regenerate=True, user=default_user)
# Assert
assert "Deleted 3 entries. Created 0 new entries for user " in caplog.records[-1].message
assert "Deleted 8 entries. Created 0 new entries for user " in caplog.records[-1].message
verify_embeddings(0, default_user)
@@ -90,7 +90,7 @@ def test_text_indexer_deletes_embedding_before_regenerate(
# Assert
assert "Deleting all entries for file type org" in caplog.text
assert "Deleted 3 entries. Created 10 new entries for user " in caplog.records[-1].message
assert "Deleted 8 entries. Created 13 new entries for user " in caplog.records[-1].message
# ----------------------------------------------------------------------------------------------------
@@ -106,7 +106,7 @@ def test_text_search_setup_batch_processes(content_config: ContentConfig, defaul
text_search.setup(OrgToEntries, data, regenerate=True, user=default_user)
# Assert
assert "Deleted 3 entries. Created 10 new entries for user " in caplog.records[-1].message
assert "Deleted 8 entries. Created 13 new entries for user " in caplog.records[-1].message
# ----------------------------------------------------------------------------------------------------
@@ -161,7 +161,7 @@ async def test_text_search(search_config: SearchConfig):
default_user,
)
query = "How to git install application?"
query = "Load Khoj on Emacs?"
# Act
hits = await text_search.query(default_user, query)
@@ -170,7 +170,7 @@ async def test_text_search(search_config: SearchConfig):
# Assert
search_result = results[0].entry
assert "git clone" in search_result, 'search result did not contain "git clone" entry'
assert "Emacs load path" in search_result, 'Expected "Emacs load path" in entry'
# ----------------------------------------------------------------------------------------------------
@@ -284,9 +284,9 @@ def test_regenerate_index_with_new_entry(
final_logs = caplog.text
# Assert
assert "Deleted 3 entries. Created 10 new entries for user " in initial_logs
assert "Deleted 10 entries. Created 11 new entries for user " in final_logs
verify_embeddings(11, default_user)
assert "Deleted 8 entries. Created 13 new entries for user " in initial_logs
assert "Deleted 13 entries. Created 14 new entries for user " in final_logs
verify_embeddings(14, default_user)
# ----------------------------------------------------------------------------------------------------
@@ -320,7 +320,7 @@ def test_update_index_with_duplicate_entries_in_stable_order(
# Assert
# verify only 1 entry added even if there are multiple duplicate entries
assert "Deleted 3 entries. Created 1 new entries for user " in initial_logs
assert "Deleted 8 entries. Created 1 new entries for user " in initial_logs
assert "Deleted 0 entries. Created 0 new entries for user " in final_logs
verify_embeddings(1, default_user)
@@ -357,7 +357,7 @@ def test_update_index_with_deleted_entry(org_config_with_only_new_file: LocalOrg
# Assert
# verify only 1 entry added even if there are multiple duplicate entries
assert "Deleted 3 entries. Created 2 new entries for user " in initial_logs
assert "Deleted 8 entries. Created 2 new entries for user " in initial_logs
assert "Deleted 1 entries. Created 0 new entries for user " in final_logs
verify_embeddings(1, default_user)
@@ -388,9 +388,9 @@ def test_update_index_with_new_entry(content_config: ContentConfig, new_org_file
final_logs = caplog.text
# Assert
assert "Deleted 3 entries. Created 10 new entries for user " in initial_logs
assert "Deleted 8 entries. Created 13 new entries for user " in initial_logs
assert "Deleted 0 entries. Created 1 new entries for user " in final_logs
verify_embeddings(11, default_user)
verify_embeddings(14, default_user)
# ----------------------------------------------------------------------------------------------------