Support incremental update of org-mode entries and embeddings

- What
  - Hash the entries and compare to find new/updated entries
  - Reuse embeddings encoded for existing entries
  - Only encode embeddings for updated or new entries
  - Merge the existing and new entries and embeddings to get the updated
    entries, embeddings

- Why
  - Given most note text entries are expected to be unchanged
    across time. Reusing their earlier encoded embeddings should
    significantly speed up embeddings updates
  - Previously we were regenerating embeddings for all entries,
    even if they had existed in previous runs
This commit is contained in:
Debanjum Singh Solanky
2022-09-07 00:16:48 +03:00
parent 762607fc9f
commit 2f7a6af56a
5 changed files with 80 additions and 30 deletions

View File

@@ -3,7 +3,7 @@ import json
from posixpath import split
# Internal Packages
from src.processor.org_mode.org_to_jsonl import convert_org_entries_to_jsonl, extract_org_entries
from src.processor.org_mode.org_to_jsonl import convert_org_entries_to_jsonl, convert_org_nodes_to_entries, extract_org_entries
from src.utils.helpers import is_none_or_empty
@@ -21,10 +21,11 @@ def test_entry_with_empty_body_line_to_jsonl(tmp_path):
# Act
# Extract Entries from specified Org files
entries, entry_to_file_map = extract_org_entries(org_files=[orgfile])
entry_nodes, file_to_entries = extract_org_entries(org_files=[orgfile])
# Process Each Entry from All Notes Files
jsonl_data = convert_org_entries_to_jsonl(entries, entry_to_file_map)
entries = convert_org_nodes_to_entries(entry_nodes, file_to_entries)
jsonl_data = convert_org_entries_to_jsonl(entries)
# Assert
assert is_none_or_empty(jsonl_data)
@@ -43,10 +44,11 @@ def test_entry_with_body_to_jsonl(tmp_path):
# Act
# Extract Entries from specified Org files
entries, entry_to_file_map = extract_org_entries(org_files=[orgfile])
entry_nodes, file_to_entries = extract_org_entries(org_files=[orgfile])
# Process Each Entry from All Notes Files
jsonl_string = convert_org_entries_to_jsonl(entries, entry_to_file_map)
entries = convert_org_nodes_to_entries(entry_nodes, file_to_entries)
jsonl_string = convert_org_entries_to_jsonl(entries)
jsonl_data = [json.loads(json_string) for json_string in jsonl_string.splitlines()]
# Assert