Support incremental update of org-mode entries and embeddings

- What - Hash the entries and compare to find new/updated entries - Reuse embeddings encoded for existing entries - Only encode embeddings for updated or new entries - Merge the existing and new entries and embeddings to get the updated entries, embeddings - Why - Given most note text entries are expected to be unchanged across time. Reusing their earlier encoded embeddings should significantly speed up embeddings updates - Previously we were regenerating embeddings for all entries, even if they had existed in previous runs
2026-03-05 05:39:11 +00:00 · 2022-09-07 00:16:48 +03:00
parent 762607fc9f
commit 2f7a6af56a
5 changed files with 80 additions and 30 deletions
--- a/src/processor/ledger/beancount_to_jsonl.py
+++ b/src/processor/ledger/beancount_to_jsonl.py
@@ -18,7 +18,7 @@ logger = logging.getLogger(__name__)


 # Define Functions
-def beancount_to_jsonl(beancount_files, beancount_file_filter, output_file):
+def beancount_to_jsonl(beancount_files, beancount_file_filter, output_file, previous_entries=None):
    # Input Validation
    if is_none_or_empty(beancount_files) and is_none_or_empty(beancount_file_filter):
        print("At least one of beancount-files or beancount-file-filter is required to be specified")
@@ -39,7 +39,7 @@ def beancount_to_jsonl(beancount_files, beancount_file_filter, output_file):
    elif output_file.suffix == ".jsonl":
        dump_jsonl(jsonl_data, output_file)

-    return entries
+    return list(enumerate(entries))


 def get_beancount_files(beancount_files=None, beancount_file_filter=None):