Support incremental update of org-mode entries and embeddings

- What
  - Hash the entries and compare to find new/updated entries
  - Reuse embeddings encoded for existing entries
  - Only encode embeddings for updated or new entries
  - Merge the existing and new entries and embeddings to get the updated
    entries, embeddings

- Why
  - Given most note text entries are expected to be unchanged
    across time. Reusing their earlier encoded embeddings should
    significantly speed up embeddings updates
  - Previously we were regenerating embeddings for all entries,
    even if they had existed in previous runs
This commit is contained in:
Debanjum Singh Solanky
2022-09-07 00:16:48 +03:00
parent 762607fc9f
commit 2f7a6af56a
5 changed files with 80 additions and 30 deletions

View File

@@ -18,7 +18,7 @@ logger = logging.getLogger(__name__)
# Define Functions
def beancount_to_jsonl(beancount_files, beancount_file_filter, output_file):
def beancount_to_jsonl(beancount_files, beancount_file_filter, output_file, previous_entries=None):
# Input Validation
if is_none_or_empty(beancount_files) and is_none_or_empty(beancount_file_filter):
print("At least one of beancount-files or beancount-file-filter is required to be specified")
@@ -39,7 +39,7 @@ def beancount_to_jsonl(beancount_files, beancount_file_filter, output_file):
elif output_file.suffix == ".jsonl":
dump_jsonl(jsonl_data, output_file)
return entries
return list(enumerate(entries))
def get_beancount_files(beancount_files=None, beancount_file_filter=None):