diff --git a/Readme.md b/Readme.md index 301e2369..c47e07a5 100644 --- a/Readme.md +++ b/Readme.md @@ -131,7 +131,7 @@ pip install --upgrade khoj-assistant - Indexing is more strongly impacted by the size of the source data - Indexing 100K+ line corpus of notes takes about 10 minutes - Indexing 4000+ images takes about 15 minutes and more than 8Gb of RAM -- Once is implemented, it should only take this long on first run +- Note: *It should only take this long on the first run* as the index is incrementally updated ### Miscellaneous diff --git a/src/configure.py b/src/configure.py index f6476951..ed46af37 100644 --- a/src/configure.py +++ b/src/configure.py @@ -48,11 +48,7 @@ def configure_search(model: SearchModels, config: FullConfig, regenerate: bool, config.content_type.org, search_config=config.search_type.asymmetric, regenerate=regenerate, - filters=[ - DateFilter(), - WordFilter(config.content_type.org.compressed_jsonl.parent, SearchType.Org), - FileFilter(), - ]) + filters=[DateFilter(), WordFilter(), FileFilter()]) # Initialize Org Music Search if (t == SearchType.Music or t == None) and config.content_type.music: @@ -71,11 +67,7 @@ def configure_search(model: SearchModels, config: FullConfig, regenerate: bool, config.content_type.markdown, search_config=config.search_type.asymmetric, regenerate=regenerate, - filters=[ - DateFilter(), - WordFilter(config.content_type.markdown.compressed_jsonl.parent, SearchType.Markdown), - FileFilter(), - ]) + filters=[DateFilter(), WordFilter(), FileFilter()]) # Initialize Ledger Search if (t == SearchType.Ledger or t == None) and config.content_type.ledger: @@ -85,11 +77,7 @@ def configure_search(model: SearchModels, config: FullConfig, regenerate: bool, config.content_type.ledger, search_config=config.search_type.symmetric, regenerate=regenerate, - filters=[ - DateFilter(), - WordFilter(config.content_type.ledger.compressed_jsonl.parent, SearchType.Ledger), - FileFilter(), - ]) + filters=[DateFilter(), WordFilter(), FileFilter()]) # Initialize Image Search if (t == SearchType.Image or t == None) and config.content_type.image: diff --git a/src/processor/ledger/beancount_to_jsonl.py b/src/processor/ledger/beancount_to_jsonl.py index c0136bc6..3af45c7a 100644 --- a/src/processor/ledger/beancount_to_jsonl.py +++ b/src/processor/ledger/beancount_to_jsonl.py @@ -7,9 +7,10 @@ import pathlib import glob import re import logging +import time # Internal Packages -from src.utils.helpers import get_absolute_path, is_none_or_empty +from src.utils.helpers import get_absolute_path, is_none_or_empty, mark_entries_for_update from src.utils.constants import empty_escape_sequences from src.utils.jsonl import dump_jsonl, compress_jsonl_data @@ -18,7 +19,7 @@ logger = logging.getLogger(__name__) # Define Functions -def beancount_to_jsonl(beancount_files, beancount_file_filter, output_file): +def beancount_to_jsonl(beancount_files, beancount_file_filter, output_file, previous_entries=None): # Input Validation if is_none_or_empty(beancount_files) and is_none_or_empty(beancount_file_filter): print("At least one of beancount-files or beancount-file-filter is required to be specified") @@ -28,18 +29,34 @@ def beancount_to_jsonl(beancount_files, beancount_file_filter, output_file): beancount_files = get_beancount_files(beancount_files, beancount_file_filter) # Extract Entries from specified Beancount files - entries, transaction_to_file_map = extract_beancount_entries(beancount_files) + start = time.time() + current_entries = convert_transactions_to_maps(*extract_beancount_transactions(beancount_files)) + end = time.time() + logger.debug(f"Parse transactions from Beancount files into dictionaries: {end - start} seconds") + + # Identify, mark and merge any new entries with previous entries + start = time.time() + if not previous_entries: + entries_with_ids = list(enumerate(current_entries)) + else: + entries_with_ids = mark_entries_for_update(current_entries, previous_entries, key='compiled', logger=logger) + end = time.time() + logger.debug(f"Identify new or updated transaction: {end - start} seconds") # Process Each Entry from All Notes Files - jsonl_data = convert_beancount_entries_to_jsonl(entries, transaction_to_file_map) + start = time.time() + entries = list(map(lambda entry: entry[1], entries_with_ids)) + jsonl_data = convert_transaction_maps_to_jsonl(entries) # Compress JSONL formatted Data if output_file.suffix == ".gz": compress_jsonl_data(jsonl_data, output_file) elif output_file.suffix == ".jsonl": dump_jsonl(jsonl_data, output_file) + end = time.time() + logger.debug(f"Write transactions to JSONL file: {end - start} seconds") - return entries + return entries_with_ids def get_beancount_files(beancount_files=None, beancount_file_filter=None): @@ -66,12 +83,12 @@ def get_beancount_files(beancount_files=None, beancount_file_filter=None): return all_beancount_files -def extract_beancount_entries(beancount_files): +def extract_beancount_transactions(beancount_files): "Extract entries from specified Beancount files" # Initialize Regex for extracting Beancount Entries transaction_regex = r'^\n?\d{4}-\d{2}-\d{2} [\*|\!] ' - empty_newline = f'^[{empty_escape_sequences}]*$' + empty_newline = f'^[\n\r\t\ ]*$' entries = [] transaction_to_file_map = [] @@ -82,22 +99,25 @@ def extract_beancount_entries(beancount_files): for entry in re.split(empty_newline, ledger_content, flags=re.MULTILINE) if re.match(transaction_regex, entry)] - transaction_to_file_map += [beancount_file]*len(transactions_per_file) + transaction_to_file_map += zip(transactions_per_file, [beancount_file]*len(transactions_per_file)) entries.extend(transactions_per_file) - return entries, transaction_to_file_map + return entries, dict(transaction_to_file_map) -def convert_beancount_entries_to_jsonl(entries, transaction_to_file_map): - "Convert each Beancount transaction to JSON and collate as JSONL" - jsonl = '' - for entry_id, entry in enumerate(entries): - entry_dict = {'compiled': entry, 'raw': entry, 'file': f'{transaction_to_file_map[entry_id]}'} - # Convert Dictionary to JSON and Append to JSONL string - jsonl += f'{json.dumps(entry_dict, ensure_ascii=False)}\n' +def convert_transactions_to_maps(entries: list[str], transaction_to_file_map) -> list[dict]: + "Convert each Beancount transaction into a dictionary" + entry_maps = [] + for entry in entries: + entry_maps.append({'compiled': entry, 'raw': entry, 'file': f'{transaction_to_file_map[entry]}'}) - logger.info(f"Converted {len(entries)} to jsonl format") + logger.info(f"Converted {len(entries)} transactions to dictionaries") - return jsonl + return entry_maps + + +def convert_transaction_maps_to_jsonl(entries: list[dict]) -> str: + "Convert each Beancount transaction dictionary to JSON and collate as JSONL" + return ''.join([f'{json.dumps(entry_dict, ensure_ascii=False)}\n' for entry_dict in entries]) if __name__ == '__main__': diff --git a/src/processor/markdown/markdown_to_jsonl.py b/src/processor/markdown/markdown_to_jsonl.py index a0903fcb..e7fc2779 100644 --- a/src/processor/markdown/markdown_to_jsonl.py +++ b/src/processor/markdown/markdown_to_jsonl.py @@ -7,9 +7,10 @@ import pathlib import glob import re import logging +import time # Internal Packages -from src.utils.helpers import get_absolute_path, is_none_or_empty +from src.utils.helpers import get_absolute_path, is_none_or_empty, mark_entries_for_update from src.utils.constants import empty_escape_sequences from src.utils.jsonl import dump_jsonl, compress_jsonl_data @@ -18,7 +19,7 @@ logger = logging.getLogger(__name__) # Define Functions -def markdown_to_jsonl(markdown_files, markdown_file_filter, output_file): +def markdown_to_jsonl(markdown_files, markdown_file_filter, output_file, previous_entries=None): # Input Validation if is_none_or_empty(markdown_files) and is_none_or_empty(markdown_file_filter): print("At least one of markdown-files or markdown-file-filter is required to be specified") @@ -28,18 +29,34 @@ def markdown_to_jsonl(markdown_files, markdown_file_filter, output_file): markdown_files = get_markdown_files(markdown_files, markdown_file_filter) # Extract Entries from specified Markdown files - entries, entry_to_file_map = extract_markdown_entries(markdown_files) + start = time.time() + current_entries = convert_markdown_entries_to_maps(*extract_markdown_entries(markdown_files)) + end = time.time() + logger.debug(f"Parse entries from Markdown files into dictionaries: {end - start} seconds") + + # Identify, mark and merge any new entries with previous entries + start = time.time() + if not previous_entries: + entries_with_ids = list(enumerate(current_entries)) + else: + entries_with_ids = mark_entries_for_update(current_entries, previous_entries, key='compiled', logger=logger) + end = time.time() + logger.debug(f"Identify new or updated entries: {end - start} seconds") # Process Each Entry from All Notes Files - jsonl_data = convert_markdown_entries_to_jsonl(entries, entry_to_file_map) + start = time.time() + entries = list(map(lambda entry: entry[1], entries_with_ids)) + jsonl_data = convert_markdown_maps_to_jsonl(entries) # Compress JSONL formatted Data if output_file.suffix == ".gz": compress_jsonl_data(jsonl_data, output_file) elif output_file.suffix == ".jsonl": dump_jsonl(jsonl_data, output_file) + end = time.time() + logger.debug(f"Write markdown entries to JSONL file: {end - start} seconds") - return entries + return entries_with_ids def get_markdown_files(markdown_files=None, markdown_file_filter=None): @@ -80,24 +97,28 @@ def extract_markdown_entries(markdown_files): markdown_content = f.read() markdown_entries_per_file = [f'#{entry.strip(empty_escape_sequences)}' for entry - in re.split(markdown_heading_regex, markdown_content, flags=re.MULTILINE)] - entry_to_file_map += [markdown_file]*len(markdown_entries_per_file) + in re.split(markdown_heading_regex, markdown_content, flags=re.MULTILINE) + if entry.strip(empty_escape_sequences) != ''] + entry_to_file_map += zip(markdown_entries_per_file, [markdown_file]*len(markdown_entries_per_file)) entries.extend(markdown_entries_per_file) - return entries, entry_to_file_map + return entries, dict(entry_to_file_map) -def convert_markdown_entries_to_jsonl(entries, entry_to_file_map): +def convert_markdown_entries_to_maps(entries: list[str], entry_to_file_map) -> list[dict]: + "Convert each Markdown entries into a dictionary" + entry_maps = [] + for entry in entries: + entry_maps.append({'compiled': entry, 'raw': entry, 'file': f'{entry_to_file_map[entry]}'}) + + logger.info(f"Converted {len(entries)} markdown entries to dictionaries") + + return entry_maps + + +def convert_markdown_maps_to_jsonl(entries): "Convert each Markdown entries to JSON and collate as JSONL" - jsonl = '' - for entry_id, entry in enumerate(entries): - entry_dict = {'compiled': entry, 'raw': entry, 'file': f'{entry_to_file_map[entry_id]}'} - # Convert Dictionary to JSON and Append to JSONL string - jsonl += f'{json.dumps(entry_dict, ensure_ascii=False)}\n' - - logger.info(f"Converted {len(entries)} to jsonl format") - - return jsonl + return ''.join([f'{json.dumps(entry_dict, ensure_ascii=False)}\n' for entry_dict in entries]) if __name__ == '__main__': diff --git a/src/processor/org_mode/org_to_jsonl.py b/src/processor/org_mode/org_to_jsonl.py index f1531797..f166810f 100644 --- a/src/processor/org_mode/org_to_jsonl.py +++ b/src/processor/org_mode/org_to_jsonl.py @@ -7,10 +7,11 @@ import argparse import pathlib import glob import logging +import time # Internal Packages from src.processor.org_mode import orgnode -from src.utils.helpers import get_absolute_path, is_none_or_empty +from src.utils.helpers import get_absolute_path, is_none_or_empty, mark_entries_for_update from src.utils.jsonl import dump_jsonl, compress_jsonl_data from src.utils import state @@ -19,28 +20,47 @@ logger = logging.getLogger(__name__) # Define Functions -def org_to_jsonl(org_files, org_file_filter, output_file): +def org_to_jsonl(org_files, org_file_filter, output_file, previous_entries=None): # Input Validation if is_none_or_empty(org_files) and is_none_or_empty(org_file_filter): print("At least one of org-files or org-file-filter is required to be specified") exit(1) # Get Org Files to Process + start = time.time() org_files = get_org_files(org_files, org_file_filter) # Extract Entries from specified Org files - entries, file_to_entries = extract_org_entries(org_files) + start = time.time() + entry_nodes, file_to_entries = extract_org_entries(org_files) + end = time.time() + logger.debug(f"Parse entries from org files into OrgNode objects: {end - start} seconds") + + start = time.time() + current_entries = convert_org_nodes_to_entries(entry_nodes, file_to_entries) + end = time.time() + logger.debug(f"Convert OrgNodes into entry dictionaries: {end - start} seconds") + + # Identify, mark and merge any new entries with previous entries + if not previous_entries: + entries_with_ids = list(enumerate(current_entries)) + else: + entries_with_ids = mark_entries_for_update(current_entries, previous_entries, key='compiled', logger=logger) # Process Each Entry from All Notes Files - jsonl_data = convert_org_entries_to_jsonl(entries, file_to_entries) + start = time.time() + entries = map(lambda entry: entry[1], entries_with_ids) + jsonl_data = convert_org_entries_to_jsonl(entries) # Compress JSONL formatted Data if output_file.suffix == ".gz": compress_jsonl_data(jsonl_data, output_file) elif output_file.suffix == ".jsonl": dump_jsonl(jsonl_data, output_file) + end = time.time() + logger.debug(f"Write org entries to JSONL file: {end - start} seconds") - return entries + return entries_with_ids def get_org_files(org_files=None, org_file_filter=None): @@ -70,16 +90,16 @@ def extract_org_entries(org_files): entry_to_file_map = [] for org_file in org_files: org_file_entries = orgnode.makelist(str(org_file)) - entry_to_file_map += [org_file]*len(org_file_entries) + entry_to_file_map += zip(org_file_entries, [org_file]*len(org_file_entries)) entries.extend(org_file_entries) - return entries, entry_to_file_map + return entries, dict(entry_to_file_map) -def convert_org_entries_to_jsonl(entries, entry_to_file_map) -> str: - "Convert each Org-Mode entries to JSON and collate as JSONL" - jsonl = '' - for entry_id, entry in enumerate(entries): +def convert_org_nodes_to_entries(entries: list[orgnode.Orgnode], entry_to_file_map) -> list[dict]: + "Convert Org-Mode entries into list of dictionary" + entry_maps = [] + for entry in entries: entry_dict = dict() # Ignore title notes i.e notes with just headings and empty body @@ -113,14 +133,17 @@ def convert_org_entries_to_jsonl(entries, entry_to_file_map) -> str: if entry_dict: entry_dict["raw"] = f'{entry}' - entry_dict["file"] = f'{entry_to_file_map[entry_id]}' + entry_dict["file"] = f'{entry_to_file_map[entry]}' # Convert Dictionary to JSON and Append to JSONL string - jsonl += f'{json.dumps(entry_dict, ensure_ascii=False)}\n' + entry_maps.append(entry_dict) - logger.info(f"Converted {len(entries)} to jsonl format") + return entry_maps - return jsonl + +def convert_org_entries_to_jsonl(entries: list[dict]) -> str: + "Convert each Org-Mode entry to JSON and collate as JSONL" + return ''.join([f'{json.dumps(entry_dict, ensure_ascii=False)}\n' for entry_dict in entries]) if __name__ == '__main__': diff --git a/src/processor/org_mode/orgnode.py b/src/processor/org_mode/orgnode.py index f5c55e7f..31cedbb9 100644 --- a/src/processor/org_mode/orgnode.py +++ b/src/processor/org_mode/orgnode.py @@ -69,7 +69,7 @@ def makelist(filename): level = "" heading = "" bodytext = "" - tags = set() # set of all tags in headline + tags = list() # set of all tags in headline closed_date = '' sched_date = '' deadline_date = '' @@ -104,14 +104,14 @@ def makelist(filename): level = hdng.group(1) heading = hdng.group(2) bodytext = "" - tags = set() # set of all tags in headline + tags = list() # set of all tags in headline tagsrch = re.search(r'(.*?)\s*:([a-zA-Z0-9].*?):$',heading) if tagsrch: heading = tagsrch.group(1) parsedtags = tagsrch.group(2) if parsedtags: for parsedtag in parsedtags.split(':'): - if parsedtag != '': tags.add(parsedtag) + if parsedtag != '': tags.append(parsedtag) else: # we are processing a non-heading line if line[:10] == '#+SEQ_TODO': kwlist = re.findall(r'([A-Z]+)\(', line) @@ -237,7 +237,7 @@ class Orgnode(object): self.level = len(level) self.headline = headline self.body = body - self.tags = set(tags) # All tags in the headline + self.tags = tags # All tags in the headline self.todo = "" self.prty = "" # empty of A, B or C self.scheduled = "" # Scheduled date @@ -290,8 +290,8 @@ class Orgnode(object): def Tags(self): """ - Returns the set of all tags - For example, :HOME:COMPUTER: would return {'HOME', 'COMPUTER'} + Returns the list of all tags + For example, :HOME:COMPUTER: would return ['HOME', 'COMPUTER'] """ return self.tags @@ -307,7 +307,7 @@ class Orgnode(object): """ Store all the tags found in the headline. """ - self.tags = set(newtags) + self.tags = newtags def Todo(self): """ diff --git a/src/search_filter/word_filter.py b/src/search_filter/word_filter.py index a177ba38..dd376cbb 100644 --- a/src/search_filter/word_filter.py +++ b/src/search_filter/word_filter.py @@ -19,37 +19,24 @@ class WordFilter(BaseFilter): required_regex = r'\+"(\w+)" ?' blocked_regex = r'\-"(\w+)" ?' - def __init__(self, filter_directory, search_type: SearchType, entry_key='raw'): - self.filter_file = resolve_absolute_path(filter_directory / f"word_filter_{search_type.name.lower()}_index.pkl") + def __init__(self, entry_key='raw'): self.entry_key = entry_key - self.search_type = search_type - self.word_to_entry_index = dict() + self.word_to_entry_index = defaultdict(set) self.cache = LRU() def load(self, entries, regenerate=False): - if self.filter_file.exists() and not regenerate: - start = time.time() - with self.filter_file.open('rb') as f: - self.word_to_entry_index = pickle.load(f) - end = time.time() - logger.debug(f"Load word filter index for {self.search_type} from {self.filter_file}: {end - start} seconds") - else: - start = time.time() - self.cache = {} # Clear cache on (re-)generating entries_by_word_set - entry_splitter = r',|\.| |\]|\[\(|\)|\{|\}|\t|\n|\:' - self.word_to_entry_index = defaultdict(set) - # Create map of words to entries they exist in - for entry_index, entry in enumerate(entries): - for word in re.split(entry_splitter, entry[self.entry_key].lower()): - if word == '': - continue - self.word_to_entry_index[word].add(entry_index) - - with self.filter_file.open('wb') as f: - pickle.dump(self.word_to_entry_index, f) - end = time.time() - logger.debug(f"Indexed {len(self.word_to_entry_index)} words of {self.search_type} type for word filter to {self.filter_file}: {end - start} seconds") + start = time.time() + self.cache = {} # Clear cache on filter (re-)load + entry_splitter = r',|\.| |\]|\[\(|\)|\{|\}|\<|\>|\t|\n|\:|\;|\?|\!|\(|\)|\&|\^|\$|\@|\%|\+|\=|\/|\\|\||\~|\`|\"|\'' + # Create map of words to entries they exist in + for entry_index, entry in enumerate(entries): + for word in re.split(entry_splitter, entry[self.entry_key].lower()): + if word == '': + continue + self.word_to_entry_index[word].add(entry_index) + end = time.time() + logger.debug(f"Created word filter index: {end - start} seconds") return self.word_to_entry_index diff --git a/src/search_type/text_search.py b/src/search_type/text_search.py index 7a80c296..238c4736 100644 --- a/src/search_type/text_search.py +++ b/src/search_type/text_search.py @@ -55,15 +55,28 @@ def extract_entries(jsonl_file): return load_jsonl(jsonl_file) -def compute_embeddings(entries, bi_encoder, embeddings_file, regenerate=False): +def compute_embeddings(entries_with_ids, bi_encoder, embeddings_file, regenerate=False): "Compute (and Save) Embeddings or Load Pre-Computed Embeddings" - # Load pre-computed embeddings from file if exists + new_entries = [] + # Load pre-computed embeddings from file if exists and update them if required if embeddings_file.exists() and not regenerate: corpus_embeddings = torch.load(get_absolute_path(embeddings_file), map_location=state.device) logger.info(f"Loaded embeddings from {embeddings_file}") - else: # Else compute the corpus_embeddings from scratch, which can take a while - corpus_embeddings = bi_encoder.encode([entry['compiled'] for entry in entries], convert_to_tensor=True, device=state.device, show_progress_bar=True) + # Encode any new entries in the corpus and update corpus embeddings + new_entries = [entry['compiled'] for id, entry in entries_with_ids if id is None] + if new_entries: + new_embeddings = bi_encoder.encode(new_entries, convert_to_tensor=True, device=state.device, show_progress_bar=True) + existing_entry_ids = [id for id, _ in entries_with_ids if id is not None] + existing_embeddings = torch.index_select(corpus_embeddings, 0, torch.tensor(existing_entry_ids)) if existing_entry_ids else torch.Tensor() + corpus_embeddings = torch.cat([existing_embeddings, new_embeddings], dim=0) + # Else compute the corpus embeddings from scratch + else: + new_entries = [entry['compiled'] for _, entry in entries_with_ids] + corpus_embeddings = bi_encoder.encode(new_entries, convert_to_tensor=True, device=state.device, show_progress_bar=True) + + # Save regenerated or updated embeddings to file + if new_entries: corpus_embeddings = util.normalize_embeddings(corpus_embeddings) torch.save(corpus_embeddings, embeddings_file) logger.info(f"Computed embeddings and saved them to {embeddings_file}") @@ -169,10 +182,10 @@ def setup(text_to_jsonl, config: TextContentConfig, search_config: TextSearchCon # Map notes in text files to (compressed) JSONL formatted file config.compressed_jsonl = resolve_absolute_path(config.compressed_jsonl) - if not config.compressed_jsonl.exists() or regenerate: - text_to_jsonl(config.input_files, config.input_filter, config.compressed_jsonl) + previous_entries = extract_entries(config.compressed_jsonl) if config.compressed_jsonl.exists() else None + entries_with_indices = text_to_jsonl(config.input_files, config.input_filter, config.compressed_jsonl, previous_entries) - # Extract Entries + # Extract Updated Entries entries = extract_entries(config.compressed_jsonl) if is_none_or_empty(entries): raise ValueError(f"No valid entries found in specified files: {config.input_files} or {config.input_filter}") @@ -180,7 +193,7 @@ def setup(text_to_jsonl, config: TextContentConfig, search_config: TextSearchCon # Compute or Load Embeddings config.embeddings_file = resolve_absolute_path(config.embeddings_file) - corpus_embeddings = compute_embeddings(entries, bi_encoder, config.embeddings_file, regenerate=regenerate) + corpus_embeddings = compute_embeddings(entries_with_indices, bi_encoder, config.embeddings_file, regenerate=regenerate) for filter in filters: filter.load(entries, regenerate=regenerate) diff --git a/src/utils/constants.py b/src/utils/constants.py index 84c3dfbb..8b443944 100644 --- a/src/utils/constants.py +++ b/src/utils/constants.py @@ -2,7 +2,7 @@ from pathlib import Path app_root_directory = Path(__file__).parent.parent.parent web_directory = app_root_directory / 'src/interface/web/' -empty_escape_sequences = r'\n|\r\t ' +empty_escape_sequences = '\n|\r|\t| ' # default app config to use default_config = { diff --git a/src/utils/helpers.py b/src/utils/helpers.py index 7ea6580c..e8b2b8ca 100644 --- a/src/utils/helpers.py +++ b/src/utils/helpers.py @@ -1,6 +1,8 @@ # Standard Packages import pathlib import sys +import time +import hashlib from os.path import join from collections import OrderedDict @@ -79,3 +81,39 @@ class LRU(OrderedDict): if len(self) > self.capacity: oldest = next(iter(self)) del self[oldest] + + +def mark_entries_for_update(current_entries, previous_entries, key='compiled', logger=None): + # Hash all current and previous entries to identify new entries + start = time.time() + current_entry_hashes = list(map(lambda e: hashlib.md5(bytes(e[key], encoding='utf-8')).hexdigest(), current_entries)) + previous_entry_hashes = list(map(lambda e: hashlib.md5(bytes(e[key], encoding='utf-8')).hexdigest(), previous_entries)) + end = time.time() + logger.debug(f"Hash previous, current entries: {end - start} seconds") + + start = time.time() + hash_to_current_entries = dict(zip(current_entry_hashes, current_entries)) + hash_to_previous_entries = dict(zip(previous_entry_hashes, previous_entries)) + + # All entries that did not exist in the previous set are to be added + new_entry_hashes = set(current_entry_hashes) - set(previous_entry_hashes) + # All entries that exist in both current and previous sets are kept + existing_entry_hashes = set(current_entry_hashes) & set(previous_entry_hashes) + + # Mark new entries with no ids for later embeddings generation + new_entries = [ + (None, hash_to_current_entries[entry_hash]) + for entry_hash in new_entry_hashes + ] + # Set id of existing entries to their previous ids to reuse their existing encoded embeddings + existing_entries = [ + (previous_entry_hashes.index(entry_hash), hash_to_previous_entries[entry_hash]) + for entry_hash in existing_entry_hashes + ] + + existing_entries_sorted = sorted(existing_entries, key=lambda e: e[0]) + entries_with_ids = existing_entries_sorted + new_entries + end = time.time() + logger.debug(f"Identify, Mark, Combine new, existing entries: {end - start} seconds") + + return entries_with_ids \ No newline at end of file diff --git a/src/utils/jsonl.py b/src/utils/jsonl.py index 77b5af11..8a034acd 100644 --- a/src/utils/jsonl.py +++ b/src/utils/jsonl.py @@ -54,4 +54,4 @@ def compress_jsonl_data(jsonl_data, output_path): with gzip.open(output_path, 'wt') as gzip_file: gzip_file.write(jsonl_data) - logger.info(f'Wrote {len(jsonl_data)} lines to gzip compressed jsonl at {output_path}') \ No newline at end of file + logger.info(f'Wrote jsonl data to gzip compressed jsonl at {output_path}') \ No newline at end of file diff --git a/tests/conftest.py b/tests/conftest.py index fdb26557..2a725919 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -59,7 +59,7 @@ def content_config(tmp_path_factory, search_config: SearchConfig): compressed_jsonl = content_dir.joinpath('notes.jsonl.gz'), embeddings_file = content_dir.joinpath('note_embeddings.pt')) - filters = [DateFilter(), WordFilter(content_dir, search_type=SearchType.Org), FileFilter()] + filters = [DateFilter(), WordFilter(), FileFilter()] text_search.setup(org_to_jsonl, content_config.org, search_config.asymmetric, regenerate=False, filters=filters) - return content_config \ No newline at end of file + return content_config diff --git a/tests/test_beancount_to_jsonl.py b/tests/test_beancount_to_jsonl.py new file mode 100644 index 00000000..e368e194 --- /dev/null +++ b/tests/test_beancount_to_jsonl.py @@ -0,0 +1,84 @@ +# Standard Packages +import json + +# Internal Packages +from src.processor.ledger.beancount_to_jsonl import extract_beancount_transactions, convert_transactions_to_maps, convert_transaction_maps_to_jsonl + + +def test_no_transactions_in_file(tmp_path): + "Handle file with no transactions." + # Arrange + entry = f''' + - Bullet point 1 + - Bullet point 2 + ''' + beancount_file = create_file(tmp_path, entry) + + # Act + # Extract Entries from specified Beancount files + entry_nodes, file_to_entries = extract_beancount_transactions(beancount_files=[beancount_file]) + + # Process Each Entry from All Beancount Files + jsonl_string = convert_transaction_maps_to_jsonl(convert_transactions_to_maps(entry_nodes, file_to_entries)) + jsonl_data = [json.loads(json_string) for json_string in jsonl_string.splitlines()] + + # Assert + assert len(jsonl_data) == 0 + + +def test_single_beancount_transaction_to_jsonl(tmp_path): + "Convert transaction from single file to jsonl." + # Arrange + entry = f''' +1984-04-01 * "Payee" "Narration" +Expenses:Test:Test 1.00 KES +Assets:Test:Test -1.00 KES + ''' + beancount_file = create_file(tmp_path, entry) + + # Act + # Extract Entries from specified Beancount files + entries, entry_to_file_map = extract_beancount_transactions(beancount_files=[beancount_file]) + + # Process Each Entry from All Beancount Files + jsonl_string = convert_transaction_maps_to_jsonl(convert_transactions_to_maps(entries, entry_to_file_map)) + jsonl_data = [json.loads(json_string) for json_string in jsonl_string.splitlines()] + + # Assert + assert len(jsonl_data) == 1 + + +def test_multiple_transactions_to_jsonl(tmp_path): + "Convert multiple transactions from single file to jsonl." + # Arrange + entry = f''' +1984-04-01 * "Payee" "Narration" +Expenses:Test:Test 1.00 KES +Assets:Test:Test -1.00 KES +\t\r +1984-04-01 * "Payee" "Narration" +Expenses:Test:Test 1.00 KES +Assets:Test:Test -1.00 KES +''' + + beancount_file = create_file(tmp_path, entry) + + # Act + # Extract Entries from specified Beancount files + entries, entry_to_file_map = extract_beancount_transactions(beancount_files=[beancount_file]) + + # Process Each Entry from All Beancount Files + jsonl_string = convert_transaction_maps_to_jsonl(convert_transactions_to_maps(entries, entry_to_file_map)) + jsonl_data = [json.loads(json_string) for json_string in jsonl_string.splitlines()] + + # Assert + assert len(jsonl_data) == 2 + + +# Helper Functions +def create_file(tmp_path, entry, filename="ledger.beancount"): + beancount_file = tmp_path / f"notes/{filename}" + beancount_file.parent.mkdir() + beancount_file.touch() + beancount_file.write_text(entry) + return beancount_file diff --git a/tests/test_client.py b/tests/test_client.py index 578c789c..b167bce0 100644 --- a/tests/test_client.py +++ b/tests/test_client.py @@ -132,7 +132,7 @@ def test_notes_search(content_config: ContentConfig, search_config: SearchConfig # ---------------------------------------------------------------------------------------------------- def test_notes_search_with_include_filter(content_config: ContentConfig, search_config: SearchConfig): # Arrange - filters = [WordFilter(content_config.org.compressed_jsonl.parent, search_type=SearchType.Org)] + filters = [WordFilter()] model.orgmode_search = text_search.setup(org_to_jsonl, content_config.org, search_config.asymmetric, regenerate=False, filters=filters) user_query = 'How to git install application? +"Emacs"' @@ -149,7 +149,7 @@ def test_notes_search_with_include_filter(content_config: ContentConfig, search_ # ---------------------------------------------------------------------------------------------------- def test_notes_search_with_exclude_filter(content_config: ContentConfig, search_config: SearchConfig): # Arrange - filters = [WordFilter(content_config.org.compressed_jsonl.parent, search_type=SearchType.Org)] + filters = [WordFilter()] model.orgmode_search = text_search.setup(org_to_jsonl, content_config.org, search_config.asymmetric, regenerate=False, filters=filters) user_query = 'How to git install application? -"clone"' diff --git a/tests/test_markdown_to_jsonl.py b/tests/test_markdown_to_jsonl.py new file mode 100644 index 00000000..712053e8 --- /dev/null +++ b/tests/test_markdown_to_jsonl.py @@ -0,0 +1,81 @@ +# Standard Packages +import json + +# Internal Packages +from src.processor.markdown.markdown_to_jsonl import extract_markdown_entries, convert_markdown_maps_to_jsonl, convert_markdown_entries_to_maps + + +def test_markdown_file_with_no_headings_to_jsonl(tmp_path): + "Convert files with no heading to jsonl." + # Arrange + entry = f''' + - Bullet point 1 + - Bullet point 2 + ''' + markdownfile = create_file(tmp_path, entry) + + # Act + # Extract Entries from specified Markdown files + entry_nodes, file_to_entries = extract_markdown_entries(markdown_files=[markdownfile]) + + # Process Each Entry from All Notes Files + jsonl_string = convert_markdown_maps_to_jsonl(convert_markdown_entries_to_maps(entry_nodes, file_to_entries)) + jsonl_data = [json.loads(json_string) for json_string in jsonl_string.splitlines()] + + # Assert + assert len(jsonl_data) == 1 + + +def test_single_markdown_entry_to_jsonl(tmp_path): + "Convert markdown entry from single file to jsonl." + # Arrange + entry = f'''### Heading + \t\r + Body Line 1 + ''' + markdownfile = create_file(tmp_path, entry) + + # Act + # Extract Entries from specified Markdown files + entries, entry_to_file_map = extract_markdown_entries(markdown_files=[markdownfile]) + + # Process Each Entry from All Notes Files + jsonl_string = convert_markdown_maps_to_jsonl(convert_markdown_entries_to_maps(entries, entry_to_file_map)) + jsonl_data = [json.loads(json_string) for json_string in jsonl_string.splitlines()] + + # Assert + assert len(jsonl_data) == 1 + + +def test_multiple_markdown_entries_to_jsonl(tmp_path): + "Convert multiple markdown entries from single file to jsonl." + # Arrange + entry = f''' +### Heading 1 + \t\r + Heading 1 Body Line 1 +### Heading 2 + \t\r + Heading 2 Body Line 2 + ''' + markdownfile = create_file(tmp_path, entry) + + # Act + # Extract Entries from specified Markdown files + entries, entry_to_file_map = extract_markdown_entries(markdown_files=[markdownfile]) + + # Process Each Entry from All Notes Files + jsonl_string = convert_markdown_maps_to_jsonl(convert_markdown_entries_to_maps(entries, entry_to_file_map)) + jsonl_data = [json.loads(json_string) for json_string in jsonl_string.splitlines()] + + # Assert + assert len(jsonl_data) == 2 + + +# Helper Functions +def create_file(tmp_path, entry, filename="test.md"): + markdown_file = tmp_path / f"notes/{filename}" + markdown_file.parent.mkdir() + markdown_file.touch() + markdown_file.write_text(entry) + return markdown_file diff --git a/tests/test_org_to_jsonl.py b/tests/test_org_to_jsonl.py index 39b3e91f..eaac5ef8 100644 --- a/tests/test_org_to_jsonl.py +++ b/tests/test_org_to_jsonl.py @@ -2,7 +2,7 @@ import json # Internal Packages -from src.processor.org_mode.org_to_jsonl import convert_org_entries_to_jsonl, extract_org_entries +from src.processor.org_mode.org_to_jsonl import convert_org_entries_to_jsonl, convert_org_nodes_to_entries, extract_org_entries from src.utils.helpers import is_none_or_empty @@ -20,10 +20,11 @@ def test_entry_with_empty_body_line_to_jsonl(tmp_path): # Act # Extract Entries from specified Org files - entries, entry_to_file_map = extract_org_entries(org_files=[orgfile]) + entry_nodes, file_to_entries = extract_org_entries(org_files=[orgfile]) # Process Each Entry from All Notes Files - jsonl_data = convert_org_entries_to_jsonl(entries, entry_to_file_map) + entries = convert_org_nodes_to_entries(entry_nodes, file_to_entries) + jsonl_data = convert_org_entries_to_jsonl(entries) # Assert assert is_none_or_empty(jsonl_data) @@ -46,7 +47,7 @@ def test_entry_with_body_to_jsonl(tmp_path): entries, entry_to_file_map = extract_org_entries(org_files=[orgfile]) # Process Each Entry from All Notes Files - jsonl_string = convert_org_entries_to_jsonl(entries, entry_to_file_map) + jsonl_string = convert_org_entries_to_jsonl(convert_org_nodes_to_entries(entries, entry_to_file_map)) jsonl_data = [json.loads(json_string) for json_string in jsonl_string.splitlines()] # Assert @@ -64,10 +65,11 @@ def test_file_with_no_headings_to_jsonl(tmp_path): # Act # Extract Entries from specified Org files - entries, entry_to_file_map = extract_org_entries(org_files=[orgfile]) + entry_nodes, file_to_entries = extract_org_entries(org_files=[orgfile]) # Process Each Entry from All Notes Files - jsonl_string = convert_org_entries_to_jsonl(entries, entry_to_file_map) + entries = convert_org_nodes_to_entries(entry_nodes, file_to_entries) + jsonl_string = convert_org_entries_to_jsonl(entries) jsonl_data = [json.loads(json_string) for json_string in jsonl_string.splitlines()] # Assert diff --git a/tests/test_orgnode.py b/tests/test_orgnode.py index eebf7f8a..c1e0aaa9 100644 --- a/tests/test_orgnode.py +++ b/tests/test_orgnode.py @@ -21,7 +21,7 @@ def test_parse_entry_with_no_headings(tmp_path): # Assert assert len(entries) == 1 assert entries[0].Heading() == f'{orgfile}' - assert entries[0].Tags() == set() + assert entries[0].Tags() == list() assert entries[0].Body() == "Body Line 1" assert entries[0].Priority() == "" assert entries[0].Property("ID") == "" @@ -45,7 +45,7 @@ Body Line 1''' # Assert assert len(entries) == 1 assert entries[0].Heading() == "Heading" - assert entries[0].Tags() == set() + assert entries[0].Tags() == list() assert entries[0].Body() == "Body Line 1" assert entries[0].Priority() == "" assert entries[0].Property("ID") == "" @@ -79,7 +79,7 @@ Body Line 2''' assert len(entries) == 1 assert entries[0].Heading() == "Heading" assert entries[0].Todo() == "DONE" - assert entries[0].Tags() == {"Tag1", "TAG2", "tag3"} + assert entries[0].Tags() == ["Tag1", "TAG2", "tag3"] assert entries[0].Body() == "- Clocked Log 1\nBody Line 1\nBody Line 2" assert entries[0].Priority() == "A" assert entries[0].Property("ID") == "id:123-456-789-4234-1231" @@ -178,7 +178,7 @@ Body 2 for index, entry in enumerate(entries): assert entry.Heading() == f"Heading{index+1}" assert entry.Todo() == "FAILED" if index == 0 else "CANCELLED" - assert entry.Tags() == {f"tag{index+1}"} + assert entry.Tags() == [f"tag{index+1}"] assert entry.Body() == f"- Clocked Log {index+1}\nBody {index+1}\n\n" assert entry.Priority() == "A" assert entry.Property("ID") == f"id:123-456-789-4234-000{index+1}" @@ -202,7 +202,7 @@ Body Line 1''' # Assert assert len(entries) == 1 assert entries[0].Heading() == f'{orgfile}' - assert entries[0].Tags() == set() + assert entries[0].Tags() == list() assert entries[0].Body() == "Body Line 1" assert entries[0].Priority() == "" assert entries[0].Property("ID") == "" @@ -225,7 +225,7 @@ Body Line 1''' # Assert assert len(entries) == 1 assert entries[0].Heading() == 'test' - assert entries[0].Tags() == set() + assert entries[0].Tags() == list() assert entries[0].Body() == "Body Line 1" assert entries[0].Priority() == "" assert entries[0].Property("ID") == "" @@ -249,7 +249,7 @@ Body Line 1 # Assert assert len(entries) == 1 assert entries[0].Heading() == 'title1 title2' - assert entries[0].Tags() == set() + assert entries[0].Tags() == list() assert entries[0].Body() == "Body Line 1\n" assert entries[0].Priority() == "" assert entries[0].Property("ID") == "" diff --git a/tests/test_text_search.py b/tests/test_text_search.py index 6a1471c9..dce1070a 100644 --- a/tests/test_text_search.py +++ b/tests/test_text_search.py @@ -100,3 +100,32 @@ def test_asymmetric_reload(content_config: ContentConfig, search_config: SearchC # delete reload test file added content_config.org.input_files = [] file_to_add_on_reload.unlink() + + +# ---------------------------------------------------------------------------------------------------- +def test_incremental_update(content_config: ContentConfig, search_config: SearchConfig): + # Arrange + initial_notes_model = text_search.setup(org_to_jsonl, content_config.org, search_config.asymmetric, regenerate=True) + + assert len(initial_notes_model.entries) == 10 + assert len(initial_notes_model.corpus_embeddings) == 10 + + file_to_add_on_update = Path(content_config.org.input_filter).parent / "update.org" + content_config.org.input_files = [f'{file_to_add_on_update}'] + + # append Org-Mode Entry to first Org Input File in Config + with open(file_to_add_on_update, "w") as f: + f.write("\n* A Chihuahua doing Tango\n- Saw a super cute video of a chihuahua doing the Tango on Youtube\n") + + # Act + # update embeddings, entries with the newly added note + initial_notes_model = text_search.setup(org_to_jsonl, content_config.org, search_config.asymmetric, regenerate=False) + + # verify new entry added in updated embeddings, entries + assert len(initial_notes_model.entries) == 11 + assert len(initial_notes_model.corpus_embeddings) == 11 + + # Cleanup + # delete file added for update testing + content_config.org.input_files = [] + file_to_add_on_update.unlink() diff --git a/tests/test_word_filter.py b/tests/test_word_filter.py index 3efe8ed9..db23c2c6 100644 --- a/tests/test_word_filter.py +++ b/tests/test_word_filter.py @@ -1,15 +1,12 @@ -# External Packages -import torch - # Application Packages from src.search_filter.word_filter import WordFilter from src.utils.config import SearchType -def test_no_word_filter(tmp_path): +def test_no_word_filter(): # Arrange - word_filter = WordFilter(tmp_path, SearchType.Org) - embeddings, entries = arrange_content() + word_filter = WordFilter() + entries = arrange_content() q_with_no_filter = 'head tail' # Act @@ -22,10 +19,10 @@ def test_no_word_filter(tmp_path): assert entry_indices == {0, 1, 2, 3} -def test_word_exclude_filter(tmp_path): +def test_word_exclude_filter(): # Arrange - word_filter = WordFilter(tmp_path, SearchType.Org) - embeddings, entries = arrange_content() + word_filter = WordFilter() + entries = arrange_content() q_with_exclude_filter = 'head -"exclude_word" tail' # Act @@ -38,10 +35,10 @@ def test_word_exclude_filter(tmp_path): assert entry_indices == {0, 2} -def test_word_include_filter(tmp_path): +def test_word_include_filter(): # Arrange - word_filter = WordFilter(tmp_path, SearchType.Org) - embeddings, entries = arrange_content() + word_filter = WordFilter() + entries = arrange_content() query_with_include_filter = 'head +"include_word" tail' # Act @@ -54,10 +51,10 @@ def test_word_include_filter(tmp_path): assert entry_indices == {2, 3} -def test_word_include_and_exclude_filter(tmp_path): +def test_word_include_and_exclude_filter(): # Arrange - word_filter = WordFilter(tmp_path, SearchType.Org) - embeddings, entries = arrange_content() + word_filter = WordFilter() + entries = arrange_content() query_with_include_and_exclude_filter = 'head +"include_word" -"exclude_word" tail' # Act @@ -71,11 +68,10 @@ def test_word_include_and_exclude_filter(tmp_path): def arrange_content(): - embeddings = torch.randn(4, 10) entries = [ {'compiled': '', 'raw': 'Minimal Entry'}, {'compiled': '', 'raw': 'Entry with exclude_word'}, {'compiled': '', 'raw': 'Entry with include_word'}, {'compiled': '', 'raw': 'Entry with include_word and exclude_word'}] - return embeddings, entries + return entries