Reuse logic to extract entries across symmetric, asymmetric search

Now that the logic to compile entries is in the processor layer, the extract_entries method is standard across (text) search_types Extract the load_jsonl method as a utility helper method. Use it in (a)symmetric search types
2026-03-08 05:39:13 +00:00 · 2022-07-21 02:53:18 +04:00
parent e220ecc00b
commit 5aad297286
4 changed files with 39 additions and 56 deletions
--- a/src/search_type/asymmetric.py
+++ b/src/search_type/asymmetric.py
@@ -1,8 +1,6 @@
 #!/usr/bin/env python

 # Standard Packages
-import json
-import gzip
 import argparse
 import pathlib
 from copy import deepcopy
@@ -12,11 +10,10 @@ import torch
 from sentence_transformers import SentenceTransformer, CrossEncoder, util

 # Internal Packages
-from src.utils.helpers import get_absolute_path, resolve_absolute_path, load_model
+from src.utils.helpers import get_absolute_path, resolve_absolute_path, load_model, load_jsonl
 from src.processor.org_mode.org_to_jsonl import org_to_jsonl
 from src.utils.config import TextSearchModel
 from src.utils.rawconfig import AsymmetricSearchConfig, TextContentConfig
-from src.utils.constants import empty_escape_sequences


 def initialize_model(search_config: AsymmetricSearchConfig):
@@ -43,27 +40,9 @@ def initialize_model(search_config: AsymmetricSearchConfig):

 def extract_entries(notesfile, verbose=0):
    "Load entries from compressed jsonl"
-    entries = []
-    jsonl_file = None
-
-    # Open File
-    if notesfile.suffix == ".gz":
-        jsonl_file = gzip.open(get_absolute_path(notesfile), "rt", encoding='utf8')
-    elif notesfile.suffix == ".jsonl":
-        jsonl_file = open(get_absolute_path(notesfile), "r", encoding='utf8')
-
-    # Read File
-    for line in jsonl_file:
-        note = json.loads(line.strip(empty_escape_sequences))
-        entries.append({'compiled': note['compiled'], 'raw': note["raw"]})
-
-    # Close File
-    jsonl_file.close()
-
-    if verbose > 0:
-        print(f"Loaded {len(entries)} entries from {notesfile}")
-
-    return entries
+    return [{'compiled': f'{entry["compiled"]}', 'raw': f'{entry["raw"]}'}
+            for entry
+            in load_jsonl(notesfile, verbose=verbose)]


 def compute_embeddings(entries, bi_encoder, embeddings_file, regenerate=False, device='cpu', verbose=0):
@@ -194,4 +173,4 @@ if __name__ == '__main__':
        hits = query(user_query, corpus_embeddings, entries, bi_encoder, cross_encoder, top_k)

        # render results
-        render_results(hits, entries, count=args.results_count)
+        render_results(hits, entries, count=args.results_count)
--- a/src/search_type/symmetric_ledger.py
+++ b/src/search_type/symmetric_ledger.py
@@ -8,8 +8,8 @@ import torch
 from sentence_transformers import SentenceTransformer, CrossEncoder, util

 # Internal Packages
-from src.utils.helpers import get_absolute_path, resolve_absolute_path, load_model
-from src.processor.ledger.beancount_to_jsonl import beancount_to_jsonl, load_jsonl
+from src.utils.helpers import get_absolute_path, resolve_absolute_path, load_model, load_jsonl
+from src.processor.ledger.beancount_to_jsonl import beancount_to_jsonl
 from src.utils.config import TextSearchModel
 from src.utils.rawconfig import SymmetricSearchConfig, TextContentConfig