From 502c68d4f8cc67fb52d11853125d69d56772e99b Mon Sep 17 00:00:00 2001
From: Debanjum Singh Solanky <debanjum@gmail.com>
Date: Sat, 26 Feb 2022 17:23:02 -0500
Subject: [PATCH] Remove trailling escape sequence in ledger search response
 entries

- Fix loading entries from jsonl in extract_entries method
  - Only extract Title from jsonl of each entry
    This is the only thing written to the jsonl for symmetric ledger
  - This fixes the trailing escape seq in loaded entries
  - Remove the need for semantic-search.el response reader to do pointless complicated cleanup

- Make symmetric_ledger:extract_entries use beancount_to_jsonl:load_jsonl
  Both methods were doing similar work

- Make load_jsonl handle loading entries from both gzip and uncompressed jsonl
---
 src/processor/ledger/beancount_to_jsonl.py | 20 +++++++++++++++++---
 src/search_type/symmetric_ledger.py        | 17 ++++-------------
 2 files changed, 21 insertions(+), 16 deletions(-)

diff --git a/src/processor/ledger/beancount_to_jsonl.py b/src/processor/ledger/beancount_to_jsonl.py
index 21dbcfb9..99c9d5d5 100644
--- a/src/processor/ledger/beancount_to_jsonl.py
+++ b/src/processor/ledger/beancount_to_jsonl.py
@@ -58,11 +58,25 @@ def compress_jsonl_data(jsonl_data, output_path, verbose=0):
 
 def load_jsonl(input_path, verbose=0):
     "Read List of JSON objects from JSON line file"
+    # Initialize Variables
     data = []
-    with open(get_absolute_path(input_path), 'r', encoding='utf-8') as f:
-        for line in f:
-            data.append(json.loads(line.rstrip('\n|\r')))
+    jsonl_file = None
+    escape_sequences = '\n|\r\t '
 
+    # Open JSONL file
+    if input_path.suffix == ".gz":
+        jsonl_file = gzip.open(get_absolute_path(input_path), 'rt', encoding='utf-8')
+    elif input_path.suffix == ".jsonl":
+        jsonl_file = open(get_absolute_path(input_path), 'r', encoding='utf-8')
+
+    # Read JSONL file
+    for line in jsonl_file:
+        data.append(json.loads(line.strip(escape_sequences)))
+
+    # Close JSONL file
+    jsonl_file.close()
+
+    # Log JSONL entries loaded
     if verbose > 0:
         print(f'Loaded {len(data)} records from {input_path}')
 
diff --git a/src/search_type/symmetric_ledger.py b/src/search_type/symmetric_ledger.py
index f63a1c98..5243c1aa 100644
--- a/src/search_type/symmetric_ledger.py
+++ b/src/search_type/symmetric_ledger.py
@@ -11,7 +11,7 @@ from sentence_transformers import SentenceTransformer, CrossEncoder, util
 
 # Internal Packages
 from src.utils.helpers import get_absolute_path, resolve_absolute_path, load_model
-from src.processor.ledger.beancount_to_jsonl import beancount_to_jsonl
+from src.processor.ledger.beancount_to_jsonl import beancount_to_jsonl, load_jsonl
 from src.utils.config import TextSearchModel
 from src.utils.rawconfig import SymmetricSearchConfig, TextContentConfig
 
@@ -40,18 +40,9 @@ def initialize_model(search_config: SymmetricSearchConfig):
 
 def extract_entries(notesfile, verbose=0):
     "Load entries from compressed jsonl"
-    entries = []
-    with gzip.open(get_absolute_path(notesfile), 'rt', encoding='utf8') as jsonl:
-        for line in jsonl:
-            note = json.loads(line.strip())
-
-            note_string = f'{note["Title"]} \t {note["Tags"] if "Tags" in note else ""} \n {note["Body"] if "Body" in note else ""}'
-            entries.extend([note_string])
-
-    if verbose > 0:
-        print(f"Loaded {len(entries)} entries from {notesfile}")
-
-    return entries
+    return [f'{entry["Title"]}'
+            for entry
+            in load_jsonl(notesfile, verbose=verbose)]
 
 
 def compute_embeddings(entries, bi_encoder, embeddings_file, regenerate=False, verbose=0):