From 502c68d4f8cc67fb52d11853125d69d56772e99b Mon Sep 17 00:00:00 2001 From: Debanjum Singh Solanky Date: Sat, 26 Feb 2022 17:23:02 -0500 Subject: [PATCH] Remove trailling escape sequence in ledger search response entries - Fix loading entries from jsonl in extract_entries method - Only extract Title from jsonl of each entry This is the only thing written to the jsonl for symmetric ledger - This fixes the trailing escape seq in loaded entries - Remove the need for semantic-search.el response reader to do pointless complicated cleanup - Make symmetric_ledger:extract_entries use beancount_to_jsonl:load_jsonl Both methods were doing similar work - Make load_jsonl handle loading entries from both gzip and uncompressed jsonl --- src/processor/ledger/beancount_to_jsonl.py | 20 +++++++++++++++++--- src/search_type/symmetric_ledger.py | 17 ++++------------- 2 files changed, 21 insertions(+), 16 deletions(-) diff --git a/src/processor/ledger/beancount_to_jsonl.py b/src/processor/ledger/beancount_to_jsonl.py index 21dbcfb9..99c9d5d5 100644 --- a/src/processor/ledger/beancount_to_jsonl.py +++ b/src/processor/ledger/beancount_to_jsonl.py @@ -58,11 +58,25 @@ def compress_jsonl_data(jsonl_data, output_path, verbose=0): def load_jsonl(input_path, verbose=0): "Read List of JSON objects from JSON line file" + # Initialize Variables data = [] - with open(get_absolute_path(input_path), 'r', encoding='utf-8') as f: - for line in f: - data.append(json.loads(line.rstrip('\n|\r'))) + jsonl_file = None + escape_sequences = '\n|\r\t ' + # Open JSONL file + if input_path.suffix == ".gz": + jsonl_file = gzip.open(get_absolute_path(input_path), 'rt', encoding='utf-8') + elif input_path.suffix == ".jsonl": + jsonl_file = open(get_absolute_path(input_path), 'r', encoding='utf-8') + + # Read JSONL file + for line in jsonl_file: + data.append(json.loads(line.strip(escape_sequences))) + + # Close JSONL file + jsonl_file.close() + + # Log JSONL entries loaded if verbose > 0: print(f'Loaded {len(data)} records from {input_path}') diff --git a/src/search_type/symmetric_ledger.py b/src/search_type/symmetric_ledger.py index f63a1c98..5243c1aa 100644 --- a/src/search_type/symmetric_ledger.py +++ b/src/search_type/symmetric_ledger.py @@ -11,7 +11,7 @@ from sentence_transformers import SentenceTransformer, CrossEncoder, util # Internal Packages from src.utils.helpers import get_absolute_path, resolve_absolute_path, load_model -from src.processor.ledger.beancount_to_jsonl import beancount_to_jsonl +from src.processor.ledger.beancount_to_jsonl import beancount_to_jsonl, load_jsonl from src.utils.config import TextSearchModel from src.utils.rawconfig import SymmetricSearchConfig, TextContentConfig @@ -40,18 +40,9 @@ def initialize_model(search_config: SymmetricSearchConfig): def extract_entries(notesfile, verbose=0): "Load entries from compressed jsonl" - entries = [] - with gzip.open(get_absolute_path(notesfile), 'rt', encoding='utf8') as jsonl: - for line in jsonl: - note = json.loads(line.strip()) - - note_string = f'{note["Title"]} \t {note["Tags"] if "Tags" in note else ""} \n {note["Body"] if "Body" in note else ""}' - entries.extend([note_string]) - - if verbose > 0: - print(f"Loaded {len(entries)} entries from {notesfile}") - - return entries + return [f'{entry["Title"]}' + for entry + in load_jsonl(notesfile, verbose=verbose)] def compute_embeddings(entries, bi_encoder, embeddings_file, regenerate=False, verbose=0):