diff --git a/src/processor/ledger/beancount_to_jsonl.py b/src/processor/ledger/beancount_to_jsonl.py index ede42686..861f8620 100644 --- a/src/processor/ledger/beancount_to_jsonl.py +++ b/src/processor/ledger/beancount_to_jsonl.py @@ -5,12 +5,12 @@ import json import argparse import pathlib import glob -import gzip import re # Internal Packages from src.utils.helpers import get_absolute_path, is_none_or_empty from src.utils.constants import empty_escape_sequences +from src.utils.jsonl import dump_jsonl, compress_jsonl_data # Define Functions @@ -38,25 +38,6 @@ def beancount_to_jsonl(beancount_files, beancount_file_filter, output_file, verb return entries -def dump_jsonl(jsonl_data, output_path, verbose=0): - "Write List of JSON objects to JSON line file" - with open(get_absolute_path(output_path), 'w', encoding='utf-8') as f: - f.write(jsonl_data) - - if verbose > 0: - jsonl_entries = len(jsonl_data.split('\n')) - print(f'Wrote {jsonl_entries} lines to jsonl at {output_path}') - - -def compress_jsonl_data(jsonl_data, output_path, verbose=0): - with gzip.open(get_absolute_path(output_path), 'wt') as gzip_file: - gzip_file.write(jsonl_data) - - if verbose > 0: - jsonl_entries = len(jsonl_data.split('\n')) - print(f'Wrote {jsonl_entries} lines to gzip compressed jsonl at {output_path}') - - def get_beancount_files(beancount_files=None, beancount_file_filter=None, verbose=0): "Get Beancount files to process" absolute_beancount_files, filtered_beancount_files = set(), set() diff --git a/src/processor/org_mode/org_to_jsonl.py b/src/processor/org_mode/org_to_jsonl.py index 871125b1..bf147faa 100644 --- a/src/processor/org_mode/org_to_jsonl.py +++ b/src/processor/org_mode/org_to_jsonl.py @@ -5,12 +5,12 @@ import json import argparse import pathlib import glob -import gzip # Internal Packages from src.processor.org_mode import orgnode from src.utils.helpers import get_absolute_path, is_none_or_empty from src.utils.constants import empty_escape_sequences +from src.utils.jsonl import dump_jsonl, compress_jsonl_data # Define Functions @@ -37,35 +37,6 @@ def org_to_jsonl(org_files, org_file_filter, output_file, verbose=0): return entries -def dump_jsonl(jsonl_data, output_path, verbose=0): - "Write List of JSON objects to JSON line file" - with open(get_absolute_path(output_path), 'w', encoding='utf-8') as f: - f.write(jsonl_data) - - if verbose > 0: - print(f'Wrote {len(jsonl_data)} lines to jsonl at {output_path}') - - -def compress_jsonl_data(jsonl_data, output_path, verbose=0): - with gzip.open(get_absolute_path(output_path), 'wt') as gzip_file: - gzip_file.write(jsonl_data) - - if verbose > 0: - print(f'Wrote {len(jsonl_data)} lines to gzip compressed jsonl at {output_path}') - - -def load_jsonl(input_path, verbose=0): - "Read List of JSON objects from JSON line file" - data = [] - with open(get_absolute_path(input_path), 'r', encoding='utf-8') as f: - for line in f: - data.append(json.loads(line.rstrip('\n|\r'))) - - if verbose > 0: - print(f'Loaded {len(data)} records from {input_path}') - - return data - def get_org_files(org_files=None, org_file_filter=None, verbose=0): "Get Org files to process" diff --git a/src/search_type/asymmetric.py b/src/search_type/asymmetric.py index da2f34dc..0fb879d7 100644 --- a/src/search_type/asymmetric.py +++ b/src/search_type/asymmetric.py @@ -10,10 +10,11 @@ import torch from sentence_transformers import SentenceTransformer, CrossEncoder, util # Internal Packages -from src.utils.helpers import get_absolute_path, resolve_absolute_path, load_model, load_jsonl +from src.utils.helpers import get_absolute_path, resolve_absolute_path, load_model from src.processor.org_mode.org_to_jsonl import org_to_jsonl from src.utils.config import TextSearchModel from src.utils.rawconfig import AsymmetricSearchConfig, TextContentConfig +from src.utils.jsonl import load_jsonl def initialize_model(search_config: AsymmetricSearchConfig): diff --git a/src/search_type/symmetric_ledger.py b/src/search_type/symmetric_ledger.py index 616a86e7..0dcb94bf 100644 --- a/src/search_type/symmetric_ledger.py +++ b/src/search_type/symmetric_ledger.py @@ -8,10 +8,11 @@ import torch from sentence_transformers import SentenceTransformer, CrossEncoder, util # Internal Packages -from src.utils.helpers import get_absolute_path, resolve_absolute_path, load_model, load_jsonl +from src.utils.helpers import get_absolute_path, resolve_absolute_path, load_model from src.processor.ledger.beancount_to_jsonl import beancount_to_jsonl from src.utils.config import TextSearchModel from src.utils.rawconfig import SymmetricSearchConfig, TextContentConfig +from src.utils.jsonl import load_jsonl def initialize_model(search_config: SymmetricSearchConfig): diff --git a/src/utils/helpers.py b/src/utils/helpers.py index b19deb6f..3c19b935 100644 --- a/src/utils/helpers.py +++ b/src/utils/helpers.py @@ -1,12 +1,7 @@ # Standard Packages -import json -import gzip import pathlib from os.path import join -# Internal Packages -from src.utils.constants import empty_escape_sequences - def is_none_or_empty(item): return item == None or (hasattr(item, '__iter__') and len(item) == 0) @@ -57,30 +52,4 @@ def load_model(model_name, model_dir, model_type): if model_path is not None: model.save(model_path) - return model - - -def load_jsonl(input_path, verbose=0): - "Read List of JSON objects from JSON line file" - # Initialize Variables - data = [] - jsonl_file = None - - # Open JSONL file - if input_path.suffix == ".gz": - jsonl_file = gzip.open(get_absolute_path(input_path), 'rt', encoding='utf-8') - elif input_path.suffix == ".jsonl": - jsonl_file = open(get_absolute_path(input_path), 'r', encoding='utf-8') - - # Read JSONL file - for line in jsonl_file: - data.append(json.loads(line.strip(empty_escape_sequences))) - - # Close JSONL file - jsonl_file.close() - - # Log JSONL entries loaded - if verbose > 0: - print(f'Loaded {len(data)} records from {input_path}') - - return data \ No newline at end of file + return model \ No newline at end of file diff --git a/src/utils/jsonl.py b/src/utils/jsonl.py new file mode 100644 index 00000000..67fc7c9a --- /dev/null +++ b/src/utils/jsonl.py @@ -0,0 +1,50 @@ +# Standard Packages +import json +import gzip + +# Internal Packages +from src.utils.constants import empty_escape_sequences +from src.utils.helpers import get_absolute_path + + +def load_jsonl(input_path, verbose=0): + "Read List of JSON objects from JSON line file" + # Initialize Variables + data = [] + jsonl_file = None + + # Open JSONL file + if input_path.suffix == ".gz": + jsonl_file = gzip.open(get_absolute_path(input_path), 'rt', encoding='utf-8') + elif input_path.suffix == ".jsonl": + jsonl_file = open(get_absolute_path(input_path), 'r', encoding='utf-8') + + # Read JSONL file + for line in jsonl_file: + data.append(json.loads(line.strip(empty_escape_sequences))) + + # Close JSONL file + jsonl_file.close() + + # Log JSONL entries loaded + if verbose > 0: + print(f'Loaded {len(data)} records from {input_path}') + + return data + + +def dump_jsonl(jsonl_data, output_path, verbose=0): + "Write List of JSON objects to JSON line file" + with open(get_absolute_path(output_path), 'w', encoding='utf-8') as f: + f.write(jsonl_data) + + if verbose > 0: + print(f'Wrote {len(jsonl_data)} lines to jsonl at {output_path}') + + +def compress_jsonl_data(jsonl_data, output_path, verbose=0): + with gzip.open(get_absolute_path(output_path), 'wt') as gzip_file: + gzip_file.write(jsonl_data) + + if verbose > 0: + print(f'Wrote {len(jsonl_data)} lines to gzip compressed jsonl at {output_path}') \ No newline at end of file