mirror of
https://github.com/khoaliber/khoj.git
synced 2026-03-03 05:29:12 +00:00
57 lines
1.5 KiB
Python
57 lines
1.5 KiB
Python
# Standard Packages
|
|
import json
|
|
import gzip
|
|
import logging
|
|
|
|
# Internal Packages
|
|
from src.utils.constants import empty_escape_sequences
|
|
from src.utils.helpers import get_absolute_path
|
|
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
def load_jsonl(input_path):
|
|
"Read List of JSON objects from JSON line file"
|
|
# Initialize Variables
|
|
data = []
|
|
jsonl_file = None
|
|
|
|
# Open JSONL file
|
|
if input_path.suffix == ".gz":
|
|
jsonl_file = gzip.open(get_absolute_path(input_path), 'rt', encoding='utf-8')
|
|
elif input_path.suffix == ".jsonl":
|
|
jsonl_file = open(get_absolute_path(input_path), 'r', encoding='utf-8')
|
|
|
|
# Read JSONL file
|
|
for line in jsonl_file:
|
|
data.append(json.loads(line.strip(empty_escape_sequences)))
|
|
|
|
# Close JSONL file
|
|
jsonl_file.close()
|
|
|
|
# Log JSONL entries loaded
|
|
logger.info(f'Loaded {len(data)} records from {input_path}')
|
|
|
|
return data
|
|
|
|
|
|
def dump_jsonl(jsonl_data, output_path):
|
|
"Write List of JSON objects to JSON line file"
|
|
# Create output directory, if it doesn't exist
|
|
output_path.parent.mkdir(parents=True, exist_ok=True)
|
|
|
|
with open(output_path, 'w', encoding='utf-8') as f:
|
|
f.write(jsonl_data)
|
|
|
|
logger.info(f'Wrote jsonl data to {output_path}')
|
|
|
|
|
|
def compress_jsonl_data(jsonl_data, output_path):
|
|
# Create output directory, if it doesn't exist
|
|
output_path.parent.mkdir(parents=True, exist_ok=True)
|
|
|
|
with gzip.open(output_path, 'wt') as gzip_file:
|
|
gzip_file.write(jsonl_data)
|
|
|
|
logger.info(f'Wrote jsonl data to gzip compressed jsonl at {output_path}') |