mirror of
https://github.com/khoaliber/khoj.git
synced 2026-03-09 21:29:11 +00:00
Remove trailling escape sequence in ledger search response entries
- Fix loading entries from jsonl in extract_entries method
- Only extract Title from jsonl of each entry
This is the only thing written to the jsonl for symmetric ledger
- This fixes the trailing escape seq in loaded entries
- Remove the need for semantic-search.el response reader to do pointless complicated cleanup
- Make symmetric_ledger:extract_entries use beancount_to_jsonl:load_jsonl
Both methods were doing similar work
- Make load_jsonl handle loading entries from both gzip and uncompressed jsonl
This commit is contained in:
@@ -58,11 +58,25 @@ def compress_jsonl_data(jsonl_data, output_path, verbose=0):
|
|||||||
|
|
||||||
def load_jsonl(input_path, verbose=0):
|
def load_jsonl(input_path, verbose=0):
|
||||||
"Read List of JSON objects from JSON line file"
|
"Read List of JSON objects from JSON line file"
|
||||||
|
# Initialize Variables
|
||||||
data = []
|
data = []
|
||||||
with open(get_absolute_path(input_path), 'r', encoding='utf-8') as f:
|
jsonl_file = None
|
||||||
for line in f:
|
escape_sequences = '\n|\r\t '
|
||||||
data.append(json.loads(line.rstrip('\n|\r')))
|
|
||||||
|
|
||||||
|
# Open JSONL file
|
||||||
|
if input_path.suffix == ".gz":
|
||||||
|
jsonl_file = gzip.open(get_absolute_path(input_path), 'rt', encoding='utf-8')
|
||||||
|
elif input_path.suffix == ".jsonl":
|
||||||
|
jsonl_file = open(get_absolute_path(input_path), 'r', encoding='utf-8')
|
||||||
|
|
||||||
|
# Read JSONL file
|
||||||
|
for line in jsonl_file:
|
||||||
|
data.append(json.loads(line.strip(escape_sequences)))
|
||||||
|
|
||||||
|
# Close JSONL file
|
||||||
|
jsonl_file.close()
|
||||||
|
|
||||||
|
# Log JSONL entries loaded
|
||||||
if verbose > 0:
|
if verbose > 0:
|
||||||
print(f'Loaded {len(data)} records from {input_path}')
|
print(f'Loaded {len(data)} records from {input_path}')
|
||||||
|
|
||||||
|
|||||||
@@ -11,7 +11,7 @@ from sentence_transformers import SentenceTransformer, CrossEncoder, util
|
|||||||
|
|
||||||
# Internal Packages
|
# Internal Packages
|
||||||
from src.utils.helpers import get_absolute_path, resolve_absolute_path, load_model
|
from src.utils.helpers import get_absolute_path, resolve_absolute_path, load_model
|
||||||
from src.processor.ledger.beancount_to_jsonl import beancount_to_jsonl
|
from src.processor.ledger.beancount_to_jsonl import beancount_to_jsonl, load_jsonl
|
||||||
from src.utils.config import TextSearchModel
|
from src.utils.config import TextSearchModel
|
||||||
from src.utils.rawconfig import SymmetricSearchConfig, TextContentConfig
|
from src.utils.rawconfig import SymmetricSearchConfig, TextContentConfig
|
||||||
|
|
||||||
@@ -40,18 +40,9 @@ def initialize_model(search_config: SymmetricSearchConfig):
|
|||||||
|
|
||||||
def extract_entries(notesfile, verbose=0):
|
def extract_entries(notesfile, verbose=0):
|
||||||
"Load entries from compressed jsonl"
|
"Load entries from compressed jsonl"
|
||||||
entries = []
|
return [f'{entry["Title"]}'
|
||||||
with gzip.open(get_absolute_path(notesfile), 'rt', encoding='utf8') as jsonl:
|
for entry
|
||||||
for line in jsonl:
|
in load_jsonl(notesfile, verbose=verbose)]
|
||||||
note = json.loads(line.strip())
|
|
||||||
|
|
||||||
note_string = f'{note["Title"]} \t {note["Tags"] if "Tags" in note else ""} \n {note["Body"] if "Body" in note else ""}'
|
|
||||||
entries.extend([note_string])
|
|
||||||
|
|
||||||
if verbose > 0:
|
|
||||||
print(f"Loaded {len(entries)} entries from {notesfile}")
|
|
||||||
|
|
||||||
return entries
|
|
||||||
|
|
||||||
|
|
||||||
def compute_embeddings(entries, bi_encoder, embeddings_file, regenerate=False, verbose=0):
|
def compute_embeddings(entries, bi_encoder, embeddings_file, regenerate=False, verbose=0):
|
||||||
|
|||||||
Reference in New Issue
Block a user