mirror of
https://github.com/khoaliber/khoj.git
synced 2026-03-08 05:39:13 +00:00
Reuse logic to extract entries across symmetric, asymmetric search
Now that the logic to compile entries is in the processor layer, the extract_entries method is standard across (text) search_types Extract the load_jsonl method as a utility helper method. Use it in (a)symmetric search types
This commit is contained in:
@@ -1,8 +1,6 @@
|
||||
#!/usr/bin/env python
|
||||
|
||||
# Standard Packages
|
||||
import json
|
||||
import gzip
|
||||
import argparse
|
||||
import pathlib
|
||||
from copy import deepcopy
|
||||
@@ -12,11 +10,10 @@ import torch
|
||||
from sentence_transformers import SentenceTransformer, CrossEncoder, util
|
||||
|
||||
# Internal Packages
|
||||
from src.utils.helpers import get_absolute_path, resolve_absolute_path, load_model
|
||||
from src.utils.helpers import get_absolute_path, resolve_absolute_path, load_model, load_jsonl
|
||||
from src.processor.org_mode.org_to_jsonl import org_to_jsonl
|
||||
from src.utils.config import TextSearchModel
|
||||
from src.utils.rawconfig import AsymmetricSearchConfig, TextContentConfig
|
||||
from src.utils.constants import empty_escape_sequences
|
||||
|
||||
|
||||
def initialize_model(search_config: AsymmetricSearchConfig):
|
||||
@@ -43,27 +40,9 @@ def initialize_model(search_config: AsymmetricSearchConfig):
|
||||
|
||||
def extract_entries(notesfile, verbose=0):
|
||||
"Load entries from compressed jsonl"
|
||||
entries = []
|
||||
jsonl_file = None
|
||||
|
||||
# Open File
|
||||
if notesfile.suffix == ".gz":
|
||||
jsonl_file = gzip.open(get_absolute_path(notesfile), "rt", encoding='utf8')
|
||||
elif notesfile.suffix == ".jsonl":
|
||||
jsonl_file = open(get_absolute_path(notesfile), "r", encoding='utf8')
|
||||
|
||||
# Read File
|
||||
for line in jsonl_file:
|
||||
note = json.loads(line.strip(empty_escape_sequences))
|
||||
entries.append({'compiled': note['compiled'], 'raw': note["raw"]})
|
||||
|
||||
# Close File
|
||||
jsonl_file.close()
|
||||
|
||||
if verbose > 0:
|
||||
print(f"Loaded {len(entries)} entries from {notesfile}")
|
||||
|
||||
return entries
|
||||
return [{'compiled': f'{entry["compiled"]}', 'raw': f'{entry["raw"]}'}
|
||||
for entry
|
||||
in load_jsonl(notesfile, verbose=verbose)]
|
||||
|
||||
|
||||
def compute_embeddings(entries, bi_encoder, embeddings_file, regenerate=False, device='cpu', verbose=0):
|
||||
@@ -194,4 +173,4 @@ if __name__ == '__main__':
|
||||
hits = query(user_query, corpus_embeddings, entries, bi_encoder, cross_encoder, top_k)
|
||||
|
||||
# render results
|
||||
render_results(hits, entries, count=args.results_count)
|
||||
render_results(hits, entries, count=args.results_count)
|
||||
@@ -8,8 +8,8 @@ import torch
|
||||
from sentence_transformers import SentenceTransformer, CrossEncoder, util
|
||||
|
||||
# Internal Packages
|
||||
from src.utils.helpers import get_absolute_path, resolve_absolute_path, load_model
|
||||
from src.processor.ledger.beancount_to_jsonl import beancount_to_jsonl, load_jsonl
|
||||
from src.utils.helpers import get_absolute_path, resolve_absolute_path, load_model, load_jsonl
|
||||
from src.processor.ledger.beancount_to_jsonl import beancount_to_jsonl
|
||||
from src.utils.config import TextSearchModel
|
||||
from src.utils.rawconfig import SymmetricSearchConfig, TextContentConfig
|
||||
|
||||
|
||||
Reference in New Issue
Block a user