mirror of
https://github.com/khoaliber/khoj.git
synced 2026-03-02 21:19:12 +00:00
Use verbosity level instead of bool across application
For consistent, more granular verbosity controls across app Allows user to increase verbosity by passing -vvv flags passed to main.py
This commit is contained in:
6
main.py
6
main.py
@@ -37,9 +37,9 @@ def search(q: str, n: Optional[int] = 5, t: Optional[str] = 'notes'):
|
|||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
# Setup Argument Parser
|
# Setup Argument Parser
|
||||||
parser = argparse.ArgumentParser(description="Expose API for Semantic Search")
|
parser = argparse.ArgumentParser(description="Expose API for Semantic Search")
|
||||||
parser.add_argument('--compressed-jsonl', '-j', required=True, type=pathlib.Path, help="Compressed JSONL formatted notes file to compute embeddings from")
|
parser.add_argument('--compressed-jsonl', '-j', type=pathlib.Path, default=pathlib.Path(".notes.jsonl.gz"), help="Compressed JSONL formatted notes file to compute embeddings from")
|
||||||
parser.add_argument('--embeddings', '-e', required=True, type=pathlib.Path, help="File to save/load model embeddings to/from")
|
parser.add_argument('--embeddings', '-e', type=pathlib.Path, default=pathlib.Path(".notes_embeddings.pt"), help="File to save/load model embeddings to/from")
|
||||||
parser.add_argument('--verbose', action='store_true', default=False, help="Show verbose conversion logs. Default: false")
|
parser.add_argument('--verbose', action='count', help="Show verbose conversion logs. Default: 0")
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
|
|
||||||
# Initialize Model
|
# Initialize Model
|
||||||
|
|||||||
@@ -11,9 +11,9 @@ import gzip
|
|||||||
|
|
||||||
|
|
||||||
# Define Functions
|
# Define Functions
|
||||||
def org_to_jsonl(org_files, org_file_filter, output_path, verbose=False):
|
def org_to_jsonl(org_files, org_file_filter, output_path, verbose=0):
|
||||||
# Get Org Files to Process
|
# Get Org Files to Process
|
||||||
org_files = get_org_files(args.input_files, args.input_filter)
|
org_files = get_org_files(args.input_files, args.input_filter, verbose)
|
||||||
|
|
||||||
# Extract Entries from specified Org files
|
# Extract Entries from specified Org files
|
||||||
entries = extract_org_entries(org_files)
|
entries = extract_org_entries(org_files)
|
||||||
@@ -59,7 +59,7 @@ def load_jsonl(input_path, verbose=0):
|
|||||||
return data
|
return data
|
||||||
|
|
||||||
|
|
||||||
def get_org_files(org_files=None, org_file_filter=None):
|
def get_org_files(org_files=None, org_file_filter=None, verbose=0):
|
||||||
"Get Org files to process"
|
"Get Org files to process"
|
||||||
absolute_org_files, filtered_org_files = set(), set()
|
absolute_org_files, filtered_org_files = set(), set()
|
||||||
if org_files:
|
if org_files:
|
||||||
@@ -75,7 +75,7 @@ def get_org_files(org_files=None, org_file_filter=None):
|
|||||||
if any(files_with_non_org_extensions):
|
if any(files_with_non_org_extensions):
|
||||||
print(f"[Warning] There maybe non org-mode files in the input set: {files_with_non_org_extensions}")
|
print(f"[Warning] There maybe non org-mode files in the input set: {files_with_non_org_extensions}")
|
||||||
|
|
||||||
if args.verbose:
|
if args.verbose > 0:
|
||||||
print(f'Processing files: {all_org_files}')
|
print(f'Processing files: {all_org_files}')
|
||||||
|
|
||||||
return all_org_files
|
return all_org_files
|
||||||
|
|||||||
@@ -20,7 +20,7 @@ def initialize_model():
|
|||||||
return bi_encoder, cross_encoder, top_k
|
return bi_encoder, cross_encoder, top_k
|
||||||
|
|
||||||
|
|
||||||
def extract_entries(notesfile, verbose=False):
|
def extract_entries(notesfile, verbose=0):
|
||||||
"Load entries from compressed jsonl"
|
"Load entries from compressed jsonl"
|
||||||
entries = []
|
entries = []
|
||||||
with gzip.open(get_absolute_path(notesfile), 'rt', encoding='utf8') as jsonl:
|
with gzip.open(get_absolute_path(notesfile), 'rt', encoding='utf8') as jsonl:
|
||||||
@@ -34,24 +34,24 @@ def extract_entries(notesfile, verbose=False):
|
|||||||
note_string = f'{note["Title"]}\t{note["Tags"] if "Tags" in note else ""}\n{note["Body"] if "Body" in note else ""}'
|
note_string = f'{note["Title"]}\t{note["Tags"] if "Tags" in note else ""}\n{note["Body"] if "Body" in note else ""}'
|
||||||
entries.extend([note_string])
|
entries.extend([note_string])
|
||||||
|
|
||||||
if verbose:
|
if verbose > 0:
|
||||||
print(f"Loaded {len(entries)} entries from {notesfile}")
|
print(f"Loaded {len(entries)} entries from {notesfile}")
|
||||||
|
|
||||||
return entries
|
return entries
|
||||||
|
|
||||||
|
|
||||||
def compute_embeddings(entries, bi_encoder, embeddings_file, regenerate=False, verbose=False):
|
def compute_embeddings(entries, bi_encoder, embeddings_file, regenerate=False, verbose=0):
|
||||||
"Compute (and Save) Embeddings or Load Pre-Computed Embeddings"
|
"Compute (and Save) Embeddings or Load Pre-Computed Embeddings"
|
||||||
# Load pre-computed embeddings from file if exists
|
# Load pre-computed embeddings from file if exists
|
||||||
if embeddings_file.exists() and not regenerate:
|
if embeddings_file.exists() and not regenerate:
|
||||||
corpus_embeddings = torch.load(get_absolute_path(embeddings_file))
|
corpus_embeddings = torch.load(get_absolute_path(embeddings_file))
|
||||||
if verbose:
|
if verbose > 0:
|
||||||
print(f"Loaded embeddings from {embeddings_file}")
|
print(f"Loaded embeddings from {embeddings_file}")
|
||||||
|
|
||||||
else: # Else compute the corpus_embeddings from scratch, which can take a while
|
else: # Else compute the corpus_embeddings from scratch, which can take a while
|
||||||
corpus_embeddings = bi_encoder.encode(entries, convert_to_tensor=True, show_progress_bar=True)
|
corpus_embeddings = bi_encoder.encode(entries, convert_to_tensor=True, show_progress_bar=True)
|
||||||
torch.save(corpus_embeddings, get_absolute_path(embeddings_file))
|
torch.save(corpus_embeddings, get_absolute_path(embeddings_file))
|
||||||
if verbose:
|
if verbose > 0:
|
||||||
print(f"Computed embeddings and save them to {embeddings_file}")
|
print(f"Computed embeddings and save them to {embeddings_file}")
|
||||||
|
|
||||||
return corpus_embeddings
|
return corpus_embeddings
|
||||||
@@ -147,7 +147,7 @@ if __name__ == '__main__':
|
|||||||
parser.add_argument('--embeddings', '-e', required=True, type=pathlib.Path, help="File to save/load model embeddings to/from")
|
parser.add_argument('--embeddings', '-e', required=True, type=pathlib.Path, help="File to save/load model embeddings to/from")
|
||||||
parser.add_argument('--results-count', '-n', default=5, type=int, help="Number of results to render. Default: 5")
|
parser.add_argument('--results-count', '-n', default=5, type=int, help="Number of results to render. Default: 5")
|
||||||
parser.add_argument('--interactive', action='store_true', default=False, help="Interactive mode allows user to run queries on the model. Default: true")
|
parser.add_argument('--interactive', action='store_true', default=False, help="Interactive mode allows user to run queries on the model. Default: true")
|
||||||
parser.add_argument('--verbose', action='store_true', default=False, help="Show verbose conversion logs. Default: false")
|
parser.add_argument('--verbose', action='count', help="Show verbose conversion logs. Default: 0")
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
|
|
||||||
# Initialize Model
|
# Initialize Model
|
||||||
|
|||||||
Reference in New Issue
Block a user