From 70e70d4b155ddf57faa30479df8eae8c42fc4372 Mon Sep 17 00:00:00 2001 From: Debanjum Singh Solanky Date: Wed, 20 Jul 2022 20:35:50 +0400 Subject: [PATCH] Rename 'embed' key to more generic 'compiled' for jsonl extracted results - While it's true those strings are going to be used to generated embeddings, the more generic term allows them to be used elsewhere as well - Their main property is that they are processed, compiled for usage by semantic search - Unlike the 'raw' string which contains the external representation of the data, as is --- src/search_type/asymmetric.py | 10 +++++----- src/search_type/symmetric_ledger.py | 8 ++++---- tests/test_date_filter.py | 6 +++--- 3 files changed, 12 insertions(+), 12 deletions(-) diff --git a/src/search_type/asymmetric.py b/src/search_type/asymmetric.py index f3596e53..d2b1886d 100644 --- a/src/search_type/asymmetric.py +++ b/src/search_type/asymmetric.py @@ -63,7 +63,7 @@ def extract_entries(notesfile, verbose=0): note_string = f'{note["Title"]}' \ f'\t{note["Tags"] if "Tags" in note else ""}' \ f'\n{note["Body"] if "Body" in note else ""}' - entries.append({'embed': note_string, 'raw': note["Raw"]}) + entries.append({'compiled': note_string, 'raw': note["Raw"]}) # Close File jsonl_file.close() @@ -83,7 +83,7 @@ def compute_embeddings(entries, bi_encoder, embeddings_file, regenerate=False, d print(f"Loaded embeddings from {embeddings_file}") else: # Else compute the corpus_embeddings from scratch, which can take a while - corpus_embeddings = bi_encoder.encode([entry['embed'] for entry in entries], convert_to_tensor=True, show_progress_bar=True) + corpus_embeddings = bi_encoder.encode([entry['compiled'] for entry in entries], convert_to_tensor=True, show_progress_bar=True) corpus_embeddings.to(device) corpus_embeddings = util.normalize_embeddings(corpus_embeddings) torch.save(corpus_embeddings, get_absolute_path(embeddings_file)) @@ -116,7 +116,7 @@ def query(raw_query: str, model: TextSearchModel, device=torch.device('cpu'), fi hits = util.semantic_search(question_embedding, corpus_embeddings, top_k=model.top_k, score_function=util.dot_score)[0] # Score all retrieved entries using the cross-encoder - cross_inp = [[query, entries[hit['corpus_id']]['embed']] for hit in hits] + cross_inp = [[query, entries[hit['corpus_id']]['compiled']] for hit in hits] cross_scores = model.cross_encoder.predict(cross_inp) # Store cross-encoder scores in results dictionary for ranking @@ -138,14 +138,14 @@ def render_results(hits, entries, count=5, display_biencoder_results=False): print(f"Top-{count} Bi-Encoder Retrieval hits") hits = sorted(hits, key=lambda x: x['score'], reverse=True) for hit in hits[0:count]: - print(f"Score: {hit['score']:.3f}\n------------\n{entries[hit['corpus_id']]['embed']}") + print(f"Score: {hit['score']:.3f}\n------------\n{entries[hit['corpus_id']]['compiled']}") # Output of top hits from re-ranker print("\n-------------------------\n") print(f"Top-{count} Cross-Encoder Re-ranker hits") hits = sorted(hits, key=lambda x: x['cross-score'], reverse=True) for hit in hits[0:count]: - print(f"CrossScore: {hit['cross-score']:.3f}\n-----------------\n{entries[hit['corpus_id']]['embed']}") + print(f"CrossScore: {hit['cross-score']:.3f}\n-----------------\n{entries[hit['corpus_id']]['compiled']}") def collate_results(hits, entries, count=5): diff --git a/src/search_type/symmetric_ledger.py b/src/search_type/symmetric_ledger.py index f0efddae..3a8922a3 100644 --- a/src/search_type/symmetric_ledger.py +++ b/src/search_type/symmetric_ledger.py @@ -38,7 +38,7 @@ def initialize_model(search_config: SymmetricSearchConfig): def extract_entries(notesfile, verbose=0): "Load entries from compressed jsonl" - return [{'raw': f'{entry["Title"]}', 'embed': f'{entry["Title"]}'} + return [{'raw': f'{entry["Title"]}', 'compiled': f'{entry["Title"]}'} for entry in load_jsonl(notesfile, verbose=verbose)] @@ -80,7 +80,7 @@ def query(raw_query, model: TextSearchModel, filters=[]): hits = util.semantic_search(question_embedding, corpus_embeddings, top_k=model.top_k)[0] # Score all retrieved entries using the cross-encoder - cross_inp = [[query, entries[hit['corpus_id']]['embed']] for hit in hits] + cross_inp = [[query, entries[hit['corpus_id']]['compiled']] for hit in hits] cross_scores = model.cross_encoder.predict(cross_inp) # Store cross-encoder scores in results dictionary for ranking @@ -102,14 +102,14 @@ def render_results(hits, entries, count=5, display_biencoder_results=False): print(f"Top-{count} Bi-Encoder Retrieval hits") hits = sorted(hits, key=lambda x: x['score'], reverse=True) for hit in hits[0:count]: - print(f"Score: {hit['score']:.3f}\n------------\n{entries[hit['corpus_id']]['embed']}") + print(f"Score: {hit['score']:.3f}\n------------\n{entries[hit['corpus_id']]['compiled']}") # Output of top hits from re-ranker print("\n-------------------------\n") print(f"Top-{count} Cross-Encoder Re-ranker hits") hits = sorted(hits, key=lambda x: x['cross-score'], reverse=True) for hit in hits[0:count]: - print(f"CrossScore: {hit['cross-score']:.3f}\n-----------------\n{entries[hit['corpus_id']]['embed']}") + print(f"CrossScore: {hit['cross-score']:.3f}\n-----------------\n{entries[hit['corpus_id']]['compiled']}") def collate_results(hits, entries, count=5): diff --git a/tests/test_date_filter.py b/tests/test_date_filter.py index 44d052f0..a1f63a05 100644 --- a/tests/test_date_filter.py +++ b/tests/test_date_filter.py @@ -13,9 +13,9 @@ from src.search_filter import date_filter def test_date_filter(): embeddings = torch.randn(3, 10) entries = [ - {'embed': '', 'raw': 'Entry with no date'}, - {'embed': '', 'raw': 'April Fools entry: 1984-04-01'}, - {'embed': '', 'raw': 'Entry with date:1984-04-02'}] + {'compiled': '', 'raw': 'Entry with no date'}, + {'compiled': '', 'raw': 'April Fools entry: 1984-04-01'}, + {'compiled': '', 'raw': 'Entry with date:1984-04-02'}] q_with_no_date_filter = 'head tail' ret_query, ret_entries, ret_emb = date_filter.date_filter(q_with_no_date_filter, entries.copy(), embeddings)