From f4bde75249607fc5b68bf3609339f14656e01eb2 Mon Sep 17 00:00:00 2001 From: Debanjum Singh Solanky Date: Sun, 29 Aug 2021 05:47:43 -0700 Subject: [PATCH] Decouple results shown to user and text the model is trained on - Previously: The text the model was trained on was being used to re-create a semblance of the original org-mode entry. - Now: - Store raw entry as another key:value in each entry json too Only return actual raw org entries in results But create embeddings like before - Also add link to entry in file::: form in property drawer of returned results This can be used to jump to actual entry in it's original file --- src/interface/emacs/semantic-search.el | 2 +- src/processor/org_mode/org_to_jsonl.py | 10 ++-------- src/processor/org_mode/orgnode.py | 14 ++++++++++---- src/search_type/asymmetric.py | 14 +++++++------- 4 files changed, 20 insertions(+), 20 deletions(-) diff --git a/src/interface/emacs/semantic-search.el b/src/interface/emacs/semantic-search.el index ca2c4aae..df8ed8ac 100644 --- a/src/interface/emacs/semantic-search.el +++ b/src/interface/emacs/semantic-search.el @@ -48,7 +48,7 @@ ;; extract entries from response as single string and convert to entries (format "%s" (mapcar - (lambda (args) (format "* %s" (cdr (assoc 'Entry args)))) + (lambda (args) (format "%s" (cdr (assoc 'Entry args)))) json-response)))) (defun semantic-search--extract-entries-as-ledger (json-response) diff --git a/src/processor/org_mode/org_to_jsonl.py b/src/processor/org_mode/org_to_jsonl.py index 0cb587a3..9177946e 100644 --- a/src/processor/org_mode/org_to_jsonl.py +++ b/src/processor/org_mode/org_to_jsonl.py @@ -120,15 +120,9 @@ def convert_org_entries_to_jsonl(entries, verbose=0): if verbose > 2: print(f"Body: {entry.Body()}") - for property_key in ('ID', 'QUERY', 'TYPE', 'CATEGORY'): - if entry.Property(property_key): - if 'Property' not in entry_dict: - entry_dict['Property'] = dict() - entry_dict['Property'][property_key] = entry.Property(property_key) - if verbose > 2: - print(f'Property: {entry_dict["PROPERTY"][property_key]}') - if entry_dict: + entry_dict["Raw"] = f'{entry}' + # Convert Dictionary to JSON and Append to JSONL string jsonl += f'{json.dumps(entry_dict, ensure_ascii=False)}\n' diff --git a/src/processor/org_mode/orgnode.py b/src/processor/org_mode/orgnode.py index 838b19c5..a575b3ee 100644 --- a/src/processor/org_mode/orgnode.py +++ b/src/processor/org_mode/orgnode.py @@ -77,7 +77,7 @@ def makelist(filename): deadline_date = '' thisNode.setProperties(propdict) nodelist.append( thisNode ) - propdict = dict() + propdict = {'SOURCE': f'file:{filename}::{ctr}'} level = hdng.group(1) heading = hdng.group(2) bodytext = "" @@ -325,8 +325,14 @@ class Orgnode(object): n = n + ':' + t closecolon = ':' n = n + closecolon -# Need to output Scheduled Date, Deadline Date, property tags The -# following will output the text used to construct the object - n = n + "\n" + self.body + # Need to output Scheduled Date, Deadline Date, property tags The + # following will output the text used to construct the object + n = n + "\n" + n = n + ":PROPERTIES:\n" + for key, value in self.properties.items(): + n = n + f":{key}: {value}\n" + n = n + ":END:\n" + + n = n + self.body return n diff --git a/src/search_type/asymmetric.py b/src/search_type/asymmetric.py index 656d1fb3..17c5367e 100644 --- a/src/search_type/asymmetric.py +++ b/src/search_type/asymmetric.py @@ -39,7 +39,7 @@ def extract_entries(notesfile, verbose=0): continue note_string = f'{note["Title"]}\t{note["Tags"] if "Tags" in note else ""}\n{note["Body"] if "Body" in note else ""}' - entries.extend([note_string]) + entries.append([note_string, note["Raw"]]) if verbose > 0: print(f"Loaded {len(entries)} entries from {notesfile}") @@ -56,7 +56,7 @@ def compute_embeddings(entries, bi_encoder, embeddings_file, regenerate=False, v print(f"Loaded embeddings from {embeddings_file}") else: # Else compute the corpus_embeddings from scratch, which can take a while - corpus_embeddings = bi_encoder.encode(entries, convert_to_tensor=True, show_progress_bar=True) + corpus_embeddings = bi_encoder.encode([entry[0] for entry in entries], convert_to_tensor=True, show_progress_bar=True) torch.save(corpus_embeddings, get_absolute_path(embeddings_file)) if verbose > 0: print(f"Computed embeddings and save them to {embeddings_file}") @@ -79,12 +79,12 @@ def query_notes(raw_query, corpus_embeddings, entries, bi_encoder, cross_encoder hits = hits[0] # Get the hits for the first query # Filter results using explicit filters - hits = explicit_filter(hits, entries, required_words, blocked_words) + hits = explicit_filter(hits, [entry[0] for entry in entries], required_words, blocked_words) if hits is None or len(hits) == 0: return hits # Score all retrieved entries using the cross-encoder - cross_inp = [[query, entries[hit['corpus_id']]] for hit in hits] + cross_inp = [[query, entries[hit['corpus_id']][0]] for hit in hits] cross_scores = cross_encoder.predict(cross_inp) # Store cross-encoder scores in results dictionary for ranking @@ -127,20 +127,20 @@ def render_results(hits, entries, count=5, display_biencoder_results=False): print(f"Top-{count} Bi-Encoder Retrieval hits") hits = sorted(hits, key=lambda x: x['score'], reverse=True) for hit in hits[0:count]: - print(f"Score: {hit['score']:.3f}\n------------\n{entries[hit['corpus_id']]}") + print(f"Score: {hit['score']:.3f}\n------------\n{entries[hit['corpus_id']][0]}") # Output of top hits from re-ranker print("\n-------------------------\n") print(f"Top-{count} Cross-Encoder Re-ranker hits") hits = sorted(hits, key=lambda x: x['cross-score'], reverse=True) for hit in hits[0:count]: - print(f"CrossScore: {hit['cross-score']:.3f}\n-----------------\n{entries[hit['corpus_id']]}") + print(f"CrossScore: {hit['cross-score']:.3f}\n-----------------\n{entries[hit['corpus_id']][0]}") def collate_results(hits, entries, count=5): return [ { - "Entry": entries[hit['corpus_id']], + "Entry": entries[hit['corpus_id']][1], "Score": f"{hit['cross-score']:.3f}" } for hit