From f4bde75249607fc5b68bf3609339f14656e01eb2 Mon Sep 17 00:00:00 2001
From: Debanjum Singh Solanky <debanjum@gmail.com>
Date: Sun, 29 Aug 2021 05:47:43 -0700
Subject: [PATCH] Decouple results shown to user and text the model is trained
 on

- Previously:
  The text the model was trained on was being used to
  re-create a semblance of the original org-mode entry.

- Now:
  - Store raw entry as another key:value in each entry json too
    Only return actual raw org entries in results
    But create embeddings like before
  - Also add link to entry in file:<filename>::<line_number> form
    in property drawer of returned results
    This can be used to jump to actual entry in it's original file
---
 src/interface/emacs/semantic-search.el |  2 +-
 src/processor/org_mode/org_to_jsonl.py | 10 ++--------
 src/processor/org_mode/orgnode.py      | 14 ++++++++++----
 src/search_type/asymmetric.py          | 14 +++++++-------
 4 files changed, 20 insertions(+), 20 deletions(-)
diff --git a/src/interface/emacs/semantic-search.el b/src/interface/emacs/semantic-search.el
index ca2c4aae..df8ed8ac 100644
--- a/src/interface/emacs/semantic-search.el
+++ b/src/interface/emacs/semantic-search.el
@@ -48,7 +48,7 @@
    ;; extract entries from response as single string and convert to entries
    (format "%s"
            (mapcar
-            (lambda (args) (format "* %s" (cdr (assoc 'Entry args))))
+            (lambda (args) (format "%s" (cdr (assoc 'Entry args))))
             json-response))))
 
 (defun semantic-search--extract-entries-as-ledger (json-response)
diff --git a/src/processor/org_mode/org_to_jsonl.py b/src/processor/org_mode/org_to_jsonl.py
index 0cb587a3..9177946e 100644
--- a/src/processor/org_mode/org_to_jsonl.py
+++ b/src/processor/org_mode/org_to_jsonl.py
@@ -120,15 +120,9 @@ def convert_org_entries_to_jsonl(entries, verbose=0):
             if verbose > 2:
                 print(f"Body: {entry.Body()}")
 
-        for property_key in ('ID', 'QUERY', 'TYPE', 'CATEGORY'):
-            if entry.Property(property_key):
-                if 'Property' not in entry_dict:
-                    entry_dict['Property'] = dict()
-                entry_dict['Property'][property_key] = entry.Property(property_key)
-                if verbose > 2:
-                    print(f'Property: {entry_dict["PROPERTY"][property_key]}')
-
         if entry_dict:
+            entry_dict["Raw"] = f'{entry}'
+
             # Convert Dictionary to JSON and Append to JSONL string
             jsonl += f'{json.dumps(entry_dict, ensure_ascii=False)}\n'
 
diff --git a/src/processor/org_mode/orgnode.py b/src/processor/org_mode/orgnode.py
index 838b19c5..a575b3ee 100644
--- a/src/processor/org_mode/orgnode.py
+++ b/src/processor/org_mode/orgnode.py
@@ -77,7 +77,7 @@ def makelist(filename):
                 deadline_date = ''
              thisNode.setProperties(propdict)
              nodelist.append( thisNode )
-             propdict = dict()
+             propdict = {'SOURCE': f'file:{filename}::{ctr}'}
           level = hdng.group(1)
           heading =  hdng.group(2)
           bodytext = ""
@@ -325,8 +325,14 @@ class Orgnode(object):
            n = n + ':' + t
            closecolon = ':'
         n = n + closecolon
-# Need to output Scheduled Date, Deadline Date, property tags The
-# following will output the text used to construct the object
-        n = n + "\n" + self.body
+        # Need to output Scheduled Date, Deadline Date, property tags The
+        # following will output the text used to construct the object
+        n = n + "\n"
+        n = n + ":PROPERTIES:\n"
+        for key, value in self.properties.items():
+           n = n + f":{key}: {value}\n"
+        n = n + ":END:\n"
+
+        n = n + self.body
 
         return n
diff --git a/src/search_type/asymmetric.py b/src/search_type/asymmetric.py
index 656d1fb3..17c5367e 100644
--- a/src/search_type/asymmetric.py
+++ b/src/search_type/asymmetric.py
@@ -39,7 +39,7 @@ def extract_entries(notesfile, verbose=0):
                 continue
 
             note_string = f'{note["Title"]}\t{note["Tags"] if "Tags" in note else ""}\n{note["Body"] if "Body" in note else ""}'
-            entries.extend([note_string])
+            entries.append([note_string, note["Raw"]])
 
     if verbose > 0:
         print(f"Loaded {len(entries)} entries from {notesfile}")
@@ -56,7 +56,7 @@ def compute_embeddings(entries, bi_encoder, embeddings_file, regenerate=False, v
             print(f"Loaded embeddings from {embeddings_file}")
 
     else:  # Else compute the corpus_embeddings from scratch, which can take a while
-        corpus_embeddings = bi_encoder.encode(entries, convert_to_tensor=True, show_progress_bar=True)
+        corpus_embeddings = bi_encoder.encode([entry[0] for entry in entries], convert_to_tensor=True, show_progress_bar=True)
         torch.save(corpus_embeddings, get_absolute_path(embeddings_file))
         if verbose > 0:
             print(f"Computed embeddings and save them to {embeddings_file}")
@@ -79,12 +79,12 @@ def query_notes(raw_query, corpus_embeddings, entries, bi_encoder, cross_encoder
     hits = hits[0]  # Get the hits for the first query
 
     # Filter results using explicit filters
-    hits = explicit_filter(hits, entries, required_words, blocked_words)
+    hits = explicit_filter(hits, [entry[0] for entry in entries], required_words, blocked_words)
     if hits is None or len(hits) == 0:
         return hits
 
     # Score all retrieved entries using the cross-encoder
-    cross_inp = [[query, entries[hit['corpus_id']]] for hit in hits]
+    cross_inp = [[query, entries[hit['corpus_id']][0]] for hit in hits]
     cross_scores = cross_encoder.predict(cross_inp)
 
     # Store cross-encoder scores in results dictionary for ranking
@@ -127,20 +127,20 @@ def render_results(hits, entries, count=5, display_biencoder_results=False):
         print(f"Top-{count} Bi-Encoder Retrieval hits")
         hits = sorted(hits, key=lambda x: x['score'], reverse=True)
         for hit in hits[0:count]:
-            print(f"Score: {hit['score']:.3f}\n------------\n{entries[hit['corpus_id']]}")
+            print(f"Score: {hit['score']:.3f}\n------------\n{entries[hit['corpus_id']][0]}")
 
     # Output of top hits from re-ranker
     print("\n-------------------------\n")
     print(f"Top-{count} Cross-Encoder Re-ranker hits")
     hits = sorted(hits, key=lambda x: x['cross-score'], reverse=True)
     for hit in hits[0:count]:
-        print(f"CrossScore: {hit['cross-score']:.3f}\n-----------------\n{entries[hit['corpus_id']]}")
+        print(f"CrossScore: {hit['cross-score']:.3f}\n-----------------\n{entries[hit['corpus_id']][0]}")
 
 
 def collate_results(hits, entries, count=5):
     return [
         {
-            "Entry": entries[hit['corpus_id']],
+            "Entry": entries[hit['corpus_id']][1],
             "Score": f"{hit['cross-score']:.3f}"
         }
         for hit