Move application files under src directory. Update Readmes

- Remove callign asymmetric search script directly command. It doesn't work anymore on calling directly due to internal package import issues
2026-03-02 21:19:12 +00:00 · 2021-08-17 01:25:12 -07:00
parent c35c6fb0b3
commit af9660f28e
16 changed files with 3 additions and 13 deletions
--- a/src/init.py
+++ b/src/init.py
--- a/src/interface/emacs/README.org
+++ b/src/interface/emacs/README.org
@@ -0,0 +1,44 @@
+* Emacs Semantic Search
+  /An Emacs interface for [[https://github.com/debanjum/semantic-search][semantic-search]]/
+
+** Requirements
+   - Install and Run [[https://github.com/debanjum/semantic-search][semantic-search]]
+
+** Installation
+   - Direct Install
+     - Put ~semantic-search.el~ in your Emacs load path. For e.g ~/.emacs.d/lisp
+
+     - Load via ~use-package~ in your ~/.emacs.d/init.el or .emacs file by adding below snippet
+       #+begin_src elisp
+         ;; Org-Semantic Search Library
+         (use-package semantic-search
+           :load-path "~/.emacs.d/lisp/semantic-search.el"
+           :bind ("C-c s" . 'semantic-search))
+       #+end_src
+
+   - Use [[https://github.com/quelpa/quelpa#installation][Quelpa]]
+     - Ensure [[https://github.com/quelpa/quelpa#installation][Quelpa]], [[https://github.com/quelpa/quelpa-use-package#installation][quelpa-use-package]] are installed
+     - Add below snippet to your ~/.emacs.d/init.el or .emacs config file and execute it.
+       #+begin_src elisp
+         ;; Org-Semantic Search Library
+         (use-package semantic-search
+           :quelpa (semantic-search :fetcher url :url "https://raw.githubusercontent.com/debanjum/semantic-search/master/interface/emacs/semantic-search.el")
+           :bind ("C-c s" . 'semantic-search))
+       #+end_src
+
+** Usage
+   1. Call ~semantic-search~ using keybinding ~C-c s~ or ~M-x semantic-search~
+
+   2. Enter Query in Natural Language
+
+      e.g "What is the meaning of life?" "What are my life goals?"
+
+   3. Wait for results
+
+      *Note: It takes about 15s on a Mac M1 and a ~100K lines corpus of org-mode files*
+
+   4. (Optional) Narrow down results further
+
+      Include/Exclude specific words from results by adding to query
+
+      e.g "What is the meaning of life? -god +none"
--- a/src/interface/emacs/semantic-search.el
+++ b/src/interface/emacs/semantic-search.el
@@ -0,0 +1,103 @@
+;;; semantic-search.el --- Semantic search via Emacs
+
+;; Copyright (C) 2021-2022 Debanjum Singh Solanky
+
+;; Author: Debanjum Singh Solanky <debanjum@gmail.com>
+;; Version: 0.1
+;; Keywords: search, org-mode, outlines
+;; URL: http://github.com/debanjum/semantic-search/interface/emacs
+
+;; This file is NOT part of GNU Emacs.
+
+;;; License:
+
+;; This program is free software; you can redistribute it and/or
+;; modify it under the terms of the GNU General Public License
+;; as published by the Free Software Foundation; either version 3
+;; of the License, or (at your option) any later version.
+
+;; This program is distributed in the hope that it will be useful,
+;; but WITHOUT ANY WARRANTY; without even the implied warranty of
+;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+;; GNU General Public License for more details.
+
+;; You should have received a copy of the GNU General Public License
+;; along with this program. If not, see <http://www.gnu.org/licenses/>.
+
+;;; Commentary:
+
+;; This package provides semantic search on org-mode files
+;; It is a wrapper that interfaces with transformer based ML model
+;; The models semantic search capabilities are exposed via an HTTP API
+
+;;; Code:
+
+(require 'url)
+(require 'json)
+
+(defcustom semantic-search--server-url "http://localhost:8000"
+  "Location of semantic search API server."
+  :group 'semantic-search
+  :type 'string)
+
+(defun semantic-search--extract-entries-as-org (json-response)
+  "Convert json response from API to org-mode entries"
+  ;; remove leading (, ) or SPC from extracted entries string
+  (replace-regexp-in-string
+   "^[\(\) ]" ""
+   ;; extract entries from response as single string and convert to entries
+   (format "%s"
+           (mapcar
+            (lambda (args) (format "* %s" (cdr (assoc 'Entry args))))
+            json-response))))
+
+(defun semantic-search--extract-entries-as-ledger (json-response)
+  "Convert json response from API to ledger entries"
+  ;; remove leading (, ) or SPC from extracted entries string
+  (replace-regexp-in-string
+   "^[\(\) ]" ""
+   ;; extract entries from response as single string and convert to entries
+   (format "%s"
+           (mapcar
+            (lambda (args) (format "* %s" (cdr (assoc 'Entry args))))
+            json-response))))
+
+(defun semantic-search--buffer-name-to-search-type (buffer-name)
+  (let ((file-extension (file-name-extension buffer-name)))
+    (cond
+     ((equal file-extension "bean") "ledger")
+     ((equal file-extension "org") "notes")
+     (t "notes"))))
+
+(defun semantic-search--construct-api-query (query search-type)
+  (let ((encoded-query (url-hexify-string query)))
+    (format "%s/search?q=%s&t=%s" semantic-search--server-url encoded-query search-type)))
+
+(defun semantic-search (query)
+  "Semantic search on org-mode content via semantic-search API"
+  (interactive "sQuery: ")
+  (let* ((search-type (semantic-search--buffer-name-to-search-type (buffer-name)))
+         (url (semantic-search--construct-api-query query search-type))
+         (buff (get-buffer-create "*semantic-search*")))
+    ;; get json response from api
+    (with-current-buffer buff
+      (let ((inhibit-read-only t))
+        (erase-buffer)
+        (url-insert-file-contents url)))
+    ;; convert json response to org-mode entries
+    (with-current-buffer buff
+      (let ((inhibit-read-only t)
+            (json-response (json-parse-buffer :object-type 'alist)))
+        (erase-buffer)
+        (insert
+         (cond ((equal search-type "notes") (semantic-search--extract-entries-as-org json-response))
+               ((equal search-type "ledger") (semantic-search--extract-entries-as-ledger json-response))
+               (t (format "%s" json-response)))))
+      (cond ((equal search-type "notes") (org-mode))
+            (t (fundamental-mode)))
+      (read-only-mode t))
+    (switch-to-buffer buff)))
+
+(provide 'semantic-search)
+
+;;; semantic-search.el ends here
--- a/src/main.py
+++ b/src/main.py
@@ -0,0 +1,69 @@
+from typing import Optional
+from fastapi import FastAPI
+from search_type import asymmetric
+from processor.org_mode.org_to_jsonl import org_to_jsonl
+from utils.helpers import is_none_or_empty
+import argparse
+import pathlib
+import uvicorn
+
+app = FastAPI()
+
+
+@app.get('/search')
+def search(q: str, n: Optional[int] = 5, t: Optional[str] = 'notes'):
+    if q is None or q == '':
+        print(f'No query param (q) passed in API call to initiate search')
+        return {}
+
+    user_query = q
+    results_count = n
+
+    if t == 'notes':
+        # query notes
+        hits = asymmetric.query_notes(
+            user_query,
+            corpus_embeddings,
+            entries,
+            bi_encoder,
+            cross_encoder,
+            top_k)
+
+        # collate and return results
+        return asymmetric.collate_results(hits, entries, results_count)
+
+    else:
+        return {}
+
+
+@app.get('/regenerate')
+def regenerate():
+    # Extract Entries, Generate Embeddings
+    extracted_entries, computed_embeddings, _, _, _ = asymmetric.setup(args.input_files, args.input_filter, args.compressed_jsonl, args.embeddings, regenerate=True, verbose=args.verbose)
+
+    # Now Update State
+    # update state variables after regeneration complete
+    # minimize time the application is in inconsistent, partially updated state
+    global corpus_embeddings
+    global entries
+    entries = extracted_entries
+    corpus_embeddings = computed_embeddings
+
+    return {'status': 'ok', 'message': 'regeneration completed'}
+
+
+if __name__ == '__main__':
+    # Setup Argument Parser
+    parser = argparse.ArgumentParser(description="Expose API for Semantic Search")
+    parser.add_argument('--input-files', '-i', nargs='*', help="List of org-mode files to process")
+    parser.add_argument('--input-filter', type=str, default=None, help="Regex filter for org-mode files to process")
+    parser.add_argument('--compressed-jsonl', '-j', type=pathlib.Path, default=pathlib.Path(".notes.jsonl.gz"), help="Compressed JSONL formatted notes file to compute embeddings from")
+    parser.add_argument('--embeddings', '-e', type=pathlib.Path, default=pathlib.Path(".notes_embeddings.pt"), help="File to save/load model embeddings to/from")
+    parser.add_argument('--regenerate', action='store_true', default=False, help="Regenerate embeddings from org-mode files. Default: false")
+    parser.add_argument('--verbose', action='count', default=0, help="Show verbose conversion logs. Default: 0")
+    args = parser.parse_args()
+
+    entries, corpus_embeddings, bi_encoder, cross_encoder, top_k = asymmetric.setup(args.input_files, args.input_filter, args.compressed_jsonl, args.embeddings, args.regenerate, args.verbose)
+
+    # Start Application Server
+    uvicorn.run(app)
--- a/src/processor/init.py
+++ b/src/processor/init.py
--- a/src/processor/org_mode/init.py
+++ b/src/processor/org_mode/init.py
--- a/src/processor/org_mode/org_to_jsonl.py
+++ b/src/processor/org_mode/org_to_jsonl.py
@@ -0,0 +1,141 @@
+#!/usr/bin/env python3
+
+# Import Modules
+from processor.org_mode import orgnode
+from utils.helpers import get_absolute_path, is_none_or_empty
+import json
+import argparse
+import pathlib
+import glob
+import gzip
+
+
+# Define Functions
+def org_to_jsonl(org_files, org_file_filter, output_file, verbose=0):
+    # Input Validation
+    if is_none_or_empty(org_files) and is_none_or_empty(org_file_filter):
+        print("At least one of org-files or org-file-filter is required to be specified")
+        exit(1)
+
+    # Get Org Files to Process
+    org_files = get_org_files(org_files, org_file_filter, verbose)
+
+    # Extract Entries from specified Org files
+    entries = extract_org_entries(org_files)
+
+    # Process Each Entry from All Notes Files
+    jsonl_data = convert_org_entries_to_jsonl(entries, verbose=verbose)
+
+    # Compress JSONL formatted Data
+    if output_file.suffix == ".gz":
+        compress_jsonl_data(jsonl_data, output_file, verbose=verbose)
+    elif output_file.suffix == ".jsonl":
+        dump_jsonl(jsonl_data, output_file, verbose=verbose)
+
+    return entries
+
+def dump_jsonl(jsonl_data, output_path, verbose=0):
+    "Write List of JSON objects to JSON line file"
+    with open(get_absolute_path(output_path), 'w', encoding='utf-8') as f:
+        f.write(jsonl_data)
+
+    if verbose > 0:
+        print(f'Wrote {len(jsonl_data)} lines to jsonl at {output_path}')
+
+
+def compress_jsonl_data(jsonl_data, output_path, verbose=0):
+    with gzip.open(get_absolute_path(output_path), 'wt') as gzip_file:
+        gzip_file.write(jsonl_data)
+
+    if verbose > 0:
+        print(f'Wrote {len(jsonl_data)} lines to gzip compressed jsonl at {output_path}')
+
+
+def load_jsonl(input_path, verbose=0):
+    "Read List of JSON objects from JSON line file"
+    data = []
+    with open(get_absolute_path(input_path), 'r', encoding='utf-8') as f:
+        for line in f:
+            data.append(json.loads(line.rstrip('\n|\r')))
+
+    if verbose > 0:
+        print(f'Loaded {len(data)} records from {input_path}')
+
+    return data
+
+
+def get_org_files(org_files=None, org_file_filter=None, verbose=0):
+    "Get Org files to process"
+    absolute_org_files, filtered_org_files = set(), set()
+    if org_files:
+        absolute_org_files = {get_absolute_path(org_file)
+                              for org_file
+                              in org_files}
+    if org_file_filter:
+        filtered_org_files = set(glob.glob(get_absolute_path(org_file_filter)))
+
+    all_org_files = absolute_org_files | filtered_org_files
+
+    files_with_non_org_extensions = {org_file for org_file in all_org_files if not org_file.endswith(".org")}
+    if any(files_with_non_org_extensions):
+        print(f"[Warning] There maybe non org-mode files in the input set: {files_with_non_org_extensions}")
+
+    if verbose > 0:
+        print(f'Processing files: {all_org_files}')
+
+    return all_org_files
+
+
+def extract_org_entries(org_files):
+    "Extract entries from specified Org files"
+    entries = []
+    for org_file in org_files:
+        entries.extend(
+            orgnode.makelist(
+                str(org_file)))
+
+    return entries
+
+
+def convert_org_entries_to_jsonl(entries, verbose=0):
+    "Convert each Org-Mode entries to JSON and collate as JSONL"
+    jsonl = ''
+    for entry in entries:
+        entry_dict = dict()
+
+        entry_dict["Title"] = entry.Heading()
+        if verbose > 1:
+            print(f"Title: {entry.Heading()}")
+
+        if entry.Tags():
+            tags_str = " ".join([tag for tag in entry.Tags()])
+            entry_dict["Tags"] = tags_str
+            if verbose > 1:
+                print(f"Tags: {tags_str}")
+
+        if entry.Body():
+            entry_dict["Body"] = entry.Body()
+            if verbose > 2:
+                print(f"Body: {entry.Body()}")
+
+        if entry_dict:
+            # Convert Dictionary to JSON and Append to JSONL string
+            jsonl += f'{json.dumps(entry_dict, ensure_ascii=False)}\n'
+
+    if verbose > 0:
+        print(f"Converted {len(entries)} to jsonl format")
+
+    return jsonl
+
+
+if __name__ == '__main__':
+    # Setup Argument Parser
+    parser = argparse.ArgumentParser(description="Map Org-Mode notes into (compressed) JSONL format")
+    parser.add_argument('--output-file', '-o', type=pathlib.Path, required=True, help="Output file for (compressed) JSONL formatted notes. Expected file extensions: jsonl or jsonl.gz")
+    parser.add_argument('--input-files', '-i', nargs='*', help="List of org-mode files to process")
+    parser.add_argument('--input-filter', type=str, default=None, help="Regex filter for org-mode files to process")
+    parser.add_argument('--verbose', '-v', action='count', default=0, help="Show verbose conversion logs, Default: 0")
+    args = parser.parse_args()
+
+    # Map notes in Org-Mode files to (compressed) JSONL formatted file
+    org_to_jsonl(args.input_files, args.input_filter, args.output_file, args.verbose)
--- a/src/processor/org_mode/orgnode.py
+++ b/src/processor/org_mode/orgnode.py
@@ -0,0 +1,332 @@
+# Copyright (c) 2010 Charles Cave
+#
+#  Permission  is  hereby  granted,  free  of charge,  to  any  person
+#  obtaining  a copy  of  this software  and associated  documentation
+#  files   (the  "Software"),   to  deal   in  the   Software  without
+#  restriction, including without limitation  the rights to use, copy,
+#  modify, merge, publish,  distribute, sublicense, and/or sell copies
+#  of  the Software, and  to permit  persons to  whom the  Software is
+#  furnished to do so, subject to the following conditions:
+#
+#  The above copyright notice and this permission notice shall be
+#  included in all copies or substantial portions of the Software.
+#
+#  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+#  EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+#  MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+#  NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+#  BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+#  ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+#  CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+#  SOFTWARE.
+
+# Program written by Charles Cave   (charlesweb@optusnet.com.au)
+# February - March 2009
+# Version 2 - June 2009
+#   Added support for all tags, TODO priority and checking existence of a tag
+# More information at
+#    http://members.optusnet.com.au/~charles57/GTD
+
+"""
+The Orgnode module consists of the Orgnode class for representing a
+headline and associated text from an org-mode file, and routines for
+constructing data structures of these classes.
+"""
+
+import re, sys
+import datetime
+
+def makelist(filename):
+   """
+   Read an org-mode file and return a list of Orgnode objects
+   created from this file.
+   """
+   ctr = 0
+
+   try:
+      f = open(filename, 'r')
+   except IOError:
+      print(f"Unable to open file {filename}")
+      print("Program terminating.")
+      sys.exit(1)
+
+   todos         = { "TODO": "", "WAITING": "", "ACTIVE": "",
+                     "DONE": "", "CANCELLED": "", "FAILED": ""} # populated from #+SEQ_TODO line
+   level         = 0
+   heading       = ""
+   bodytext      = ""
+   tag1          = ""      # The first tag enclosed in ::
+   alltags       = []      # list of all tags in headline
+   sched_date    = ''
+   deadline_date = ''
+   nodelist      = []
+   propdict      = dict()
+   in_properties_drawer = False
+
+   for line in f:
+       ctr += 1
+       hdng = re.search('^(\*+)\s(.*?)\s*$', line)
+       if hdng:
+          if heading:  # we are processing a heading line
+             thisNode = Orgnode(level, heading, bodytext, tag1, alltags)
+             if sched_date:
+                thisNode.setScheduled(sched_date)
+                sched_date = ""
+             if deadline_date:
+                thisNode.setDeadline(deadline_date)
+                deadline_date = ''
+             thisNode.setProperties(propdict)
+             nodelist.append( thisNode )
+             propdict = dict()
+          level = hdng.group(1)
+          heading =  hdng.group(2)
+          bodytext = ""
+          tag1 = ""
+          alltags = []       # list of all tags in headline
+          tagsrch = re.search('(.*?)\s*:([a-zA-Z0-9].*?):([a-zA-Z0-9].*?):$',heading)
+          if tagsrch:
+              heading = tagsrch.group(1)
+              tag1 = tagsrch.group(2)
+              alltags.append(tag1)
+              tag2 = tagsrch.group(3)
+              if tag2:
+                 for t in tag2.split(':'):
+                    if t != '': alltags.append(t)
+       else:      # we are processing a non-heading line
+           if line[:10] == '#+SEQ_TODO':
+              kwlist = re.findall('([A-Z]+)\(', line)
+              for kw in kwlist: todos[kw] = ""
+
+           # Ignore Properties Drawers Completely
+           if re.search(':PROPERTIES:', line):
+              in_properties_drawer=True
+              continue
+           if in_properties_drawer and re.search(':END:', line):
+              in_properties_drawer=False
+              continue
+
+           # Ignore Clocking Lines
+           if re.search('CLOCK: \[[0-9]{4}-[0-9]{2}-[0-9]{2}', line):
+              continue
+
+           if not in_properties_drawer and line[:1] != '#':
+               bodytext = bodytext + line
+
+           prop_srch = re.search('^\s*:(.*?):\s*(.*?)\s*$', line)
+           if prop_srch:
+              propdict[prop_srch.group(1)] = prop_srch.group(2)
+              continue
+           sd_re = re.search('SCHEDULED:\s+<([0-9]+)\-([0-9]+)\-([0-9]+)', line)
+           if sd_re:
+              sched_date = datetime.date(int(sd_re.group(1)),
+                                         int(sd_re.group(2)),
+                                         int(sd_re.group(3)) )
+           dd_re = re.search('DEADLINE:\s*<(\d+)\-(\d+)\-(\d+)', line)
+           if dd_re:
+              deadline_date = datetime.date(int(dd_re.group(1)),
+                                            int(dd_re.group(2)),
+                                            int(dd_re.group(3)) )
+
+   # write out last node
+   thisNode = Orgnode(level, heading, bodytext, tag1, alltags)
+   thisNode.setProperties(propdict)
+   if sched_date:
+      thisNode.setScheduled(sched_date)
+   if deadline_date:
+      thisNode.setDeadline(deadline_date)
+   nodelist.append( thisNode )
+
+   # using the list of TODO keywords found in the file
+   # process the headings searching for TODO keywords
+   for n in nodelist:
+       h = n.Heading()
+       todoSrch = re.search('([A-Z]+)\s(.*?)$', h)
+       if todoSrch:
+           if todoSrch.group(1) in todos:
+               n.setHeading( todoSrch.group(2) )
+               n.setTodo ( todoSrch.group(1) )
+       prtysrch = re.search('^\[\#(A|B|C)\] (.*?)$', n.Heading())
+       if prtysrch:
+          n.setPriority(prtysrch.group(1))
+          n.setHeading(prtysrch.group(2))
+
+   return nodelist
+
+######################
+class Orgnode(object):
+    """
+    Orgnode class represents a headline, tags and text associated
+    with the headline.
+    """
+    def __init__(self, level, headline, body, tag, alltags):
+        """
+        Create an Orgnode object given the parameters of level (as the
+        raw asterisks), headline text (including the TODO tag), and
+        first tag. The makelist routine postprocesses the list to
+        identify TODO tags and updates headline and todo fields.
+        """
+        self.level = len(level)
+        self.headline = headline
+        self.body = body
+        self.tag = tag            # The first tag in the list
+        self.tags = dict()        # All tags in the headline
+        self.todo = ""
+        self.prty = ""            # empty of A, B or C
+        self.scheduled = ""       # Scheduled date
+        self.deadline = ""        # Deadline date
+        self.properties = dict()
+        for t in alltags:
+           self.tags[t] = ''
+
+        # Look for priority in headline and transfer to prty field
+
+    def Heading(self):
+        """
+        Return the Heading text of the node without the TODO tag
+        """
+        return self.headline
+
+    def setHeading(self, newhdng):
+        """
+        Change the heading to the supplied string
+        """
+        self.headline = newhdng
+
+    def Body(self):
+        """
+        Returns all lines of text of the body of this node except the
+        Property Drawer
+        """
+        return self.body
+
+    def Level(self):
+        """
+        Returns an integer corresponding to the level of the node.
+        Top level (one asterisk) has a level of 1.
+        """
+        return self.level
+
+    def Priority(self):
+        """
+        Returns the priority of this headline: 'A', 'B', 'C' or empty
+        string if priority has not been set.
+        """
+        return self.prty
+
+    def setPriority(self, newprty):
+        """
+        Change the value of the priority of this headline.
+        Values values are '', 'A', 'B', 'C'
+        """
+        self.prty = newprty
+
+    def Tag(self):
+        """
+        Returns the value of the first tag.
+        For example, :HOME:COMPUTER: would return HOME
+        """
+        return self.tag
+
+    def Tags(self):
+        """
+        Returns a list of all tags
+        For example, :HOME:COMPUTER: would return ['HOME', 'COMPUTER']
+        """
+        return self.tags.keys()
+
+    def hasTag(self, srch):
+        """
+        Returns True if the supplied tag is present in this headline
+        For example, hasTag('COMPUTER') on headling containing
+        :HOME:COMPUTER: would return True.
+        """
+        return srch in self.tags
+
+    def setTag(self, newtag):
+        """
+        Change the value of the first tag to the supplied string
+        """
+        self.tag = newtag
+
+    def setTags(self, taglist):
+        """
+        Store all the tags found in the headline. The first tag will
+        also be stored as if the setTag method was called.
+        """
+        for t in taglist:
+           self.tags[t] = ''
+
+    def Todo(self):
+        """
+        Return the value of the TODO tag
+        """
+        return self.todo
+
+    def setTodo(self, value):
+        """
+        Set the value of the TODO tag to the supplied string
+        """
+        self.todo = value
+
+    def setProperties(self, dictval):
+        """
+        Sets all properties using the supplied dictionary of
+        name/value pairs
+        """
+        self.properties = dictval
+
+    def Property(self, keyval):
+        """
+        Returns the value of the requested property or null if the
+        property does not exist.
+        """
+        return self.properties.get(keyval, "")
+
+    def setScheduled(self, dateval):
+        """
+        Set the scheduled date using the supplied date object
+        """
+        self.scheduled = dateval
+
+    def Scheduled(self):
+        """
+        Return the scheduled date object or null if nonexistent
+        """
+        return self.scheduled
+
+    def setDeadline(self, dateval):
+        """
+        Set the deadline (due) date using the supplied date object
+        """
+        self.deadline = dateval
+
+    def Deadline(self):
+        """
+        Return the deadline date object or null if nonexistent
+        """
+        return self.deadline
+
+    def __repr__(self):
+        """
+        Print the level, heading text and tag of a node and the body
+        text as used to construct the node.
+        """
+        # This method is not completed yet.
+        n = ''
+        for i in range(0, self.level):
+           n = n + '*'
+        n = n + ' ' + self.todo + ' '
+        if self.prty:
+           n = n +  '[#' + self.prty + '] '
+        n = n + self.headline
+        n = "%-60s " % n     # hack - tags will start in column 62
+        closecolon = ''
+        for t in self.tags.keys():
+           n = n + ':' + t
+           closecolon = ':'
+        n = n + closecolon
+# Need to output Scheduled Date, Deadline Date, property tags The
+# following will output the text used to construct the object
+        n = n + "\n" + self.body
+
+        return n
--- a/src/search_type/init.py
+++ b/src/search_type/init.py
--- a/src/search_type/asymmetric.py
+++ b/src/search_type/asymmetric.py
@@ -0,0 +1,188 @@
+#!/usr/bin/env python
+
+import json
+from sentence_transformers import SentenceTransformer, CrossEncoder, util
+import time
+import gzip
+import os
+import sys
+import re
+import torch
+import argparse
+import pathlib
+from utils.helpers import get_absolute_path
+from processor.org_mode.org_to_jsonl import org_to_jsonl
+
+
+def initialize_model():
+    "Initialize model for assymetric semantic search. That is, where query smaller than results"
+    bi_encoder = SentenceTransformer('sentence-transformers/msmarco-MiniLM-L-6-v3')  # The bi-encoder encodes all entries to use for semantic search
+    top_k = 100                                                                      # Number of entries we want to retrieve with the bi-encoder
+    cross_encoder = CrossEncoder('cross-encoder/ms-marco-MiniLM-L-6-v2')             # The cross-encoder re-ranks the results to improve quality
+    return bi_encoder, cross_encoder, top_k
+
+
+def extract_entries(notesfile, verbose=0):
+    "Load entries from compressed jsonl"
+    entries = []
+    with gzip.open(get_absolute_path(notesfile), 'rt', encoding='utf8') as jsonl:
+        for line in jsonl:
+            note = json.loads(line.strip())
+
+            # Ignore title notes i.e notes with just headings and empty body
+            if not "Body" in note or note["Body"].strip() == "":
+                continue
+
+            note_string = f'{note["Title"]}\t{note["Tags"] if "Tags" in note else ""}\n{note["Body"] if "Body" in note else ""}'
+            entries.extend([note_string])
+
+    if verbose > 0:
+        print(f"Loaded {len(entries)} entries from {notesfile}")
+
+    return entries
+
+
+def compute_embeddings(entries, bi_encoder, embeddings_file, regenerate=False, verbose=0):
+    "Compute (and Save) Embeddings or Load Pre-Computed Embeddings"
+    # Load pre-computed embeddings from file if exists
+    if embeddings_file.exists() and not regenerate:
+        corpus_embeddings = torch.load(get_absolute_path(embeddings_file))
+        if verbose > 0:
+            print(f"Loaded embeddings from {embeddings_file}")
+
+    else:  # Else compute the corpus_embeddings from scratch, which can take a while
+        corpus_embeddings = bi_encoder.encode(entries, convert_to_tensor=True, show_progress_bar=True)
+        torch.save(corpus_embeddings, get_absolute_path(embeddings_file))
+        if verbose > 0:
+            print(f"Computed embeddings and save them to {embeddings_file}")
+
+    return corpus_embeddings
+
+
+def query_notes(raw_query, corpus_embeddings, entries, bi_encoder, cross_encoder, top_k=100):
+    "Search all notes for entries that answer the query"
+    # Separate natural query from explicit required, blocked words filters
+    query = " ".join([word for word in raw_query.split() if not word.startswith("+") and not word.startswith("-")])
+    required_words = set([word[1:].lower() for word in raw_query.split() if word.startswith("+")])
+    blocked_words = set([word[1:].lower() for word in raw_query.split() if word.startswith("-")])
+
+    # Encode the query using the bi-encoder
+    question_embedding = bi_encoder.encode(query, convert_to_tensor=True)
+
+    # Find relevant entries for the query
+    hits = util.semantic_search(question_embedding, corpus_embeddings, top_k=top_k)
+    hits = hits[0]  # Get the hits for the first query
+
+    # Filter results using explicit filters
+    hits = explicit_filter(hits, entries, required_words, blocked_words)
+    if hits is None or len(hits) == 0:
+        return hits
+
+    # Score all retrieved entries using the cross-encoder
+    cross_inp = [[query, entries[hit['corpus_id']]] for hit in hits]
+    cross_scores = cross_encoder.predict(cross_inp)
+
+    # Store cross-encoder scores in results dictionary for ranking
+    for idx in range(len(cross_scores)):
+        hits[idx]['cross-score'] = cross_scores[idx]
+
+    # Order results by cross encoder score followed by biencoder score
+    hits.sort(key=lambda x: x['score'], reverse=True) # sort by biencoder score
+    hits.sort(key=lambda x: x['cross-score'], reverse=True) # sort by cross encoder score
+
+    return hits
+
+
+def explicit_filter(hits, entries, required_words, blocked_words):
+    hits_by_word_set = [(set(word.lower()
+                             for word
+                             in re.split(
+                                 ',|\.| |\]|\[\(|\)|\{|\}',
+                                 entries[hit['corpus_id']])
+                             if word != ""),
+                         hit)
+                        for hit in hits]
+
+    if len(required_words) == 0 and len(blocked_words) == 0:
+        return hits
+    if len(required_words) > 0:
+        return [hit for (words_in_entry, hit) in hits_by_word_set
+                if required_words.intersection(words_in_entry) and not blocked_words.intersection(words_in_entry)]
+    if len(blocked_words) > 0:
+        return [hit for (words_in_entry, hit) in hits_by_word_set
+                if not blocked_words.intersection(words_in_entry)]
+    return hits
+
+
+def render_results(hits, entries, count=5, display_biencoder_results=False):
+    "Render the Results returned by Search for the Query"
+    if display_biencoder_results:
+        # Output of top hits from bi-encoder
+        print("\n-------------------------\n")
+        print(f"Top-{count} Bi-Encoder Retrieval hits")
+        hits = sorted(hits, key=lambda x: x['score'], reverse=True)
+        for hit in hits[0:count]:
+            print(f"Score: {hit['score']:.3f}\n------------\n{entries[hit['corpus_id']]}")
+
+    # Output of top hits from re-ranker
+    print("\n-------------------------\n")
+    print(f"Top-{count} Cross-Encoder Re-ranker hits")
+    hits = sorted(hits, key=lambda x: x['cross-score'], reverse=True)
+    for hit in hits[0:count]:
+        print(f"CrossScore: {hit['cross-score']:.3f}\n-----------------\n{entries[hit['corpus_id']]}")
+
+
+def collate_results(hits, entries, count=5):
+    return [
+        {
+            "Entry": entries[hit['corpus_id']],
+            "Score": f"{hit['cross-score']:.3f}"
+        }
+        for hit
+        in hits[0:count]]
+
+
+def setup(input_files, input_filter, compressed_jsonl, embeddings, regenerate=False, verbose=False):
+    # Initialize Model
+    bi_encoder, cross_encoder, top_k = initialize_model()
+
+    # Map notes in Org-Mode files to (compressed) JSONL formatted file
+    if not compressed_jsonl.exists() or regenerate:
+        org_to_jsonl(input_files, input_filter, compressed_jsonl, verbose)
+
+    # Extract Entries
+    entries = extract_entries(compressed_jsonl, verbose)
+
+    # Compute or Load Embeddings
+    corpus_embeddings = compute_embeddings(entries, bi_encoder, embeddings, regenerate=regenerate, verbose=verbose)
+
+    return entries, corpus_embeddings, bi_encoder, cross_encoder, top_k
+
+
+if __name__ == '__main__':
+    # Setup Argument Parser
+    parser = argparse.ArgumentParser(description="Map Org-Mode notes into (compressed) JSONL format")
+    parser.add_argument('--input-files', '-i', nargs='*', help="List of org-mode files to process")
+    parser.add_argument('--input-filter', type=str, default=None, help="Regex filter for org-mode files to process")
+    parser.add_argument('--compressed-jsonl', '-j', type=pathlib.Path, default=pathlib.Path(".notes.jsonl.gz"), help="Compressed JSONL formatted notes file to compute embeddings from")
+    parser.add_argument('--embeddings', '-e', type=pathlib.Path, default=pathlib.Path(".notes_embeddings.pt"), help="File to save/load model embeddings to/from")
+    parser.add_argument('--regenerate', action='store_true', default=False, help="Regenerate embeddings from org-mode files. Default: false")
+    parser.add_argument('--results-count', '-n', default=5, type=int, help="Number of results to render. Default: 5")
+    parser.add_argument('--interactive', action='store_true', default=False, help="Interactive mode allows user to run queries on the model. Default: true")
+    parser.add_argument('--verbose', action='count', default=0, help="Show verbose conversion logs. Default: 0")
+    args = parser.parse_args()
+
+    entries, corpus_embeddings, bi_encoder, cross_encoder, top_k = setup(args.input_files, args.input_filter, args.compressed_jsonl, args.embeddings, args.regenerate, args.verbose)
+
+    # Run User Queries on Entries in Interactive Mode
+    while args.interactive:
+        # get query from user
+        user_query = input("Enter your query: ")
+        if user_query == "exit":
+            exit(0)
+
+        # query notes
+        hits = query_notes(user_query, corpus_embeddings, entries, bi_encoder, cross_encoder, top_k)
+
+        # render results
+        render_results(hits, entries, count=args.results_count)
--- a/src/search_type/image-search.py
+++ b/src/search_type/image-search.py
@@ -0,0 +1,112 @@
+from sentence_transformers import SentenceTransformer, util
+from PIL import Image
+import torch
+import argparse
+import pathlib
+import copy
+
+
+def initialize_model():
+    # Initialize Model
+    torch.set_num_threads(4)
+    top_k = 3
+    model = SentenceTransformer('clip-ViT-B-32')  #Load the CLIP model
+    return model, top_k
+
+
+def extract_entries(image_directory, verbose=False):
+    image_names = list(image_directory.glob('*.jpg'))
+    if verbose:
+        print(f'Found {len(image_names)} images in {image_directory}')
+    return image_names
+
+
+def compute_embeddings(image_names, model, embeddings_file, verbose=False):
+    "Compute (and Save) Embeddings or Load Pre-Computed Embeddings"
+
+    # Load pre-computed embeddings from file if exists
+    if embeddings_file.exists():
+        image_embeddings = torch.load(embeddings_file)
+        if verbose:
+            print(f"Loaded pre-computed embeddings from {embeddings_file}")
+
+    else:  # Else compute the image_embeddings from scratch, which can take a while
+        images = []
+        if verbose:
+            print(f"Loading the {len(image_names)} images into memory")
+        for image_name in image_names:
+            images.append(copy.deepcopy(Image.open(image_name)))
+
+        if len(images) > 0:
+            image_embeddings = model.encode(images, batch_size=128, convert_to_tensor=True, show_progress_bar=True)
+            torch.save(image_embeddings, embeddings_file)
+            if verbose:
+                print(f"Saved computed embeddings to {embeddings_file}")
+
+    return image_embeddings
+
+
+def search(query, image_embeddings, model, count=3, verbose=False):
+    # Set query to image content if query is a filepath
+    if pathlib.Path(query).expanduser().is_file():
+        query_imagepath = pathlib.Path(query).expanduser().resolve(strict=True)
+        query = copy.deepcopy(Image.open(query_imagepath))
+        if verbose:
+            print(f"Find Images similar to Image at {query_imagepath}")
+    else:
+        print(f"Find Images by Text: {query}")
+
+    # Now we encode the query (which can either be an image or a text string)
+    query_embedding = model.encode([query], convert_to_tensor=True, show_progress_bar=False)
+
+    # Then, we use the util.semantic_search function, which computes the cosine-similarity
+    # between the query embedding and all image embeddings.
+    # It then returns the top_k highest ranked images, which we output
+    hits = util.semantic_search(query_embedding, image_embeddings, top_k=count)[0]
+
+    return hits
+
+
+def render_results(hits, image_names, image_directory, count):
+    for hit in hits[:count]:
+        print(image_names[hit['corpus_id']])
+        image_path = image_directory.joinpath(image_names[hit['corpus_id']])
+        with Image.open(image_path) as img:
+            img.show()
+
+
+if __name__ == '__main__':
+    # Setup Argument Parser
+    parser = argparse.ArgumentParser(description="Semantic Search on Images")
+    parser.add_argument('--image-directory', '-i', required=True, type=pathlib.Path, help="Image directory to query")
+    parser.add_argument('--embeddings-file', '-e', default='embeddings.pt', type=pathlib.Path, help="File to save/load model embeddings to/from. Default: ./embeddings.pt")
+    parser.add_argument('--results-count', '-n', default=5, type=int, help="Number of results to render. Default: 5")
+    parser.add_argument('--interactive', action='store_true', default=False, help="Interactive mode allows user to run queries on the model. Default: true")
+    parser.add_argument('--verbose', action='store_true', default=False, help="Show verbose conversion logs. Default: false")
+    args = parser.parse_args()
+
+    # Resolve file, directory paths in args to absolute paths
+    embeddings_file = args.embeddings_file.expanduser().resolve()
+    image_directory = args.image_directory.expanduser().resolve(strict=True)
+
+    # Initialize Model
+    model, count = initialize_model()
+
+    # Extract Entries
+    image_names = extract_entries(image_directory, args.verbose)
+
+    # Compute or Load Embeddings
+    image_embeddings = compute_embeddings(image_names, model, embeddings_file, args.verbose)
+
+    # Run User Queries on Entries in Interactive Mode
+    while args.interactive:
+        # get query from user
+        user_query = input("Enter your query: ")
+        if user_query == "exit":
+            exit(0)
+
+        # query notes
+        hits = search(user_query, image_embeddings, model, args.results_count, args.verbose)
+
+        # render results
+        render_results(hits, image_names, image_directory, count=args.results_count)
--- a/src/search_type/symmetric.py
+++ b/src/search_type/symmetric.py
@@ -0,0 +1,97 @@
+import pandas as pd
+import faiss
+import numpy as np
+
+from sentence_transformers import SentenceTransformer
+
+import argparse
+import os
+
+def create_index(
+        model,
+        dataset_path,
+        index_path,
+        column_name,
+        recreate):
+    # Load Dataset
+    dataset = pd.read_csv(dataset_path)
+
+    # Clean Dataset
+    dataset = dataset.dropna()
+    dataset[column_name] = dataset[column_name].str.strip()
+
+    # Create Index or Load it if it already exists
+    if os.path.exists(index_path) and not recreate:
+        index = faiss.read_index(index_path)
+    else:
+        # Create Embedding Vectors of Documents
+        embeddings = model.encode(dataset[column_name].to_list(), show_progress_bar=True)
+        embeddings = np.array([embedding for embedding in embeddings]).astype("float32")
+
+        index = faiss.IndexIDMap(
+            faiss.IndexFlatL2(
+                embeddings.shape[1]))
+
+        index.add_with_ids(embeddings, dataset.index.values)
+
+        faiss.write_index(index, index_path)
+
+    return index, dataset
+
+
+def resolve_column(dataset, Id, column):
+    return [list(dataset[dataset.index == idx][column]) for idx in Id[0]]
+
+
+def vector_search(query, index, dataset, column_name, num_results=10):
+    query_vector = np.array(query).astype("float32")
+    D, Id = index.search(query_vector, k=num_results)
+
+    return zip(D[0], Id[0], resolve_column(dataset, Id, column_name))
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser(description="Find most suitable match based on users exclude, include preferences")
+    parser.add_argument('positives', type=str, help="Terms to find closest match to")
+    parser.add_argument('--negatives', '-n', type=str, help="Terms to find farthest match from")
+
+    parser.add_argument('--recreate', action='store_true', default=False, help="Recreate index at index_path from dataset at dataset path")
+    parser.add_argument('--index', type=str, default="./.faiss_index", help="Path to index for storing vector embeddings")
+    parser.add_argument('--dataset', type=str, default="./.dataset", help="Path to dataset to generate index from")
+    parser.add_argument('--column', type=str, default="DATA", help="Name of dataset column to index")
+    parser.add_argument('--num_results', type=int, default=10, help="Number of most suitable matches to show")
+    parser.add_argument('--model_name', type=str, default='paraphrase-distilroberta-base-v1', help="Specify name of the SentenceTransformer model to use for encoding")
+    args = parser.parse_args()
+
+    model = SentenceTransformer(args.model_name)
+
+    if args.positives and not args.negatives:
+        # Get index, create it from dataset if doesn't exist
+        index, dataset = create_index(model, args.dataset, args.index, args.column, args.recreate)
+
+        # Create vector to represent user's stated positive preference
+        preference_vector = model.encode([args.positives])
+
+        # Find and display most suitable matches for users preferences in the dataset
+        results = vector_search(preference_vector, index, dataset, args.column, args.num_results)
+
+        print("Most Suitable Matches:")
+        for similarity, id_, data in results:
+            print(f"Id: {id_}\nSimilarity: {similarity}\n{args.column}: {data[0]}")
+
+    elif args.positives and args.negatives:
+        # Get index, create it from dataset if doesn't exist
+        index, dataset = create_index(model, args.dataset, args.index, args.column, args.recreate)
+
+        # Create vector to represent user's stated preference
+        positives_vector = np.array(model.encode([args.positives])).astype("float32")
+        negatives_vector = np.array(model.encode([args.negatives])).astype("float32")
+
+        # preference_vector = np.mean([positives_vector, -1 * negatives_vector], axis=0)
+        preference_vector = np.add(positives_vector, -1 * negatives_vector)
+
+        # Find and display most suitable matches for users preferences in the dataset
+        results = vector_search(preference_vector, index, dataset, args.column, args.num_results)
+
+        print("Most Suitable Matches:")
+        for similarity, id_, data in results:
+            print(f"Id: {id_}\nSimilarity: {similarity}\n{args.column}: {data[0]}")
--- a/src/utils/init.py
+++ b/src/utils/init.py
--- a/src/utils/helpers.py
+++ b/src/utils/helpers.py
@@ -0,0 +1,9 @@
+import pathlib
+
+
+def is_none_or_empty(item):
+    return item == None or (hasattr(item, '__iter__') and len(item) == 0)
+
+
+def get_absolute_path(filepath):
+    return str(pathlib.Path(filepath).expanduser().absolute())
--- a/src/utils/install.py
+++ b/src/utils/install.py
@@ -0,0 +1,57 @@
+#!/usr/bin/env python3
+import pathlib
+import argparse
+import os
+import stat
+
+
+def get_absolute(path):
+    return path.expanduser().absolute()
+
+
+def create_script(filepath, content):
+    # Create Program Script File
+    with open(get_absolute(filepath, 'w')) as run_script:
+        run_script.write(run_script_content)
+
+    # Make Script Executable
+    absolute_install_path = str(get_absolute(filepath))
+    st = os.stat(absolute_install_path)
+    os.chmod(absolute_install_path, st.st_mode | stat.S_IEXEC)
+
+
+if __name__ == '__main__':
+    # Setup Argument Parser
+    parser = argparse.ArgumentParser(description="Setup the semantic search program")
+    parser.add_argument('--script-dir', '-s', default="./", type=pathlib.Path, help="The project directory. Default: Current Directory")
+    parser.add_argument('--install-dir', '-i', default="./", type=pathlib.Path, help="The directory to install the script. Default: Current Directory")
+    parser.add_argument('--model-dir', '-m', default="./", type=pathlib.Path, help="The directory to store the model in. Default: Current Directory")
+    args = parser.parse_args()
+
+    run_server_content = f'''#!/bin/bash
+
+# Arrange
+eval "$(conda shell.bash hook)"
+conda activate semantic-search
+cd {get_absolute(args.script_dir)}
+
+# Act
+python3 search_types/asymmetric.py -j {get_absolute(args.model_dir)}/notes.jsonl.gz -e {get_absolute(args.model_dir)}/notes_embeddings.pt  -n 5 --interactive
+'''
+
+    search_cmd_content = f'''#!/bin/bash
+
+# Arrange
+eval "$(conda shell.bash hook)"
+conda activate semantic-search
+cd {get_absolute(args.script_dir)}
+
+# Act
+python3 main.py -j {get_absolute(args.model_dir)}/notes.jsonl.gz -e {get_absolute(args.model_dir)}/notes_embeddings.pt
+'''
+
+    # Create single command to start API server exposing HTTP interface
+    create_script(f"{args.install_path}run_server"), run_server_content)
+
+    # Create single command for interactive queries over commandline
+    create_script(f"{args.install_path}semantic-search"), search_cmd_content)