Move application files under src directory. Update Readmes

- Remove callign asymmetric search script directly command.
  It doesn't work anymore on calling directly due to internal package
  import issues
This commit is contained in:
Debanjum Singh Solanky
2021-08-17 01:25:12 -07:00
parent c35c6fb0b3
commit af9660f28e
16 changed files with 3 additions and 13 deletions

0
src/__init__.py Normal file
View File

View File

@@ -0,0 +1,44 @@
* Emacs Semantic Search
/An Emacs interface for [[https://github.com/debanjum/semantic-search][semantic-search]]/
** Requirements
- Install and Run [[https://github.com/debanjum/semantic-search][semantic-search]]
** Installation
- Direct Install
- Put ~semantic-search.el~ in your Emacs load path. For e.g ~/.emacs.d/lisp
- Load via ~use-package~ in your ~/.emacs.d/init.el or .emacs file by adding below snippet
#+begin_src elisp
;; Org-Semantic Search Library
(use-package semantic-search
:load-path "~/.emacs.d/lisp/semantic-search.el"
:bind ("C-c s" . 'semantic-search))
#+end_src
- Use [[https://github.com/quelpa/quelpa#installation][Quelpa]]
- Ensure [[https://github.com/quelpa/quelpa#installation][Quelpa]], [[https://github.com/quelpa/quelpa-use-package#installation][quelpa-use-package]] are installed
- Add below snippet to your ~/.emacs.d/init.el or .emacs config file and execute it.
#+begin_src elisp
;; Org-Semantic Search Library
(use-package semantic-search
:quelpa (semantic-search :fetcher url :url "https://raw.githubusercontent.com/debanjum/semantic-search/master/interface/emacs/semantic-search.el")
:bind ("C-c s" . 'semantic-search))
#+end_src
** Usage
1. Call ~semantic-search~ using keybinding ~C-c s~ or ~M-x semantic-search~
2. Enter Query in Natural Language
e.g "What is the meaning of life?" "What are my life goals?"
3. Wait for results
*Note: It takes about 15s on a Mac M1 and a ~100K lines corpus of org-mode files*
4. (Optional) Narrow down results further
Include/Exclude specific words from results by adding to query
e.g "What is the meaning of life? -god +none"

View File

@@ -0,0 +1,103 @@
;;; semantic-search.el --- Semantic search via Emacs
;; Copyright (C) 2021-2022 Debanjum Singh Solanky
;; Author: Debanjum Singh Solanky <debanjum@gmail.com>
;; Version: 0.1
;; Keywords: search, org-mode, outlines
;; URL: http://github.com/debanjum/semantic-search/interface/emacs
;; This file is NOT part of GNU Emacs.
;;; License:
;; This program is free software; you can redistribute it and/or
;; modify it under the terms of the GNU General Public License
;; as published by the Free Software Foundation; either version 3
;; of the License, or (at your option) any later version.
;; This program is distributed in the hope that it will be useful,
;; but WITHOUT ANY WARRANTY; without even the implied warranty of
;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
;; GNU General Public License for more details.
;; You should have received a copy of the GNU General Public License
;; along with this program. If not, see <http://www.gnu.org/licenses/>.
;;; Commentary:
;; This package provides semantic search on org-mode files
;; It is a wrapper that interfaces with transformer based ML model
;; The models semantic search capabilities are exposed via an HTTP API
;;; Code:
(require 'url)
(require 'json)
(defcustom semantic-search--server-url "http://localhost:8000"
"Location of semantic search API server."
:group 'semantic-search
:type 'string)
(defun semantic-search--extract-entries-as-org (json-response)
"Convert json response from API to org-mode entries"
;; remove leading (, ) or SPC from extracted entries string
(replace-regexp-in-string
"^[\(\) ]" ""
;; extract entries from response as single string and convert to entries
(format "%s"
(mapcar
(lambda (args) (format "* %s" (cdr (assoc 'Entry args))))
json-response))))
(defun semantic-search--extract-entries-as-ledger (json-response)
"Convert json response from API to ledger entries"
;; remove leading (, ) or SPC from extracted entries string
(replace-regexp-in-string
"^[\(\) ]" ""
;; extract entries from response as single string and convert to entries
(format "%s"
(mapcar
(lambda (args) (format "* %s" (cdr (assoc 'Entry args))))
json-response))))
(defun semantic-search--buffer-name-to-search-type (buffer-name)
(let ((file-extension (file-name-extension buffer-name)))
(cond
((equal file-extension "bean") "ledger")
((equal file-extension "org") "notes")
(t "notes"))))
(defun semantic-search--construct-api-query (query search-type)
(let ((encoded-query (url-hexify-string query)))
(format "%s/search?q=%s&t=%s" semantic-search--server-url encoded-query search-type)))
(defun semantic-search (query)
"Semantic search on org-mode content via semantic-search API"
(interactive "sQuery: ")
(let* ((search-type (semantic-search--buffer-name-to-search-type (buffer-name)))
(url (semantic-search--construct-api-query query search-type))
(buff (get-buffer-create "*semantic-search*")))
;; get json response from api
(with-current-buffer buff
(let ((inhibit-read-only t))
(erase-buffer)
(url-insert-file-contents url)))
;; convert json response to org-mode entries
(with-current-buffer buff
(let ((inhibit-read-only t)
(json-response (json-parse-buffer :object-type 'alist)))
(erase-buffer)
(insert
(cond ((equal search-type "notes") (semantic-search--extract-entries-as-org json-response))
((equal search-type "ledger") (semantic-search--extract-entries-as-ledger json-response))
(t (format "%s" json-response)))))
(cond ((equal search-type "notes") (org-mode))
(t (fundamental-mode)))
(read-only-mode t))
(switch-to-buffer buff)))
(provide 'semantic-search)
;;; semantic-search.el ends here

69
src/main.py Normal file
View File

@@ -0,0 +1,69 @@
from typing import Optional
from fastapi import FastAPI
from search_type import asymmetric
from processor.org_mode.org_to_jsonl import org_to_jsonl
from utils.helpers import is_none_or_empty
import argparse
import pathlib
import uvicorn
app = FastAPI()
@app.get('/search')
def search(q: str, n: Optional[int] = 5, t: Optional[str] = 'notes'):
if q is None or q == '':
print(f'No query param (q) passed in API call to initiate search')
return {}
user_query = q
results_count = n
if t == 'notes':
# query notes
hits = asymmetric.query_notes(
user_query,
corpus_embeddings,
entries,
bi_encoder,
cross_encoder,
top_k)
# collate and return results
return asymmetric.collate_results(hits, entries, results_count)
else:
return {}
@app.get('/regenerate')
def regenerate():
# Extract Entries, Generate Embeddings
extracted_entries, computed_embeddings, _, _, _ = asymmetric.setup(args.input_files, args.input_filter, args.compressed_jsonl, args.embeddings, regenerate=True, verbose=args.verbose)
# Now Update State
# update state variables after regeneration complete
# minimize time the application is in inconsistent, partially updated state
global corpus_embeddings
global entries
entries = extracted_entries
corpus_embeddings = computed_embeddings
return {'status': 'ok', 'message': 'regeneration completed'}
if __name__ == '__main__':
# Setup Argument Parser
parser = argparse.ArgumentParser(description="Expose API for Semantic Search")
parser.add_argument('--input-files', '-i', nargs='*', help="List of org-mode files to process")
parser.add_argument('--input-filter', type=str, default=None, help="Regex filter for org-mode files to process")
parser.add_argument('--compressed-jsonl', '-j', type=pathlib.Path, default=pathlib.Path(".notes.jsonl.gz"), help="Compressed JSONL formatted notes file to compute embeddings from")
parser.add_argument('--embeddings', '-e', type=pathlib.Path, default=pathlib.Path(".notes_embeddings.pt"), help="File to save/load model embeddings to/from")
parser.add_argument('--regenerate', action='store_true', default=False, help="Regenerate embeddings from org-mode files. Default: false")
parser.add_argument('--verbose', action='count', default=0, help="Show verbose conversion logs. Default: 0")
args = parser.parse_args()
entries, corpus_embeddings, bi_encoder, cross_encoder, top_k = asymmetric.setup(args.input_files, args.input_filter, args.compressed_jsonl, args.embeddings, args.regenerate, args.verbose)
# Start Application Server
uvicorn.run(app)

View File

View File

View File

@@ -0,0 +1,141 @@
#!/usr/bin/env python3
# Import Modules
from processor.org_mode import orgnode
from utils.helpers import get_absolute_path, is_none_or_empty
import json
import argparse
import pathlib
import glob
import gzip
# Define Functions
def org_to_jsonl(org_files, org_file_filter, output_file, verbose=0):
# Input Validation
if is_none_or_empty(org_files) and is_none_or_empty(org_file_filter):
print("At least one of org-files or org-file-filter is required to be specified")
exit(1)
# Get Org Files to Process
org_files = get_org_files(org_files, org_file_filter, verbose)
# Extract Entries from specified Org files
entries = extract_org_entries(org_files)
# Process Each Entry from All Notes Files
jsonl_data = convert_org_entries_to_jsonl(entries, verbose=verbose)
# Compress JSONL formatted Data
if output_file.suffix == ".gz":
compress_jsonl_data(jsonl_data, output_file, verbose=verbose)
elif output_file.suffix == ".jsonl":
dump_jsonl(jsonl_data, output_file, verbose=verbose)
return entries
def dump_jsonl(jsonl_data, output_path, verbose=0):
"Write List of JSON objects to JSON line file"
with open(get_absolute_path(output_path), 'w', encoding='utf-8') as f:
f.write(jsonl_data)
if verbose > 0:
print(f'Wrote {len(jsonl_data)} lines to jsonl at {output_path}')
def compress_jsonl_data(jsonl_data, output_path, verbose=0):
with gzip.open(get_absolute_path(output_path), 'wt') as gzip_file:
gzip_file.write(jsonl_data)
if verbose > 0:
print(f'Wrote {len(jsonl_data)} lines to gzip compressed jsonl at {output_path}')
def load_jsonl(input_path, verbose=0):
"Read List of JSON objects from JSON line file"
data = []
with open(get_absolute_path(input_path), 'r', encoding='utf-8') as f:
for line in f:
data.append(json.loads(line.rstrip('\n|\r')))
if verbose > 0:
print(f'Loaded {len(data)} records from {input_path}')
return data
def get_org_files(org_files=None, org_file_filter=None, verbose=0):
"Get Org files to process"
absolute_org_files, filtered_org_files = set(), set()
if org_files:
absolute_org_files = {get_absolute_path(org_file)
for org_file
in org_files}
if org_file_filter:
filtered_org_files = set(glob.glob(get_absolute_path(org_file_filter)))
all_org_files = absolute_org_files | filtered_org_files
files_with_non_org_extensions = {org_file for org_file in all_org_files if not org_file.endswith(".org")}
if any(files_with_non_org_extensions):
print(f"[Warning] There maybe non org-mode files in the input set: {files_with_non_org_extensions}")
if verbose > 0:
print(f'Processing files: {all_org_files}')
return all_org_files
def extract_org_entries(org_files):
"Extract entries from specified Org files"
entries = []
for org_file in org_files:
entries.extend(
orgnode.makelist(
str(org_file)))
return entries
def convert_org_entries_to_jsonl(entries, verbose=0):
"Convert each Org-Mode entries to JSON and collate as JSONL"
jsonl = ''
for entry in entries:
entry_dict = dict()
entry_dict["Title"] = entry.Heading()
if verbose > 1:
print(f"Title: {entry.Heading()}")
if entry.Tags():
tags_str = " ".join([tag for tag in entry.Tags()])
entry_dict["Tags"] = tags_str
if verbose > 1:
print(f"Tags: {tags_str}")
if entry.Body():
entry_dict["Body"] = entry.Body()
if verbose > 2:
print(f"Body: {entry.Body()}")
if entry_dict:
# Convert Dictionary to JSON and Append to JSONL string
jsonl += f'{json.dumps(entry_dict, ensure_ascii=False)}\n'
if verbose > 0:
print(f"Converted {len(entries)} to jsonl format")
return jsonl
if __name__ == '__main__':
# Setup Argument Parser
parser = argparse.ArgumentParser(description="Map Org-Mode notes into (compressed) JSONL format")
parser.add_argument('--output-file', '-o', type=pathlib.Path, required=True, help="Output file for (compressed) JSONL formatted notes. Expected file extensions: jsonl or jsonl.gz")
parser.add_argument('--input-files', '-i', nargs='*', help="List of org-mode files to process")
parser.add_argument('--input-filter', type=str, default=None, help="Regex filter for org-mode files to process")
parser.add_argument('--verbose', '-v', action='count', default=0, help="Show verbose conversion logs, Default: 0")
args = parser.parse_args()
# Map notes in Org-Mode files to (compressed) JSONL formatted file
org_to_jsonl(args.input_files, args.input_filter, args.output_file, args.verbose)

View File

@@ -0,0 +1,332 @@
# Copyright (c) 2010 Charles Cave
#
# Permission is hereby granted, free of charge, to any person
# obtaining a copy of this software and associated documentation
# files (the "Software"), to deal in the Software without
# restriction, including without limitation the rights to use, copy,
# modify, merge, publish, distribute, sublicense, and/or sell copies
# of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be
# included in all copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
# BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.
# Program written by Charles Cave (charlesweb@optusnet.com.au)
# February - March 2009
# Version 2 - June 2009
# Added support for all tags, TODO priority and checking existence of a tag
# More information at
# http://members.optusnet.com.au/~charles57/GTD
"""
The Orgnode module consists of the Orgnode class for representing a
headline and associated text from an org-mode file, and routines for
constructing data structures of these classes.
"""
import re, sys
import datetime
def makelist(filename):
"""
Read an org-mode file and return a list of Orgnode objects
created from this file.
"""
ctr = 0
try:
f = open(filename, 'r')
except IOError:
print(f"Unable to open file {filename}")
print("Program terminating.")
sys.exit(1)
todos = { "TODO": "", "WAITING": "", "ACTIVE": "",
"DONE": "", "CANCELLED": "", "FAILED": ""} # populated from #+SEQ_TODO line
level = 0
heading = ""
bodytext = ""
tag1 = "" # The first tag enclosed in ::
alltags = [] # list of all tags in headline
sched_date = ''
deadline_date = ''
nodelist = []
propdict = dict()
in_properties_drawer = False
for line in f:
ctr += 1
hdng = re.search('^(\*+)\s(.*?)\s*$', line)
if hdng:
if heading: # we are processing a heading line
thisNode = Orgnode(level, heading, bodytext, tag1, alltags)
if sched_date:
thisNode.setScheduled(sched_date)
sched_date = ""
if deadline_date:
thisNode.setDeadline(deadline_date)
deadline_date = ''
thisNode.setProperties(propdict)
nodelist.append( thisNode )
propdict = dict()
level = hdng.group(1)
heading = hdng.group(2)
bodytext = ""
tag1 = ""
alltags = [] # list of all tags in headline
tagsrch = re.search('(.*?)\s*:([a-zA-Z0-9].*?):([a-zA-Z0-9].*?):$',heading)
if tagsrch:
heading = tagsrch.group(1)
tag1 = tagsrch.group(2)
alltags.append(tag1)
tag2 = tagsrch.group(3)
if tag2:
for t in tag2.split(':'):
if t != '': alltags.append(t)
else: # we are processing a non-heading line
if line[:10] == '#+SEQ_TODO':
kwlist = re.findall('([A-Z]+)\(', line)
for kw in kwlist: todos[kw] = ""
# Ignore Properties Drawers Completely
if re.search(':PROPERTIES:', line):
in_properties_drawer=True
continue
if in_properties_drawer and re.search(':END:', line):
in_properties_drawer=False
continue
# Ignore Clocking Lines
if re.search('CLOCK: \[[0-9]{4}-[0-9]{2}-[0-9]{2}', line):
continue
if not in_properties_drawer and line[:1] != '#':
bodytext = bodytext + line
prop_srch = re.search('^\s*:(.*?):\s*(.*?)\s*$', line)
if prop_srch:
propdict[prop_srch.group(1)] = prop_srch.group(2)
continue
sd_re = re.search('SCHEDULED:\s+<([0-9]+)\-([0-9]+)\-([0-9]+)', line)
if sd_re:
sched_date = datetime.date(int(sd_re.group(1)),
int(sd_re.group(2)),
int(sd_re.group(3)) )
dd_re = re.search('DEADLINE:\s*<(\d+)\-(\d+)\-(\d+)', line)
if dd_re:
deadline_date = datetime.date(int(dd_re.group(1)),
int(dd_re.group(2)),
int(dd_re.group(3)) )
# write out last node
thisNode = Orgnode(level, heading, bodytext, tag1, alltags)
thisNode.setProperties(propdict)
if sched_date:
thisNode.setScheduled(sched_date)
if deadline_date:
thisNode.setDeadline(deadline_date)
nodelist.append( thisNode )
# using the list of TODO keywords found in the file
# process the headings searching for TODO keywords
for n in nodelist:
h = n.Heading()
todoSrch = re.search('([A-Z]+)\s(.*?)$', h)
if todoSrch:
if todoSrch.group(1) in todos:
n.setHeading( todoSrch.group(2) )
n.setTodo ( todoSrch.group(1) )
prtysrch = re.search('^\[\#(A|B|C)\] (.*?)$', n.Heading())
if prtysrch:
n.setPriority(prtysrch.group(1))
n.setHeading(prtysrch.group(2))
return nodelist
######################
class Orgnode(object):
"""
Orgnode class represents a headline, tags and text associated
with the headline.
"""
def __init__(self, level, headline, body, tag, alltags):
"""
Create an Orgnode object given the parameters of level (as the
raw asterisks), headline text (including the TODO tag), and
first tag. The makelist routine postprocesses the list to
identify TODO tags and updates headline and todo fields.
"""
self.level = len(level)
self.headline = headline
self.body = body
self.tag = tag # The first tag in the list
self.tags = dict() # All tags in the headline
self.todo = ""
self.prty = "" # empty of A, B or C
self.scheduled = "" # Scheduled date
self.deadline = "" # Deadline date
self.properties = dict()
for t in alltags:
self.tags[t] = ''
# Look for priority in headline and transfer to prty field
def Heading(self):
"""
Return the Heading text of the node without the TODO tag
"""
return self.headline
def setHeading(self, newhdng):
"""
Change the heading to the supplied string
"""
self.headline = newhdng
def Body(self):
"""
Returns all lines of text of the body of this node except the
Property Drawer
"""
return self.body
def Level(self):
"""
Returns an integer corresponding to the level of the node.
Top level (one asterisk) has a level of 1.
"""
return self.level
def Priority(self):
"""
Returns the priority of this headline: 'A', 'B', 'C' or empty
string if priority has not been set.
"""
return self.prty
def setPriority(self, newprty):
"""
Change the value of the priority of this headline.
Values values are '', 'A', 'B', 'C'
"""
self.prty = newprty
def Tag(self):
"""
Returns the value of the first tag.
For example, :HOME:COMPUTER: would return HOME
"""
return self.tag
def Tags(self):
"""
Returns a list of all tags
For example, :HOME:COMPUTER: would return ['HOME', 'COMPUTER']
"""
return self.tags.keys()
def hasTag(self, srch):
"""
Returns True if the supplied tag is present in this headline
For example, hasTag('COMPUTER') on headling containing
:HOME:COMPUTER: would return True.
"""
return srch in self.tags
def setTag(self, newtag):
"""
Change the value of the first tag to the supplied string
"""
self.tag = newtag
def setTags(self, taglist):
"""
Store all the tags found in the headline. The first tag will
also be stored as if the setTag method was called.
"""
for t in taglist:
self.tags[t] = ''
def Todo(self):
"""
Return the value of the TODO tag
"""
return self.todo
def setTodo(self, value):
"""
Set the value of the TODO tag to the supplied string
"""
self.todo = value
def setProperties(self, dictval):
"""
Sets all properties using the supplied dictionary of
name/value pairs
"""
self.properties = dictval
def Property(self, keyval):
"""
Returns the value of the requested property or null if the
property does not exist.
"""
return self.properties.get(keyval, "")
def setScheduled(self, dateval):
"""
Set the scheduled date using the supplied date object
"""
self.scheduled = dateval
def Scheduled(self):
"""
Return the scheduled date object or null if nonexistent
"""
return self.scheduled
def setDeadline(self, dateval):
"""
Set the deadline (due) date using the supplied date object
"""
self.deadline = dateval
def Deadline(self):
"""
Return the deadline date object or null if nonexistent
"""
return self.deadline
def __repr__(self):
"""
Print the level, heading text and tag of a node and the body
text as used to construct the node.
"""
# This method is not completed yet.
n = ''
for i in range(0, self.level):
n = n + '*'
n = n + ' ' + self.todo + ' '
if self.prty:
n = n + '[#' + self.prty + '] '
n = n + self.headline
n = "%-60s " % n # hack - tags will start in column 62
closecolon = ''
for t in self.tags.keys():
n = n + ':' + t
closecolon = ':'
n = n + closecolon
# Need to output Scheduled Date, Deadline Date, property tags The
# following will output the text used to construct the object
n = n + "\n" + self.body
return n

View File

View File

@@ -0,0 +1,188 @@
#!/usr/bin/env python
import json
from sentence_transformers import SentenceTransformer, CrossEncoder, util
import time
import gzip
import os
import sys
import re
import torch
import argparse
import pathlib
from utils.helpers import get_absolute_path
from processor.org_mode.org_to_jsonl import org_to_jsonl
def initialize_model():
"Initialize model for assymetric semantic search. That is, where query smaller than results"
bi_encoder = SentenceTransformer('sentence-transformers/msmarco-MiniLM-L-6-v3') # The bi-encoder encodes all entries to use for semantic search
top_k = 100 # Number of entries we want to retrieve with the bi-encoder
cross_encoder = CrossEncoder('cross-encoder/ms-marco-MiniLM-L-6-v2') # The cross-encoder re-ranks the results to improve quality
return bi_encoder, cross_encoder, top_k
def extract_entries(notesfile, verbose=0):
"Load entries from compressed jsonl"
entries = []
with gzip.open(get_absolute_path(notesfile), 'rt', encoding='utf8') as jsonl:
for line in jsonl:
note = json.loads(line.strip())
# Ignore title notes i.e notes with just headings and empty body
if not "Body" in note or note["Body"].strip() == "":
continue
note_string = f'{note["Title"]}\t{note["Tags"] if "Tags" in note else ""}\n{note["Body"] if "Body" in note else ""}'
entries.extend([note_string])
if verbose > 0:
print(f"Loaded {len(entries)} entries from {notesfile}")
return entries
def compute_embeddings(entries, bi_encoder, embeddings_file, regenerate=False, verbose=0):
"Compute (and Save) Embeddings or Load Pre-Computed Embeddings"
# Load pre-computed embeddings from file if exists
if embeddings_file.exists() and not regenerate:
corpus_embeddings = torch.load(get_absolute_path(embeddings_file))
if verbose > 0:
print(f"Loaded embeddings from {embeddings_file}")
else: # Else compute the corpus_embeddings from scratch, which can take a while
corpus_embeddings = bi_encoder.encode(entries, convert_to_tensor=True, show_progress_bar=True)
torch.save(corpus_embeddings, get_absolute_path(embeddings_file))
if verbose > 0:
print(f"Computed embeddings and save them to {embeddings_file}")
return corpus_embeddings
def query_notes(raw_query, corpus_embeddings, entries, bi_encoder, cross_encoder, top_k=100):
"Search all notes for entries that answer the query"
# Separate natural query from explicit required, blocked words filters
query = " ".join([word for word in raw_query.split() if not word.startswith("+") and not word.startswith("-")])
required_words = set([word[1:].lower() for word in raw_query.split() if word.startswith("+")])
blocked_words = set([word[1:].lower() for word in raw_query.split() if word.startswith("-")])
# Encode the query using the bi-encoder
question_embedding = bi_encoder.encode(query, convert_to_tensor=True)
# Find relevant entries for the query
hits = util.semantic_search(question_embedding, corpus_embeddings, top_k=top_k)
hits = hits[0] # Get the hits for the first query
# Filter results using explicit filters
hits = explicit_filter(hits, entries, required_words, blocked_words)
if hits is None or len(hits) == 0:
return hits
# Score all retrieved entries using the cross-encoder
cross_inp = [[query, entries[hit['corpus_id']]] for hit in hits]
cross_scores = cross_encoder.predict(cross_inp)
# Store cross-encoder scores in results dictionary for ranking
for idx in range(len(cross_scores)):
hits[idx]['cross-score'] = cross_scores[idx]
# Order results by cross encoder score followed by biencoder score
hits.sort(key=lambda x: x['score'], reverse=True) # sort by biencoder score
hits.sort(key=lambda x: x['cross-score'], reverse=True) # sort by cross encoder score
return hits
def explicit_filter(hits, entries, required_words, blocked_words):
hits_by_word_set = [(set(word.lower()
for word
in re.split(
',|\.| |\]|\[\(|\)|\{|\}',
entries[hit['corpus_id']])
if word != ""),
hit)
for hit in hits]
if len(required_words) == 0 and len(blocked_words) == 0:
return hits
if len(required_words) > 0:
return [hit for (words_in_entry, hit) in hits_by_word_set
if required_words.intersection(words_in_entry) and not blocked_words.intersection(words_in_entry)]
if len(blocked_words) > 0:
return [hit for (words_in_entry, hit) in hits_by_word_set
if not blocked_words.intersection(words_in_entry)]
return hits
def render_results(hits, entries, count=5, display_biencoder_results=False):
"Render the Results returned by Search for the Query"
if display_biencoder_results:
# Output of top hits from bi-encoder
print("\n-------------------------\n")
print(f"Top-{count} Bi-Encoder Retrieval hits")
hits = sorted(hits, key=lambda x: x['score'], reverse=True)
for hit in hits[0:count]:
print(f"Score: {hit['score']:.3f}\n------------\n{entries[hit['corpus_id']]}")
# Output of top hits from re-ranker
print("\n-------------------------\n")
print(f"Top-{count} Cross-Encoder Re-ranker hits")
hits = sorted(hits, key=lambda x: x['cross-score'], reverse=True)
for hit in hits[0:count]:
print(f"CrossScore: {hit['cross-score']:.3f}\n-----------------\n{entries[hit['corpus_id']]}")
def collate_results(hits, entries, count=5):
return [
{
"Entry": entries[hit['corpus_id']],
"Score": f"{hit['cross-score']:.3f}"
}
for hit
in hits[0:count]]
def setup(input_files, input_filter, compressed_jsonl, embeddings, regenerate=False, verbose=False):
# Initialize Model
bi_encoder, cross_encoder, top_k = initialize_model()
# Map notes in Org-Mode files to (compressed) JSONL formatted file
if not compressed_jsonl.exists() or regenerate:
org_to_jsonl(input_files, input_filter, compressed_jsonl, verbose)
# Extract Entries
entries = extract_entries(compressed_jsonl, verbose)
# Compute or Load Embeddings
corpus_embeddings = compute_embeddings(entries, bi_encoder, embeddings, regenerate=regenerate, verbose=verbose)
return entries, corpus_embeddings, bi_encoder, cross_encoder, top_k
if __name__ == '__main__':
# Setup Argument Parser
parser = argparse.ArgumentParser(description="Map Org-Mode notes into (compressed) JSONL format")
parser.add_argument('--input-files', '-i', nargs='*', help="List of org-mode files to process")
parser.add_argument('--input-filter', type=str, default=None, help="Regex filter for org-mode files to process")
parser.add_argument('--compressed-jsonl', '-j', type=pathlib.Path, default=pathlib.Path(".notes.jsonl.gz"), help="Compressed JSONL formatted notes file to compute embeddings from")
parser.add_argument('--embeddings', '-e', type=pathlib.Path, default=pathlib.Path(".notes_embeddings.pt"), help="File to save/load model embeddings to/from")
parser.add_argument('--regenerate', action='store_true', default=False, help="Regenerate embeddings from org-mode files. Default: false")
parser.add_argument('--results-count', '-n', default=5, type=int, help="Number of results to render. Default: 5")
parser.add_argument('--interactive', action='store_true', default=False, help="Interactive mode allows user to run queries on the model. Default: true")
parser.add_argument('--verbose', action='count', default=0, help="Show verbose conversion logs. Default: 0")
args = parser.parse_args()
entries, corpus_embeddings, bi_encoder, cross_encoder, top_k = setup(args.input_files, args.input_filter, args.compressed_jsonl, args.embeddings, args.regenerate, args.verbose)
# Run User Queries on Entries in Interactive Mode
while args.interactive:
# get query from user
user_query = input("Enter your query: ")
if user_query == "exit":
exit(0)
# query notes
hits = query_notes(user_query, corpus_embeddings, entries, bi_encoder, cross_encoder, top_k)
# render results
render_results(hits, entries, count=args.results_count)

View File

@@ -0,0 +1,112 @@
from sentence_transformers import SentenceTransformer, util
from PIL import Image
import torch
import argparse
import pathlib
import copy
def initialize_model():
# Initialize Model
torch.set_num_threads(4)
top_k = 3
model = SentenceTransformer('clip-ViT-B-32') #Load the CLIP model
return model, top_k
def extract_entries(image_directory, verbose=False):
image_names = list(image_directory.glob('*.jpg'))
if verbose:
print(f'Found {len(image_names)} images in {image_directory}')
return image_names
def compute_embeddings(image_names, model, embeddings_file, verbose=False):
"Compute (and Save) Embeddings or Load Pre-Computed Embeddings"
# Load pre-computed embeddings from file if exists
if embeddings_file.exists():
image_embeddings = torch.load(embeddings_file)
if verbose:
print(f"Loaded pre-computed embeddings from {embeddings_file}")
else: # Else compute the image_embeddings from scratch, which can take a while
images = []
if verbose:
print(f"Loading the {len(image_names)} images into memory")
for image_name in image_names:
images.append(copy.deepcopy(Image.open(image_name)))
if len(images) > 0:
image_embeddings = model.encode(images, batch_size=128, convert_to_tensor=True, show_progress_bar=True)
torch.save(image_embeddings, embeddings_file)
if verbose:
print(f"Saved computed embeddings to {embeddings_file}")
return image_embeddings
def search(query, image_embeddings, model, count=3, verbose=False):
# Set query to image content if query is a filepath
if pathlib.Path(query).expanduser().is_file():
query_imagepath = pathlib.Path(query).expanduser().resolve(strict=True)
query = copy.deepcopy(Image.open(query_imagepath))
if verbose:
print(f"Find Images similar to Image at {query_imagepath}")
else:
print(f"Find Images by Text: {query}")
# Now we encode the query (which can either be an image or a text string)
query_embedding = model.encode([query], convert_to_tensor=True, show_progress_bar=False)
# Then, we use the util.semantic_search function, which computes the cosine-similarity
# between the query embedding and all image embeddings.
# It then returns the top_k highest ranked images, which we output
hits = util.semantic_search(query_embedding, image_embeddings, top_k=count)[0]
return hits
def render_results(hits, image_names, image_directory, count):
for hit in hits[:count]:
print(image_names[hit['corpus_id']])
image_path = image_directory.joinpath(image_names[hit['corpus_id']])
with Image.open(image_path) as img:
img.show()
if __name__ == '__main__':
# Setup Argument Parser
parser = argparse.ArgumentParser(description="Semantic Search on Images")
parser.add_argument('--image-directory', '-i', required=True, type=pathlib.Path, help="Image directory to query")
parser.add_argument('--embeddings-file', '-e', default='embeddings.pt', type=pathlib.Path, help="File to save/load model embeddings to/from. Default: ./embeddings.pt")
parser.add_argument('--results-count', '-n', default=5, type=int, help="Number of results to render. Default: 5")
parser.add_argument('--interactive', action='store_true', default=False, help="Interactive mode allows user to run queries on the model. Default: true")
parser.add_argument('--verbose', action='store_true', default=False, help="Show verbose conversion logs. Default: false")
args = parser.parse_args()
# Resolve file, directory paths in args to absolute paths
embeddings_file = args.embeddings_file.expanduser().resolve()
image_directory = args.image_directory.expanduser().resolve(strict=True)
# Initialize Model
model, count = initialize_model()
# Extract Entries
image_names = extract_entries(image_directory, args.verbose)
# Compute or Load Embeddings
image_embeddings = compute_embeddings(image_names, model, embeddings_file, args.verbose)
# Run User Queries on Entries in Interactive Mode
while args.interactive:
# get query from user
user_query = input("Enter your query: ")
if user_query == "exit":
exit(0)
# query notes
hits = search(user_query, image_embeddings, model, args.results_count, args.verbose)
# render results
render_results(hits, image_names, image_directory, count=args.results_count)

View File

@@ -0,0 +1,97 @@
import pandas as pd
import faiss
import numpy as np
from sentence_transformers import SentenceTransformer
import argparse
import os
def create_index(
model,
dataset_path,
index_path,
column_name,
recreate):
# Load Dataset
dataset = pd.read_csv(dataset_path)
# Clean Dataset
dataset = dataset.dropna()
dataset[column_name] = dataset[column_name].str.strip()
# Create Index or Load it if it already exists
if os.path.exists(index_path) and not recreate:
index = faiss.read_index(index_path)
else:
# Create Embedding Vectors of Documents
embeddings = model.encode(dataset[column_name].to_list(), show_progress_bar=True)
embeddings = np.array([embedding for embedding in embeddings]).astype("float32")
index = faiss.IndexIDMap(
faiss.IndexFlatL2(
embeddings.shape[1]))
index.add_with_ids(embeddings, dataset.index.values)
faiss.write_index(index, index_path)
return index, dataset
def resolve_column(dataset, Id, column):
return [list(dataset[dataset.index == idx][column]) for idx in Id[0]]
def vector_search(query, index, dataset, column_name, num_results=10):
query_vector = np.array(query).astype("float32")
D, Id = index.search(query_vector, k=num_results)
return zip(D[0], Id[0], resolve_column(dataset, Id, column_name))
if __name__ == '__main__':
parser = argparse.ArgumentParser(description="Find most suitable match based on users exclude, include preferences")
parser.add_argument('positives', type=str, help="Terms to find closest match to")
parser.add_argument('--negatives', '-n', type=str, help="Terms to find farthest match from")
parser.add_argument('--recreate', action='store_true', default=False, help="Recreate index at index_path from dataset at dataset path")
parser.add_argument('--index', type=str, default="./.faiss_index", help="Path to index for storing vector embeddings")
parser.add_argument('--dataset', type=str, default="./.dataset", help="Path to dataset to generate index from")
parser.add_argument('--column', type=str, default="DATA", help="Name of dataset column to index")
parser.add_argument('--num_results', type=int, default=10, help="Number of most suitable matches to show")
parser.add_argument('--model_name', type=str, default='paraphrase-distilroberta-base-v1', help="Specify name of the SentenceTransformer model to use for encoding")
args = parser.parse_args()
model = SentenceTransformer(args.model_name)
if args.positives and not args.negatives:
# Get index, create it from dataset if doesn't exist
index, dataset = create_index(model, args.dataset, args.index, args.column, args.recreate)
# Create vector to represent user's stated positive preference
preference_vector = model.encode([args.positives])
# Find and display most suitable matches for users preferences in the dataset
results = vector_search(preference_vector, index, dataset, args.column, args.num_results)
print("Most Suitable Matches:")
for similarity, id_, data in results:
print(f"Id: {id_}\nSimilarity: {similarity}\n{args.column}: {data[0]}")
elif args.positives and args.negatives:
# Get index, create it from dataset if doesn't exist
index, dataset = create_index(model, args.dataset, args.index, args.column, args.recreate)
# Create vector to represent user's stated preference
positives_vector = np.array(model.encode([args.positives])).astype("float32")
negatives_vector = np.array(model.encode([args.negatives])).astype("float32")
# preference_vector = np.mean([positives_vector, -1 * negatives_vector], axis=0)
preference_vector = np.add(positives_vector, -1 * negatives_vector)
# Find and display most suitable matches for users preferences in the dataset
results = vector_search(preference_vector, index, dataset, args.column, args.num_results)
print("Most Suitable Matches:")
for similarity, id_, data in results:
print(f"Id: {id_}\nSimilarity: {similarity}\n{args.column}: {data[0]}")

0
src/utils/__init__.py Normal file
View File

9
src/utils/helpers.py Normal file
View File

@@ -0,0 +1,9 @@
import pathlib
def is_none_or_empty(item):
return item == None or (hasattr(item, '__iter__') and len(item) == 0)
def get_absolute_path(filepath):
return str(pathlib.Path(filepath).expanduser().absolute())

57
src/utils/install.py Normal file
View File

@@ -0,0 +1,57 @@
#!/usr/bin/env python3
import pathlib
import argparse
import os
import stat
def get_absolute(path):
return path.expanduser().absolute()
def create_script(filepath, content):
# Create Program Script File
with open(get_absolute(filepath, 'w')) as run_script:
run_script.write(run_script_content)
# Make Script Executable
absolute_install_path = str(get_absolute(filepath))
st = os.stat(absolute_install_path)
os.chmod(absolute_install_path, st.st_mode | stat.S_IEXEC)
if __name__ == '__main__':
# Setup Argument Parser
parser = argparse.ArgumentParser(description="Setup the semantic search program")
parser.add_argument('--script-dir', '-s', default="./", type=pathlib.Path, help="The project directory. Default: Current Directory")
parser.add_argument('--install-dir', '-i', default="./", type=pathlib.Path, help="The directory to install the script. Default: Current Directory")
parser.add_argument('--model-dir', '-m', default="./", type=pathlib.Path, help="The directory to store the model in. Default: Current Directory")
args = parser.parse_args()
run_server_content = f'''#!/bin/bash
# Arrange
eval "$(conda shell.bash hook)"
conda activate semantic-search
cd {get_absolute(args.script_dir)}
# Act
python3 search_types/asymmetric.py -j {get_absolute(args.model_dir)}/notes.jsonl.gz -e {get_absolute(args.model_dir)}/notes_embeddings.pt -n 5 --interactive
'''
search_cmd_content = f'''#!/bin/bash
# Arrange
eval "$(conda shell.bash hook)"
conda activate semantic-search
cd {get_absolute(args.script_dir)}
# Act
python3 main.py -j {get_absolute(args.model_dir)}/notes.jsonl.gz -e {get_absolute(args.model_dir)}/notes_embeddings.pt
'''
# Create single command to start API server exposing HTTP interface
create_script(f"{args.install_path}run_server"), run_server_content)
# Create single command for interactive queries over commandline
create_script(f"{args.install_path}semantic-search"), search_cmd_content)