Merge branch 'master' of github.com:debanjum/semantic-search into add-summarize-capability-to-chat-bot

- Fix openai_api_key being set in ConfigProcessorConfig
- Merge addition of config UI and config instantiation updates
This commit is contained in:
Debanjum Singh Solanky
2021-12-20 13:26:35 +05:30
19 changed files with 424 additions and 210 deletions

View File

@@ -1,26 +1,49 @@
# Standard Packages
import sys
import json
import sys, json, yaml
from typing import Optional
# External Packages
import uvicorn
from fastapi import FastAPI
from fastapi import FastAPI, Request
from fastapi.responses import HTMLResponse
from fastapi.staticfiles import StaticFiles
from fastapi.templating import Jinja2Templates
# Internal Packages
from src.search_type import asymmetric, symmetric_ledger, image_search
from src.utils.helpers import get_absolute_path, get_from_dict
from src.utils.cli import cli
from src.utils.config import SearchType, SearchModels, TextSearchConfig, ImageSearchConfig, SearchConfig, ProcessorConfig, ConversationProcessorConfig
from src.utils.config import SearchType, SearchModels, ProcessorConfigModel, ConversationProcessorConfigModel
from src.utils.rawconfig import FullConfig
from src.processor.conversation.gpt import converse, message_to_log, message_to_prompt, understand, summarize
# Application Global State
config = FullConfig()
model = SearchModels()
search_config = SearchConfig()
processor_config = ProcessorConfig()
processor_config = ProcessorConfigModel()
config_file = ""
verbose = 0
app = FastAPI()
app.mount("/views", StaticFiles(directory="views"), name="views")
templates = Jinja2Templates(directory="views/")
@app.get('/ui', response_class=HTMLResponse)
def ui(request: Request):
return templates.TemplateResponse("config.html", context={'request': request})
@app.get('/config', response_model=FullConfig)
def config_data():
return config
@app.post('/config')
async def config_data(updated_config: FullConfig):
global config
config = updated_config
with open(config_file, 'w') as outfile:
yaml.dump(yaml.safe_load(config.json(by_alias=True)), outfile)
outfile.close()
return config
@app.get('/search')
def search(q: str, n: Optional[int] = 5, t: Optional[SearchType] = None):
@@ -60,7 +83,7 @@ def search(q: str, n: Optional[int] = 5, t: Optional[SearchType] = None):
return image_search.collate_results(
hits,
model.image_search.image_names,
search_config.image.input_directory,
config.content_type.image.input_directory,
results_count)
else:
@@ -69,22 +92,7 @@ def search(q: str, n: Optional[int] = 5, t: Optional[SearchType] = None):
@app.get('/regenerate')
def regenerate(t: Optional[SearchType] = None):
if (t == SearchType.Notes or t == None) and search_config.notes:
# Extract Entries, Generate Embeddings
model.notes_search = asymmetric.setup(search_config.notes, regenerate=True)
if (t == SearchType.Music or t == None) and search_config.music:
# Extract Entries, Generate Song Embeddings
model.music_search = asymmetric.setup(search_config.music, regenerate=True)
if (t == SearchType.Ledger or t == None) and search_config.ledger:
# Extract Entries, Generate Embeddings
model.ledger_search = symmetric_ledger.setup(search_config.ledger, regenerate=True)
if (t == SearchType.Image or t == None) and search_config.image:
# Extract Images, Generate Embeddings
model.image_search = image_search.setup(search_config.image, regenerate=True)
initialize_search(config, regenerate=True, t=t)
return {'status': 'ok', 'message': 'regeneration completed'}
@@ -111,37 +119,40 @@ def chat(q: str):
return {'status': 'ok', 'response': gpt_response}
def initialize_search(config, regenerate, verbose):
def initialize_search(config: FullConfig, regenerate: bool, t: SearchType = None):
model = SearchModels()
search_config = SearchConfig()
# Initialize Org Notes Search
search_config.notes = TextSearchConfig.create_from_dictionary(config, ('content-type', 'org'), verbose)
if search_config.notes:
model.notes_search = asymmetric.setup(search_config.notes, regenerate=regenerate)
if (t == SearchType.Notes or t == None) and config.content_type.org:
# Extract Entries, Generate Notes Embeddings
model.notes_search = asymmetric.setup(config.content_type.org, regenerate=regenerate, verbose=verbose)
# Initialize Org Music Search
search_config.music = TextSearchConfig.create_from_dictionary(config, ('content-type', 'music'), verbose)
if search_config.music:
model.music_search = asymmetric.setup(search_config.music, regenerate=regenerate)
if (t == SearchType.Music or t == None) and config.content_type.music:
# Extract Entries, Generate Music Embeddings
model.music_search = asymmetric.setup(config.content_type.music, regenerate=regenerate, verbose=verbose)
# Initialize Ledger Search
search_config.ledger = TextSearchConfig.create_from_dictionary(config, ('content-type', 'ledger'), verbose)
if search_config.ledger:
model.ledger_search = symmetric_ledger.setup(search_config.ledger, regenerate=regenerate)
if (t == SearchType.Ledger or t == None) and config.content_type.ledger:
# Extract Entries, Generate Ledger Embeddings
model.ledger_search = symmetric_ledger.setup(config.content_type.ledger, regenerate=regenerate, verbose=verbose)
# Initialize Image Search
search_config.image = ImageSearchConfig.create_from_dictionary(config, ('content-type', 'image'), verbose)
if search_config.image:
model.image_search = image_search.setup(search_config.image, regenerate=regenerate)
if (t == SearchType.Image or t == None) and config.content_type.image:
# Extract Entries, Generate Image Embeddings
model.image_search = image_search.setup(config.content_type.image, regenerate=regenerate, verbose=verbose)
return model, search_config
return model
def initialize_processor(config, verbose):
def initialize_processor(config: FullConfig):
if not config.processor:
return
processor_config = ProcessorConfigModel()
# Initialize Conversation Processor
processor_config = ProcessorConfig()
processor_config.conversation = ConversationProcessorConfig.create_from_dictionary(config, ('processor', 'conversation'), verbose)
processor_config.conversation = ConversationProcessorConfigModel(config.processor.conversation, verbose)
conversation_logfile = processor_config.conversation.conversation_logfile
if processor_config.conversation.verbose:
@@ -195,11 +206,20 @@ if __name__ == '__main__':
# Load config from CLI
args = cli(sys.argv[1:])
# Initialize Search from Config
model, search_config = initialize_search(args.config, args.regenerate, args.verbose)
# Stores the file path to the config file.
config_file = args.config_file
# Store the verbose flag
verbose = args.verbose
# Store the raw config data.
config = args.config
# Initialize the search model from Config
model = initialize_search(args.config, args.regenerate)
# Initialize Processor from Config
processor_config = initialize_processor(args.config, args.verbose)
processor_config = initialize_processor(args.config)
# Start Application Server
if args.socket:

View File

@@ -14,7 +14,8 @@ from sentence_transformers import SentenceTransformer, CrossEncoder, util
# Internal Packages
from src.utils.helpers import get_absolute_path, resolve_absolute_path
from src.processor.org_mode.org_to_jsonl import org_to_jsonl
from src.utils.config import TextSearchModel, TextSearchConfig
from src.utils.config import TextSearchModel
from src.utils.rawconfig import TextSearchConfig
def initialize_model():
@@ -58,7 +59,7 @@ def compute_embeddings(entries, bi_encoder, embeddings_file, regenerate=False, v
corpus_embeddings = bi_encoder.encode([entry[0] for entry in entries], convert_to_tensor=True, show_progress_bar=True)
torch.save(corpus_embeddings, get_absolute_path(embeddings_file))
if verbose > 0:
print(f"Computed embeddings and save them to {embeddings_file}")
print(f"Computed embeddings and saved them to {embeddings_file}")
return corpus_embeddings
@@ -148,22 +149,22 @@ def collate_results(hits, entries, count=5):
in hits[0:count]]
def setup(config: TextSearchConfig, regenerate: bool) -> TextSearchModel:
def setup(config: TextSearchConfig, regenerate: bool, verbose: bool=False) -> TextSearchModel:
# Initialize Model
bi_encoder, cross_encoder, top_k = initialize_model()
# Map notes in Org-Mode files to (compressed) JSONL formatted file
if not resolve_absolute_path(config.compressed_jsonl).exists() or regenerate:
org_to_jsonl(config.input_files, config.input_filter, config.compressed_jsonl, config.verbose)
org_to_jsonl(config.input_files, config.input_filter, config.compressed_jsonl, verbose)
# Extract Entries
entries = extract_entries(config.compressed_jsonl, config.verbose)
entries = extract_entries(config.compressed_jsonl, verbose)
top_k = min(len(entries), top_k) # top_k hits can't be more than the total entries in corpus
# Compute or Load Embeddings
corpus_embeddings = compute_embeddings(entries, bi_encoder, config.embeddings_file, regenerate=regenerate, verbose=config.verbose)
corpus_embeddings = compute_embeddings(entries, bi_encoder, config.embeddings_file, regenerate=regenerate, verbose=verbose)
return TextSearchModel(entries, corpus_embeddings, bi_encoder, cross_encoder, top_k, verbose=config.verbose)
return TextSearchModel(entries, corpus_embeddings, bi_encoder, cross_encoder, top_k, verbose=verbose)
if __name__ == '__main__':

View File

@@ -10,9 +10,10 @@ from tqdm import trange
import torch
# Internal Packages
from src.utils.helpers import get_absolute_path, resolve_absolute_path
from src.utils.helpers import resolve_absolute_path
import src.utils.exiftool as exiftool
from src.utils.config import ImageSearchModel, ImageSearchConfig
from src.utils.config import ImageSearchModel
from src.utils.rawconfig import ImageSearchConfig
def initialize_model():
@@ -153,13 +154,13 @@ def collate_results(hits, image_names, image_directory, count=5):
in hits[0:count]]
def setup(config: ImageSearchConfig, regenerate: bool) -> ImageSearchModel:
def setup(config: ImageSearchConfig, regenerate: bool, verbose: bool=False) -> ImageSearchModel:
# Initialize Model
encoder = initialize_model()
# Extract Entries
image_directory = resolve_absolute_path(config.input_directory, strict=True)
image_names = extract_entries(image_directory, config.verbose)
image_names = extract_entries(image_directory, verbose)
# Compute or Load Embeddings
embeddings_file = resolve_absolute_path(config.embeddings_file)
@@ -170,13 +171,13 @@ def setup(config: ImageSearchConfig, regenerate: bool) -> ImageSearchModel:
batch_size=config.batch_size,
regenerate=regenerate,
use_xmp_metadata=config.use_xmp_metadata,
verbose=config.verbose)
verbose=verbose)
return ImageSearchModel(image_names,
image_embeddings,
image_metadata_embeddings,
encoder,
config.verbose)
verbose)
if __name__ == '__main__':

View File

@@ -1,9 +1,6 @@
# Standard Packages
import json
import time
import gzip
import os
import sys
import re
import argparse
import pathlib
@@ -15,11 +12,12 @@ from sentence_transformers import SentenceTransformer, CrossEncoder, util
# Internal Packages
from src.utils.helpers import get_absolute_path, resolve_absolute_path
from src.processor.ledger.beancount_to_jsonl import beancount_to_jsonl
from src.utils.config import TextSearchModel, TextSearchConfig
from src.utils.config import TextSearchModel
from src.utils.rawconfig import TextSearchConfig
def initialize_model():
"Initialize model for symetric semantic search. That is, where query of similar size to results"
"Initialize model for symmetric semantic search. That is, where query of similar size to results"
torch.set_num_threads(4)
bi_encoder = SentenceTransformer('sentence-transformers/paraphrase-MiniLM-L6-v2') # The encoder encodes all entries to use for semantic search
top_k = 30 # Number of entries we want to retrieve with the bi-encoder
@@ -55,7 +53,7 @@ def compute_embeddings(entries, bi_encoder, embeddings_file, regenerate=False, v
corpus_embeddings = bi_encoder.encode(entries, convert_to_tensor=True, show_progress_bar=True)
torch.save(corpus_embeddings, get_absolute_path(embeddings_file))
if verbose > 0:
print(f"Computed embeddings and save them to {embeddings_file}")
print(f"Computed embeddings and saved them to {embeddings_file}")
return corpus_embeddings
@@ -143,22 +141,22 @@ def collate_results(hits, entries, count=5):
in hits[0:count]]
def setup(config: TextSearchConfig, regenerate: bool) -> TextSearchModel:
def setup(config: TextSearchConfig, regenerate: bool, verbose: bool) -> TextSearchModel:
# Initialize Model
bi_encoder, cross_encoder, top_k = initialize_model()
# Map notes in Org-Mode files to (compressed) JSONL formatted file
if not resolve_absolute_path(config.compressed_jsonl).exists() or regenerate:
beancount_to_jsonl(config.input_files, config.input_filter, config.compressed_jsonl, config.verbose)
beancount_to_jsonl(config.input_files, config.input_filter, config.compressed_jsonl, verbose)
# Extract Entries
entries = extract_entries(config.compressed_jsonl, config.verbose)
entries = extract_entries(config.compressed_jsonl, verbose)
top_k = min(len(entries), top_k)
# Compute or Load Embeddings
corpus_embeddings = compute_embeddings(entries, bi_encoder, config.embeddings_file, regenerate=regenerate, verbose=config.verbose)
corpus_embeddings = compute_embeddings(entries, bi_encoder, config.embeddings_file, regenerate=regenerate, verbose=verbose)
return TextSearchModel(entries, corpus_embeddings, bi_encoder, cross_encoder, top_k, verbose=config.verbose)
return TextSearchModel(entries, corpus_embeddings, bi_encoder, cross_encoder, top_k, verbose=verbose)
if __name__ == '__main__':

View File

@@ -1,12 +1,14 @@
# Standard Packages
import argparse
import pathlib
import json
# External Packages
import yaml
# Internal Packages
from src.utils.helpers import is_none_or_empty, get_absolute_path, resolve_absolute_path, get_from_dict, merge_dicts
from src.utils.helpers import is_none_or_empty, get_absolute_path, resolve_absolute_path, merge_dicts
from src.utils.rawconfig import FullConfig
def cli(args=None):
if is_none_or_empty(args):
@@ -35,12 +37,15 @@ def cli(args=None):
with open(get_absolute_path(args.config_file), 'r', encoding='utf-8') as config_file:
config_from_file = yaml.safe_load(config_file)
args.config = merge_dicts(priority_dict=config_from_file, default_dict=args.config)
args.config = FullConfig.parse_obj(args.config)
else:
args.config = FullConfig.parse_obj(args.config)
if args.org_files:
args.config['content-type']['org']['input-files'] = args.org_files
args.config.content_type.org.input_files = args.org_files
if args.org_filter:
args.config['content-type']['org']['input-filter'] = args.org_filter
args.config.content_type.org.input_filter = args.org_filter
return args

View File

@@ -4,7 +4,7 @@ from dataclasses import dataclass
from pathlib import Path
# Internal Packages
from src.utils.helpers import get_from_dict
from src.utils.rawconfig import ConversationProcessorConfig
class SearchType(str, Enum):
@@ -42,80 +42,15 @@ class SearchModels():
image_search: ImageSearchModel = None
class TextSearchConfig():
def __init__(self, input_files, input_filter, compressed_jsonl, embeddings_file, verbose):
self.input_files = input_files
self.input_filter = input_filter
self.compressed_jsonl = Path(compressed_jsonl)
self.embeddings_file = Path(embeddings_file)
class ConversationProcessorConfigModel():
def __init__(self, processor_config: ConversationProcessorConfig, verbose: bool):
self.openai_api_key = processor_config.openai_api_key
self.conversation_logfile = Path(processor_config.conversation_logfile)
self.chat_session = ''
self.meta_log = []
self.verbose = verbose
def create_from_dictionary(config, key_tree, verbose):
text_config = get_from_dict(config, *key_tree)
search_enabled = text_config and ('input-files' in text_config or 'input-filter' in text_config)
if not search_enabled:
return None
return TextSearchConfig(
input_files = text_config['input-files'],
input_filter = text_config['input-filter'],
compressed_jsonl = Path(text_config['compressed-jsonl']),
embeddings_file = Path(text_config['embeddings-file']),
verbose = verbose)
class ImageSearchConfig():
def __init__(self, input_directory, embeddings_file, batch_size, use_xmp_metadata, verbose):
self.input_directory = input_directory
self.embeddings_file = Path(embeddings_file)
self.batch_size = batch_size
self.use_xmp_metadata = use_xmp_metadata
self.verbose = verbose
def create_from_dictionary(config, key_tree, verbose):
image_config = get_from_dict(config, *key_tree)
search_enabled = image_config and 'input-directory' in image_config
if not search_enabled:
return None
return ImageSearchConfig(
input_directory = Path(image_config['input-directory']),
embeddings_file = Path(image_config['embeddings-file']),
batch_size = image_config['batch-size'],
use_xmp_metadata = {'yes': True, 'no': False}[image_config['use-xmp-metadata']],
verbose = verbose)
@dataclass
class SearchConfig():
notes: TextSearchConfig = None
ledger: TextSearchConfig = None
music: TextSearchConfig = None
image: ImageSearchConfig = None
class ConversationProcessorConfig():
def __init__(self, conversation_logfile, chat_session, meta_log, openai_api_key, verbose):
self.openai_api_key = openai_api_key
self.conversation_logfile = conversation_logfile
self.chat_session = chat_session
self.meta_log = meta_log
self.verbose = verbose
def create_from_dictionary(config, key_tree, verbose):
conversation_config = get_from_dict(config, *key_tree)
if not conversation_config:
return None
return ConversationProcessorConfig(
openai_api_key = conversation_config['openai-api-key'],
chat_session = '',
meta_log = [],
conversation_logfile = Path(conversation_config['conversation-logfile']),
verbose = verbose)
@dataclass
class ProcessorConfig():
conversation: ConversationProcessorConfig = None
class ProcessorConfigModel():
conversation: ConversationProcessorConfigModel = None

View File

@@ -4,6 +4,8 @@ import pathlib
def is_none_or_empty(item):
return item == None or (hasattr(item, '__iter__') and len(item) == 0)
def to_snake_case_from_dash(item: str):
return item.replace('_', '-')
def get_absolute_path(filepath):
return str(pathlib.Path(filepath).expanduser().absolute())

62
src/utils/rawconfig.py Normal file
View File

@@ -0,0 +1,62 @@
# System Packages
from pathlib import Path
from typing import List, Optional
# External Packages
from pydantic import BaseModel
# Internal Packages
from src.utils.helpers import to_snake_case_from_dash
class ConfigBase(BaseModel):
class Config:
alias_generator = to_snake_case_from_dash
allow_population_by_field_name = True
class SearchConfig(ConfigBase):
input_files: Optional[List[str]]
input_filter: Optional[str]
embeddings_file: Optional[Path]
class TextSearchConfig(ConfigBase):
compressed_jsonl: Optional[Path]
input_files: Optional[List[str]]
input_filter: Optional[str]
embeddings_file: Optional[Path]
class ImageSearchConfig(ConfigBase):
use_xmp_metadata: Optional[str]
batch_size: Optional[int]
input_directory: Optional[Path]
input_filter: Optional[str]
embeddings_file: Optional[Path]
class ContentTypeConfig(ConfigBase):
org: Optional[TextSearchConfig]
ledger: Optional[TextSearchConfig]
image: Optional[ImageSearchConfig]
music: Optional[TextSearchConfig]
class AsymmetricConfig(ConfigBase):
encoder: Optional[str]
cross_encoder: Optional[str]
class ImageSearchTypeConfig(ConfigBase):
encoder: Optional[str]
class SearchTypeConfig(ConfigBase):
asymmetric: Optional[AsymmetricConfig]
image: Optional[ImageSearchTypeConfig]
class ConversationProcessorConfig(ConfigBase):
openai_api_key: Optional[str]
conversation_logfile: Optional[str]
conversation_history: Optional[str]
class ProcessorConfigModel(ConfigBase):
conversation: Optional[ConversationProcessorConfig]
class FullConfig(ConfigBase):
content_type: Optional[ContentTypeConfig]
search_type: Optional[SearchTypeConfig]
processor: Optional[ProcessorConfigModel]