mirror of
https://github.com/khoaliber/khoj.git
synced 2026-03-02 21:19:12 +00:00
Wrap acquire/release locks in try/catch/finally when updating content index and search models to prevent lock not being released on error and causing a deadlock
333 lines
13 KiB
Python
333 lines
13 KiB
Python
# Standard Packages
|
|
import sys
|
|
import logging
|
|
import json
|
|
from enum import Enum
|
|
from typing import Optional
|
|
import requests
|
|
|
|
# External Packages
|
|
import schedule
|
|
from fastapi.staticfiles import StaticFiles
|
|
|
|
# Internal Packages
|
|
from khoj.processor.conversation.gpt import summarize
|
|
from khoj.processor.jsonl.jsonl_to_jsonl import JsonlToJsonl
|
|
from khoj.processor.markdown.markdown_to_jsonl import MarkdownToJsonl
|
|
from khoj.processor.org_mode.org_to_jsonl import OrgToJsonl
|
|
from khoj.processor.pdf.pdf_to_jsonl import PdfToJsonl
|
|
from khoj.processor.github.github_to_jsonl import GithubToJsonl
|
|
from khoj.processor.notion.notion_to_jsonl import NotionToJsonl
|
|
from khoj.search_type import image_search, text_search
|
|
from khoj.utils import constants, state
|
|
from khoj.utils.config import (
|
|
ContentIndex,
|
|
SearchType,
|
|
SearchModels,
|
|
ProcessorConfigModel,
|
|
ConversationProcessorConfigModel,
|
|
)
|
|
from khoj.utils.helpers import LRU, resolve_absolute_path, merge_dicts
|
|
from khoj.utils.rawconfig import FullConfig, ProcessorConfig, SearchConfig, ContentConfig
|
|
from khoj.search_filter.date_filter import DateFilter
|
|
from khoj.search_filter.word_filter import WordFilter
|
|
from khoj.search_filter.file_filter import FileFilter
|
|
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
def configure_server(args, required=False):
|
|
if args.config is None:
|
|
if required:
|
|
logger.error(
|
|
f"Exiting as Khoj is not configured.\nConfigure it via http://localhost:42110/config or by editing {state.config_file}."
|
|
)
|
|
sys.exit(1)
|
|
else:
|
|
logger.warning(
|
|
f"Khoj is not configured.\nConfigure it via http://localhost:42110/config, plugins or by editing {state.config_file}."
|
|
)
|
|
return
|
|
else:
|
|
state.config = args.config
|
|
|
|
# Initialize Processor from Config
|
|
state.processor_config = configure_processor(args.config.processor)
|
|
|
|
# Initialize Search Models from Config
|
|
try:
|
|
state.search_index_lock.acquire()
|
|
state.SearchType = configure_search_types(state.config)
|
|
state.search_models = configure_search(state.search_models, state.config.search_type)
|
|
except Exception as e:
|
|
logger.error(f"🚨 Error configuring search models on app load: {e}")
|
|
finally:
|
|
state.search_index_lock.release()
|
|
|
|
# Initialize Content from Config
|
|
if state.search_models:
|
|
try:
|
|
state.search_index_lock.acquire()
|
|
state.content_index = configure_content(
|
|
state.content_index, state.config.content_type, state.search_models, args.regenerate
|
|
)
|
|
except Exception as e:
|
|
logger.error(f"🚨 Error configuring content index on app load: {e}")
|
|
finally:
|
|
state.search_index_lock.release()
|
|
|
|
|
|
def configure_routes(app):
|
|
# Import APIs here to setup search types before while configuring server
|
|
from khoj.routers.api import api
|
|
from khoj.routers.api_beta import api_beta
|
|
from khoj.routers.web_client import web_client
|
|
|
|
app.mount("/static", StaticFiles(directory=constants.web_directory), name="static")
|
|
app.include_router(api, prefix="/api")
|
|
app.include_router(api_beta, prefix="/api/beta")
|
|
app.include_router(web_client)
|
|
|
|
|
|
if not state.demo:
|
|
|
|
@schedule.repeat(schedule.every(61).minutes)
|
|
def update_search_index():
|
|
try:
|
|
state.search_index_lock.acquire()
|
|
state.content_index = configure_content(
|
|
state.content_index, state.config.content_type, state.search_models, regenerate=False
|
|
)
|
|
logger.info("📬 Content index updated via Scheduler")
|
|
except Exception as e:
|
|
logger.error(f"🚨 Error updating content index via Scheduler: {e}")
|
|
finally:
|
|
state.search_index_lock.release()
|
|
|
|
|
|
def configure_search_types(config: FullConfig):
|
|
# Extract core search types
|
|
core_search_types = {e.name: e.value for e in SearchType}
|
|
# Extract configured plugin search types
|
|
plugin_search_types = {}
|
|
if config.content_type and config.content_type.plugins:
|
|
plugin_search_types = {plugin_type: plugin_type for plugin_type in config.content_type.plugins.keys()}
|
|
|
|
# Dynamically generate search type enum by merging core search types with configured plugin search types
|
|
return Enum("SearchType", merge_dicts(core_search_types, plugin_search_types))
|
|
|
|
|
|
def configure_search(search_models: SearchModels, search_config: SearchConfig) -> Optional[SearchModels]:
|
|
# Run Validation Checks
|
|
if search_config is None:
|
|
logger.warning("🚨 No Search type is configured.")
|
|
return None
|
|
if search_models is None:
|
|
search_models = SearchModels()
|
|
|
|
# Initialize Search Models
|
|
if search_config.asymmetric:
|
|
logger.info("🔍 📜 Setting up text search model")
|
|
search_models.text_search = text_search.initialize_model(search_config.asymmetric)
|
|
|
|
if search_config.image:
|
|
logger.info("🔍 🌄 Setting up image search model")
|
|
search_models.image_search = image_search.initialize_model(search_config.image)
|
|
|
|
return search_models
|
|
|
|
|
|
def configure_content(
|
|
content_index: Optional[ContentIndex],
|
|
content_config: Optional[ContentConfig],
|
|
search_models: SearchModels,
|
|
regenerate: bool,
|
|
t: Optional[state.SearchType] = None,
|
|
) -> Optional[ContentIndex]:
|
|
# Run Validation Checks
|
|
if content_config is None:
|
|
logger.warning("🚨 No Content type is configured.")
|
|
return None
|
|
if content_index is None:
|
|
content_index = ContentIndex()
|
|
|
|
try:
|
|
# Initialize Org Notes Search
|
|
if (t == state.SearchType.Org or t == None) and content_config.org and search_models.text_search:
|
|
logger.info("🦄 Setting up search for orgmode notes")
|
|
# Extract Entries, Generate Notes Embeddings
|
|
content_index.org = text_search.setup(
|
|
OrgToJsonl,
|
|
content_config.org,
|
|
search_models.text_search.bi_encoder,
|
|
regenerate=regenerate,
|
|
filters=[DateFilter(), WordFilter(), FileFilter()],
|
|
)
|
|
|
|
# Initialize Markdown Search
|
|
if (t == state.SearchType.Markdown or t == None) and content_config.markdown and search_models.text_search:
|
|
logger.info("💎 Setting up search for markdown notes")
|
|
# Extract Entries, Generate Markdown Embeddings
|
|
content_index.markdown = text_search.setup(
|
|
MarkdownToJsonl,
|
|
content_config.markdown,
|
|
search_models.text_search.bi_encoder,
|
|
regenerate=regenerate,
|
|
filters=[DateFilter(), WordFilter(), FileFilter()],
|
|
)
|
|
|
|
# Initialize PDF Search
|
|
if (t == state.SearchType.Pdf or t == None) and content_config.pdf and search_models.text_search:
|
|
logger.info("🖨️ Setting up search for pdf")
|
|
# Extract Entries, Generate PDF Embeddings
|
|
content_index.pdf = text_search.setup(
|
|
PdfToJsonl,
|
|
content_config.pdf,
|
|
search_models.text_search.bi_encoder,
|
|
regenerate=regenerate,
|
|
filters=[DateFilter(), WordFilter(), FileFilter()],
|
|
)
|
|
|
|
# Initialize Image Search
|
|
if (t == state.SearchType.Image or t == None) and content_config.image and search_models.image_search:
|
|
logger.info("🌄 Setting up search for images")
|
|
# Extract Entries, Generate Image Embeddings
|
|
content_index.image = image_search.setup(
|
|
content_config.image, search_models.image_search.image_encoder, regenerate=regenerate
|
|
)
|
|
|
|
if (t == state.SearchType.Github or t == None) and content_config.github and search_models.text_search:
|
|
logger.info("🐙 Setting up search for github")
|
|
# Extract Entries, Generate Github Embeddings
|
|
content_index.github = text_search.setup(
|
|
GithubToJsonl,
|
|
content_config.github,
|
|
search_models.text_search.bi_encoder,
|
|
regenerate=regenerate,
|
|
filters=[DateFilter(), WordFilter(), FileFilter()],
|
|
)
|
|
|
|
# Initialize External Plugin Search
|
|
if (t == None or t in state.SearchType) and content_config.plugins and search_models.text_search:
|
|
logger.info("🔌 Setting up search for plugins")
|
|
content_index.plugins = {}
|
|
for plugin_type, plugin_config in content_config.plugins.items():
|
|
content_index.plugins[plugin_type] = text_search.setup(
|
|
JsonlToJsonl,
|
|
plugin_config,
|
|
search_models.text_search.bi_encoder,
|
|
regenerate=regenerate,
|
|
filters=[DateFilter(), WordFilter(), FileFilter()],
|
|
)
|
|
|
|
# Initialize Notion Search
|
|
if (t == None or t in state.SearchType) and content_config.notion and search_models.text_search:
|
|
logger.info("🔌 Setting up search for notion")
|
|
content_index.notion = text_search.setup(
|
|
NotionToJsonl,
|
|
content_config.notion,
|
|
search_models.text_search.bi_encoder,
|
|
regenerate=regenerate,
|
|
filters=[DateFilter(), WordFilter(), FileFilter()],
|
|
)
|
|
|
|
except Exception as e:
|
|
logger.error("🚨 Failed to setup search")
|
|
raise e
|
|
|
|
# Invalidate Query Cache
|
|
state.query_cache = LRU()
|
|
|
|
return content_index
|
|
|
|
|
|
def configure_processor(processor_config: ProcessorConfig):
|
|
if not processor_config:
|
|
return
|
|
|
|
processor = ProcessorConfigModel()
|
|
|
|
# Initialize Conversation Processor
|
|
if processor_config.conversation:
|
|
logger.info("💬 Setting up conversation processor")
|
|
processor.conversation = configure_conversation_processor(processor_config.conversation)
|
|
|
|
return processor
|
|
|
|
|
|
def configure_conversation_processor(conversation_processor_config):
|
|
conversation_processor = ConversationProcessorConfigModel(conversation_processor_config)
|
|
conversation_logfile = resolve_absolute_path(conversation_processor.conversation_logfile)
|
|
|
|
if conversation_logfile.is_file():
|
|
# Load Metadata Logs from Conversation Logfile
|
|
with conversation_logfile.open("r") as f:
|
|
conversation_processor.meta_log = json.load(f)
|
|
logger.debug(f"Loaded conversation logs from {conversation_logfile}")
|
|
else:
|
|
# Initialize Conversation Logs
|
|
conversation_processor.meta_log = {}
|
|
conversation_processor.chat_session = []
|
|
|
|
return conversation_processor
|
|
|
|
|
|
@schedule.repeat(schedule.every(17).minutes)
|
|
def save_chat_session():
|
|
# No need to create empty log file
|
|
if not (
|
|
state.processor_config
|
|
and state.processor_config.conversation
|
|
and state.processor_config.conversation.meta_log
|
|
and state.processor_config.conversation.chat_session
|
|
):
|
|
return
|
|
|
|
# Summarize Conversation Logs for this Session
|
|
chat_session = state.processor_config.conversation.chat_session
|
|
openai_api_key = state.processor_config.conversation.openai_api_key
|
|
conversation_log = state.processor_config.conversation.meta_log
|
|
chat_model = state.processor_config.conversation.chat_model
|
|
session = {
|
|
"summary": summarize(chat_session, model=chat_model, api_key=openai_api_key),
|
|
"session-start": conversation_log.get("session", [{"session-end": 0}])[-1]["session-end"],
|
|
"session-end": len(conversation_log["chat"]),
|
|
}
|
|
if "session" in conversation_log:
|
|
conversation_log["session"].append(session)
|
|
else:
|
|
conversation_log["session"] = [session]
|
|
|
|
# Save Conversation Metadata Logs to Disk
|
|
conversation_logfile = resolve_absolute_path(state.processor_config.conversation.conversation_logfile)
|
|
conversation_logfile.parent.mkdir(parents=True, exist_ok=True) # create conversation directory if doesn't exist
|
|
with open(conversation_logfile, "w+", encoding="utf-8") as logfile:
|
|
json.dump(conversation_log, logfile, indent=2)
|
|
|
|
state.processor_config.conversation.chat_session = []
|
|
logger.info("📩 Saved current chat session to conversation logs")
|
|
|
|
|
|
@schedule.repeat(schedule.every(59).minutes)
|
|
def upload_telemetry():
|
|
if not state.config or not state.config.app or not state.config.app.should_log_telemetry or not state.telemetry:
|
|
message = "📡 No telemetry to upload" if not state.telemetry else "📡 Telemetry logging disabled"
|
|
logger.debug(message)
|
|
return
|
|
|
|
try:
|
|
logger.debug(f"📡 Upload usage telemetry to {constants.telemetry_server}:\n{state.telemetry}")
|
|
for log in state.telemetry:
|
|
for field in log:
|
|
# Check if the value for the field is JSON serializable
|
|
try:
|
|
json.dumps(log[field])
|
|
except TypeError:
|
|
log[field] = str(log[field])
|
|
requests.post(constants.telemetry_server, json=state.telemetry)
|
|
except Exception as e:
|
|
logger.error(f"📡 Error uploading telemetry: {e}")
|
|
else:
|
|
state.telemetry = []
|