add a lock around chat operations to prevent the offline model from getting bombarded and stealing a bunch of compute resources

- This also solves #367
2026-03-09 13:25:11 +00:00 · 2023-08-01 00:23:17 -07:00
parent 6c3074061b
commit 1c52a6993f
2 changed files with 13 additions and 3 deletions
--- a/src/khoj/processor/conversation/gpt4all/chat_model.py
+++ b/src/khoj/processor/conversation/gpt4all/chat_model.py
@@ -10,6 +10,7 @@ from gpt4all import GPT4All
 from khoj.processor.conversation.utils import ThreadedGenerator, generate_chatml_messages_with_context
 from khoj.processor.conversation import prompts
 from khoj.utils.constants import empty_escape_sequences
 from khoj.utils import state
 logger = logging.getLogger(__name__)
@@ -58,7 +59,11 @@ def extract_questions_offline(
        next_christmas_date=next_christmas_date,
    )
    message = system_prompt + example_questions
    state.chat_lock.acquire()
    try:
        response = gpt4all_model.generate(message, max_tokens=200, top_k=2, temp=0, n_batch=128)
    finally:
        state.chat_lock.release()
    # Extract, Clean Message from GPT's Response
    try:
@@ -162,6 +167,10 @@ def llm_thread(g, messages: List[ChatMessage], model: GPT4All):
    templated_user_message = prompts.general_conversation_llamav2.format(query=user_message.content)
    prompted_message = templated_system_message + chat_history + templated_user_message
    response_iterator = model.generate(prompted_message, streaming=True, max_tokens=1000, n_batch=256)
    state.chat_lock.acquire()
    try:
        for response in response_iterator:
            g.send(response)
    finally:
        state.chat_lock.release()
    g.close()
--- a/src/khoj/utils/state.py
+++ b/src/khoj/utils/state.py
@@ -25,6 +25,7 @@ port: int = None
 cli_args: List[str] = None
 query_cache = LRU()
 config_lock = threading.Lock()
 chat_lock = threading.Lock()
 SearchType = utils_config.SearchType
 telemetry: List[Dict[str, str]] = []
 previous_query: str = None