add a lock around chat operations to prevent the offline model from getting bombarded and stealing a bunch of compute resources

- This also solves #367
2026-04-28 00:19:25 +00:00 · 2023-08-01 00:23:17 -07:00
parent 6c3074061b
commit 1c52a6993f
2 changed files with 13 additions and 3 deletions
@@ -10,6 +10,7 @@ from gpt4all import GPT4All
 from khoj.processor.conversation.utils import ThreadedGenerator, generate_chatml_messages_with_context
 from khoj.processor.conversation import prompts
 from khoj.utils.constants import empty_escape_sequences
+from khoj.utils import state

 logger = logging.getLogger(__name__)

@@ -58,7 +59,11 @@ def extract_questions_offline(
        next_christmas_date=next_christmas_date,
    )
    message = system_prompt + example_questions
-    response = gpt4all_model.generate(message, max_tokens=200, top_k=2, temp=0, n_batch=128)
+    state.chat_lock.acquire()
+    try:
+        response = gpt4all_model.generate(message, max_tokens=200, top_k=2, temp=0, n_batch=128)
+    finally:
+        state.chat_lock.release()

    # Extract, Clean Message from GPT's Response
    try:
@@ -162,6 +167,10 @@ def llm_thread(g, messages: List[ChatMessage], model: GPT4All):
    templated_user_message = prompts.general_conversation_llamav2.format(query=user_message.content)
    prompted_message = templated_system_message + chat_history + templated_user_message
    response_iterator = model.generate(prompted_message, streaming=True, max_tokens=1000, n_batch=256)
-    for response in response_iterator:
-        g.send(response)
+    state.chat_lock.acquire()
+    try:
+        for response in response_iterator:
+            g.send(response)
+    finally:
+        state.chat_lock.release()
    g.close()
@@ -25,6 +25,7 @@ port: int = None
 cli_args: List[str] = None
 query_cache = LRU()
 config_lock = threading.Lock()
+chat_lock = threading.Lock()
 SearchType = utils_config.SearchType
 telemetry: List[Dict[str, str]] = []
 previous_query: str = None