diff --git a/docs/chat.md b/docs/chat.md index b900d052..2efd7b1b 100644 --- a/docs/chat.md +++ b/docs/chat.md @@ -10,7 +10,8 @@ Offline chat stays completely private and works without internet. But it is slower, lower quality and more compute intensive. > **System Requirements**: -> - Machine with at least **6 GB of RAM** and **4 GB of Disk** available +> - Minimum 8 GB RAM. Recommend **16Gb VRAM** +> - Minimum **5 GB of Disk** available > - A CPU supporting [AVX or AVX2 instructions](https://en.wikipedia.org/wiki/Advanced_Vector_Extensions) is required > - A Mac M1+ or [Vulcan supported GPU](https://vulkan.gpuinfo.org/) should significantly speed up chat response times diff --git a/pyproject.toml b/pyproject.toml index f4ae57f4..a5163d5c 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -62,8 +62,8 @@ dependencies = [ "pymupdf >= 1.23.5", "django == 4.2.5", "authlib == 1.2.1", - "gpt4all == 1.0.12; platform_system == 'Linux' and platform_machine == 'x86_64'", - "gpt4all == 1.0.12; platform_system == 'Windows' or platform_system == 'Darwin'", + "gpt4all >= 2.0.0; platform_system == 'Linux' and platform_machine == 'x86_64'", + "gpt4all >= 2.0.0; platform_system == 'Windows' or platform_system == 'Darwin'", "itsdangerous == 2.1.2", "httpx == 0.25.0", "pgvector == 0.2.3", diff --git a/src/interface/obsidian/src/chat_modal.ts b/src/interface/obsidian/src/chat_modal.ts index ddf266fa..a8008048 100644 --- a/src/interface/obsidian/src/chat_modal.ts +++ b/src/interface/obsidian/src/chat_modal.ts @@ -38,7 +38,7 @@ export class KhojChatModal extends Modal { await this.getChatHistory(); // Add chat input field - contentEl.createEl("input", + const chatInput = contentEl.createEl("input", { attr: { type: "text", @@ -48,10 +48,11 @@ export class KhojChatModal extends Modal { class: "khoj-chat-input option" } }) - .addEventListener('change', (event) => { this.result = (event.target).value }); + chatInput.addEventListener('change', (event) => { this.result = (event.target).value }); // Scroll to bottom of modal, till the send message input box this.modalEl.scrollTop = this.modalEl.scrollHeight; + chatInput.focus(); } generateReference(messageEl: any, reference: string, index: number) { diff --git a/src/khoj/main.py b/src/khoj/main.py index d434c461..33029b94 100644 --- a/src/khoj/main.py +++ b/src/khoj/main.py @@ -122,6 +122,7 @@ def set_state(args): state.demo = args.demo state.anonymous_mode = args.anonymous_mode state.khoj_version = version("khoj-assistant") + state.chat_on_gpu = args.chat_on_gpu def start_server(app, host=None, port=None, socket=None): diff --git a/src/khoj/migrations/migrate_offline_chat_default_model.py b/src/khoj/migrations/migrate_offline_chat_default_model.py new file mode 100644 index 00000000..b7f4cf94 --- /dev/null +++ b/src/khoj/migrations/migrate_offline_chat_default_model.py @@ -0,0 +1,69 @@ +""" +Current format of khoj.yml +--- +app: + ... +content-type: + ... +processor: + conversation: + offline-chat: + enable-offline-chat: false + chat-model: llama-2-7b-chat.ggmlv3.q4_0.bin + ... +search-type: + ... + +New format of khoj.yml +--- +app: + ... +content-type: + ... +processor: + conversation: + offline-chat: + enable-offline-chat: false + chat-model: mistral-7b-instruct-v0.1.Q4_0.gguf + ... +search-type: + ... +""" +import logging +from packaging import version + +from khoj.utils.yaml import load_config_from_file, save_config_to_file + + +logger = logging.getLogger(__name__) + + +def migrate_offline_chat_default_model(args): + schema_version = "0.12.4" + raw_config = load_config_from_file(args.config_file) + previous_version = raw_config.get("version") + + if "processor" not in raw_config: + return args + if raw_config["processor"] is None: + return args + if "conversation" not in raw_config["processor"]: + return args + if "offline-chat" not in raw_config["processor"]["conversation"]: + return args + if "chat-model" not in raw_config["processor"]["conversation"]["offline-chat"]: + return args + + if previous_version is None or version.parse(previous_version) < version.parse("0.12.4"): + logger.info( + f"Upgrading config schema to {schema_version} from {previous_version} to change default (offline) chat model to mistral GGUF" + ) + raw_config["version"] = schema_version + + # Update offline chat model to mistral in GGUF format to use latest GPT4All + offline_chat_model = raw_config["processor"]["conversation"]["offline-chat"]["chat-model"] + if offline_chat_model.endswith(".bin"): + raw_config["processor"]["conversation"]["offline-chat"]["chat-model"] = "mistral-7b-instruct-v0.1.Q4_0.gguf" + + save_config_to_file(raw_config, args.config_file) + return args diff --git a/src/khoj/migrations/migrate_server_pg.py b/src/khoj/migrations/migrate_server_pg.py index 0f6e518b..d19780f9 100644 --- a/src/khoj/migrations/migrate_server_pg.py +++ b/src/khoj/migrations/migrate_server_pg.py @@ -9,7 +9,7 @@ processor: conversation-logfile: ~/.khoj/processor/conversation/conversation_logs.json max-prompt-size: null offline-chat: - chat-model: llama-2-7b-chat.ggmlv3.q4_0.bin + chat-model: mistral-7b-instruct-v0.1.Q4_0.gguf enable-offline-chat: false openai: api-key: sk-blah @@ -46,7 +46,7 @@ processor: - chat-model: gpt-3.5-turbo tokenizer: null type: openai - - chat-model: llama-2-7b-chat.ggmlv3.q4_0.bin + - chat-model: mistral-7b-instruct-v0.1.Q4_0.gguf tokenizer: null type: offline search-type: diff --git a/src/khoj/processor/conversation/gpt4all/chat_model.py b/src/khoj/processor/conversation/gpt4all/chat_model.py index 7e92d002..04a004f0 100644 --- a/src/khoj/processor/conversation/gpt4all/chat_model.py +++ b/src/khoj/processor/conversation/gpt4all/chat_model.py @@ -16,7 +16,7 @@ logger = logging.getLogger(__name__) def extract_questions_offline( text: str, - model: str = "llama-2-7b-chat.ggmlv3.q4_0.bin", + model: str = "mistral-7b-instruct-v0.1.Q4_0.gguf", loaded_model: Union[Any, None] = None, conversation_log={}, use_history: bool = True, @@ -123,7 +123,7 @@ def converse_offline( references, user_query, conversation_log={}, - model: str = "llama-2-7b-chat.ggmlv3.q4_0.bin", + model: str = "mistral-7b-instruct-v0.1.Q4_0.gguf", loaded_model: Union[Any, None] = None, completion_func=None, conversation_command=ConversationCommand.Default, diff --git a/src/khoj/processor/conversation/gpt4all/utils.py b/src/khoj/processor/conversation/gpt4all/utils.py index cd9bc9e2..0b876b26 100644 --- a/src/khoj/processor/conversation/gpt4all/utils.py +++ b/src/khoj/processor/conversation/gpt4all/utils.py @@ -1,5 +1,6 @@ import logging +from khoj.utils import state logger = logging.getLogger(__name__) @@ -16,8 +17,13 @@ def download_model(model_name: str): # Decide whether to load model to GPU or CPU try: - # Check if machine has GPU and GPU has enough free memory to load the chat model - device = "gpu" if gpt4all.pyllmodel.LLModel().list_gpu(chat_model_config["path"]) else "cpu" + # Try load chat model to GPU if: + # 1. Loading chat model to GPU isn't disabled via CLI and + # 2. Machine has GPU + # 3. GPU has enough free memory to load the chat model + device = ( + "gpu" if state.chat_on_gpu and gpt4all.pyllmodel.LLModel().list_gpu(chat_model_config["path"]) else "cpu" + ) except ValueError: device = "cpu" diff --git a/src/khoj/processor/conversation/utils.py b/src/khoj/processor/conversation/utils.py index 83d51f2d..b0d401fa 100644 --- a/src/khoj/processor/conversation/utils.py +++ b/src/khoj/processor/conversation/utils.py @@ -20,9 +20,11 @@ model_to_prompt_size = { "gpt-4": 8192, "llama-2-7b-chat.ggmlv3.q4_0.bin": 1548, "gpt-3.5-turbo-16k": 15000, + "mistral-7b-instruct-v0.1.Q4_0.gguf": 1548, } model_to_tokenizer = { "llama-2-7b-chat.ggmlv3.q4_0.bin": "hf-internal-testing/llama-tokenizer", + "mistral-7b-instruct-v0.1.Q4_0.gguf": "mistralai/Mistral-7B-Instruct-v0.1", } diff --git a/src/khoj/utils/cli.py b/src/khoj/utils/cli.py index 06c3786a..c0928d5e 100644 --- a/src/khoj/utils/cli.py +++ b/src/khoj/utils/cli.py @@ -14,6 +14,7 @@ from khoj.migrations.migrate_version import migrate_config_to_version from khoj.migrations.migrate_processor_config_openai import migrate_processor_conversation_schema from khoj.migrations.migrate_offline_model import migrate_offline_model from khoj.migrations.migrate_offline_chat_schema import migrate_offline_chat_schema +from khoj.migrations.migrate_offline_chat_default_model import migrate_offline_chat_default_model from khoj.migrations.migrate_server_pg import migrate_server_pg @@ -38,6 +39,9 @@ def cli(args=None): help="Path to UNIX socket for server. Use to run server behind reverse proxy. Default: /tmp/uvicorn.sock", ) parser.add_argument("--version", "-V", action="store_true", help="Print the installed Khoj version and exit") + parser.add_argument( + "--disable-chat-on-gpu", action="store_true", default=False, help="Disable using GPU for the offline chat model" + ) parser.add_argument("--demo", action="store_true", default=False, help="Run Khoj in demo mode") parser.add_argument( "--anonymous-mode", @@ -50,6 +54,9 @@ def cli(args=None): logger.debug(f"Ignoring unknown commandline args: {remaining_args}") + # Set default values for arguments + args.chat_on_gpu = not args.disable_chat_on_gpu + args.version_no = version("khoj-assistant") if args.version: # Show version of khoj installed and exit @@ -76,6 +83,7 @@ def run_migrations(args): migrate_processor_conversation_schema, migrate_offline_model, migrate_offline_chat_schema, + migrate_offline_chat_default_model, migrate_server_pg, ] for migration in migrations: diff --git a/src/khoj/utils/rawconfig.py b/src/khoj/utils/rawconfig.py index a469951f..48525ab2 100644 --- a/src/khoj/utils/rawconfig.py +++ b/src/khoj/utils/rawconfig.py @@ -84,7 +84,7 @@ class OpenAIProcessorConfig(ConfigBase): class OfflineChatProcessorConfig(ConfigBase): enable_offline_chat: Optional[bool] = False - chat_model: Optional[str] = "llama-2-7b-chat.ggmlv3.q4_0.bin" + chat_model: Optional[str] = "mistral-7b-instruct-v0.1.Q4_0.gguf" class ConversationProcessorConfig(ConfigBase): diff --git a/src/khoj/utils/state.py b/src/khoj/utils/state.py index e92f19a7..1b830245 100644 --- a/src/khoj/utils/state.py +++ b/src/khoj/utils/state.py @@ -33,5 +33,6 @@ SearchType = utils_config.SearchType telemetry: List[Dict[str, str]] = [] demo: bool = False khoj_version: str = None -anonymous_mode: bool = False device = get_device() +chat_on_gpu: bool = True +anonymous_mode: bool = False diff --git a/tests/conftest.py b/tests/conftest.py index a5f23dd2..80feaaac 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -169,7 +169,7 @@ def md_content_config(): return markdown_config -@pytest.fixture(scope="function") +@pytest.fixture(scope="session") def chat_client(search_config: SearchConfig, default_user2: KhojUser): # Initialize app state state.config.search_type = search_config @@ -211,7 +211,7 @@ def chat_client_no_background(search_config: SearchConfig, default_user2: KhojUs # Initialize Processor from Config if os.getenv("OPENAI_API_KEY"): - OpenAIProcessorConversationConfigFactory(user=default_user2) + OpenAIProcessorConversationConfigFactory() state.anonymous_mode = True diff --git a/tests/data/config.yml b/tests/data/config.yml index c544eebe..2d642a09 100644 --- a/tests/data/config.yml +++ b/tests/data/config.yml @@ -14,4 +14,4 @@ search-type: asymmetric: cross-encoder: cross-encoder/ms-marco-MiniLM-L-6-v2 encoder: sentence-transformers/msmarco-MiniLM-L-6-v3 -version: 0.10.1 +version: 0.14.0 diff --git a/tests/helpers.py b/tests/helpers.py index 968b1247..3aa7c435 100644 --- a/tests/helpers.py +++ b/tests/helpers.py @@ -37,7 +37,7 @@ class ChatModelOptionsFactory(factory.django.DjangoModelFactory): max_prompt_size = 2000 tokenizer = None - chat_model = "llama-2-7b-chat.ggmlv3.q4_0.bin" + chat_model = "mistral-7b-instruct-v0.1.Q4_0.gguf" model_type = "offline" diff --git a/tests/test_gpt4all_chat_actors.py b/tests/test_gpt4all_chat_actors.py index 76ed26e7..782b54f2 100644 --- a/tests/test_gpt4all_chat_actors.py +++ b/tests/test_gpt4all_chat_actors.py @@ -24,7 +24,7 @@ from khoj.processor.conversation.gpt4all.utils import download_model from khoj.processor.conversation.utils import message_to_log -MODEL_NAME = "llama-2-7b-chat.ggmlv3.q4_0.bin" +MODEL_NAME = "mistral-7b-instruct-v0.1.Q4_0.gguf" @pytest.fixture(scope="session")