Improve docker builds for local hosting (#476)

* Remove GPT4All dependency in pyproject.toml and use multiplatform builds in the dockerization setup in GH actions * Move configure_search method into indexer * Add conditional installation for gpt4all * Add hint to go to localhost:42110 in the docs. Addresses #477
2026-03-05 05:39:11 +00:00 · 2023-09-08 17:07:26 -07:00
parent dccfae3853
commit 343854752c
10 changed files with 122 additions and 42 deletions
--- a/src/khoj/configure.py
+++ b/src/khoj/configure.py
@@ -11,18 +11,16 @@ import schedule
 from fastapi.staticfiles import StaticFiles

 # Internal Packages
-from khoj.search_type import image_search, text_search
 from khoj.utils import constants, state
 from khoj.utils.config import (
    SearchType,
-    SearchModels,
    ProcessorConfigModel,
    ConversationProcessorConfigModel,
 )
 from khoj.utils.helpers import resolve_absolute_path, merge_dicts
 from khoj.utils.fs_syncer import collect_files
-from khoj.utils.rawconfig import FullConfig, ProcessorConfig, SearchConfig, ConversationProcessorConfig
-from khoj.routers.indexer import configure_content, load_content
+from khoj.utils.rawconfig import FullConfig, ProcessorConfig, ConversationProcessorConfig
+from khoj.routers.indexer import configure_content, load_content, configure_search


 logger = logging.getLogger(__name__)
@@ -136,26 +134,6 @@ def configure_search_types(config: FullConfig):
    return Enum("SearchType", merge_dicts(core_search_types, plugin_search_types))


-def configure_search(search_models: SearchModels, search_config: Optional[SearchConfig]) -> Optional[SearchModels]:
-    # Run Validation Checks
-    if search_config is None:
-        logger.warning("🚨 No Search configuration available.")
-        return None
-    if search_models is None:
-        search_models = SearchModels()
-
-    # Initialize Search Models
-    if search_config.asymmetric:
-        logger.info("🔍 📜 Setting up text search model")
-        search_models.text_search = text_search.initialize_model(search_config.asymmetric)
-
-    if search_config.image:
-        logger.info("🔍 🌄 Setting up image search model")
-        search_models.image_search = image_search.initialize_model(search_config.image)
-
-    return search_models
-
-
 def configure_processor(
    processor_config: Optional[ProcessorConfig], state_processor_config: Optional[ProcessorConfigModel] = None
 ):
--- a/src/khoj/main.py
+++ b/src/khoj/main.py
@@ -100,3 +100,7 @@ def poll_task_scheduler():
    timer_thread.daemon = True
    timer_thread.start()
    schedule.run_pending()
+
+
+if __name__ == "__main__":
+    run()
--- a/src/khoj/processor/conversation/gpt4all/chat_model.py
+++ b/src/khoj/processor/conversation/gpt4all/chat_model.py
@@ -1,12 +1,10 @@
-from typing import Iterator, Union, List
+from typing import Iterator, Union, List, Any
 from datetime import datetime
 import logging
 from threading import Thread

 from langchain.schema import ChatMessage

-from gpt4all import GPT4All
-
 from khoj.processor.conversation.utils import ThreadedGenerator, generate_chatml_messages_with_context
 from khoj.processor.conversation import prompts
 from khoj.utils.constants import empty_escape_sequences
@@ -19,7 +17,7 @@ logger = logging.getLogger(__name__)
 def extract_questions_offline(
    text: str,
    model: str = "llama-2-7b-chat.ggmlv3.q4_K_S.bin",
-    loaded_model: Union[GPT4All, None] = None,
+    loaded_model: Union[Any, None] = None,
    conversation_log={},
    use_history: bool = True,
    should_extract_questions: bool = True,
@@ -27,6 +25,15 @@ def extract_questions_offline(
    """
    Infer search queries to retrieve relevant notes to answer user query
    """
+    try:
+        from gpt4all import GPT4All
+    except ModuleNotFoundError as e:
+        logger.info("There was an error importing GPT4All. Please run pip install gpt4all in order to install it.")
+        raise e
+
+    # Assert that loaded_model is either None or of type GPT4All
+    assert loaded_model is None or isinstance(loaded_model, GPT4All), "loaded_model must be of type GPT4All or None"
+
    all_questions = text.split("? ")
    all_questions = [q + "?" for q in all_questions[:-1]] + [all_questions[-1]]

@@ -117,13 +124,20 @@ def converse_offline(
    user_query,
    conversation_log={},
    model: str = "llama-2-7b-chat.ggmlv3.q4_K_S.bin",
-    loaded_model: Union[GPT4All, None] = None,
+    loaded_model: Union[Any, None] = None,
    completion_func=None,
    conversation_command=ConversationCommand.Default,
 ) -> Union[ThreadedGenerator, Iterator[str]]:
    """
    Converse with user using Llama
    """
+    try:
+        from gpt4all import GPT4All
+    except ModuleNotFoundError as e:
+        logger.info("There was an error importing GPT4All. Please run pip install gpt4all in order to install it.")
+        raise e
+
+    assert loaded_model is None or isinstance(loaded_model, GPT4All), "loaded_model must be of type GPT4All or None"
    gpt4all_model = loaded_model or GPT4All(model)
    # Initialize Variables
    compiled_references_message = "\n\n".join({f"{item}" for item in references})
@@ -152,7 +166,14 @@ def converse_offline(
    return g


-def llm_thread(g, messages: List[ChatMessage], model: GPT4All):
+def llm_thread(g, messages: List[ChatMessage], model: Any):
+    try:
+        from gpt4all import GPT4All
+    except ModuleNotFoundError as e:
+        logger.info("There was an error importing GPT4All. Please run pip install gpt4all in order to install it.")
+        raise e
+
+    assert isinstance(model, GPT4All), "model should be of type GPT4All"
    user_message = messages[-1]
    system_message = messages[0]
    conversation_history = messages[1:-1]
--- a/src/khoj/processor/conversation/gpt4all/utils.py
+++ b/src/khoj/processor/conversation/gpt4all/utils.py
@@ -3,7 +3,6 @@ import logging
 import requests
 import hashlib

-from gpt4all import GPT4All
 from tqdm import tqdm

 from khoj.processor.conversation.gpt4all import model_metadata
@@ -22,6 +21,12 @@ def get_md5_checksum(filename: str):


 def download_model(model_name: str):
+    try:
+        from gpt4all import GPT4All
+    except ModuleNotFoundError as e:
+        logger.info("There was an error importing GPT4All. Please run pip install gpt4all in order to install it.")
+        raise e
+
    url = model_metadata.model_name_to_url.get(model_name)
    if not url:
        logger.debug(f"Model {model_name} not found in model metadata. Skipping download.")
--- a/src/khoj/routers/indexer.py
+++ b/src/khoj/routers/indexer.py
@@ -1,6 +1,7 @@
 # Standard Packages
 import logging
 import sys
+import json
 from typing import Optional, Union, Dict

 # External Packages
@@ -8,7 +9,7 @@ from fastapi import APIRouter, HTTPException, Header, Request, Body, Response
 from pydantic import BaseModel

 # Internal Packages
-from khoj.utils import state
+from khoj.utils import state, constants
 from khoj.processor.jsonl.jsonl_to_jsonl import JsonlToJsonl
 from khoj.processor.markdown.markdown_to_jsonl import MarkdownToJsonl
 from khoj.processor.org_mode.org_to_jsonl import OrgToJsonl
@@ -18,11 +19,14 @@ from khoj.processor.notion.notion_to_jsonl import NotionToJsonl
 from khoj.processor.plaintext.plaintext_to_jsonl import PlaintextToJsonl
 from khoj.utils.rawconfig import ContentConfig, TextContentConfig
 from khoj.search_type import text_search, image_search
+from khoj.utils.yaml import save_config_to_file_updated_state
 from khoj.utils.config import SearchModels
 from khoj.utils.constants import default_config
 from khoj.utils.helpers import LRU, get_file_type
 from khoj.utils.rawconfig import (
    ContentConfig,
+    FullConfig,
+    SearchConfig,
 )
 from khoj.search_filter.date_filter import DateFilter
 from khoj.search_filter.word_filter import WordFilter
@@ -111,6 +115,28 @@ async def index_batch(
            plaintext=plaintext_files,
        )

+        if state.config == None:
+            logger.info("First run, initializing state.")
+            default_full_config = FullConfig(
+                content_type=None,
+                search_type=SearchConfig.parse_obj(constants.default_config["search-type"]),
+                processor=None,
+            )
+            state.config = default_full_config
+            default_content_config = ContentConfig(
+                org=None,
+                markdown=None,
+                pdf=None,
+                image=None,
+                github=None,
+                notion=None,
+                plaintext=None,
+                plugins=None,
+            )
+            state.config.content_type = default_content_config
+            save_config_to_file_updated_state()
+            configure_search(state.search_models, state.config.search_type)
+
        # Extract required fields from config
        state.content_index = configure_content(
            state.content_index,
@@ -129,6 +155,26 @@ async def index_batch(
    return Response(content="OK", status_code=200)


+def configure_search(search_models: SearchModels, search_config: Optional[SearchConfig]) -> Optional[SearchModels]:
+    # Run Validation Checks
+    if search_config is None:
+        logger.warning("🚨 No Search configuration available.")
+        return None
+    if search_models is None:
+        search_models = SearchModels()
+
+    # Initialize Search Models
+    if search_config.asymmetric:
+        logger.info("🔍 📜 Setting up text search model")
+        search_models.text_search = text_search.initialize_model(search_config.asymmetric)
+
+    if search_config.image:
+        logger.info("🔍 🌄 Setting up image search model")
+        search_models.image_search = image_search.initialize_model(search_config.image)
+
+    return search_models
+
+
 def configure_content(
    content_index: Optional[ContentIndex],
    content_config: Optional[ContentConfig],
@@ -138,6 +184,9 @@ def configure_content(
    t: Optional[Union[state.SearchType, str]] = None,
    full_corpus: bool = True,
 ) -> Optional[ContentIndex]:
+    def has_valid_text_config(config: TextContentConfig):
+        return config.input_files or config.input_filter
+
    # Run Validation Checks
    if content_config is None:
        logger.warning("🚨 No Content configuration available.")
@@ -158,7 +207,7 @@ def configure_content(
        # Initialize Org Notes Search
        if (
            (t == None or t == state.SearchType.Org.value)
-            and (content_config.org or files["org"])
+            and ((content_config.org and has_valid_text_config(content_config.org)) or files["org"])
            and search_models.text_search
        ):
            if content_config.org == None:
@@ -187,7 +236,7 @@ def configure_content(
        # Initialize Markdown Search
        if (
            (t == None or t == state.SearchType.Markdown.value)
-            and (content_config.markdown or files["markdown"])
+            and ((content_config.markdown and has_valid_text_config(content_config.markdown)) or files["markdown"])
            and search_models.text_search
            and files["markdown"]
        ):
@@ -218,7 +267,7 @@ def configure_content(
        # Initialize PDF Search
        if (
            (t == None or t == state.SearchType.Pdf.value)
-            and (content_config.pdf or files["pdf"])
+            and ((content_config.pdf and has_valid_text_config(content_config.pdf)) or files["pdf"])
            and search_models.text_search
            and files["pdf"]
        ):
@@ -249,7 +298,7 @@ def configure_content(
        # Initialize Plaintext Search
        if (
            (t == None or t == state.SearchType.Plaintext.value)
-            and (content_config.plaintext or files["plaintext"])
+            and ((content_config.plaintext and has_valid_text_config(content_config.plaintext)) or files["plaintext"])
            and search_models.text_search
            and files["plaintext"]
        ):