mirror of
https://github.com/khoaliber/khoj.git
synced 2026-03-05 05:39:11 +00:00
Improve docker builds for local hosting (#476)
* Remove GPT4All dependency in pyproject.toml and use multiplatform builds in the dockerization setup in GH actions * Move configure_search method into indexer * Add conditional installation for gpt4all * Add hint to go to localhost:42110 in the docs. Addresses #477
This commit is contained in:
@@ -11,18 +11,16 @@ import schedule
|
||||
from fastapi.staticfiles import StaticFiles
|
||||
|
||||
# Internal Packages
|
||||
from khoj.search_type import image_search, text_search
|
||||
from khoj.utils import constants, state
|
||||
from khoj.utils.config import (
|
||||
SearchType,
|
||||
SearchModels,
|
||||
ProcessorConfigModel,
|
||||
ConversationProcessorConfigModel,
|
||||
)
|
||||
from khoj.utils.helpers import resolve_absolute_path, merge_dicts
|
||||
from khoj.utils.fs_syncer import collect_files
|
||||
from khoj.utils.rawconfig import FullConfig, ProcessorConfig, SearchConfig, ConversationProcessorConfig
|
||||
from khoj.routers.indexer import configure_content, load_content
|
||||
from khoj.utils.rawconfig import FullConfig, ProcessorConfig, ConversationProcessorConfig
|
||||
from khoj.routers.indexer import configure_content, load_content, configure_search
|
||||
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
@@ -136,26 +134,6 @@ def configure_search_types(config: FullConfig):
|
||||
return Enum("SearchType", merge_dicts(core_search_types, plugin_search_types))
|
||||
|
||||
|
||||
def configure_search(search_models: SearchModels, search_config: Optional[SearchConfig]) -> Optional[SearchModels]:
|
||||
# Run Validation Checks
|
||||
if search_config is None:
|
||||
logger.warning("🚨 No Search configuration available.")
|
||||
return None
|
||||
if search_models is None:
|
||||
search_models = SearchModels()
|
||||
|
||||
# Initialize Search Models
|
||||
if search_config.asymmetric:
|
||||
logger.info("🔍 📜 Setting up text search model")
|
||||
search_models.text_search = text_search.initialize_model(search_config.asymmetric)
|
||||
|
||||
if search_config.image:
|
||||
logger.info("🔍 🌄 Setting up image search model")
|
||||
search_models.image_search = image_search.initialize_model(search_config.image)
|
||||
|
||||
return search_models
|
||||
|
||||
|
||||
def configure_processor(
|
||||
processor_config: Optional[ProcessorConfig], state_processor_config: Optional[ProcessorConfigModel] = None
|
||||
):
|
||||
|
||||
@@ -100,3 +100,7 @@ def poll_task_scheduler():
|
||||
timer_thread.daemon = True
|
||||
timer_thread.start()
|
||||
schedule.run_pending()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
run()
|
||||
|
||||
@@ -1,12 +1,10 @@
|
||||
from typing import Iterator, Union, List
|
||||
from typing import Iterator, Union, List, Any
|
||||
from datetime import datetime
|
||||
import logging
|
||||
from threading import Thread
|
||||
|
||||
from langchain.schema import ChatMessage
|
||||
|
||||
from gpt4all import GPT4All
|
||||
|
||||
from khoj.processor.conversation.utils import ThreadedGenerator, generate_chatml_messages_with_context
|
||||
from khoj.processor.conversation import prompts
|
||||
from khoj.utils.constants import empty_escape_sequences
|
||||
@@ -19,7 +17,7 @@ logger = logging.getLogger(__name__)
|
||||
def extract_questions_offline(
|
||||
text: str,
|
||||
model: str = "llama-2-7b-chat.ggmlv3.q4_K_S.bin",
|
||||
loaded_model: Union[GPT4All, None] = None,
|
||||
loaded_model: Union[Any, None] = None,
|
||||
conversation_log={},
|
||||
use_history: bool = True,
|
||||
should_extract_questions: bool = True,
|
||||
@@ -27,6 +25,15 @@ def extract_questions_offline(
|
||||
"""
|
||||
Infer search queries to retrieve relevant notes to answer user query
|
||||
"""
|
||||
try:
|
||||
from gpt4all import GPT4All
|
||||
except ModuleNotFoundError as e:
|
||||
logger.info("There was an error importing GPT4All. Please run pip install gpt4all in order to install it.")
|
||||
raise e
|
||||
|
||||
# Assert that loaded_model is either None or of type GPT4All
|
||||
assert loaded_model is None or isinstance(loaded_model, GPT4All), "loaded_model must be of type GPT4All or None"
|
||||
|
||||
all_questions = text.split("? ")
|
||||
all_questions = [q + "?" for q in all_questions[:-1]] + [all_questions[-1]]
|
||||
|
||||
@@ -117,13 +124,20 @@ def converse_offline(
|
||||
user_query,
|
||||
conversation_log={},
|
||||
model: str = "llama-2-7b-chat.ggmlv3.q4_K_S.bin",
|
||||
loaded_model: Union[GPT4All, None] = None,
|
||||
loaded_model: Union[Any, None] = None,
|
||||
completion_func=None,
|
||||
conversation_command=ConversationCommand.Default,
|
||||
) -> Union[ThreadedGenerator, Iterator[str]]:
|
||||
"""
|
||||
Converse with user using Llama
|
||||
"""
|
||||
try:
|
||||
from gpt4all import GPT4All
|
||||
except ModuleNotFoundError as e:
|
||||
logger.info("There was an error importing GPT4All. Please run pip install gpt4all in order to install it.")
|
||||
raise e
|
||||
|
||||
assert loaded_model is None or isinstance(loaded_model, GPT4All), "loaded_model must be of type GPT4All or None"
|
||||
gpt4all_model = loaded_model or GPT4All(model)
|
||||
# Initialize Variables
|
||||
compiled_references_message = "\n\n".join({f"{item}" for item in references})
|
||||
@@ -152,7 +166,14 @@ def converse_offline(
|
||||
return g
|
||||
|
||||
|
||||
def llm_thread(g, messages: List[ChatMessage], model: GPT4All):
|
||||
def llm_thread(g, messages: List[ChatMessage], model: Any):
|
||||
try:
|
||||
from gpt4all import GPT4All
|
||||
except ModuleNotFoundError as e:
|
||||
logger.info("There was an error importing GPT4All. Please run pip install gpt4all in order to install it.")
|
||||
raise e
|
||||
|
||||
assert isinstance(model, GPT4All), "model should be of type GPT4All"
|
||||
user_message = messages[-1]
|
||||
system_message = messages[0]
|
||||
conversation_history = messages[1:-1]
|
||||
|
||||
@@ -3,7 +3,6 @@ import logging
|
||||
import requests
|
||||
import hashlib
|
||||
|
||||
from gpt4all import GPT4All
|
||||
from tqdm import tqdm
|
||||
|
||||
from khoj.processor.conversation.gpt4all import model_metadata
|
||||
@@ -22,6 +21,12 @@ def get_md5_checksum(filename: str):
|
||||
|
||||
|
||||
def download_model(model_name: str):
|
||||
try:
|
||||
from gpt4all import GPT4All
|
||||
except ModuleNotFoundError as e:
|
||||
logger.info("There was an error importing GPT4All. Please run pip install gpt4all in order to install it.")
|
||||
raise e
|
||||
|
||||
url = model_metadata.model_name_to_url.get(model_name)
|
||||
if not url:
|
||||
logger.debug(f"Model {model_name} not found in model metadata. Skipping download.")
|
||||
|
||||
@@ -1,6 +1,7 @@
|
||||
# Standard Packages
|
||||
import logging
|
||||
import sys
|
||||
import json
|
||||
from typing import Optional, Union, Dict
|
||||
|
||||
# External Packages
|
||||
@@ -8,7 +9,7 @@ from fastapi import APIRouter, HTTPException, Header, Request, Body, Response
|
||||
from pydantic import BaseModel
|
||||
|
||||
# Internal Packages
|
||||
from khoj.utils import state
|
||||
from khoj.utils import state, constants
|
||||
from khoj.processor.jsonl.jsonl_to_jsonl import JsonlToJsonl
|
||||
from khoj.processor.markdown.markdown_to_jsonl import MarkdownToJsonl
|
||||
from khoj.processor.org_mode.org_to_jsonl import OrgToJsonl
|
||||
@@ -18,11 +19,14 @@ from khoj.processor.notion.notion_to_jsonl import NotionToJsonl
|
||||
from khoj.processor.plaintext.plaintext_to_jsonl import PlaintextToJsonl
|
||||
from khoj.utils.rawconfig import ContentConfig, TextContentConfig
|
||||
from khoj.search_type import text_search, image_search
|
||||
from khoj.utils.yaml import save_config_to_file_updated_state
|
||||
from khoj.utils.config import SearchModels
|
||||
from khoj.utils.constants import default_config
|
||||
from khoj.utils.helpers import LRU, get_file_type
|
||||
from khoj.utils.rawconfig import (
|
||||
ContentConfig,
|
||||
FullConfig,
|
||||
SearchConfig,
|
||||
)
|
||||
from khoj.search_filter.date_filter import DateFilter
|
||||
from khoj.search_filter.word_filter import WordFilter
|
||||
@@ -111,6 +115,28 @@ async def index_batch(
|
||||
plaintext=plaintext_files,
|
||||
)
|
||||
|
||||
if state.config == None:
|
||||
logger.info("First run, initializing state.")
|
||||
default_full_config = FullConfig(
|
||||
content_type=None,
|
||||
search_type=SearchConfig.parse_obj(constants.default_config["search-type"]),
|
||||
processor=None,
|
||||
)
|
||||
state.config = default_full_config
|
||||
default_content_config = ContentConfig(
|
||||
org=None,
|
||||
markdown=None,
|
||||
pdf=None,
|
||||
image=None,
|
||||
github=None,
|
||||
notion=None,
|
||||
plaintext=None,
|
||||
plugins=None,
|
||||
)
|
||||
state.config.content_type = default_content_config
|
||||
save_config_to_file_updated_state()
|
||||
configure_search(state.search_models, state.config.search_type)
|
||||
|
||||
# Extract required fields from config
|
||||
state.content_index = configure_content(
|
||||
state.content_index,
|
||||
@@ -129,6 +155,26 @@ async def index_batch(
|
||||
return Response(content="OK", status_code=200)
|
||||
|
||||
|
||||
def configure_search(search_models: SearchModels, search_config: Optional[SearchConfig]) -> Optional[SearchModels]:
|
||||
# Run Validation Checks
|
||||
if search_config is None:
|
||||
logger.warning("🚨 No Search configuration available.")
|
||||
return None
|
||||
if search_models is None:
|
||||
search_models = SearchModels()
|
||||
|
||||
# Initialize Search Models
|
||||
if search_config.asymmetric:
|
||||
logger.info("🔍 📜 Setting up text search model")
|
||||
search_models.text_search = text_search.initialize_model(search_config.asymmetric)
|
||||
|
||||
if search_config.image:
|
||||
logger.info("🔍 🌄 Setting up image search model")
|
||||
search_models.image_search = image_search.initialize_model(search_config.image)
|
||||
|
||||
return search_models
|
||||
|
||||
|
||||
def configure_content(
|
||||
content_index: Optional[ContentIndex],
|
||||
content_config: Optional[ContentConfig],
|
||||
@@ -138,6 +184,9 @@ def configure_content(
|
||||
t: Optional[Union[state.SearchType, str]] = None,
|
||||
full_corpus: bool = True,
|
||||
) -> Optional[ContentIndex]:
|
||||
def has_valid_text_config(config: TextContentConfig):
|
||||
return config.input_files or config.input_filter
|
||||
|
||||
# Run Validation Checks
|
||||
if content_config is None:
|
||||
logger.warning("🚨 No Content configuration available.")
|
||||
@@ -158,7 +207,7 @@ def configure_content(
|
||||
# Initialize Org Notes Search
|
||||
if (
|
||||
(t == None or t == state.SearchType.Org.value)
|
||||
and (content_config.org or files["org"])
|
||||
and ((content_config.org and has_valid_text_config(content_config.org)) or files["org"])
|
||||
and search_models.text_search
|
||||
):
|
||||
if content_config.org == None:
|
||||
@@ -187,7 +236,7 @@ def configure_content(
|
||||
# Initialize Markdown Search
|
||||
if (
|
||||
(t == None or t == state.SearchType.Markdown.value)
|
||||
and (content_config.markdown or files["markdown"])
|
||||
and ((content_config.markdown and has_valid_text_config(content_config.markdown)) or files["markdown"])
|
||||
and search_models.text_search
|
||||
and files["markdown"]
|
||||
):
|
||||
@@ -218,7 +267,7 @@ def configure_content(
|
||||
# Initialize PDF Search
|
||||
if (
|
||||
(t == None or t == state.SearchType.Pdf.value)
|
||||
and (content_config.pdf or files["pdf"])
|
||||
and ((content_config.pdf and has_valid_text_config(content_config.pdf)) or files["pdf"])
|
||||
and search_models.text_search
|
||||
and files["pdf"]
|
||||
):
|
||||
@@ -249,7 +298,7 @@ def configure_content(
|
||||
# Initialize Plaintext Search
|
||||
if (
|
||||
(t == None or t == state.SearchType.Plaintext.value)
|
||||
and (content_config.plaintext or files["plaintext"])
|
||||
and ((content_config.plaintext and has_valid_text_config(content_config.plaintext)) or files["plaintext"])
|
||||
and search_models.text_search
|
||||
and files["plaintext"]
|
||||
):
|
||||
|
||||
Reference in New Issue
Block a user