diff --git a/src/khoj/configure.py b/src/khoj/configure.py index a72f15d5..40d1eeb5 100644 --- a/src/khoj/configure.py +++ b/src/khoj/configure.py @@ -50,13 +50,11 @@ from khoj.database.adapters import ( ) from khoj.database.models import ClientApplication, KhojUser, ProcessLock, Subscription from khoj.processor.embeddings import CrossEncoderModel, EmbeddingsModel -from khoj.routers.api_content import configure_content, configure_search +from khoj.routers.api_content import configure_content from khoj.routers.twilio import is_twilio_enabled from khoj.utils import constants, state from khoj.utils.config import SearchType -from khoj.utils.fs_syncer import collect_files -from khoj.utils.helpers import is_none_or_empty, telemetry_disabled -from khoj.utils.rawconfig import FullConfig +from khoj.utils.helpers import is_none_or_empty logger = logging.getLogger(__name__) @@ -232,14 +230,6 @@ class UserAuthenticationBackend(AuthenticationBackend): return AuthCredentials(), UnauthenticatedUser() -def initialize_server(config: Optional[FullConfig]): - try: - configure_server(config, init=True) - except Exception as e: - logger.error(f"🚨 Failed to configure server on app load: {e}", exc_info=True) - raise e - - def clean_connections(func): """ A decorator that ensures that Django database connections that have become unusable, or are obsolete, are closed @@ -260,19 +250,7 @@ def clean_connections(func): return func_wrapper -def configure_server( - config: FullConfig, - regenerate: bool = False, - search_type: Optional[SearchType] = None, - init=False, - user: KhojUser = None, -): - # Update Config - if config == None: - logger.info(f"Initializing with default config.") - config = FullConfig() - state.config = config - +def initialize_server(): if ConversationAdapters.has_valid_ai_model_api(): ai_model_api = ConversationAdapters.get_ai_model_api() state.openai_client = openai.OpenAI(api_key=ai_model_api.api_key, base_url=ai_model_api.api_base_url) @@ -309,43 +287,33 @@ def configure_server( ) state.SearchType = configure_search_types() - state.search_models = configure_search(state.search_models, state.config.search_type) - setup_default_agent(user) + setup_default_agent() - message = ( - "📡 Telemetry disabled" - if telemetry_disabled(state.config.app, state.telemetry_disabled) - else "📡 Telemetry enabled" - ) + message = "📡 Telemetry disabled" if state.telemetry_disabled else "📡 Telemetry enabled" logger.info(message) - if not init: - initialize_content(user, regenerate, search_type) - except Exception as e: logger.error(f"Failed to load some search models: {e}", exc_info=True) -def setup_default_agent(user: KhojUser): - AgentAdapters.create_default_agent(user) +def setup_default_agent(): + AgentAdapters.create_default_agent() def initialize_content(user: KhojUser, regenerate: bool, search_type: Optional[SearchType] = None): # Initialize Content from Config - if state.search_models: - try: - logger.info("📬 Updating content index...") - all_files = collect_files(user=user) - status = configure_content( - user, - all_files, - regenerate, - search_type, - ) - if not status: - raise RuntimeError("Failed to update content index") - except Exception as e: - raise e + try: + logger.info("📬 Updating content index...") + status = configure_content( + user, + {}, + regenerate, + search_type, + ) + if not status: + raise RuntimeError("Failed to update content index") + except Exception as e: + raise e def configure_routes(app): @@ -438,8 +406,7 @@ def configure_middleware(app, ssl_enabled: bool = False): def update_content_index(): for user in get_all_users(): - all_files = collect_files(user=user) - success = configure_content(user, all_files) + success = configure_content(user, {}) if not success: raise RuntimeError("Failed to update content index") logger.info("📪 Content index updated via Scheduler") @@ -464,7 +431,7 @@ def configure_search_types(): @schedule.repeat(schedule.every(2).minutes) @clean_connections def upload_telemetry(): - if telemetry_disabled(state.config.app, state.telemetry_disabled) or not state.telemetry: + if state.telemetry_disabled or not state.telemetry: return try: diff --git a/src/khoj/database/adapters/__init__.py b/src/khoj/database/adapters/__init__.py index 76d7578b..6d92b8e9 100644 --- a/src/khoj/database/adapters/__init__.py +++ b/src/khoj/database/adapters/__init__.py @@ -788,8 +788,8 @@ class AgentAdapters: return Agent.objects.filter(name=AgentAdapters.DEFAULT_AGENT_NAME).first() @staticmethod - def create_default_agent(user: KhojUser): - default_chat_model = ConversationAdapters.get_default_chat_model(user) + def create_default_agent(): + default_chat_model = ConversationAdapters.get_default_chat_model(user=None) if default_chat_model is None: logger.info("No default conversation config found, skipping default agent creation") return None diff --git a/src/khoj/database/migrations/0093_remove_localorgconfig_user_and_more.py b/src/khoj/database/migrations/0093_remove_localorgconfig_user_and_more.py new file mode 100644 index 00000000..ad3409cf --- /dev/null +++ b/src/khoj/database/migrations/0093_remove_localorgconfig_user_and_more.py @@ -0,0 +1,36 @@ +# Generated by Django 5.1.10 on 2025-07-25 23:30 + +from django.db import migrations + + +class Migration(migrations.Migration): + dependencies = [ + ("database", "0092_alter_chatmodel_model_type_alter_chatmodel_name_and_more"), + ] + + operations = [ + migrations.RemoveField( + model_name="localorgconfig", + name="user", + ), + migrations.RemoveField( + model_name="localpdfconfig", + name="user", + ), + migrations.RemoveField( + model_name="localplaintextconfig", + name="user", + ), + migrations.DeleteModel( + name="LocalMarkdownConfig", + ), + migrations.DeleteModel( + name="LocalOrgConfig", + ), + migrations.DeleteModel( + name="LocalPdfConfig", + ), + migrations.DeleteModel( + name="LocalPlaintextConfig", + ), + ] diff --git a/src/khoj/database/models/__init__.py b/src/khoj/database/models/__init__.py index 03b43376..1ed58572 100644 --- a/src/khoj/database/models/__init__.py +++ b/src/khoj/database/models/__init__.py @@ -488,34 +488,6 @@ class ServerChatSettings(DbBaseModel): super().save(*args, **kwargs) -class LocalOrgConfig(DbBaseModel): - input_files = models.JSONField(default=list, null=True) - input_filter = models.JSONField(default=list, null=True) - index_heading_entries = models.BooleanField(default=False) - user = models.ForeignKey(KhojUser, on_delete=models.CASCADE) - - -class LocalMarkdownConfig(DbBaseModel): - input_files = models.JSONField(default=list, null=True) - input_filter = models.JSONField(default=list, null=True) - index_heading_entries = models.BooleanField(default=False) - user = models.ForeignKey(KhojUser, on_delete=models.CASCADE) - - -class LocalPdfConfig(DbBaseModel): - input_files = models.JSONField(default=list, null=True) - input_filter = models.JSONField(default=list, null=True) - index_heading_entries = models.BooleanField(default=False) - user = models.ForeignKey(KhojUser, on_delete=models.CASCADE) - - -class LocalPlaintextConfig(DbBaseModel): - input_files = models.JSONField(default=list, null=True) - input_filter = models.JSONField(default=list, null=True) - index_heading_entries = models.BooleanField(default=False) - user = models.ForeignKey(KhojUser, on_delete=models.CASCADE) - - class SearchModelConfig(DbBaseModel): class ModelType(models.TextChoices): TEXT = "text" diff --git a/src/khoj/main.py b/src/khoj/main.py index d5918a14..f42ae135 100644 --- a/src/khoj/main.py +++ b/src/khoj/main.py @@ -138,10 +138,10 @@ def run(should_start_server=True): initialization(not args.non_interactive) # Create app directory, if it doesn't exist - state.config_file.parent.mkdir(parents=True, exist_ok=True) + state.log_file.parent.mkdir(parents=True, exist_ok=True) # Set Log File - fh = logging.FileHandler(state.config_file.parent / "khoj.log", encoding="utf-8") + fh = logging.FileHandler(state.log_file, encoding="utf-8") fh.setLevel(logging.DEBUG) logger.addHandler(fh) @@ -194,7 +194,7 @@ def run(should_start_server=True): # Configure Middleware configure_middleware(app, state.ssl_config) - initialize_server(args.config) + initialize_server() # If the server is started through gunicorn (external to the script), don't start the server if should_start_server: @@ -204,8 +204,7 @@ def run(should_start_server=True): def set_state(args): - state.config_file = args.config_file - state.config = args.config + state.log_file = args.log_file state.verbose = args.verbose state.host = args.host state.port = args.port diff --git a/src/khoj/processor/content/github/github_to_entries.py b/src/khoj/processor/content/github/github_to_entries.py index 31f99f84..63ed50c6 100644 --- a/src/khoj/processor/content/github/github_to_entries.py +++ b/src/khoj/processor/content/github/github_to_entries.py @@ -20,7 +20,6 @@ magika = Magika() class GithubToEntries(TextToEntries): def __init__(self, config: GithubConfig): - super().__init__(config) raw_repos = config.githubrepoconfig.all() repos = [] for repo in raw_repos: diff --git a/src/khoj/processor/content/notion/notion_to_entries.py b/src/khoj/processor/content/notion/notion_to_entries.py index 1e1ab4d3..23b96f63 100644 --- a/src/khoj/processor/content/notion/notion_to_entries.py +++ b/src/khoj/processor/content/notion/notion_to_entries.py @@ -47,7 +47,6 @@ class NotionBlockType(Enum): class NotionToEntries(TextToEntries): def __init__(self, config: NotionConfig): - super().__init__(config) self.config = NotionContentConfig( token=config.token, ) diff --git a/src/khoj/processor/content/text_to_entries.py b/src/khoj/processor/content/text_to_entries.py index 0ceda11d..0369d273 100644 --- a/src/khoj/processor/content/text_to_entries.py +++ b/src/khoj/processor/content/text_to_entries.py @@ -27,7 +27,6 @@ logger = logging.getLogger(__name__) class TextToEntries(ABC): def __init__(self, config: Any = None): self.embeddings_model = state.embeddings_model - self.config = config self.date_filter = DateFilter() @abstractmethod diff --git a/src/khoj/routers/api.py b/src/khoj/routers/api.py index b1562e64..44c2f2b7 100644 --- a/src/khoj/routers/api.py +++ b/src/khoj/routers/api.py @@ -87,22 +87,14 @@ def update( force: Optional[bool] = False, ): user = request.user.object - if not state.config: - error_msg = f"🚨 Khoj is not configured.\nConfigure it via http://localhost:42110/settings, plugins or by editing {state.config_file}." - logger.warning(error_msg) - raise HTTPException(status_code=500, detail=error_msg) try: initialize_content(user=user, regenerate=force, search_type=t) except Exception as e: - error_msg = f"🚨 Failed to update server via API: {e}" + error_msg = f"🚨 Failed to update server indexed content via API: {e}" logger.error(error_msg, exc_info=True) raise HTTPException(status_code=500, detail=error_msg) else: - components = [] - if state.search_models: - components.append("Search models") - components_msg = ", ".join(components) - logger.info(f"📪 {components_msg} updated via API") + logger.info(f"📪 Server indexed content updated via API") update_telemetry_state( request=request, diff --git a/src/khoj/routers/api_content.py b/src/khoj/routers/api_content.py index 4f9cc407..c2732ec8 100644 --- a/src/khoj/routers/api_content.py +++ b/src/khoj/routers/api_content.py @@ -27,16 +27,7 @@ from khoj.database.adapters import ( get_user_notion_config, ) from khoj.database.models import Entry as DbEntry -from khoj.database.models import ( - GithubConfig, - GithubRepoConfig, - KhojUser, - LocalMarkdownConfig, - LocalOrgConfig, - LocalPdfConfig, - LocalPlaintextConfig, - NotionConfig, -) +from khoj.database.models import GithubConfig, GithubRepoConfig, NotionConfig from khoj.processor.content.docx.docx_to_entries import DocxToEntries from khoj.processor.content.pdf.pdf_to_entries import PdfToEntries from khoj.routers.helpers import ( @@ -47,17 +38,9 @@ from khoj.routers.helpers import ( get_user_config, update_telemetry_state, ) -from khoj.utils import constants, state -from khoj.utils.config import SearchModels -from khoj.utils.rawconfig import ( - ContentConfig, - FullConfig, - GithubContentConfig, - NotionContentConfig, - SearchConfig, -) +from khoj.utils import state +from khoj.utils.rawconfig import GithubContentConfig, NotionContentConfig from khoj.utils.state import SearchType -from khoj.utils.yaml import save_config_to_file_updated_state logger = logging.getLogger(__name__) @@ -192,8 +175,6 @@ async def set_content_github( updated_config: Union[GithubContentConfig, None], client: Optional[str] = None, ): - _initialize_config() - user = request.user.object try: @@ -225,8 +206,6 @@ async def set_content_notion( updated_config: Union[NotionContentConfig, None], client: Optional[str] = None, ): - _initialize_config() - user = request.user.object try: @@ -323,10 +302,6 @@ def get_content_types(request: Request, client: Optional[str] = None): configured_content_types = set(EntryAdapters.get_unique_file_types(user)) configured_content_types |= {"all"} - if state.config and state.config.content_type: - for ctype in state.config.content_type.model_dump(exclude_none=True): - configured_content_types.add(ctype) - return list(configured_content_types & all_content_types) @@ -606,28 +581,6 @@ async def indexer( docx=index_files["docx"], ) - if state.config == None: - logger.info("📬 Initializing content index on first run.") - default_full_config = FullConfig( - content_type=None, - search_type=SearchConfig.model_validate(constants.default_config["search-type"]), - processor=None, - ) - state.config = default_full_config - default_content_config = ContentConfig( - org=None, - markdown=None, - pdf=None, - docx=None, - image=None, - github=None, - notion=None, - plaintext=None, - ) - state.config.content_type = default_content_config - save_config_to_file_updated_state() - configure_search(state.search_models, state.config.search_type) - loop = asyncio.get_event_loop() success = await loop.run_in_executor( None, @@ -674,14 +627,6 @@ async def indexer( return Response(content=indexed_filenames, status_code=200) -def configure_search(search_models: SearchModels, search_config: Optional[SearchConfig]) -> Optional[SearchModels]: - # Run Validation Checks - if search_models is None: - search_models = SearchModels() - - return search_models - - def map_config_to_object(content_source: str): if content_source == DbEntry.EntrySource.GITHUB: return GithubConfig @@ -689,56 +634,3 @@ def map_config_to_object(content_source: str): return NotionConfig if content_source == DbEntry.EntrySource.COMPUTER: return "Computer" - - -async def map_config_to_db(config: FullConfig, user: KhojUser): - if config.content_type: - if config.content_type.org: - await LocalOrgConfig.objects.filter(user=user).adelete() - await LocalOrgConfig.objects.acreate( - input_files=config.content_type.org.input_files, - input_filter=config.content_type.org.input_filter, - index_heading_entries=config.content_type.org.index_heading_entries, - user=user, - ) - if config.content_type.markdown: - await LocalMarkdownConfig.objects.filter(user=user).adelete() - await LocalMarkdownConfig.objects.acreate( - input_files=config.content_type.markdown.input_files, - input_filter=config.content_type.markdown.input_filter, - index_heading_entries=config.content_type.markdown.index_heading_entries, - user=user, - ) - if config.content_type.pdf: - await LocalPdfConfig.objects.filter(user=user).adelete() - await LocalPdfConfig.objects.acreate( - input_files=config.content_type.pdf.input_files, - input_filter=config.content_type.pdf.input_filter, - index_heading_entries=config.content_type.pdf.index_heading_entries, - user=user, - ) - if config.content_type.plaintext: - await LocalPlaintextConfig.objects.filter(user=user).adelete() - await LocalPlaintextConfig.objects.acreate( - input_files=config.content_type.plaintext.input_files, - input_filter=config.content_type.plaintext.input_filter, - index_heading_entries=config.content_type.plaintext.index_heading_entries, - user=user, - ) - if config.content_type.github: - await adapters.set_user_github_config( - user=user, - pat_token=config.content_type.github.pat_token, - repos=config.content_type.github.repos, - ) - if config.content_type.notion: - await adapters.set_notion_config( - user=user, - token=config.content_type.notion.token, - ) - - -def _initialize_config(): - if state.config is None: - state.config = FullConfig() - state.config.search_type = SearchConfig.model_validate(constants.default_config["search-type"]) diff --git a/src/khoj/routers/helpers.py b/src/khoj/routers/helpers.py index 7833d3d8..8dcda86b 100644 --- a/src/khoj/routers/helpers.py +++ b/src/khoj/routers/helpers.py @@ -218,7 +218,6 @@ def update_telemetry_state( telemetry_type=telemetry_type, api=api, client=client, - app_config=state.config.app, disable_telemetry_env=state.telemetry_disabled, properties=user_state, ) @@ -2726,7 +2725,8 @@ def configure_content( search_type = t.value if t else None - no_documents = all([not files.get(file_type) for file_type in files]) + # Check if client sent any documents of the supported types + no_client_sent_documents = all([not files.get(file_type) for file_type in files]) if files is None: logger.warning(f"🚨 No files to process for {search_type} search.") @@ -2800,7 +2800,8 @@ def configure_content( success = False try: - if no_documents: + # Run server side indexing of user Github docs if no client sent documents + if no_client_sent_documents: github_config = GithubConfig.objects.filter(user=user).prefetch_related("githubrepoconfig").first() if ( search_type == state.SearchType.All.value or search_type == state.SearchType.Github.value @@ -2820,7 +2821,8 @@ def configure_content( success = False try: - if no_documents: + # Run server side indexing of user Notion docs if no client sent documents + if no_client_sent_documents: # Initialize Notion Search notion_config = NotionConfig.objects.filter(user=user).first() if ( diff --git a/src/khoj/utils/cli.py b/src/khoj/utils/cli.py index 66cdda74..a016f9a4 100644 --- a/src/khoj/utils/cli.py +++ b/src/khoj/utils/cli.py @@ -1,26 +1,19 @@ import argparse import logging -import os import pathlib from importlib.metadata import version logger = logging.getLogger(__name__) -from khoj.utils.helpers import is_env_var_true, resolve_absolute_path -from khoj.utils.yaml import parse_config_from_file - def cli(args=None): # Setup Argument Parser for the Commandline Interface parser = argparse.ArgumentParser(description="Start Khoj; An AI personal assistant for your Digital Brain") parser.add_argument( - "--config-file", default="~/.khoj/khoj.yml", type=pathlib.Path, help="YAML file to configure Khoj" - ) - parser.add_argument( - "--regenerate", - action="store_true", - default=False, - help="Regenerate model embeddings from source files. Default: false", + "--log-file", + default="~/.khoj/khoj.log", + type=pathlib.Path, + help="File path for server logs. Default: ~/.khoj/khoj.log", ) parser.add_argument("--verbose", "-v", action="count", default=0, help="Show verbose conversion logs. Default: 0") parser.add_argument("--host", type=str, default="127.0.0.1", help="Host address of the server. Default: 127.0.0.1") @@ -37,7 +30,7 @@ def cli(args=None): "--anonymous-mode", action="store_true", default=False, - help="Run Khoj in anonymous mode. This does not require any login for connecting users.", + help="Run Khoj in single user mode with no login required. Useful for personal use or testing.", ) parser.add_argument( "--non-interactive", @@ -57,15 +50,4 @@ def cli(args=None): print(args.version_no) exit(0) - # Normalize config_file path to absolute path - args.config_file = resolve_absolute_path(args.config_file) - - if not args.config_file.exists(): - args.config = None - else: - args = run_migrations(args) - args.config = parse_config_from_file(args.config_file) - if is_env_var_true("KHOJ_TELEMETRY_DISABLE"): - args.config.app.should_log_telemetry = False - return args diff --git a/src/khoj/utils/config.py b/src/khoj/utils/config.py index d1b6f20a..c9cd1c43 100644 --- a/src/khoj/utils/config.py +++ b/src/khoj/utils/config.py @@ -1,20 +1,7 @@ # System Packages from __future__ import annotations # to avoid quoting type hints -import logging -from dataclasses import dataclass from enum import Enum -from typing import TYPE_CHECKING, Any, List, Optional, Union - -import torch - -logger = logging.getLogger(__name__) - - -if TYPE_CHECKING: - from sentence_transformers import CrossEncoder - - from khoj.utils.models import BaseEncoder class SearchType(str, Enum): @@ -27,36 +14,3 @@ class SearchType(str, Enum): Notion = "notion" Plaintext = "plaintext" Docx = "docx" - - -class ProcessorType(str, Enum): - Conversation = "conversation" - - -@dataclass -class TextContent: - enabled: bool - - -@dataclass -class ImageContent: - image_names: List[str] - image_embeddings: torch.Tensor - image_metadata_embeddings: torch.Tensor - - -@dataclass -class TextSearchModel: - bi_encoder: BaseEncoder - cross_encoder: Optional[CrossEncoder] = None - top_k: Optional[int] = 15 - - -@dataclass -class ImageSearchModel: - image_encoder: BaseEncoder - - -@dataclass -class SearchModels: - text_search: Optional[TextSearchModel] = None diff --git a/src/khoj/utils/fs_syncer.py b/src/khoj/utils/fs_syncer.py deleted file mode 100644 index 67e91bc9..00000000 --- a/src/khoj/utils/fs_syncer.py +++ /dev/null @@ -1,252 +0,0 @@ -import glob -import logging -import os -from pathlib import Path -from typing import Optional - -from bs4 import BeautifulSoup -from magika import Magika - -from khoj.database.models import ( - KhojUser, - LocalMarkdownConfig, - LocalOrgConfig, - LocalPdfConfig, - LocalPlaintextConfig, -) -from khoj.utils.config import SearchType -from khoj.utils.helpers import get_absolute_path, is_none_or_empty -from khoj.utils.rawconfig import TextContentConfig - -logger = logging.getLogger(__name__) -magika = Magika() - - -def collect_files(user: KhojUser, search_type: Optional[SearchType] = SearchType.All) -> dict: - files: dict[str, dict] = {"docx": {}, "image": {}} - - if search_type == SearchType.All or search_type == SearchType.Org: - org_config = LocalOrgConfig.objects.filter(user=user).first() - files["org"] = get_org_files(construct_config_from_db(org_config)) if org_config else {} - if search_type == SearchType.All or search_type == SearchType.Markdown: - markdown_config = LocalMarkdownConfig.objects.filter(user=user).first() - files["markdown"] = get_markdown_files(construct_config_from_db(markdown_config)) if markdown_config else {} - if search_type == SearchType.All or search_type == SearchType.Plaintext: - plaintext_config = LocalPlaintextConfig.objects.filter(user=user).first() - files["plaintext"] = get_plaintext_files(construct_config_from_db(plaintext_config)) if plaintext_config else {} - if search_type == SearchType.All or search_type == SearchType.Pdf: - pdf_config = LocalPdfConfig.objects.filter(user=user).first() - files["pdf"] = get_pdf_files(construct_config_from_db(pdf_config)) if pdf_config else {} - files["image"] = {} - files["docx"] = {} - return files - - -def construct_config_from_db(db_config) -> TextContentConfig: - return TextContentConfig( - input_files=db_config.input_files, - input_filter=db_config.input_filter, - index_heading_entries=db_config.index_heading_entries, - ) - - -def get_plaintext_files(config: TextContentConfig) -> dict[str, str]: - def is_plaintextfile(file: str): - "Check if file is plaintext file" - # Check if file path exists - content_group = magika.identify_path(Path(file)).output.group - # Use file extension to decide plaintext if file content is not identifiable - valid_text_file_extensions = ("txt", "md", "markdown", "org" "mbox", "rst", "html", "htm", "xml") - return file.endswith(valid_text_file_extensions) or content_group in ["text", "code"] - - def extract_html_content(html_content: str): - "Extract content from HTML" - soup = BeautifulSoup(html_content, "html.parser") - return soup.get_text(strip=True, separator="\n") - - # Extract required fields from config - input_files, input_filters = ( - config.input_files, - config.input_filter, - ) - - # Input Validation - if is_none_or_empty(input_files) and is_none_or_empty(input_filters): - logger.debug("At least one of input-files or input-file-filter is required to be specified") - return {} - - # Get all plain text files to process - absolute_plaintext_files, filtered_plaintext_files = set(), set() - if input_files: - absolute_plaintext_files = {get_absolute_path(jsonl_file) for jsonl_file in input_files} - if input_filters: - filtered_plaintext_files = { - filtered_file - for plaintext_file_filter in input_filters - for filtered_file in glob.glob(get_absolute_path(plaintext_file_filter), recursive=True) - if os.path.isfile(filtered_file) - } - - all_target_files = sorted(absolute_plaintext_files | filtered_plaintext_files) - - files_with_no_plaintext_extensions = { - target_files for target_files in all_target_files if not is_plaintextfile(target_files) - } - if any(files_with_no_plaintext_extensions): - logger.warning(f"Skipping unsupported files from plaintext indexing: {files_with_no_plaintext_extensions}") - all_target_files = list(set(all_target_files) - files_with_no_plaintext_extensions) - - logger.debug(f"Processing files: {all_target_files}") - - filename_to_content_map = {} - for file in all_target_files: - with open(file, "r", encoding="utf8") as f: - try: - plaintext_content = f.read() - if file.endswith(("html", "htm", "xml")): - plaintext_content = extract_html_content(plaintext_content) - filename_to_content_map[file] = plaintext_content - except Exception as e: - logger.warning(f"Unable to read file: {file} as plaintext. Skipping file.") - logger.warning(e, exc_info=True) - - return filename_to_content_map - - -def get_org_files(config: TextContentConfig): - # Extract required fields from config - org_files, org_file_filters = ( - config.input_files, - config.input_filter, - ) - - # Input Validation - if is_none_or_empty(org_files) and is_none_or_empty(org_file_filters): - logger.debug("At least one of org-files or org-file-filter is required to be specified") - return {} - - # Get Org files to process - absolute_org_files, filtered_org_files = set(), set() - if org_files: - absolute_org_files = {get_absolute_path(org_file) for org_file in org_files} - if org_file_filters: - filtered_org_files = { - filtered_file - for org_file_filter in org_file_filters - for filtered_file in glob.glob(get_absolute_path(org_file_filter), recursive=True) - if os.path.isfile(filtered_file) - } - - all_org_files = sorted(absolute_org_files | filtered_org_files) - - files_with_non_org_extensions = {org_file for org_file in all_org_files if not org_file.endswith(".org")} - if any(files_with_non_org_extensions): - logger.warning(f"There maybe non org-mode files in the input set: {files_with_non_org_extensions}") - - logger.debug(f"Processing files: {all_org_files}") - - filename_to_content_map = {} - for file in all_org_files: - with open(file, "r", encoding="utf8") as f: - try: - filename_to_content_map[file] = f.read() - except Exception as e: - logger.warning(f"Unable to read file: {file} as org. Skipping file.") - logger.warning(e, exc_info=True) - - return filename_to_content_map - - -def get_markdown_files(config: TextContentConfig): - # Extract required fields from config - markdown_files, markdown_file_filters = ( - config.input_files, - config.input_filter, - ) - - # Input Validation - if is_none_or_empty(markdown_files) and is_none_or_empty(markdown_file_filters): - logger.debug("At least one of markdown-files or markdown-file-filter is required to be specified") - return {} - - # Get markdown files to process - absolute_markdown_files, filtered_markdown_files = set(), set() - if markdown_files: - absolute_markdown_files = {get_absolute_path(markdown_file) for markdown_file in markdown_files} - - if markdown_file_filters: - filtered_markdown_files = { - filtered_file - for markdown_file_filter in markdown_file_filters - for filtered_file in glob.glob(get_absolute_path(markdown_file_filter), recursive=True) - if os.path.isfile(filtered_file) - } - - all_markdown_files = sorted(absolute_markdown_files | filtered_markdown_files) - - files_with_non_markdown_extensions = { - md_file for md_file in all_markdown_files if not md_file.endswith(".md") and not md_file.endswith(".markdown") - } - - if any(files_with_non_markdown_extensions): - logger.warning( - f"[Warning] There maybe non markdown-mode files in the input set: {files_with_non_markdown_extensions}" - ) - - logger.debug(f"Processing files: {all_markdown_files}") - - filename_to_content_map = {} - for file in all_markdown_files: - with open(file, "r", encoding="utf8") as f: - try: - filename_to_content_map[file] = f.read() - except Exception as e: - logger.warning(f"Unable to read file: {file} as markdown. Skipping file.") - logger.warning(e, exc_info=True) - - return filename_to_content_map - - -def get_pdf_files(config: TextContentConfig): - # Extract required fields from config - pdf_files, pdf_file_filters = ( - config.input_files, - config.input_filter, - ) - - # Input Validation - if is_none_or_empty(pdf_files) and is_none_or_empty(pdf_file_filters): - logger.debug("At least one of pdf-files or pdf-file-filter is required to be specified") - return {} - - # Get PDF files to process - absolute_pdf_files, filtered_pdf_files = set(), set() - if pdf_files: - absolute_pdf_files = {get_absolute_path(pdf_file) for pdf_file in pdf_files} - if pdf_file_filters: - filtered_pdf_files = { - filtered_file - for pdf_file_filter in pdf_file_filters - for filtered_file in glob.glob(get_absolute_path(pdf_file_filter), recursive=True) - if os.path.isfile(filtered_file) - } - - all_pdf_files = sorted(absolute_pdf_files | filtered_pdf_files) - - files_with_non_pdf_extensions = {pdf_file for pdf_file in all_pdf_files if not pdf_file.endswith(".pdf")} - - if any(files_with_non_pdf_extensions): - logger.warning(f"[Warning] There maybe non pdf-mode files in the input set: {files_with_non_pdf_extensions}") - - logger.debug(f"Processing files: {all_pdf_files}") - - filename_to_content_map = {} - for file in all_pdf_files: - with open(file, "rb") as f: - try: - filename_to_content_map[file] = f.read() - except Exception as e: - logger.warning(f"Unable to read file: {file} as PDF. Skipping file.") - logger.warning(e, exc_info=True) - - return filename_to_content_map diff --git a/src/khoj/utils/helpers.py b/src/khoj/utils/helpers.py index 508901de..523ec007 100644 --- a/src/khoj/utils/helpers.py +++ b/src/khoj/utils/helpers.py @@ -47,7 +47,6 @@ if TYPE_CHECKING: from sentence_transformers import CrossEncoder, SentenceTransformer from khoj.utils.models import BaseEncoder - from khoj.utils.rawconfig import AppConfig logger = logging.getLogger(__name__) @@ -267,23 +266,16 @@ def get_server_id(): return server_id -def telemetry_disabled(app_config: AppConfig, telemetry_disable_env) -> bool: - if telemetry_disable_env is True: - return True - return not app_config or not app_config.should_log_telemetry - - def log_telemetry( telemetry_type: str, api: str = None, client: Optional[str] = None, - app_config: Optional[AppConfig] = None, disable_telemetry_env: bool = False, properties: dict = None, ): """Log basic app usage telemetry like client, os, api called""" # Do not log usage telemetry, if telemetry is disabled via app config - if telemetry_disabled(app_config, disable_telemetry_env): + if disable_telemetry_env: return [] if properties.get("server_id") is None: diff --git a/src/khoj/utils/initialization.py b/src/khoj/utils/initialization.py index 336b228d..8023b3ed 100644 --- a/src/khoj/utils/initialization.py +++ b/src/khoj/utils/initialization.py @@ -147,24 +147,6 @@ def initialization(interactive: bool = True): logger.info("🗣️ Chat model configuration complete") - # Set up offline speech to text model - use_offline_speech2text_model = "n" if not interactive else input("Use offline speech to text model? (y/n): ") - if use_offline_speech2text_model == "y": - logger.info("🗣️ Setting up offline speech to text model") - # Delete any existing speech to text model options. There can only be one. - SpeechToTextModelOptions.objects.all().delete() - - default_offline_speech2text_model = "base" - offline_speech2text_model = input( - f"Enter the Whisper model to use Offline (default: {default_offline_speech2text_model}): " - ) - offline_speech2text_model = offline_speech2text_model or default_offline_speech2text_model - SpeechToTextModelOptions.objects.create( - model_name=offline_speech2text_model, model_type=SpeechToTextModelOptions.ModelType.OFFLINE - ) - - logger.info(f"🗣️ Offline speech to text model configured to {offline_speech2text_model}") - def _setup_chat_model_provider( model_type: ChatModel.ModelType, default_chat_models: list, diff --git a/src/khoj/utils/rawconfig.py b/src/khoj/utils/rawconfig.py index 0148511a..5377577b 100644 --- a/src/khoj/utils/rawconfig.py +++ b/src/khoj/utils/rawconfig.py @@ -48,17 +48,6 @@ class FilesFilterRequest(BaseModel): conversation_id: str -class TextConfigBase(ConfigBase): - compressed_jsonl: Path - embeddings_file: Path - - -class TextContentConfig(ConfigBase): - input_files: Optional[List[Path]] = None - input_filter: Optional[List[str]] = None - index_heading_entries: Optional[bool] = False - - class GithubRepoConfig(ConfigBase): name: str owner: str @@ -74,57 +63,6 @@ class NotionContentConfig(ConfigBase): token: str -class ContentConfig(ConfigBase): - org: Optional[TextContentConfig] = None - markdown: Optional[TextContentConfig] = None - pdf: Optional[TextContentConfig] = None - plaintext: Optional[TextContentConfig] = None - github: Optional[GithubContentConfig] = None - notion: Optional[NotionContentConfig] = None - image: Optional[TextContentConfig] = None - docx: Optional[TextContentConfig] = None - - -class ImageSearchConfig(ConfigBase): - encoder: str - encoder_type: Optional[str] = None - model_directory: Optional[Path] = None - - class Config: - protected_namespaces = () - - -class SearchConfig(ConfigBase): - image: Optional[ImageSearchConfig] = None - - -class OpenAIProcessorConfig(ConfigBase): - api_key: str - chat_model: Optional[str] = "gpt-4o-mini" - - -class ConversationProcessorConfig(ConfigBase): - openai: Optional[OpenAIProcessorConfig] = None - max_prompt_size: Optional[int] = None - tokenizer: Optional[str] = None - - -class ProcessorConfig(ConfigBase): - conversation: Optional[ConversationProcessorConfig] = None - - -class AppConfig(ConfigBase): - should_log_telemetry: bool = True - - -class FullConfig(ConfigBase): - content_type: Optional[ContentConfig] = None - search_type: Optional[SearchConfig] = None - processor: Optional[ProcessorConfig] = None - app: Optional[AppConfig] = AppConfig() - version: Optional[str] = None - - class SearchResponse(ConfigBase): entry: str score: float diff --git a/src/khoj/utils/state.py b/src/khoj/utils/state.py index 6acd3d65..3b65a85b 100644 --- a/src/khoj/utils/state.py +++ b/src/khoj/utils/state.py @@ -12,18 +12,14 @@ from whisper import Whisper from khoj.database.models import ProcessLock from khoj.processor.embeddings import CrossEncoderModel, EmbeddingsModel from khoj.utils import config as utils_config -from khoj.utils.config import SearchModels from khoj.utils.helpers import LRU, get_device, is_env_var_true -from khoj.utils.rawconfig import FullConfig # Application Global State -config = FullConfig() -search_models = SearchModels() embeddings_model: Dict[str, EmbeddingsModel] = None cross_encoder_model: Dict[str, CrossEncoderModel] = None openai_client: OpenAI = None whisper_model: Whisper = None -config_file: Path = None +log_file: Path = None verbose: int = 0 host: str = None port: int = None diff --git a/src/khoj/utils/yaml.py b/src/khoj/utils/yaml.py index f658e1eb..43b139e5 100644 --- a/src/khoj/utils/yaml.py +++ b/src/khoj/utils/yaml.py @@ -1,47 +1,8 @@ -from pathlib import Path - import yaml -from khoj.utils import state -from khoj.utils.rawconfig import FullConfig - # Do not emit tags when dumping to YAML yaml.emitter.Emitter.process_tag = lambda self, *args, **kwargs: None # type: ignore[assignment] -def save_config_to_file_updated_state(): - with open(state.config_file, "w") as outfile: - yaml.dump(yaml.safe_load(state.config.json(by_alias=True)), outfile) - outfile.close() - return state.config - - -def save_config_to_file(yaml_config: dict, yaml_config_file: Path): - "Write config to YML file" - # Create output directory, if it doesn't exist - yaml_config_file.parent.mkdir(parents=True, exist_ok=True) - - with open(yaml_config_file, "w", encoding="utf-8") as config_file: - yaml.safe_dump(yaml_config, config_file, allow_unicode=True) - - -def load_config_from_file(yaml_config_file: Path) -> dict: - "Read config from YML file" - config_from_file = None - with open(yaml_config_file, "r", encoding="utf-8") as config_file: - config_from_file = yaml.safe_load(config_file) - return config_from_file - - -def parse_config_from_string(yaml_config: dict) -> FullConfig: - "Parse and validate config in YML string" - return FullConfig.model_validate(yaml_config) - - -def parse_config_from_file(yaml_config_file): - "Parse and validate config in YML file" - return parse_config_from_string(load_config_from_file(yaml_config_file)) - - def yaml_dump(data): return yaml.dump(data, allow_unicode=True, sort_keys=False, default_flow_style=False) diff --git a/tests/conftest.py b/tests/conftest.py index 77c86dfa..097a0ab0 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -33,7 +33,7 @@ from khoj.utils import fs_syncer, state from khoj.utils.config import SearchModels from khoj.utils.constants import web_directory from khoj.utils.helpers import resolve_absolute_path -from khoj.utils.rawconfig import ContentConfig, ImageSearchConfig, SearchConfig +from khoj.utils.rawconfig import ContentConfig, SearchConfig from tests.helpers import ( AiModelApiFactory, ChatModelFactory, @@ -69,12 +69,6 @@ def search_config() -> SearchConfig: model_dir.mkdir(parents=True, exist_ok=True) search_config = SearchConfig() - search_config.image = ImageSearchConfig( - encoder="sentence-transformers/clip-ViT-B-32", - model_directory=model_dir / "image/", - encoder_type=None, - ) - return search_config @@ -301,7 +295,6 @@ def chat_client_with_large_kb(search_config: SearchConfig, default_user2: KhojUs @pytest.mark.django_db def chat_client_builder(search_config, user, index_content=True, require_auth=False): # Initialize app state - state.config.search_type = search_config state.SearchType = configure_search_types() if index_content: @@ -349,7 +342,6 @@ def large_kb_chat_client_builder(search_config, user): import tempfile # Initialize app state - state.config.search_type = search_config state.SearchType = configure_search_types() # Create temporary directory for large number of test files @@ -470,12 +462,8 @@ def fastapi_app(): @pytest.fixture(scope="function") def client( - content_config: ContentConfig, - search_config: SearchConfig, api_user: KhojApiUser, ): - state.config.content_type = content_config - state.config.search_type = search_config state.SearchType = configure_search_types() state.embeddings_model = dict() state.embeddings_model["default"] = EmbeddingsModel() diff --git a/tests/test_client.py b/tests/test_client.py index 00507851..b7341d0e 100644 --- a/tests/test_client.py +++ b/tests/test_client.py @@ -283,10 +283,6 @@ def test_get_api_config_types(client, sample_org_data, default_user: KhojUser): def test_get_configured_types_with_no_content_config(fastapi_app: FastAPI): # Arrange state.anonymous_mode = True - if state.config and state.config.content_type: - state.config.content_type = None - state.search_models = configure_search_types() - configure_routes(fastapi_app) client = TestClient(fastapi_app)