From d9d24dd638fa19b34ccf52f3dabda9e308b5b38f Mon Sep 17 00:00:00 2001 From: Debanjum Date: Thu, 3 Jul 2025 15:27:06 -0700 Subject: [PATCH] Drop old code to sync files on server filesystem. Clean cli, init paths This stale code was originally used to index files on server file system directly by server. We currently push files to sync via API. Server side syncing of remote content like Github and Notion is still supported. But old, unused code for server side sync of files on server fs is being cleaned out. New --log-file cli args allows specifying where khoj server should store logs on fs. This replaces the --config-file cli arg that was only being used as a proxy for deciding where to store the log file. - TODO - Tests are broken. They were relying on the server side content syncing for test setup --- src/khoj/configure.py | 75 ++---- src/khoj/database/adapters/__init__.py | 4 +- ...093_remove_localorgconfig_user_and_more.py | 36 +++ src/khoj/database/models/__init__.py | 28 -- src/khoj/main.py | 9 +- .../content/github/github_to_entries.py | 1 - .../content/notion/notion_to_entries.py | 1 - src/khoj/processor/content/text_to_entries.py | 1 - src/khoj/routers/api.py | 12 +- src/khoj/routers/api_content.py | 114 +------- src/khoj/routers/helpers.py | 10 +- src/khoj/utils/cli.py | 28 +- src/khoj/utils/config.py | 46 ---- src/khoj/utils/fs_syncer.py | 252 ------------------ src/khoj/utils/helpers.py | 10 +- src/khoj/utils/initialization.py | 18 -- src/khoj/utils/rawconfig.py | 62 ----- src/khoj/utils/state.py | 6 +- src/khoj/utils/yaml.py | 39 --- tests/conftest.py | 14 +- tests/test_client.py | 4 - 21 files changed, 82 insertions(+), 688 deletions(-) create mode 100644 src/khoj/database/migrations/0093_remove_localorgconfig_user_and_more.py delete mode 100644 src/khoj/utils/fs_syncer.py diff --git a/src/khoj/configure.py b/src/khoj/configure.py index a72f15d5..40d1eeb5 100644 --- a/src/khoj/configure.py +++ b/src/khoj/configure.py @@ -50,13 +50,11 @@ from khoj.database.adapters import ( ) from khoj.database.models import ClientApplication, KhojUser, ProcessLock, Subscription from khoj.processor.embeddings import CrossEncoderModel, EmbeddingsModel -from khoj.routers.api_content import configure_content, configure_search +from khoj.routers.api_content import configure_content from khoj.routers.twilio import is_twilio_enabled from khoj.utils import constants, state from khoj.utils.config import SearchType -from khoj.utils.fs_syncer import collect_files -from khoj.utils.helpers import is_none_or_empty, telemetry_disabled -from khoj.utils.rawconfig import FullConfig +from khoj.utils.helpers import is_none_or_empty logger = logging.getLogger(__name__) @@ -232,14 +230,6 @@ class UserAuthenticationBackend(AuthenticationBackend): return AuthCredentials(), UnauthenticatedUser() -def initialize_server(config: Optional[FullConfig]): - try: - configure_server(config, init=True) - except Exception as e: - logger.error(f"🚨 Failed to configure server on app load: {e}", exc_info=True) - raise e - - def clean_connections(func): """ A decorator that ensures that Django database connections that have become unusable, or are obsolete, are closed @@ -260,19 +250,7 @@ def clean_connections(func): return func_wrapper -def configure_server( - config: FullConfig, - regenerate: bool = False, - search_type: Optional[SearchType] = None, - init=False, - user: KhojUser = None, -): - # Update Config - if config == None: - logger.info(f"Initializing with default config.") - config = FullConfig() - state.config = config - +def initialize_server(): if ConversationAdapters.has_valid_ai_model_api(): ai_model_api = ConversationAdapters.get_ai_model_api() state.openai_client = openai.OpenAI(api_key=ai_model_api.api_key, base_url=ai_model_api.api_base_url) @@ -309,43 +287,33 @@ def configure_server( ) state.SearchType = configure_search_types() - state.search_models = configure_search(state.search_models, state.config.search_type) - setup_default_agent(user) + setup_default_agent() - message = ( - "📡 Telemetry disabled" - if telemetry_disabled(state.config.app, state.telemetry_disabled) - else "📡 Telemetry enabled" - ) + message = "📡 Telemetry disabled" if state.telemetry_disabled else "📡 Telemetry enabled" logger.info(message) - if not init: - initialize_content(user, regenerate, search_type) - except Exception as e: logger.error(f"Failed to load some search models: {e}", exc_info=True) -def setup_default_agent(user: KhojUser): - AgentAdapters.create_default_agent(user) +def setup_default_agent(): + AgentAdapters.create_default_agent() def initialize_content(user: KhojUser, regenerate: bool, search_type: Optional[SearchType] = None): # Initialize Content from Config - if state.search_models: - try: - logger.info("📬 Updating content index...") - all_files = collect_files(user=user) - status = configure_content( - user, - all_files, - regenerate, - search_type, - ) - if not status: - raise RuntimeError("Failed to update content index") - except Exception as e: - raise e + try: + logger.info("📬 Updating content index...") + status = configure_content( + user, + {}, + regenerate, + search_type, + ) + if not status: + raise RuntimeError("Failed to update content index") + except Exception as e: + raise e def configure_routes(app): @@ -438,8 +406,7 @@ def configure_middleware(app, ssl_enabled: bool = False): def update_content_index(): for user in get_all_users(): - all_files = collect_files(user=user) - success = configure_content(user, all_files) + success = configure_content(user, {}) if not success: raise RuntimeError("Failed to update content index") logger.info("📪 Content index updated via Scheduler") @@ -464,7 +431,7 @@ def configure_search_types(): @schedule.repeat(schedule.every(2).minutes) @clean_connections def upload_telemetry(): - if telemetry_disabled(state.config.app, state.telemetry_disabled) or not state.telemetry: + if state.telemetry_disabled or not state.telemetry: return try: diff --git a/src/khoj/database/adapters/__init__.py b/src/khoj/database/adapters/__init__.py index 76d7578b..6d92b8e9 100644 --- a/src/khoj/database/adapters/__init__.py +++ b/src/khoj/database/adapters/__init__.py @@ -788,8 +788,8 @@ class AgentAdapters: return Agent.objects.filter(name=AgentAdapters.DEFAULT_AGENT_NAME).first() @staticmethod - def create_default_agent(user: KhojUser): - default_chat_model = ConversationAdapters.get_default_chat_model(user) + def create_default_agent(): + default_chat_model = ConversationAdapters.get_default_chat_model(user=None) if default_chat_model is None: logger.info("No default conversation config found, skipping default agent creation") return None diff --git a/src/khoj/database/migrations/0093_remove_localorgconfig_user_and_more.py b/src/khoj/database/migrations/0093_remove_localorgconfig_user_and_more.py new file mode 100644 index 00000000..ad3409cf --- /dev/null +++ b/src/khoj/database/migrations/0093_remove_localorgconfig_user_and_more.py @@ -0,0 +1,36 @@ +# Generated by Django 5.1.10 on 2025-07-25 23:30 + +from django.db import migrations + + +class Migration(migrations.Migration): + dependencies = [ + ("database", "0092_alter_chatmodel_model_type_alter_chatmodel_name_and_more"), + ] + + operations = [ + migrations.RemoveField( + model_name="localorgconfig", + name="user", + ), + migrations.RemoveField( + model_name="localpdfconfig", + name="user", + ), + migrations.RemoveField( + model_name="localplaintextconfig", + name="user", + ), + migrations.DeleteModel( + name="LocalMarkdownConfig", + ), + migrations.DeleteModel( + name="LocalOrgConfig", + ), + migrations.DeleteModel( + name="LocalPdfConfig", + ), + migrations.DeleteModel( + name="LocalPlaintextConfig", + ), + ] diff --git a/src/khoj/database/models/__init__.py b/src/khoj/database/models/__init__.py index 03b43376..1ed58572 100644 --- a/src/khoj/database/models/__init__.py +++ b/src/khoj/database/models/__init__.py @@ -488,34 +488,6 @@ class ServerChatSettings(DbBaseModel): super().save(*args, **kwargs) -class LocalOrgConfig(DbBaseModel): - input_files = models.JSONField(default=list, null=True) - input_filter = models.JSONField(default=list, null=True) - index_heading_entries = models.BooleanField(default=False) - user = models.ForeignKey(KhojUser, on_delete=models.CASCADE) - - -class LocalMarkdownConfig(DbBaseModel): - input_files = models.JSONField(default=list, null=True) - input_filter = models.JSONField(default=list, null=True) - index_heading_entries = models.BooleanField(default=False) - user = models.ForeignKey(KhojUser, on_delete=models.CASCADE) - - -class LocalPdfConfig(DbBaseModel): - input_files = models.JSONField(default=list, null=True) - input_filter = models.JSONField(default=list, null=True) - index_heading_entries = models.BooleanField(default=False) - user = models.ForeignKey(KhojUser, on_delete=models.CASCADE) - - -class LocalPlaintextConfig(DbBaseModel): - input_files = models.JSONField(default=list, null=True) - input_filter = models.JSONField(default=list, null=True) - index_heading_entries = models.BooleanField(default=False) - user = models.ForeignKey(KhojUser, on_delete=models.CASCADE) - - class SearchModelConfig(DbBaseModel): class ModelType(models.TextChoices): TEXT = "text" diff --git a/src/khoj/main.py b/src/khoj/main.py index d5918a14..f42ae135 100644 --- a/src/khoj/main.py +++ b/src/khoj/main.py @@ -138,10 +138,10 @@ def run(should_start_server=True): initialization(not args.non_interactive) # Create app directory, if it doesn't exist - state.config_file.parent.mkdir(parents=True, exist_ok=True) + state.log_file.parent.mkdir(parents=True, exist_ok=True) # Set Log File - fh = logging.FileHandler(state.config_file.parent / "khoj.log", encoding="utf-8") + fh = logging.FileHandler(state.log_file, encoding="utf-8") fh.setLevel(logging.DEBUG) logger.addHandler(fh) @@ -194,7 +194,7 @@ def run(should_start_server=True): # Configure Middleware configure_middleware(app, state.ssl_config) - initialize_server(args.config) + initialize_server() # If the server is started through gunicorn (external to the script), don't start the server if should_start_server: @@ -204,8 +204,7 @@ def run(should_start_server=True): def set_state(args): - state.config_file = args.config_file - state.config = args.config + state.log_file = args.log_file state.verbose = args.verbose state.host = args.host state.port = args.port diff --git a/src/khoj/processor/content/github/github_to_entries.py b/src/khoj/processor/content/github/github_to_entries.py index 31f99f84..63ed50c6 100644 --- a/src/khoj/processor/content/github/github_to_entries.py +++ b/src/khoj/processor/content/github/github_to_entries.py @@ -20,7 +20,6 @@ magika = Magika() class GithubToEntries(TextToEntries): def __init__(self, config: GithubConfig): - super().__init__(config) raw_repos = config.githubrepoconfig.all() repos = [] for repo in raw_repos: diff --git a/src/khoj/processor/content/notion/notion_to_entries.py b/src/khoj/processor/content/notion/notion_to_entries.py index 1e1ab4d3..23b96f63 100644 --- a/src/khoj/processor/content/notion/notion_to_entries.py +++ b/src/khoj/processor/content/notion/notion_to_entries.py @@ -47,7 +47,6 @@ class NotionBlockType(Enum): class NotionToEntries(TextToEntries): def __init__(self, config: NotionConfig): - super().__init__(config) self.config = NotionContentConfig( token=config.token, ) diff --git a/src/khoj/processor/content/text_to_entries.py b/src/khoj/processor/content/text_to_entries.py index 0ceda11d..0369d273 100644 --- a/src/khoj/processor/content/text_to_entries.py +++ b/src/khoj/processor/content/text_to_entries.py @@ -27,7 +27,6 @@ logger = logging.getLogger(__name__) class TextToEntries(ABC): def __init__(self, config: Any = None): self.embeddings_model = state.embeddings_model - self.config = config self.date_filter = DateFilter() @abstractmethod diff --git a/src/khoj/routers/api.py b/src/khoj/routers/api.py index b1562e64..44c2f2b7 100644 --- a/src/khoj/routers/api.py +++ b/src/khoj/routers/api.py @@ -87,22 +87,14 @@ def update( force: Optional[bool] = False, ): user = request.user.object - if not state.config: - error_msg = f"🚨 Khoj is not configured.\nConfigure it via http://localhost:42110/settings, plugins or by editing {state.config_file}." - logger.warning(error_msg) - raise HTTPException(status_code=500, detail=error_msg) try: initialize_content(user=user, regenerate=force, search_type=t) except Exception as e: - error_msg = f"🚨 Failed to update server via API: {e}" + error_msg = f"🚨 Failed to update server indexed content via API: {e}" logger.error(error_msg, exc_info=True) raise HTTPException(status_code=500, detail=error_msg) else: - components = [] - if state.search_models: - components.append("Search models") - components_msg = ", ".join(components) - logger.info(f"📪 {components_msg} updated via API") + logger.info(f"📪 Server indexed content updated via API") update_telemetry_state( request=request, diff --git a/src/khoj/routers/api_content.py b/src/khoj/routers/api_content.py index 4f9cc407..c2732ec8 100644 --- a/src/khoj/routers/api_content.py +++ b/src/khoj/routers/api_content.py @@ -27,16 +27,7 @@ from khoj.database.adapters import ( get_user_notion_config, ) from khoj.database.models import Entry as DbEntry -from khoj.database.models import ( - GithubConfig, - GithubRepoConfig, - KhojUser, - LocalMarkdownConfig, - LocalOrgConfig, - LocalPdfConfig, - LocalPlaintextConfig, - NotionConfig, -) +from khoj.database.models import GithubConfig, GithubRepoConfig, NotionConfig from khoj.processor.content.docx.docx_to_entries import DocxToEntries from khoj.processor.content.pdf.pdf_to_entries import PdfToEntries from khoj.routers.helpers import ( @@ -47,17 +38,9 @@ from khoj.routers.helpers import ( get_user_config, update_telemetry_state, ) -from khoj.utils import constants, state -from khoj.utils.config import SearchModels -from khoj.utils.rawconfig import ( - ContentConfig, - FullConfig, - GithubContentConfig, - NotionContentConfig, - SearchConfig, -) +from khoj.utils import state +from khoj.utils.rawconfig import GithubContentConfig, NotionContentConfig from khoj.utils.state import SearchType -from khoj.utils.yaml import save_config_to_file_updated_state logger = logging.getLogger(__name__) @@ -192,8 +175,6 @@ async def set_content_github( updated_config: Union[GithubContentConfig, None], client: Optional[str] = None, ): - _initialize_config() - user = request.user.object try: @@ -225,8 +206,6 @@ async def set_content_notion( updated_config: Union[NotionContentConfig, None], client: Optional[str] = None, ): - _initialize_config() - user = request.user.object try: @@ -323,10 +302,6 @@ def get_content_types(request: Request, client: Optional[str] = None): configured_content_types = set(EntryAdapters.get_unique_file_types(user)) configured_content_types |= {"all"} - if state.config and state.config.content_type: - for ctype in state.config.content_type.model_dump(exclude_none=True): - configured_content_types.add(ctype) - return list(configured_content_types & all_content_types) @@ -606,28 +581,6 @@ async def indexer( docx=index_files["docx"], ) - if state.config == None: - logger.info("📬 Initializing content index on first run.") - default_full_config = FullConfig( - content_type=None, - search_type=SearchConfig.model_validate(constants.default_config["search-type"]), - processor=None, - ) - state.config = default_full_config - default_content_config = ContentConfig( - org=None, - markdown=None, - pdf=None, - docx=None, - image=None, - github=None, - notion=None, - plaintext=None, - ) - state.config.content_type = default_content_config - save_config_to_file_updated_state() - configure_search(state.search_models, state.config.search_type) - loop = asyncio.get_event_loop() success = await loop.run_in_executor( None, @@ -674,14 +627,6 @@ async def indexer( return Response(content=indexed_filenames, status_code=200) -def configure_search(search_models: SearchModels, search_config: Optional[SearchConfig]) -> Optional[SearchModels]: - # Run Validation Checks - if search_models is None: - search_models = SearchModels() - - return search_models - - def map_config_to_object(content_source: str): if content_source == DbEntry.EntrySource.GITHUB: return GithubConfig @@ -689,56 +634,3 @@ def map_config_to_object(content_source: str): return NotionConfig if content_source == DbEntry.EntrySource.COMPUTER: return "Computer" - - -async def map_config_to_db(config: FullConfig, user: KhojUser): - if config.content_type: - if config.content_type.org: - await LocalOrgConfig.objects.filter(user=user).adelete() - await LocalOrgConfig.objects.acreate( - input_files=config.content_type.org.input_files, - input_filter=config.content_type.org.input_filter, - index_heading_entries=config.content_type.org.index_heading_entries, - user=user, - ) - if config.content_type.markdown: - await LocalMarkdownConfig.objects.filter(user=user).adelete() - await LocalMarkdownConfig.objects.acreate( - input_files=config.content_type.markdown.input_files, - input_filter=config.content_type.markdown.input_filter, - index_heading_entries=config.content_type.markdown.index_heading_entries, - user=user, - ) - if config.content_type.pdf: - await LocalPdfConfig.objects.filter(user=user).adelete() - await LocalPdfConfig.objects.acreate( - input_files=config.content_type.pdf.input_files, - input_filter=config.content_type.pdf.input_filter, - index_heading_entries=config.content_type.pdf.index_heading_entries, - user=user, - ) - if config.content_type.plaintext: - await LocalPlaintextConfig.objects.filter(user=user).adelete() - await LocalPlaintextConfig.objects.acreate( - input_files=config.content_type.plaintext.input_files, - input_filter=config.content_type.plaintext.input_filter, - index_heading_entries=config.content_type.plaintext.index_heading_entries, - user=user, - ) - if config.content_type.github: - await adapters.set_user_github_config( - user=user, - pat_token=config.content_type.github.pat_token, - repos=config.content_type.github.repos, - ) - if config.content_type.notion: - await adapters.set_notion_config( - user=user, - token=config.content_type.notion.token, - ) - - -def _initialize_config(): - if state.config is None: - state.config = FullConfig() - state.config.search_type = SearchConfig.model_validate(constants.default_config["search-type"]) diff --git a/src/khoj/routers/helpers.py b/src/khoj/routers/helpers.py index 7833d3d8..8dcda86b 100644 --- a/src/khoj/routers/helpers.py +++ b/src/khoj/routers/helpers.py @@ -218,7 +218,6 @@ def update_telemetry_state( telemetry_type=telemetry_type, api=api, client=client, - app_config=state.config.app, disable_telemetry_env=state.telemetry_disabled, properties=user_state, ) @@ -2726,7 +2725,8 @@ def configure_content( search_type = t.value if t else None - no_documents = all([not files.get(file_type) for file_type in files]) + # Check if client sent any documents of the supported types + no_client_sent_documents = all([not files.get(file_type) for file_type in files]) if files is None: logger.warning(f"🚨 No files to process for {search_type} search.") @@ -2800,7 +2800,8 @@ def configure_content( success = False try: - if no_documents: + # Run server side indexing of user Github docs if no client sent documents + if no_client_sent_documents: github_config = GithubConfig.objects.filter(user=user).prefetch_related("githubrepoconfig").first() if ( search_type == state.SearchType.All.value or search_type == state.SearchType.Github.value @@ -2820,7 +2821,8 @@ def configure_content( success = False try: - if no_documents: + # Run server side indexing of user Notion docs if no client sent documents + if no_client_sent_documents: # Initialize Notion Search notion_config = NotionConfig.objects.filter(user=user).first() if ( diff --git a/src/khoj/utils/cli.py b/src/khoj/utils/cli.py index 66cdda74..a016f9a4 100644 --- a/src/khoj/utils/cli.py +++ b/src/khoj/utils/cli.py @@ -1,26 +1,19 @@ import argparse import logging -import os import pathlib from importlib.metadata import version logger = logging.getLogger(__name__) -from khoj.utils.helpers import is_env_var_true, resolve_absolute_path -from khoj.utils.yaml import parse_config_from_file - def cli(args=None): # Setup Argument Parser for the Commandline Interface parser = argparse.ArgumentParser(description="Start Khoj; An AI personal assistant for your Digital Brain") parser.add_argument( - "--config-file", default="~/.khoj/khoj.yml", type=pathlib.Path, help="YAML file to configure Khoj" - ) - parser.add_argument( - "--regenerate", - action="store_true", - default=False, - help="Regenerate model embeddings from source files. Default: false", + "--log-file", + default="~/.khoj/khoj.log", + type=pathlib.Path, + help="File path for server logs. Default: ~/.khoj/khoj.log", ) parser.add_argument("--verbose", "-v", action="count", default=0, help="Show verbose conversion logs. Default: 0") parser.add_argument("--host", type=str, default="127.0.0.1", help="Host address of the server. Default: 127.0.0.1") @@ -37,7 +30,7 @@ def cli(args=None): "--anonymous-mode", action="store_true", default=False, - help="Run Khoj in anonymous mode. This does not require any login for connecting users.", + help="Run Khoj in single user mode with no login required. Useful for personal use or testing.", ) parser.add_argument( "--non-interactive", @@ -57,15 +50,4 @@ def cli(args=None): print(args.version_no) exit(0) - # Normalize config_file path to absolute path - args.config_file = resolve_absolute_path(args.config_file) - - if not args.config_file.exists(): - args.config = None - else: - args = run_migrations(args) - args.config = parse_config_from_file(args.config_file) - if is_env_var_true("KHOJ_TELEMETRY_DISABLE"): - args.config.app.should_log_telemetry = False - return args diff --git a/src/khoj/utils/config.py b/src/khoj/utils/config.py index d1b6f20a..c9cd1c43 100644 --- a/src/khoj/utils/config.py +++ b/src/khoj/utils/config.py @@ -1,20 +1,7 @@ # System Packages from __future__ import annotations # to avoid quoting type hints -import logging -from dataclasses import dataclass from enum import Enum -from typing import TYPE_CHECKING, Any, List, Optional, Union - -import torch - -logger = logging.getLogger(__name__) - - -if TYPE_CHECKING: - from sentence_transformers import CrossEncoder - - from khoj.utils.models import BaseEncoder class SearchType(str, Enum): @@ -27,36 +14,3 @@ class SearchType(str, Enum): Notion = "notion" Plaintext = "plaintext" Docx = "docx" - - -class ProcessorType(str, Enum): - Conversation = "conversation" - - -@dataclass -class TextContent: - enabled: bool - - -@dataclass -class ImageContent: - image_names: List[str] - image_embeddings: torch.Tensor - image_metadata_embeddings: torch.Tensor - - -@dataclass -class TextSearchModel: - bi_encoder: BaseEncoder - cross_encoder: Optional[CrossEncoder] = None - top_k: Optional[int] = 15 - - -@dataclass -class ImageSearchModel: - image_encoder: BaseEncoder - - -@dataclass -class SearchModels: - text_search: Optional[TextSearchModel] = None diff --git a/src/khoj/utils/fs_syncer.py b/src/khoj/utils/fs_syncer.py deleted file mode 100644 index 67e91bc9..00000000 --- a/src/khoj/utils/fs_syncer.py +++ /dev/null @@ -1,252 +0,0 @@ -import glob -import logging -import os -from pathlib import Path -from typing import Optional - -from bs4 import BeautifulSoup -from magika import Magika - -from khoj.database.models import ( - KhojUser, - LocalMarkdownConfig, - LocalOrgConfig, - LocalPdfConfig, - LocalPlaintextConfig, -) -from khoj.utils.config import SearchType -from khoj.utils.helpers import get_absolute_path, is_none_or_empty -from khoj.utils.rawconfig import TextContentConfig - -logger = logging.getLogger(__name__) -magika = Magika() - - -def collect_files(user: KhojUser, search_type: Optional[SearchType] = SearchType.All) -> dict: - files: dict[str, dict] = {"docx": {}, "image": {}} - - if search_type == SearchType.All or search_type == SearchType.Org: - org_config = LocalOrgConfig.objects.filter(user=user).first() - files["org"] = get_org_files(construct_config_from_db(org_config)) if org_config else {} - if search_type == SearchType.All or search_type == SearchType.Markdown: - markdown_config = LocalMarkdownConfig.objects.filter(user=user).first() - files["markdown"] = get_markdown_files(construct_config_from_db(markdown_config)) if markdown_config else {} - if search_type == SearchType.All or search_type == SearchType.Plaintext: - plaintext_config = LocalPlaintextConfig.objects.filter(user=user).first() - files["plaintext"] = get_plaintext_files(construct_config_from_db(plaintext_config)) if plaintext_config else {} - if search_type == SearchType.All or search_type == SearchType.Pdf: - pdf_config = LocalPdfConfig.objects.filter(user=user).first() - files["pdf"] = get_pdf_files(construct_config_from_db(pdf_config)) if pdf_config else {} - files["image"] = {} - files["docx"] = {} - return files - - -def construct_config_from_db(db_config) -> TextContentConfig: - return TextContentConfig( - input_files=db_config.input_files, - input_filter=db_config.input_filter, - index_heading_entries=db_config.index_heading_entries, - ) - - -def get_plaintext_files(config: TextContentConfig) -> dict[str, str]: - def is_plaintextfile(file: str): - "Check if file is plaintext file" - # Check if file path exists - content_group = magika.identify_path(Path(file)).output.group - # Use file extension to decide plaintext if file content is not identifiable - valid_text_file_extensions = ("txt", "md", "markdown", "org" "mbox", "rst", "html", "htm", "xml") - return file.endswith(valid_text_file_extensions) or content_group in ["text", "code"] - - def extract_html_content(html_content: str): - "Extract content from HTML" - soup = BeautifulSoup(html_content, "html.parser") - return soup.get_text(strip=True, separator="\n") - - # Extract required fields from config - input_files, input_filters = ( - config.input_files, - config.input_filter, - ) - - # Input Validation - if is_none_or_empty(input_files) and is_none_or_empty(input_filters): - logger.debug("At least one of input-files or input-file-filter is required to be specified") - return {} - - # Get all plain text files to process - absolute_plaintext_files, filtered_plaintext_files = set(), set() - if input_files: - absolute_plaintext_files = {get_absolute_path(jsonl_file) for jsonl_file in input_files} - if input_filters: - filtered_plaintext_files = { - filtered_file - for plaintext_file_filter in input_filters - for filtered_file in glob.glob(get_absolute_path(plaintext_file_filter), recursive=True) - if os.path.isfile(filtered_file) - } - - all_target_files = sorted(absolute_plaintext_files | filtered_plaintext_files) - - files_with_no_plaintext_extensions = { - target_files for target_files in all_target_files if not is_plaintextfile(target_files) - } - if any(files_with_no_plaintext_extensions): - logger.warning(f"Skipping unsupported files from plaintext indexing: {files_with_no_plaintext_extensions}") - all_target_files = list(set(all_target_files) - files_with_no_plaintext_extensions) - - logger.debug(f"Processing files: {all_target_files}") - - filename_to_content_map = {} - for file in all_target_files: - with open(file, "r", encoding="utf8") as f: - try: - plaintext_content = f.read() - if file.endswith(("html", "htm", "xml")): - plaintext_content = extract_html_content(plaintext_content) - filename_to_content_map[file] = plaintext_content - except Exception as e: - logger.warning(f"Unable to read file: {file} as plaintext. Skipping file.") - logger.warning(e, exc_info=True) - - return filename_to_content_map - - -def get_org_files(config: TextContentConfig): - # Extract required fields from config - org_files, org_file_filters = ( - config.input_files, - config.input_filter, - ) - - # Input Validation - if is_none_or_empty(org_files) and is_none_or_empty(org_file_filters): - logger.debug("At least one of org-files or org-file-filter is required to be specified") - return {} - - # Get Org files to process - absolute_org_files, filtered_org_files = set(), set() - if org_files: - absolute_org_files = {get_absolute_path(org_file) for org_file in org_files} - if org_file_filters: - filtered_org_files = { - filtered_file - for org_file_filter in org_file_filters - for filtered_file in glob.glob(get_absolute_path(org_file_filter), recursive=True) - if os.path.isfile(filtered_file) - } - - all_org_files = sorted(absolute_org_files | filtered_org_files) - - files_with_non_org_extensions = {org_file for org_file in all_org_files if not org_file.endswith(".org")} - if any(files_with_non_org_extensions): - logger.warning(f"There maybe non org-mode files in the input set: {files_with_non_org_extensions}") - - logger.debug(f"Processing files: {all_org_files}") - - filename_to_content_map = {} - for file in all_org_files: - with open(file, "r", encoding="utf8") as f: - try: - filename_to_content_map[file] = f.read() - except Exception as e: - logger.warning(f"Unable to read file: {file} as org. Skipping file.") - logger.warning(e, exc_info=True) - - return filename_to_content_map - - -def get_markdown_files(config: TextContentConfig): - # Extract required fields from config - markdown_files, markdown_file_filters = ( - config.input_files, - config.input_filter, - ) - - # Input Validation - if is_none_or_empty(markdown_files) and is_none_or_empty(markdown_file_filters): - logger.debug("At least one of markdown-files or markdown-file-filter is required to be specified") - return {} - - # Get markdown files to process - absolute_markdown_files, filtered_markdown_files = set(), set() - if markdown_files: - absolute_markdown_files = {get_absolute_path(markdown_file) for markdown_file in markdown_files} - - if markdown_file_filters: - filtered_markdown_files = { - filtered_file - for markdown_file_filter in markdown_file_filters - for filtered_file in glob.glob(get_absolute_path(markdown_file_filter), recursive=True) - if os.path.isfile(filtered_file) - } - - all_markdown_files = sorted(absolute_markdown_files | filtered_markdown_files) - - files_with_non_markdown_extensions = { - md_file for md_file in all_markdown_files if not md_file.endswith(".md") and not md_file.endswith(".markdown") - } - - if any(files_with_non_markdown_extensions): - logger.warning( - f"[Warning] There maybe non markdown-mode files in the input set: {files_with_non_markdown_extensions}" - ) - - logger.debug(f"Processing files: {all_markdown_files}") - - filename_to_content_map = {} - for file in all_markdown_files: - with open(file, "r", encoding="utf8") as f: - try: - filename_to_content_map[file] = f.read() - except Exception as e: - logger.warning(f"Unable to read file: {file} as markdown. Skipping file.") - logger.warning(e, exc_info=True) - - return filename_to_content_map - - -def get_pdf_files(config: TextContentConfig): - # Extract required fields from config - pdf_files, pdf_file_filters = ( - config.input_files, - config.input_filter, - ) - - # Input Validation - if is_none_or_empty(pdf_files) and is_none_or_empty(pdf_file_filters): - logger.debug("At least one of pdf-files or pdf-file-filter is required to be specified") - return {} - - # Get PDF files to process - absolute_pdf_files, filtered_pdf_files = set(), set() - if pdf_files: - absolute_pdf_files = {get_absolute_path(pdf_file) for pdf_file in pdf_files} - if pdf_file_filters: - filtered_pdf_files = { - filtered_file - for pdf_file_filter in pdf_file_filters - for filtered_file in glob.glob(get_absolute_path(pdf_file_filter), recursive=True) - if os.path.isfile(filtered_file) - } - - all_pdf_files = sorted(absolute_pdf_files | filtered_pdf_files) - - files_with_non_pdf_extensions = {pdf_file for pdf_file in all_pdf_files if not pdf_file.endswith(".pdf")} - - if any(files_with_non_pdf_extensions): - logger.warning(f"[Warning] There maybe non pdf-mode files in the input set: {files_with_non_pdf_extensions}") - - logger.debug(f"Processing files: {all_pdf_files}") - - filename_to_content_map = {} - for file in all_pdf_files: - with open(file, "rb") as f: - try: - filename_to_content_map[file] = f.read() - except Exception as e: - logger.warning(f"Unable to read file: {file} as PDF. Skipping file.") - logger.warning(e, exc_info=True) - - return filename_to_content_map diff --git a/src/khoj/utils/helpers.py b/src/khoj/utils/helpers.py index 508901de..523ec007 100644 --- a/src/khoj/utils/helpers.py +++ b/src/khoj/utils/helpers.py @@ -47,7 +47,6 @@ if TYPE_CHECKING: from sentence_transformers import CrossEncoder, SentenceTransformer from khoj.utils.models import BaseEncoder - from khoj.utils.rawconfig import AppConfig logger = logging.getLogger(__name__) @@ -267,23 +266,16 @@ def get_server_id(): return server_id -def telemetry_disabled(app_config: AppConfig, telemetry_disable_env) -> bool: - if telemetry_disable_env is True: - return True - return not app_config or not app_config.should_log_telemetry - - def log_telemetry( telemetry_type: str, api: str = None, client: Optional[str] = None, - app_config: Optional[AppConfig] = None, disable_telemetry_env: bool = False, properties: dict = None, ): """Log basic app usage telemetry like client, os, api called""" # Do not log usage telemetry, if telemetry is disabled via app config - if telemetry_disabled(app_config, disable_telemetry_env): + if disable_telemetry_env: return [] if properties.get("server_id") is None: diff --git a/src/khoj/utils/initialization.py b/src/khoj/utils/initialization.py index 336b228d..8023b3ed 100644 --- a/src/khoj/utils/initialization.py +++ b/src/khoj/utils/initialization.py @@ -147,24 +147,6 @@ def initialization(interactive: bool = True): logger.info("🗣️ Chat model configuration complete") - # Set up offline speech to text model - use_offline_speech2text_model = "n" if not interactive else input("Use offline speech to text model? (y/n): ") - if use_offline_speech2text_model == "y": - logger.info("🗣️ Setting up offline speech to text model") - # Delete any existing speech to text model options. There can only be one. - SpeechToTextModelOptions.objects.all().delete() - - default_offline_speech2text_model = "base" - offline_speech2text_model = input( - f"Enter the Whisper model to use Offline (default: {default_offline_speech2text_model}): " - ) - offline_speech2text_model = offline_speech2text_model or default_offline_speech2text_model - SpeechToTextModelOptions.objects.create( - model_name=offline_speech2text_model, model_type=SpeechToTextModelOptions.ModelType.OFFLINE - ) - - logger.info(f"🗣️ Offline speech to text model configured to {offline_speech2text_model}") - def _setup_chat_model_provider( model_type: ChatModel.ModelType, default_chat_models: list, diff --git a/src/khoj/utils/rawconfig.py b/src/khoj/utils/rawconfig.py index 0148511a..5377577b 100644 --- a/src/khoj/utils/rawconfig.py +++ b/src/khoj/utils/rawconfig.py @@ -48,17 +48,6 @@ class FilesFilterRequest(BaseModel): conversation_id: str -class TextConfigBase(ConfigBase): - compressed_jsonl: Path - embeddings_file: Path - - -class TextContentConfig(ConfigBase): - input_files: Optional[List[Path]] = None - input_filter: Optional[List[str]] = None - index_heading_entries: Optional[bool] = False - - class GithubRepoConfig(ConfigBase): name: str owner: str @@ -74,57 +63,6 @@ class NotionContentConfig(ConfigBase): token: str -class ContentConfig(ConfigBase): - org: Optional[TextContentConfig] = None - markdown: Optional[TextContentConfig] = None - pdf: Optional[TextContentConfig] = None - plaintext: Optional[TextContentConfig] = None - github: Optional[GithubContentConfig] = None - notion: Optional[NotionContentConfig] = None - image: Optional[TextContentConfig] = None - docx: Optional[TextContentConfig] = None - - -class ImageSearchConfig(ConfigBase): - encoder: str - encoder_type: Optional[str] = None - model_directory: Optional[Path] = None - - class Config: - protected_namespaces = () - - -class SearchConfig(ConfigBase): - image: Optional[ImageSearchConfig] = None - - -class OpenAIProcessorConfig(ConfigBase): - api_key: str - chat_model: Optional[str] = "gpt-4o-mini" - - -class ConversationProcessorConfig(ConfigBase): - openai: Optional[OpenAIProcessorConfig] = None - max_prompt_size: Optional[int] = None - tokenizer: Optional[str] = None - - -class ProcessorConfig(ConfigBase): - conversation: Optional[ConversationProcessorConfig] = None - - -class AppConfig(ConfigBase): - should_log_telemetry: bool = True - - -class FullConfig(ConfigBase): - content_type: Optional[ContentConfig] = None - search_type: Optional[SearchConfig] = None - processor: Optional[ProcessorConfig] = None - app: Optional[AppConfig] = AppConfig() - version: Optional[str] = None - - class SearchResponse(ConfigBase): entry: str score: float diff --git a/src/khoj/utils/state.py b/src/khoj/utils/state.py index 6acd3d65..3b65a85b 100644 --- a/src/khoj/utils/state.py +++ b/src/khoj/utils/state.py @@ -12,18 +12,14 @@ from whisper import Whisper from khoj.database.models import ProcessLock from khoj.processor.embeddings import CrossEncoderModel, EmbeddingsModel from khoj.utils import config as utils_config -from khoj.utils.config import SearchModels from khoj.utils.helpers import LRU, get_device, is_env_var_true -from khoj.utils.rawconfig import FullConfig # Application Global State -config = FullConfig() -search_models = SearchModels() embeddings_model: Dict[str, EmbeddingsModel] = None cross_encoder_model: Dict[str, CrossEncoderModel] = None openai_client: OpenAI = None whisper_model: Whisper = None -config_file: Path = None +log_file: Path = None verbose: int = 0 host: str = None port: int = None diff --git a/src/khoj/utils/yaml.py b/src/khoj/utils/yaml.py index f658e1eb..43b139e5 100644 --- a/src/khoj/utils/yaml.py +++ b/src/khoj/utils/yaml.py @@ -1,47 +1,8 @@ -from pathlib import Path - import yaml -from khoj.utils import state -from khoj.utils.rawconfig import FullConfig - # Do not emit tags when dumping to YAML yaml.emitter.Emitter.process_tag = lambda self, *args, **kwargs: None # type: ignore[assignment] -def save_config_to_file_updated_state(): - with open(state.config_file, "w") as outfile: - yaml.dump(yaml.safe_load(state.config.json(by_alias=True)), outfile) - outfile.close() - return state.config - - -def save_config_to_file(yaml_config: dict, yaml_config_file: Path): - "Write config to YML file" - # Create output directory, if it doesn't exist - yaml_config_file.parent.mkdir(parents=True, exist_ok=True) - - with open(yaml_config_file, "w", encoding="utf-8") as config_file: - yaml.safe_dump(yaml_config, config_file, allow_unicode=True) - - -def load_config_from_file(yaml_config_file: Path) -> dict: - "Read config from YML file" - config_from_file = None - with open(yaml_config_file, "r", encoding="utf-8") as config_file: - config_from_file = yaml.safe_load(config_file) - return config_from_file - - -def parse_config_from_string(yaml_config: dict) -> FullConfig: - "Parse and validate config in YML string" - return FullConfig.model_validate(yaml_config) - - -def parse_config_from_file(yaml_config_file): - "Parse and validate config in YML file" - return parse_config_from_string(load_config_from_file(yaml_config_file)) - - def yaml_dump(data): return yaml.dump(data, allow_unicode=True, sort_keys=False, default_flow_style=False) diff --git a/tests/conftest.py b/tests/conftest.py index 77c86dfa..097a0ab0 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -33,7 +33,7 @@ from khoj.utils import fs_syncer, state from khoj.utils.config import SearchModels from khoj.utils.constants import web_directory from khoj.utils.helpers import resolve_absolute_path -from khoj.utils.rawconfig import ContentConfig, ImageSearchConfig, SearchConfig +from khoj.utils.rawconfig import ContentConfig, SearchConfig from tests.helpers import ( AiModelApiFactory, ChatModelFactory, @@ -69,12 +69,6 @@ def search_config() -> SearchConfig: model_dir.mkdir(parents=True, exist_ok=True) search_config = SearchConfig() - search_config.image = ImageSearchConfig( - encoder="sentence-transformers/clip-ViT-B-32", - model_directory=model_dir / "image/", - encoder_type=None, - ) - return search_config @@ -301,7 +295,6 @@ def chat_client_with_large_kb(search_config: SearchConfig, default_user2: KhojUs @pytest.mark.django_db def chat_client_builder(search_config, user, index_content=True, require_auth=False): # Initialize app state - state.config.search_type = search_config state.SearchType = configure_search_types() if index_content: @@ -349,7 +342,6 @@ def large_kb_chat_client_builder(search_config, user): import tempfile # Initialize app state - state.config.search_type = search_config state.SearchType = configure_search_types() # Create temporary directory for large number of test files @@ -470,12 +462,8 @@ def fastapi_app(): @pytest.fixture(scope="function") def client( - content_config: ContentConfig, - search_config: SearchConfig, api_user: KhojApiUser, ): - state.config.content_type = content_config - state.config.search_type = search_config state.SearchType = configure_search_types() state.embeddings_model = dict() state.embeddings_model["default"] = EmbeddingsModel() diff --git a/tests/test_client.py b/tests/test_client.py index 00507851..b7341d0e 100644 --- a/tests/test_client.py +++ b/tests/test_client.py @@ -283,10 +283,6 @@ def test_get_api_config_types(client, sample_org_data, default_user: KhojUser): def test_get_configured_types_with_no_content_config(fastapi_app: FastAPI): # Arrange state.anonymous_mode = True - if state.config and state.config.content_type: - state.config.content_type = None - state.search_models = configure_search_types() - configure_routes(fastapi_app) client = TestClient(fastapi_app)