Drop old code to sync files on server filesystem. Clean cli, init paths

This stale code was originally used to index files on server file system directly by server. We currently push files to sync via API. Server side syncing of remote content like Github and Notion is still supported. But old, unused code for server side sync of files on server fs is being cleaned out. New --log-file cli args allows specifying where khoj server should store logs on fs. This replaces the --config-file cli arg that was only being used as a proxy for deciding where to store the log file. - TODO - Tests are broken. They were relying on the server side content syncing for test setup
2026-03-02 21:19:12 +00:00 · 2025-07-03 15:27:06 -07:00
parent b1f2737c9a
commit d9d24dd638
21 changed files with 82 additions and 688 deletions
--- a/src/khoj/configure.py
+++ b/src/khoj/configure.py
@@ -50,13 +50,11 @@ from khoj.database.adapters import (
 )
 from khoj.database.models import ClientApplication, KhojUser, ProcessLock, Subscription
 from khoj.processor.embeddings import CrossEncoderModel, EmbeddingsModel
-from khoj.routers.api_content import configure_content, configure_search
+from khoj.routers.api_content import configure_content
 from khoj.routers.twilio import is_twilio_enabled
 from khoj.utils import constants, state
 from khoj.utils.config import SearchType
-from khoj.utils.fs_syncer import collect_files
-from khoj.utils.helpers import is_none_or_empty, telemetry_disabled
-from khoj.utils.rawconfig import FullConfig
+from khoj.utils.helpers import is_none_or_empty

 logger = logging.getLogger(__name__)

@@ -232,14 +230,6 @@ class UserAuthenticationBackend(AuthenticationBackend):
        return AuthCredentials(), UnauthenticatedUser()


-def initialize_server(config: Optional[FullConfig]):
-    try:
-        configure_server(config, init=True)
-    except Exception as e:
-        logger.error(f"🚨 Failed to configure server on app load: {e}", exc_info=True)
-        raise e
-
-
 def clean_connections(func):
    """
    A decorator that ensures that Django database connections that have become unusable, or are obsolete, are closed
@@ -260,19 +250,7 @@ def clean_connections(func):
    return func_wrapper


-def configure_server(
-    config: FullConfig,
-    regenerate: bool = False,
-    search_type: Optional[SearchType] = None,
-    init=False,
-    user: KhojUser = None,
-):
-    # Update Config
-    if config == None:
-        logger.info(f"Initializing with default config.")
-        config = FullConfig()
-    state.config = config
-
+def initialize_server():
    if ConversationAdapters.has_valid_ai_model_api():
        ai_model_api = ConversationAdapters.get_ai_model_api()
        state.openai_client = openai.OpenAI(api_key=ai_model_api.api_key, base_url=ai_model_api.api_base_url)
@@ -309,43 +287,33 @@ def configure_server(
            )

        state.SearchType = configure_search_types()
-        state.search_models = configure_search(state.search_models, state.config.search_type)
-        setup_default_agent(user)
+        setup_default_agent()

-        message = (
-            "📡 Telemetry disabled"
-            if telemetry_disabled(state.config.app, state.telemetry_disabled)
-            else "📡 Telemetry enabled"
-        )
+        message = "📡 Telemetry disabled" if state.telemetry_disabled else "📡 Telemetry enabled"
        logger.info(message)

-        if not init:
-            initialize_content(user, regenerate, search_type)
-
    except Exception as e:
        logger.error(f"Failed to load some search models: {e}", exc_info=True)


-def setup_default_agent(user: KhojUser):
-    AgentAdapters.create_default_agent(user)
+def setup_default_agent():
+    AgentAdapters.create_default_agent()


 def initialize_content(user: KhojUser, regenerate: bool, search_type: Optional[SearchType] = None):
    # Initialize Content from Config
-    if state.search_models:
-        try:
-            logger.info("📬 Updating content index...")
-            all_files = collect_files(user=user)
-            status = configure_content(
-                user,
-                all_files,
-                regenerate,
-                search_type,
-            )
-            if not status:
-                raise RuntimeError("Failed to update content index")
-        except Exception as e:
-            raise e
+    try:
+        logger.info("📬 Updating content index...")
+        status = configure_content(
+            user,
+            {},
+            regenerate,
+            search_type,
+        )
+        if not status:
+            raise RuntimeError("Failed to update content index")
+    except Exception as e:
+        raise e


 def configure_routes(app):
@@ -438,8 +406,7 @@ def configure_middleware(app, ssl_enabled: bool = False):

 def update_content_index():
    for user in get_all_users():
-        all_files = collect_files(user=user)
-        success = configure_content(user, all_files)
+        success = configure_content(user, {})
    if not success:
        raise RuntimeError("Failed to update content index")
    logger.info("📪 Content index updated via Scheduler")
@@ -464,7 +431,7 @@ def configure_search_types():
@schedule.repeat(schedule.every(2).minutes)
@clean_connections
 def upload_telemetry():
-    if telemetry_disabled(state.config.app, state.telemetry_disabled) or not state.telemetry:
+    if state.telemetry_disabled or not state.telemetry:
        return

    try:
--- a/src/khoj/database/adapters/init.py
+++ b/src/khoj/database/adapters/init.py
@@ -788,8 +788,8 @@ class AgentAdapters:
        return Agent.objects.filter(name=AgentAdapters.DEFAULT_AGENT_NAME).first()

    @staticmethod
-    def create_default_agent(user: KhojUser):
-        default_chat_model = ConversationAdapters.get_default_chat_model(user)
+    def create_default_agent():
+        default_chat_model = ConversationAdapters.get_default_chat_model(user=None)
        if default_chat_model is None:
            logger.info("No default conversation config found, skipping default agent creation")
            return None
--- a/src/khoj/database/migrations/0093_remove_localorgconfig_user_and_more.py
+++ b/src/khoj/database/migrations/0093_remove_localorgconfig_user_and_more.py
@@ -0,0 +1,36 @@
+# Generated by Django 5.1.10 on 2025-07-25 23:30
+
+from django.db import migrations
+
+
+class Migration(migrations.Migration):
+    dependencies = [
+        ("database", "0092_alter_chatmodel_model_type_alter_chatmodel_name_and_more"),
+    ]
+
+    operations = [
+        migrations.RemoveField(
+            model_name="localorgconfig",
+            name="user",
+        ),
+        migrations.RemoveField(
+            model_name="localpdfconfig",
+            name="user",
+        ),
+        migrations.RemoveField(
+            model_name="localplaintextconfig",
+            name="user",
+        ),
+        migrations.DeleteModel(
+            name="LocalMarkdownConfig",
+        ),
+        migrations.DeleteModel(
+            name="LocalOrgConfig",
+        ),
+        migrations.DeleteModel(
+            name="LocalPdfConfig",
+        ),
+        migrations.DeleteModel(
+            name="LocalPlaintextConfig",
+        ),
+    ]
--- a/src/khoj/database/models/init.py
+++ b/src/khoj/database/models/init.py
@@ -488,34 +488,6 @@ class ServerChatSettings(DbBaseModel):
        super().save(*args, **kwargs)


-class LocalOrgConfig(DbBaseModel):
-    input_files = models.JSONField(default=list, null=True)
-    input_filter = models.JSONField(default=list, null=True)
-    index_heading_entries = models.BooleanField(default=False)
-    user = models.ForeignKey(KhojUser, on_delete=models.CASCADE)
-
-
-class LocalMarkdownConfig(DbBaseModel):
-    input_files = models.JSONField(default=list, null=True)
-    input_filter = models.JSONField(default=list, null=True)
-    index_heading_entries = models.BooleanField(default=False)
-    user = models.ForeignKey(KhojUser, on_delete=models.CASCADE)
-
-
-class LocalPdfConfig(DbBaseModel):
-    input_files = models.JSONField(default=list, null=True)
-    input_filter = models.JSONField(default=list, null=True)
-    index_heading_entries = models.BooleanField(default=False)
-    user = models.ForeignKey(KhojUser, on_delete=models.CASCADE)
-
-
-class LocalPlaintextConfig(DbBaseModel):
-    input_files = models.JSONField(default=list, null=True)
-    input_filter = models.JSONField(default=list, null=True)
-    index_heading_entries = models.BooleanField(default=False)
-    user = models.ForeignKey(KhojUser, on_delete=models.CASCADE)
-
-
 class SearchModelConfig(DbBaseModel):
    class ModelType(models.TextChoices):
        TEXT = "text"
--- a/src/khoj/main.py
+++ b/src/khoj/main.py
@@ -138,10 +138,10 @@ def run(should_start_server=True):
    initialization(not args.non_interactive)

    # Create app directory, if it doesn't exist
-    state.config_file.parent.mkdir(parents=True, exist_ok=True)
+    state.log_file.parent.mkdir(parents=True, exist_ok=True)

    # Set Log File
-    fh = logging.FileHandler(state.config_file.parent / "khoj.log", encoding="utf-8")
+    fh = logging.FileHandler(state.log_file, encoding="utf-8")
    fh.setLevel(logging.DEBUG)
    logger.addHandler(fh)

@@ -194,7 +194,7 @@ def run(should_start_server=True):
    # Configure Middleware
    configure_middleware(app, state.ssl_config)

-    initialize_server(args.config)
+    initialize_server()

    # If the server is started through gunicorn (external to the script), don't start the server
    if should_start_server:
@@ -204,8 +204,7 @@ def run(should_start_server=True):


 def set_state(args):
-    state.config_file = args.config_file
-    state.config = args.config
+    state.log_file = args.log_file
    state.verbose = args.verbose
    state.host = args.host
    state.port = args.port
--- a/src/khoj/processor/content/github/github_to_entries.py
+++ b/src/khoj/processor/content/github/github_to_entries.py
@@ -20,7 +20,6 @@ magika = Magika()

 class GithubToEntries(TextToEntries):
    def __init__(self, config: GithubConfig):
-        super().__init__(config)
        raw_repos = config.githubrepoconfig.all()
        repos = []
        for repo in raw_repos:
--- a/src/khoj/processor/content/notion/notion_to_entries.py
+++ b/src/khoj/processor/content/notion/notion_to_entries.py
@@ -47,7 +47,6 @@ class NotionBlockType(Enum):

 class NotionToEntries(TextToEntries):
    def __init__(self, config: NotionConfig):
-        super().__init__(config)
        self.config = NotionContentConfig(
            token=config.token,
        )
--- a/src/khoj/processor/content/text_to_entries.py
+++ b/src/khoj/processor/content/text_to_entries.py
@@ -27,7 +27,6 @@ logger = logging.getLogger(__name__)
 class TextToEntries(ABC):
    def __init__(self, config: Any = None):
        self.embeddings_model = state.embeddings_model
-        self.config = config
        self.date_filter = DateFilter()

    @abstractmethod
--- a/src/khoj/routers/api.py
+++ b/src/khoj/routers/api.py
@@ -87,22 +87,14 @@ def update(
    force: Optional[bool] = False,
 ):
    user = request.user.object
-    if not state.config:
-        error_msg = f"🚨 Khoj is not configured.\nConfigure it via http://localhost:42110/settings, plugins or by editing {state.config_file}."
-        logger.warning(error_msg)
-        raise HTTPException(status_code=500, detail=error_msg)
    try:
        initialize_content(user=user, regenerate=force, search_type=t)
    except Exception as e:
-        error_msg = f"🚨 Failed to update server via API: {e}"
+        error_msg = f"🚨 Failed to update server indexed content via API: {e}"
        logger.error(error_msg, exc_info=True)
        raise HTTPException(status_code=500, detail=error_msg)
    else:
-        components = []
-        if state.search_models:
-            components.append("Search models")
-        components_msg = ", ".join(components)
-        logger.info(f"📪 {components_msg} updated via API")
+        logger.info(f"📪 Server indexed content updated via API")

    update_telemetry_state(
        request=request,
--- a/src/khoj/routers/api_content.py
+++ b/src/khoj/routers/api_content.py
@@ -27,16 +27,7 @@ from khoj.database.adapters import (
    get_user_notion_config,
 )
 from khoj.database.models import Entry as DbEntry
-from khoj.database.models import (
-    GithubConfig,
-    GithubRepoConfig,
-    KhojUser,
-    LocalMarkdownConfig,
-    LocalOrgConfig,
-    LocalPdfConfig,
-    LocalPlaintextConfig,
-    NotionConfig,
-)
+from khoj.database.models import GithubConfig, GithubRepoConfig, NotionConfig
 from khoj.processor.content.docx.docx_to_entries import DocxToEntries
 from khoj.processor.content.pdf.pdf_to_entries import PdfToEntries
 from khoj.routers.helpers import (
@@ -47,17 +38,9 @@ from khoj.routers.helpers import (
    get_user_config,
    update_telemetry_state,
 )
-from khoj.utils import constants, state
-from khoj.utils.config import SearchModels
-from khoj.utils.rawconfig import (
-    ContentConfig,
-    FullConfig,
-    GithubContentConfig,
-    NotionContentConfig,
-    SearchConfig,
-)
+from khoj.utils import state
+from khoj.utils.rawconfig import GithubContentConfig, NotionContentConfig
 from khoj.utils.state import SearchType
-from khoj.utils.yaml import save_config_to_file_updated_state

 logger = logging.getLogger(__name__)

@@ -192,8 +175,6 @@ async def set_content_github(
    updated_config: Union[GithubContentConfig, None],
    client: Optional[str] = None,
 ):
-    _initialize_config()
-
    user = request.user.object

    try:
@@ -225,8 +206,6 @@ async def set_content_notion(
    updated_config: Union[NotionContentConfig, None],
    client: Optional[str] = None,
 ):
-    _initialize_config()
-
    user = request.user.object

    try:
@@ -323,10 +302,6 @@ def get_content_types(request: Request, client: Optional[str] = None):
    configured_content_types = set(EntryAdapters.get_unique_file_types(user))
    configured_content_types |= {"all"}

-    if state.config and state.config.content_type:
-        for ctype in state.config.content_type.model_dump(exclude_none=True):
-            configured_content_types.add(ctype)
-
    return list(configured_content_types & all_content_types)


@@ -606,28 +581,6 @@ async def indexer(
            docx=index_files["docx"],
        )

-        if state.config == None:
-            logger.info("📬 Initializing content index on first run.")
-            default_full_config = FullConfig(
-                content_type=None,
-                search_type=SearchConfig.model_validate(constants.default_config["search-type"]),
-                processor=None,
-            )
-            state.config = default_full_config
-            default_content_config = ContentConfig(
-                org=None,
-                markdown=None,
-                pdf=None,
-                docx=None,
-                image=None,
-                github=None,
-                notion=None,
-                plaintext=None,
-            )
-            state.config.content_type = default_content_config
-            save_config_to_file_updated_state()
-            configure_search(state.search_models, state.config.search_type)
-
        loop = asyncio.get_event_loop()
        success = await loop.run_in_executor(
            None,
@@ -674,14 +627,6 @@ async def indexer(
    return Response(content=indexed_filenames, status_code=200)


-def configure_search(search_models: SearchModels, search_config: Optional[SearchConfig]) -> Optional[SearchModels]:
-    # Run Validation Checks
-    if search_models is None:
-        search_models = SearchModels()
-
-    return search_models
-
-
 def map_config_to_object(content_source: str):
    if content_source == DbEntry.EntrySource.GITHUB:
        return GithubConfig
@@ -689,56 +634,3 @@ def map_config_to_object(content_source: str):
        return NotionConfig
    if content_source == DbEntry.EntrySource.COMPUTER:
        return "Computer"
-
-
-async def map_config_to_db(config: FullConfig, user: KhojUser):
-    if config.content_type:
-        if config.content_type.org:
-            await LocalOrgConfig.objects.filter(user=user).adelete()
-            await LocalOrgConfig.objects.acreate(
-                input_files=config.content_type.org.input_files,
-                input_filter=config.content_type.org.input_filter,
-                index_heading_entries=config.content_type.org.index_heading_entries,
-                user=user,
-            )
-        if config.content_type.markdown:
-            await LocalMarkdownConfig.objects.filter(user=user).adelete()
-            await LocalMarkdownConfig.objects.acreate(
-                input_files=config.content_type.markdown.input_files,
-                input_filter=config.content_type.markdown.input_filter,
-                index_heading_entries=config.content_type.markdown.index_heading_entries,
-                user=user,
-            )
-        if config.content_type.pdf:
-            await LocalPdfConfig.objects.filter(user=user).adelete()
-            await LocalPdfConfig.objects.acreate(
-                input_files=config.content_type.pdf.input_files,
-                input_filter=config.content_type.pdf.input_filter,
-                index_heading_entries=config.content_type.pdf.index_heading_entries,
-                user=user,
-            )
-        if config.content_type.plaintext:
-            await LocalPlaintextConfig.objects.filter(user=user).adelete()
-            await LocalPlaintextConfig.objects.acreate(
-                input_files=config.content_type.plaintext.input_files,
-                input_filter=config.content_type.plaintext.input_filter,
-                index_heading_entries=config.content_type.plaintext.index_heading_entries,
-                user=user,
-            )
-        if config.content_type.github:
-            await adapters.set_user_github_config(
-                user=user,
-                pat_token=config.content_type.github.pat_token,
-                repos=config.content_type.github.repos,
-            )
-        if config.content_type.notion:
-            await adapters.set_notion_config(
-                user=user,
-                token=config.content_type.notion.token,
-            )
-
-
-def _initialize_config():
-    if state.config is None:
-        state.config = FullConfig()
-        state.config.search_type = SearchConfig.model_validate(constants.default_config["search-type"])
--- a/src/khoj/routers/helpers.py
+++ b/src/khoj/routers/helpers.py
@@ -218,7 +218,6 @@ def update_telemetry_state(
            telemetry_type=telemetry_type,
            api=api,
            client=client,
-            app_config=state.config.app,
            disable_telemetry_env=state.telemetry_disabled,
            properties=user_state,
        )
@@ -2726,7 +2725,8 @@ def configure_content(

    search_type = t.value if t else None

-    no_documents = all([not files.get(file_type) for file_type in files])
+    # Check if client sent any documents of the supported types
+    no_client_sent_documents = all([not files.get(file_type) for file_type in files])

    if files is None:
        logger.warning(f"🚨 No files to process for {search_type} search.")
@@ -2800,7 +2800,8 @@ def configure_content(
        success = False

    try:
-        if no_documents:
+        # Run server side indexing of user Github docs if no client sent documents
+        if no_client_sent_documents:
            github_config = GithubConfig.objects.filter(user=user).prefetch_related("githubrepoconfig").first()
            if (
                search_type == state.SearchType.All.value or search_type == state.SearchType.Github.value
@@ -2820,7 +2821,8 @@ def configure_content(
        success = False

    try:
-        if no_documents:
+        # Run server side indexing of user Notion docs if no client sent documents
+        if no_client_sent_documents:
            # Initialize Notion Search
            notion_config = NotionConfig.objects.filter(user=user).first()
            if (
--- a/src/khoj/utils/cli.py
+++ b/src/khoj/utils/cli.py
@@ -1,26 +1,19 @@
 import argparse
 import logging
-import os
 import pathlib
 from importlib.metadata import version

 logger = logging.getLogger(__name__)

-from khoj.utils.helpers import is_env_var_true, resolve_absolute_path
-from khoj.utils.yaml import parse_config_from_file
-

 def cli(args=None):
    # Setup Argument Parser for the Commandline Interface
    parser = argparse.ArgumentParser(description="Start Khoj; An AI personal assistant for your Digital Brain")
    parser.add_argument(
-        "--config-file", default="~/.khoj/khoj.yml", type=pathlib.Path, help="YAML file to configure Khoj"
-    )
-    parser.add_argument(
-        "--regenerate",
-        action="store_true",
-        default=False,
-        help="Regenerate model embeddings from source files. Default: false",
+        "--log-file",
+        default="~/.khoj/khoj.log",
+        type=pathlib.Path,
+        help="File path for server logs. Default: ~/.khoj/khoj.log",
    )
    parser.add_argument("--verbose", "-v", action="count", default=0, help="Show verbose conversion logs. Default: 0")
    parser.add_argument("--host", type=str, default="127.0.0.1", help="Host address of the server. Default: 127.0.0.1")
@@ -37,7 +30,7 @@ def cli(args=None):
        "--anonymous-mode",
        action="store_true",
        default=False,
-        help="Run Khoj in anonymous mode. This does not require any login for connecting users.",
+        help="Run Khoj in single user mode with no login required. Useful for personal use or testing.",
    )
    parser.add_argument(
        "--non-interactive",
@@ -57,15 +50,4 @@ def cli(args=None):
        print(args.version_no)
        exit(0)

-    # Normalize config_file path to absolute path
-    args.config_file = resolve_absolute_path(args.config_file)
-
-    if not args.config_file.exists():
-        args.config = None
-    else:
-        args = run_migrations(args)
-        args.config = parse_config_from_file(args.config_file)
-        if is_env_var_true("KHOJ_TELEMETRY_DISABLE"):
-            args.config.app.should_log_telemetry = False
-
    return args
--- a/src/khoj/utils/config.py
+++ b/src/khoj/utils/config.py
@@ -1,20 +1,7 @@
 # System Packages
 from __future__ import annotations  # to avoid quoting type hints

-import logging
-from dataclasses import dataclass
 from enum import Enum
-from typing import TYPE_CHECKING, Any, List, Optional, Union
-
-import torch
-
-logger = logging.getLogger(__name__)
-
-
-if TYPE_CHECKING:
-    from sentence_transformers import CrossEncoder
-
-    from khoj.utils.models import BaseEncoder


 class SearchType(str, Enum):
@@ -27,36 +14,3 @@ class SearchType(str, Enum):
    Notion = "notion"
    Plaintext = "plaintext"
    Docx = "docx"
-
-
-class ProcessorType(str, Enum):
-    Conversation = "conversation"
-
-
-@dataclass
-class TextContent:
-    enabled: bool
-
-
-@dataclass
-class ImageContent:
-    image_names: List[str]
-    image_embeddings: torch.Tensor
-    image_metadata_embeddings: torch.Tensor
-
-
-@dataclass
-class TextSearchModel:
-    bi_encoder: BaseEncoder
-    cross_encoder: Optional[CrossEncoder] = None
-    top_k: Optional[int] = 15
-
-
-@dataclass
-class ImageSearchModel:
-    image_encoder: BaseEncoder
-
-
-@dataclass
-class SearchModels:
-    text_search: Optional[TextSearchModel] = None
--- a/src/khoj/utils/fs_syncer.py
+++ b/src/khoj/utils/fs_syncer.py
@@ -1,252 +0,0 @@
-import glob
-import logging
-import os
-from pathlib import Path
-from typing import Optional
-
-from bs4 import BeautifulSoup
-from magika import Magika
-
-from khoj.database.models import (
-    KhojUser,
-    LocalMarkdownConfig,
-    LocalOrgConfig,
-    LocalPdfConfig,
-    LocalPlaintextConfig,
-)
-from khoj.utils.config import SearchType
-from khoj.utils.helpers import get_absolute_path, is_none_or_empty
-from khoj.utils.rawconfig import TextContentConfig
-
-logger = logging.getLogger(__name__)
-magika = Magika()
-
-
-def collect_files(user: KhojUser, search_type: Optional[SearchType] = SearchType.All) -> dict:
-    files: dict[str, dict] = {"docx": {}, "image": {}}
-
-    if search_type == SearchType.All or search_type == SearchType.Org:
-        org_config = LocalOrgConfig.objects.filter(user=user).first()
-        files["org"] = get_org_files(construct_config_from_db(org_config)) if org_config else {}
-    if search_type == SearchType.All or search_type == SearchType.Markdown:
-        markdown_config = LocalMarkdownConfig.objects.filter(user=user).first()
-        files["markdown"] = get_markdown_files(construct_config_from_db(markdown_config)) if markdown_config else {}
-    if search_type == SearchType.All or search_type == SearchType.Plaintext:
-        plaintext_config = LocalPlaintextConfig.objects.filter(user=user).first()
-        files["plaintext"] = get_plaintext_files(construct_config_from_db(plaintext_config)) if plaintext_config else {}
-    if search_type == SearchType.All or search_type == SearchType.Pdf:
-        pdf_config = LocalPdfConfig.objects.filter(user=user).first()
-        files["pdf"] = get_pdf_files(construct_config_from_db(pdf_config)) if pdf_config else {}
-    files["image"] = {}
-    files["docx"] = {}
-    return files
-
-
-def construct_config_from_db(db_config) -> TextContentConfig:
-    return TextContentConfig(
-        input_files=db_config.input_files,
-        input_filter=db_config.input_filter,
-        index_heading_entries=db_config.index_heading_entries,
-    )
-
-
-def get_plaintext_files(config: TextContentConfig) -> dict[str, str]:
-    def is_plaintextfile(file: str):
-        "Check if file is plaintext file"
-        # Check if file path exists
-        content_group = magika.identify_path(Path(file)).output.group
-        # Use file extension to decide plaintext if file content is not identifiable
-        valid_text_file_extensions = ("txt", "md", "markdown", "org" "mbox", "rst", "html", "htm", "xml")
-        return file.endswith(valid_text_file_extensions) or content_group in ["text", "code"]
-
-    def extract_html_content(html_content: str):
-        "Extract content from HTML"
-        soup = BeautifulSoup(html_content, "html.parser")
-        return soup.get_text(strip=True, separator="\n")
-
-    # Extract required fields from config
-    input_files, input_filters = (
-        config.input_files,
-        config.input_filter,
-    )
-
-    # Input Validation
-    if is_none_or_empty(input_files) and is_none_or_empty(input_filters):
-        logger.debug("At least one of input-files or input-file-filter is required to be specified")
-        return {}
-
-    # Get all plain text files to process
-    absolute_plaintext_files, filtered_plaintext_files = set(), set()
-    if input_files:
-        absolute_plaintext_files = {get_absolute_path(jsonl_file) for jsonl_file in input_files}
-    if input_filters:
-        filtered_plaintext_files = {
-            filtered_file
-            for plaintext_file_filter in input_filters
-            for filtered_file in glob.glob(get_absolute_path(plaintext_file_filter), recursive=True)
-            if os.path.isfile(filtered_file)
-        }
-
-    all_target_files = sorted(absolute_plaintext_files | filtered_plaintext_files)
-
-    files_with_no_plaintext_extensions = {
-        target_files for target_files in all_target_files if not is_plaintextfile(target_files)
-    }
-    if any(files_with_no_plaintext_extensions):
-        logger.warning(f"Skipping unsupported files from plaintext indexing: {files_with_no_plaintext_extensions}")
-        all_target_files = list(set(all_target_files) - files_with_no_plaintext_extensions)
-
-    logger.debug(f"Processing files: {all_target_files}")
-
-    filename_to_content_map = {}
-    for file in all_target_files:
-        with open(file, "r", encoding="utf8") as f:
-            try:
-                plaintext_content = f.read()
-                if file.endswith(("html", "htm", "xml")):
-                    plaintext_content = extract_html_content(plaintext_content)
-                filename_to_content_map[file] = plaintext_content
-            except Exception as e:
-                logger.warning(f"Unable to read file: {file} as plaintext. Skipping file.")
-                logger.warning(e, exc_info=True)
-
-    return filename_to_content_map
-
-
-def get_org_files(config: TextContentConfig):
-    # Extract required fields from config
-    org_files, org_file_filters = (
-        config.input_files,
-        config.input_filter,
-    )
-
-    # Input Validation
-    if is_none_or_empty(org_files) and is_none_or_empty(org_file_filters):
-        logger.debug("At least one of org-files or org-file-filter is required to be specified")
-        return {}
-
-    # Get Org files to process
-    absolute_org_files, filtered_org_files = set(), set()
-    if org_files:
-        absolute_org_files = {get_absolute_path(org_file) for org_file in org_files}
-    if org_file_filters:
-        filtered_org_files = {
-            filtered_file
-            for org_file_filter in org_file_filters
-            for filtered_file in glob.glob(get_absolute_path(org_file_filter), recursive=True)
-            if os.path.isfile(filtered_file)
-        }
-
-    all_org_files = sorted(absolute_org_files | filtered_org_files)
-
-    files_with_non_org_extensions = {org_file for org_file in all_org_files if not org_file.endswith(".org")}
-    if any(files_with_non_org_extensions):
-        logger.warning(f"There maybe non org-mode files in the input set: {files_with_non_org_extensions}")
-
-    logger.debug(f"Processing files: {all_org_files}")
-
-    filename_to_content_map = {}
-    for file in all_org_files:
-        with open(file, "r", encoding="utf8") as f:
-            try:
-                filename_to_content_map[file] = f.read()
-            except Exception as e:
-                logger.warning(f"Unable to read file: {file} as org. Skipping file.")
-                logger.warning(e, exc_info=True)
-
-    return filename_to_content_map
-
-
-def get_markdown_files(config: TextContentConfig):
-    # Extract required fields from config
-    markdown_files, markdown_file_filters = (
-        config.input_files,
-        config.input_filter,
-    )
-
-    # Input Validation
-    if is_none_or_empty(markdown_files) and is_none_or_empty(markdown_file_filters):
-        logger.debug("At least one of markdown-files or markdown-file-filter is required to be specified")
-        return {}
-
-    # Get markdown files to process
-    absolute_markdown_files, filtered_markdown_files = set(), set()
-    if markdown_files:
-        absolute_markdown_files = {get_absolute_path(markdown_file) for markdown_file in markdown_files}
-
-    if markdown_file_filters:
-        filtered_markdown_files = {
-            filtered_file
-            for markdown_file_filter in markdown_file_filters
-            for filtered_file in glob.glob(get_absolute_path(markdown_file_filter), recursive=True)
-            if os.path.isfile(filtered_file)
-        }
-
-    all_markdown_files = sorted(absolute_markdown_files | filtered_markdown_files)
-
-    files_with_non_markdown_extensions = {
-        md_file for md_file in all_markdown_files if not md_file.endswith(".md") and not md_file.endswith(".markdown")
-    }
-
-    if any(files_with_non_markdown_extensions):
-        logger.warning(
-            f"[Warning] There maybe non markdown-mode files in the input set: {files_with_non_markdown_extensions}"
-        )
-
-    logger.debug(f"Processing files: {all_markdown_files}")
-
-    filename_to_content_map = {}
-    for file in all_markdown_files:
-        with open(file, "r", encoding="utf8") as f:
-            try:
-                filename_to_content_map[file] = f.read()
-            except Exception as e:
-                logger.warning(f"Unable to read file: {file} as markdown. Skipping file.")
-                logger.warning(e, exc_info=True)
-
-    return filename_to_content_map
-
-
-def get_pdf_files(config: TextContentConfig):
-    # Extract required fields from config
-    pdf_files, pdf_file_filters = (
-        config.input_files,
-        config.input_filter,
-    )
-
-    # Input Validation
-    if is_none_or_empty(pdf_files) and is_none_or_empty(pdf_file_filters):
-        logger.debug("At least one of pdf-files or pdf-file-filter is required to be specified")
-        return {}
-
-    # Get PDF files to process
-    absolute_pdf_files, filtered_pdf_files = set(), set()
-    if pdf_files:
-        absolute_pdf_files = {get_absolute_path(pdf_file) for pdf_file in pdf_files}
-    if pdf_file_filters:
-        filtered_pdf_files = {
-            filtered_file
-            for pdf_file_filter in pdf_file_filters
-            for filtered_file in glob.glob(get_absolute_path(pdf_file_filter), recursive=True)
-            if os.path.isfile(filtered_file)
-        }
-
-    all_pdf_files = sorted(absolute_pdf_files | filtered_pdf_files)
-
-    files_with_non_pdf_extensions = {pdf_file for pdf_file in all_pdf_files if not pdf_file.endswith(".pdf")}
-
-    if any(files_with_non_pdf_extensions):
-        logger.warning(f"[Warning] There maybe non pdf-mode files in the input set: {files_with_non_pdf_extensions}")
-
-    logger.debug(f"Processing files: {all_pdf_files}")
-
-    filename_to_content_map = {}
-    for file in all_pdf_files:
-        with open(file, "rb") as f:
-            try:
-                filename_to_content_map[file] = f.read()
-            except Exception as e:
-                logger.warning(f"Unable to read file: {file} as PDF. Skipping file.")
-                logger.warning(e, exc_info=True)
-
-    return filename_to_content_map
--- a/src/khoj/utils/helpers.py
+++ b/src/khoj/utils/helpers.py
@@ -47,7 +47,6 @@ if TYPE_CHECKING:
    from sentence_transformers import CrossEncoder, SentenceTransformer

    from khoj.utils.models import BaseEncoder
-    from khoj.utils.rawconfig import AppConfig

 logger = logging.getLogger(__name__)

@@ -267,23 +266,16 @@ def get_server_id():
    return server_id


-def telemetry_disabled(app_config: AppConfig, telemetry_disable_env) -> bool:
-    if telemetry_disable_env is True:
-        return True
-    return not app_config or not app_config.should_log_telemetry
-
-
 def log_telemetry(
    telemetry_type: str,
    api: str = None,
    client: Optional[str] = None,
-    app_config: Optional[AppConfig] = None,
    disable_telemetry_env: bool = False,
    properties: dict = None,
 ):
    """Log basic app usage telemetry like client, os, api called"""
    # Do not log usage telemetry, if telemetry is disabled via app config
-    if telemetry_disabled(app_config, disable_telemetry_env):
+    if disable_telemetry_env:
        return []

    if properties.get("server_id") is None:
--- a/src/khoj/utils/initialization.py
+++ b/src/khoj/utils/initialization.py
@@ -147,24 +147,6 @@ def initialization(interactive: bool = True):

        logger.info("🗣️ Chat model configuration complete")

-        # Set up offline speech to text model
-        use_offline_speech2text_model = "n" if not interactive else input("Use offline speech to text model? (y/n): ")
-        if use_offline_speech2text_model == "y":
-            logger.info("🗣️ Setting up offline speech to text model")
-            # Delete any existing speech to text model options. There can only be one.
-            SpeechToTextModelOptions.objects.all().delete()
-
-            default_offline_speech2text_model = "base"
-            offline_speech2text_model = input(
-                f"Enter the Whisper model to use Offline (default: {default_offline_speech2text_model}): "
-            )
-            offline_speech2text_model = offline_speech2text_model or default_offline_speech2text_model
-            SpeechToTextModelOptions.objects.create(
-                model_name=offline_speech2text_model, model_type=SpeechToTextModelOptions.ModelType.OFFLINE
-            )
-
-            logger.info(f"🗣️  Offline speech to text model configured to {offline_speech2text_model}")
-
    def _setup_chat_model_provider(
        model_type: ChatModel.ModelType,
        default_chat_models: list,
--- a/src/khoj/utils/rawconfig.py
+++ b/src/khoj/utils/rawconfig.py
@@ -48,17 +48,6 @@ class FilesFilterRequest(BaseModel):
    conversation_id: str


-class TextConfigBase(ConfigBase):
-    compressed_jsonl: Path
-    embeddings_file: Path
-
-
-class TextContentConfig(ConfigBase):
-    input_files: Optional[List[Path]] = None
-    input_filter: Optional[List[str]] = None
-    index_heading_entries: Optional[bool] = False
-
-
 class GithubRepoConfig(ConfigBase):
    name: str
    owner: str
@@ -74,57 +63,6 @@ class NotionContentConfig(ConfigBase):
    token: str


-class ContentConfig(ConfigBase):
-    org: Optional[TextContentConfig] = None
-    markdown: Optional[TextContentConfig] = None
-    pdf: Optional[TextContentConfig] = None
-    plaintext: Optional[TextContentConfig] = None
-    github: Optional[GithubContentConfig] = None
-    notion: Optional[NotionContentConfig] = None
-    image: Optional[TextContentConfig] = None
-    docx: Optional[TextContentConfig] = None
-
-
-class ImageSearchConfig(ConfigBase):
-    encoder: str
-    encoder_type: Optional[str] = None
-    model_directory: Optional[Path] = None
-
-    class Config:
-        protected_namespaces = ()
-
-
-class SearchConfig(ConfigBase):
-    image: Optional[ImageSearchConfig] = None
-
-
-class OpenAIProcessorConfig(ConfigBase):
-    api_key: str
-    chat_model: Optional[str] = "gpt-4o-mini"
-
-
-class ConversationProcessorConfig(ConfigBase):
-    openai: Optional[OpenAIProcessorConfig] = None
-    max_prompt_size: Optional[int] = None
-    tokenizer: Optional[str] = None
-
-
-class ProcessorConfig(ConfigBase):
-    conversation: Optional[ConversationProcessorConfig] = None
-
-
-class AppConfig(ConfigBase):
-    should_log_telemetry: bool = True
-
-
-class FullConfig(ConfigBase):
-    content_type: Optional[ContentConfig] = None
-    search_type: Optional[SearchConfig] = None
-    processor: Optional[ProcessorConfig] = None
-    app: Optional[AppConfig] = AppConfig()
-    version: Optional[str] = None
-
-
 class SearchResponse(ConfigBase):
    entry: str
    score: float
--- a/src/khoj/utils/state.py
+++ b/src/khoj/utils/state.py
@@ -12,18 +12,14 @@ from whisper import Whisper
 from khoj.database.models import ProcessLock
 from khoj.processor.embeddings import CrossEncoderModel, EmbeddingsModel
 from khoj.utils import config as utils_config
-from khoj.utils.config import SearchModels
 from khoj.utils.helpers import LRU, get_device, is_env_var_true
-from khoj.utils.rawconfig import FullConfig

 # Application Global State
-config = FullConfig()
-search_models = SearchModels()
 embeddings_model: Dict[str, EmbeddingsModel] = None
 cross_encoder_model: Dict[str, CrossEncoderModel] = None
 openai_client: OpenAI = None
 whisper_model: Whisper = None
-config_file: Path = None
+log_file: Path = None
 verbose: int = 0
 host: str = None
 port: int = None
--- a/src/khoj/utils/yaml.py
+++ b/src/khoj/utils/yaml.py
@@ -1,47 +1,8 @@
-from pathlib import Path
-
 import yaml

-from khoj.utils import state
-from khoj.utils.rawconfig import FullConfig
-
 # Do not emit tags when dumping to YAML
 yaml.emitter.Emitter.process_tag = lambda self, *args, **kwargs: None  # type: ignore[assignment]


-def save_config_to_file_updated_state():
-    with open(state.config_file, "w") as outfile:
-        yaml.dump(yaml.safe_load(state.config.json(by_alias=True)), outfile)
-        outfile.close()
-    return state.config
-
-
-def save_config_to_file(yaml_config: dict, yaml_config_file: Path):
-    "Write config to YML file"
-    # Create output directory, if it doesn't exist
-    yaml_config_file.parent.mkdir(parents=True, exist_ok=True)
-
-    with open(yaml_config_file, "w", encoding="utf-8") as config_file:
-        yaml.safe_dump(yaml_config, config_file, allow_unicode=True)
-
-
-def load_config_from_file(yaml_config_file: Path) -> dict:
-    "Read config from YML file"
-    config_from_file = None
-    with open(yaml_config_file, "r", encoding="utf-8") as config_file:
-        config_from_file = yaml.safe_load(config_file)
-    return config_from_file
-
-
-def parse_config_from_string(yaml_config: dict) -> FullConfig:
-    "Parse and validate config in YML string"
-    return FullConfig.model_validate(yaml_config)
-
-
-def parse_config_from_file(yaml_config_file):
-    "Parse and validate config in YML file"
-    return parse_config_from_string(load_config_from_file(yaml_config_file))
-
-
 def yaml_dump(data):
    return yaml.dump(data, allow_unicode=True, sort_keys=False, default_flow_style=False)