Drop old code to sync files on server filesystem. Clean cli, init paths

This stale code was originally used to index files on server file system directly by server. We currently push files to sync via API. Server side syncing of remote content like Github and Notion is still supported. But old, unused code for server side sync of files on server fs is being cleaned out. New --log-file cli args allows specifying where khoj server should store logs on fs. This replaces the --config-file cli arg that was only being used as a proxy for deciding where to store the log file. - TODO - Tests are broken. They were relying on the server side content syncing for test setup
2026-03-02 21:19:12 +00:00 · 2025-07-03 15:27:06 -07:00
parent b1f2737c9a
commit d9d24dd638
21 changed files with 82 additions and 688 deletions
--- a/src/khoj/configure.py
+++ b/src/khoj/configure.py
@@ -50,13 +50,11 @@ from khoj.database.adapters import (
 )
 from khoj.database.models import ClientApplication, KhojUser, ProcessLock, Subscription
 from khoj.processor.embeddings import CrossEncoderModel, EmbeddingsModel
-from khoj.routers.api_content import configure_content, configure_search
+from khoj.routers.api_content import configure_content
 from khoj.routers.twilio import is_twilio_enabled
 from khoj.utils import constants, state
 from khoj.utils.config import SearchType
-from khoj.utils.fs_syncer import collect_files
+from khoj.utils.helpers import is_none_or_empty
 from khoj.utils.helpers import is_none_or_empty, telemetry_disabled
 from khoj.utils.rawconfig import FullConfig
 logger = logging.getLogger(__name__)
@@ -232,14 +230,6 @@ class UserAuthenticationBackend(AuthenticationBackend):
        return AuthCredentials(), UnauthenticatedUser()
 def initialize_server(config: Optional[FullConfig]):
    try:
        configure_server(config, init=True)
    except Exception as e:
        logger.error(f"🚨 Failed to configure server on app load: {e}", exc_info=True)
        raise e
 def clean_connections(func):
    """
    A decorator that ensures that Django database connections that have become unusable, or are obsolete, are closed
@@ -260,19 +250,7 @@ def clean_connections(func):
    return func_wrapper
-def configure_server(
+def initialize_server():
    config: FullConfig,
    regenerate: bool = False,
    search_type: Optional[SearchType] = None,
    init=False,
    user: KhojUser = None,
 ):
    # Update Config
    if config == None:
        logger.info(f"Initializing with default config.")
        config = FullConfig()
    state.config = config
    if ConversationAdapters.has_valid_ai_model_api():
        ai_model_api = ConversationAdapters.get_ai_model_api()
        state.openai_client = openai.OpenAI(api_key=ai_model_api.api_key, base_url=ai_model_api.api_base_url)
@@ -309,36 +287,26 @@ def configure_server(
            )
        state.SearchType = configure_search_types()
-        state.search_models = configure_search(state.search_models, state.config.search_type)
+        setup_default_agent()
        setup_default_agent(user)
-        message = (
+        message = "📡 Telemetry disabled" if state.telemetry_disabled else "📡 Telemetry enabled"
            "📡 Telemetry disabled"
            if telemetry_disabled(state.config.app, state.telemetry_disabled)
            else "📡 Telemetry enabled"
        )
        logger.info(message)
        if not init:
            initialize_content(user, regenerate, search_type)
    except Exception as e:
        logger.error(f"Failed to load some search models: {e}", exc_info=True)
-def setup_default_agent(user: KhojUser):
+def setup_default_agent():
-    AgentAdapters.create_default_agent(user)
+    AgentAdapters.create_default_agent()
 def initialize_content(user: KhojUser, regenerate: bool, search_type: Optional[SearchType] = None):
    # Initialize Content from Config
    if state.search_models:
    try:
        logger.info("📬 Updating content index...")
            all_files = collect_files(user=user)
        status = configure_content(
            user,
-                all_files,
+            {},
            regenerate,
            search_type,
        )
@@ -438,8 +406,7 @@ def configure_middleware(app, ssl_enabled: bool = False):
 def update_content_index():
    for user in get_all_users():
-        all_files = collect_files(user=user)
+        success = configure_content(user, {})
        success = configure_content(user, all_files)
    if not success:
        raise RuntimeError("Failed to update content index")
    logger.info("📪 Content index updated via Scheduler")
@@ -464,7 +431,7 @@ def configure_search_types():
@schedule.repeat(schedule.every(2).minutes)
@clean_connections
 def upload_telemetry():
-    if telemetry_disabled(state.config.app, state.telemetry_disabled) or not state.telemetry:
+    if state.telemetry_disabled or not state.telemetry:
        return
    try:
--- a/src/khoj/database/adapters/init.py
+++ b/src/khoj/database/adapters/init.py
@@ -788,8 +788,8 @@ class AgentAdapters:
        return Agent.objects.filter(name=AgentAdapters.DEFAULT_AGENT_NAME).first()
    @staticmethod
-    def create_default_agent(user: KhojUser):
+    def create_default_agent():
-        default_chat_model = ConversationAdapters.get_default_chat_model(user)
+        default_chat_model = ConversationAdapters.get_default_chat_model(user=None)
        if default_chat_model is None:
            logger.info("No default conversation config found, skipping default agent creation")
            return None
--- a/src/khoj/database/migrations/0093_remove_localorgconfig_user_and_more.py
+++ b/src/khoj/database/migrations/0093_remove_localorgconfig_user_and_more.py
@@ -0,0 +1,36 @@
 # Generated by Django 5.1.10 on 2025-07-25 23:30
 from django.db import migrations
 class Migration(migrations.Migration):
    dependencies = [
        ("database", "0092_alter_chatmodel_model_type_alter_chatmodel_name_and_more"),
    ]
    operations = [
        migrations.RemoveField(
            model_name="localorgconfig",
            name="user",
        ),
        migrations.RemoveField(
            model_name="localpdfconfig",
            name="user",
        ),
        migrations.RemoveField(
            model_name="localplaintextconfig",
            name="user",
        ),
        migrations.DeleteModel(
            name="LocalMarkdownConfig",
        ),
        migrations.DeleteModel(
            name="LocalOrgConfig",
        ),
        migrations.DeleteModel(
            name="LocalPdfConfig",
        ),
        migrations.DeleteModel(
            name="LocalPlaintextConfig",
        ),
    ]
--- a/src/khoj/database/models/init.py
+++ b/src/khoj/database/models/init.py
@@ -488,34 +488,6 @@ class ServerChatSettings(DbBaseModel):
        super().save(*args, **kwargs)
 class LocalOrgConfig(DbBaseModel):
    input_files = models.JSONField(default=list, null=True)
    input_filter = models.JSONField(default=list, null=True)
    index_heading_entries = models.BooleanField(default=False)
    user = models.ForeignKey(KhojUser, on_delete=models.CASCADE)
 class LocalMarkdownConfig(DbBaseModel):
    input_files = models.JSONField(default=list, null=True)
    input_filter = models.JSONField(default=list, null=True)
    index_heading_entries = models.BooleanField(default=False)
    user = models.ForeignKey(KhojUser, on_delete=models.CASCADE)
 class LocalPdfConfig(DbBaseModel):
    input_files = models.JSONField(default=list, null=True)
    input_filter = models.JSONField(default=list, null=True)
    index_heading_entries = models.BooleanField(default=False)
    user = models.ForeignKey(KhojUser, on_delete=models.CASCADE)
 class LocalPlaintextConfig(DbBaseModel):
    input_files = models.JSONField(default=list, null=True)
    input_filter = models.JSONField(default=list, null=True)
    index_heading_entries = models.BooleanField(default=False)
    user = models.ForeignKey(KhojUser, on_delete=models.CASCADE)
 class SearchModelConfig(DbBaseModel):
    class ModelType(models.TextChoices):
        TEXT = "text"
--- a/src/khoj/main.py
+++ b/src/khoj/main.py
@@ -138,10 +138,10 @@ def run(should_start_server=True):
    initialization(not args.non_interactive)
    # Create app directory, if it doesn't exist
-    state.config_file.parent.mkdir(parents=True, exist_ok=True)
+    state.log_file.parent.mkdir(parents=True, exist_ok=True)
    # Set Log File
-    fh = logging.FileHandler(state.config_file.parent / "khoj.log", encoding="utf-8")
+    fh = logging.FileHandler(state.log_file, encoding="utf-8")
    fh.setLevel(logging.DEBUG)
    logger.addHandler(fh)
@@ -194,7 +194,7 @@ def run(should_start_server=True):
    # Configure Middleware
    configure_middleware(app, state.ssl_config)
-    initialize_server(args.config)
+    initialize_server()
    # If the server is started through gunicorn (external to the script), don't start the server
    if should_start_server:
@@ -204,8 +204,7 @@ def run(should_start_server=True):
 def set_state(args):
-    state.config_file = args.config_file
+    state.log_file = args.log_file
    state.config = args.config
    state.verbose = args.verbose
    state.host = args.host
    state.port = args.port
--- a/src/khoj/processor/content/github/github_to_entries.py
+++ b/src/khoj/processor/content/github/github_to_entries.py
@@ -20,7 +20,6 @@ magika = Magika()
 class GithubToEntries(TextToEntries):
    def __init__(self, config: GithubConfig):
        super().__init__(config)
        raw_repos = config.githubrepoconfig.all()
        repos = []
        for repo in raw_repos:
--- a/src/khoj/processor/content/notion/notion_to_entries.py
+++ b/src/khoj/processor/content/notion/notion_to_entries.py
@@ -47,7 +47,6 @@ class NotionBlockType(Enum):
 class NotionToEntries(TextToEntries):
    def __init__(self, config: NotionConfig):
        super().__init__(config)
        self.config = NotionContentConfig(
            token=config.token,
        )
--- a/src/khoj/processor/content/text_to_entries.py
+++ b/src/khoj/processor/content/text_to_entries.py
@@ -27,7 +27,6 @@ logger = logging.getLogger(__name__)
 class TextToEntries(ABC):
    def __init__(self, config: Any = None):
        self.embeddings_model = state.embeddings_model
        self.config = config
        self.date_filter = DateFilter()
    @abstractmethod
--- a/src/khoj/routers/api.py
+++ b/src/khoj/routers/api.py
@@ -87,22 +87,14 @@ def update(
    force: Optional[bool] = False,
 ):
    user = request.user.object
    if not state.config:
        error_msg = f"🚨 Khoj is not configured.\nConfigure it via http://localhost:42110/settings, plugins or by editing {state.config_file}."
        logger.warning(error_msg)
        raise HTTPException(status_code=500, detail=error_msg)
    try:
        initialize_content(user=user, regenerate=force, search_type=t)
    except Exception as e:
-        error_msg = f"🚨 Failed to update server via API: {e}"
+        error_msg = f"🚨 Failed to update server indexed content via API: {e}"
        logger.error(error_msg, exc_info=True)
        raise HTTPException(status_code=500, detail=error_msg)
    else:
-        components = []
+        logger.info(f"📪 Server indexed content updated via API")
        if state.search_models:
            components.append("Search models")
        components_msg = ", ".join(components)
        logger.info(f"📪 {components_msg} updated via API")
    update_telemetry_state(
        request=request,
--- a/src/khoj/routers/api_content.py
+++ b/src/khoj/routers/api_content.py
@@ -27,16 +27,7 @@ from khoj.database.adapters import (
    get_user_notion_config,
 )
 from khoj.database.models import Entry as DbEntry
-from khoj.database.models import (
+from khoj.database.models import GithubConfig, GithubRepoConfig, NotionConfig
    GithubConfig,
    GithubRepoConfig,
    KhojUser,
    LocalMarkdownConfig,
    LocalOrgConfig,
    LocalPdfConfig,
    LocalPlaintextConfig,
    NotionConfig,
 )
 from khoj.processor.content.docx.docx_to_entries import DocxToEntries
 from khoj.processor.content.pdf.pdf_to_entries import PdfToEntries
 from khoj.routers.helpers import (
@@ -47,17 +38,9 @@ from khoj.routers.helpers import (
    get_user_config,
    update_telemetry_state,
 )
-from khoj.utils import constants, state
+from khoj.utils import state
-from khoj.utils.config import SearchModels
+from khoj.utils.rawconfig import GithubContentConfig, NotionContentConfig
 from khoj.utils.rawconfig import (
    ContentConfig,
    FullConfig,
    GithubContentConfig,
    NotionContentConfig,
    SearchConfig,
 )
 from khoj.utils.state import SearchType
 from khoj.utils.yaml import save_config_to_file_updated_state
 logger = logging.getLogger(__name__)
@@ -192,8 +175,6 @@ async def set_content_github(
    updated_config: Union[GithubContentConfig, None],
    client: Optional[str] = None,
 ):
    _initialize_config()
    user = request.user.object
    try:
@@ -225,8 +206,6 @@ async def set_content_notion(
    updated_config: Union[NotionContentConfig, None],
    client: Optional[str] = None,
 ):
    _initialize_config()
    user = request.user.object
    try:
@@ -323,10 +302,6 @@ def get_content_types(request: Request, client: Optional[str] = None):
    configured_content_types = set(EntryAdapters.get_unique_file_types(user))
    configured_content_types |= {"all"}
    if state.config and state.config.content_type:
        for ctype in state.config.content_type.model_dump(exclude_none=True):
            configured_content_types.add(ctype)
    return list(configured_content_types & all_content_types)
@@ -606,28 +581,6 @@ async def indexer(
            docx=index_files["docx"],
        )
        if state.config == None:
            logger.info("📬 Initializing content index on first run.")
            default_full_config = FullConfig(
                content_type=None,
                search_type=SearchConfig.model_validate(constants.default_config["search-type"]),
                processor=None,
            )
            state.config = default_full_config
            default_content_config = ContentConfig(
                org=None,
                markdown=None,
                pdf=None,
                docx=None,
                image=None,
                github=None,
                notion=None,
                plaintext=None,
            )
            state.config.content_type = default_content_config
            save_config_to_file_updated_state()
            configure_search(state.search_models, state.config.search_type)
        loop = asyncio.get_event_loop()
        success = await loop.run_in_executor(
            None,
@@ -674,14 +627,6 @@ async def indexer(
    return Response(content=indexed_filenames, status_code=200)
 def configure_search(search_models: SearchModels, search_config: Optional[SearchConfig]) -> Optional[SearchModels]:
    # Run Validation Checks
    if search_models is None:
        search_models = SearchModels()
    return search_models
 def map_config_to_object(content_source: str):
    if content_source == DbEntry.EntrySource.GITHUB:
        return GithubConfig
@@ -689,56 +634,3 @@ def map_config_to_object(content_source: str):
        return NotionConfig
    if content_source == DbEntry.EntrySource.COMPUTER:
        return "Computer"
 async def map_config_to_db(config: FullConfig, user: KhojUser):
    if config.content_type:
        if config.content_type.org:
            await LocalOrgConfig.objects.filter(user=user).adelete()
            await LocalOrgConfig.objects.acreate(
                input_files=config.content_type.org.input_files,
                input_filter=config.content_type.org.input_filter,
                index_heading_entries=config.content_type.org.index_heading_entries,
                user=user,
            )
        if config.content_type.markdown:
            await LocalMarkdownConfig.objects.filter(user=user).adelete()
            await LocalMarkdownConfig.objects.acreate(
                input_files=config.content_type.markdown.input_files,
                input_filter=config.content_type.markdown.input_filter,
                index_heading_entries=config.content_type.markdown.index_heading_entries,
                user=user,
            )
        if config.content_type.pdf:
            await LocalPdfConfig.objects.filter(user=user).adelete()
            await LocalPdfConfig.objects.acreate(
                input_files=config.content_type.pdf.input_files,
                input_filter=config.content_type.pdf.input_filter,
                index_heading_entries=config.content_type.pdf.index_heading_entries,
                user=user,
            )
        if config.content_type.plaintext:
            await LocalPlaintextConfig.objects.filter(user=user).adelete()
            await LocalPlaintextConfig.objects.acreate(
                input_files=config.content_type.plaintext.input_files,
                input_filter=config.content_type.plaintext.input_filter,
                index_heading_entries=config.content_type.plaintext.index_heading_entries,
                user=user,
            )
        if config.content_type.github:
            await adapters.set_user_github_config(
                user=user,
                pat_token=config.content_type.github.pat_token,
                repos=config.content_type.github.repos,
            )
        if config.content_type.notion:
            await adapters.set_notion_config(
                user=user,
                token=config.content_type.notion.token,
            )
 def _initialize_config():
    if state.config is None:
        state.config = FullConfig()
        state.config.search_type = SearchConfig.model_validate(constants.default_config["search-type"])
--- a/src/khoj/routers/helpers.py
+++ b/src/khoj/routers/helpers.py
@@ -218,7 +218,6 @@ def update_telemetry_state(
            telemetry_type=telemetry_type,
            api=api,
            client=client,
            app_config=state.config.app,
            disable_telemetry_env=state.telemetry_disabled,
            properties=user_state,
        )
@@ -2726,7 +2725,8 @@ def configure_content(
    search_type = t.value if t else None
-    no_documents = all([not files.get(file_type) for file_type in files])
+    # Check if client sent any documents of the supported types
    no_client_sent_documents = all([not files.get(file_type) for file_type in files])
    if files is None:
        logger.warning(f"🚨 No files to process for {search_type} search.")
@@ -2800,7 +2800,8 @@ def configure_content(
        success = False
    try:
-        if no_documents:
+        # Run server side indexing of user Github docs if no client sent documents
        if no_client_sent_documents:
            github_config = GithubConfig.objects.filter(user=user).prefetch_related("githubrepoconfig").first()
            if (
                search_type == state.SearchType.All.value or search_type == state.SearchType.Github.value
@@ -2820,7 +2821,8 @@ def configure_content(
        success = False
    try:
-        if no_documents:
+        # Run server side indexing of user Notion docs if no client sent documents
        if no_client_sent_documents:
            # Initialize Notion Search
            notion_config = NotionConfig.objects.filter(user=user).first()
            if (
--- a/src/khoj/utils/cli.py
+++ b/src/khoj/utils/cli.py
@@ -1,26 +1,19 @@
 import argparse
 import logging
 import os
 import pathlib
 from importlib.metadata import version
 logger = logging.getLogger(__name__)
 from khoj.utils.helpers import is_env_var_true, resolve_absolute_path
 from khoj.utils.yaml import parse_config_from_file
 def cli(args=None):
    # Setup Argument Parser for the Commandline Interface
    parser = argparse.ArgumentParser(description="Start Khoj; An AI personal assistant for your Digital Brain")
    parser.add_argument(
-        "--config-file", default="~/.khoj/khoj.yml", type=pathlib.Path, help="YAML file to configure Khoj"
+        "--log-file",
-    )
+        default="~/.khoj/khoj.log",
-    parser.add_argument(
+        type=pathlib.Path,
-        "--regenerate",
+        help="File path for server logs. Default: ~/.khoj/khoj.log",
        action="store_true",
        default=False,
        help="Regenerate model embeddings from source files. Default: false",
    )
    parser.add_argument("--verbose", "-v", action="count", default=0, help="Show verbose conversion logs. Default: 0")
    parser.add_argument("--host", type=str, default="127.0.0.1", help="Host address of the server. Default: 127.0.0.1")
@@ -37,7 +30,7 @@ def cli(args=None):
        "--anonymous-mode",
        action="store_true",
        default=False,
-        help="Run Khoj in anonymous mode. This does not require any login for connecting users.",
+        help="Run Khoj in single user mode with no login required. Useful for personal use or testing.",
    )
    parser.add_argument(
        "--non-interactive",
@@ -57,15 +50,4 @@ def cli(args=None):
        print(args.version_no)
        exit(0)
    # Normalize config_file path to absolute path
    args.config_file = resolve_absolute_path(args.config_file)
    if not args.config_file.exists():
        args.config = None
    else:
        args = run_migrations(args)
        args.config = parse_config_from_file(args.config_file)
        if is_env_var_true("KHOJ_TELEMETRY_DISABLE"):
            args.config.app.should_log_telemetry = False
    return args
--- a/src/khoj/utils/config.py
+++ b/src/khoj/utils/config.py
@@ -1,20 +1,7 @@
 # System Packages
 from __future__ import annotations  # to avoid quoting type hints
 import logging
 from dataclasses import dataclass
 from enum import Enum
 from typing import TYPE_CHECKING, Any, List, Optional, Union
 import torch
 logger = logging.getLogger(__name__)
 if TYPE_CHECKING:
    from sentence_transformers import CrossEncoder
    from khoj.utils.models import BaseEncoder
 class SearchType(str, Enum):
@@ -27,36 +14,3 @@ class SearchType(str, Enum):
    Notion = "notion"
    Plaintext = "plaintext"
    Docx = "docx"
 class ProcessorType(str, Enum):
    Conversation = "conversation"
@dataclass
 class TextContent:
    enabled: bool
@dataclass
 class ImageContent:
    image_names: List[str]
    image_embeddings: torch.Tensor
    image_metadata_embeddings: torch.Tensor
@dataclass
 class TextSearchModel:
    bi_encoder: BaseEncoder
    cross_encoder: Optional[CrossEncoder] = None
    top_k: Optional[int] = 15
@dataclass
 class ImageSearchModel:
    image_encoder: BaseEncoder
@dataclass
 class SearchModels:
    text_search: Optional[TextSearchModel] = None
--- a/src/khoj/utils/fs_syncer.py
+++ b/src/khoj/utils/fs_syncer.py
@@ -1,252 +0,0 @@
 import glob
 import logging
 import os
 from pathlib import Path
 from typing import Optional
 from bs4 import BeautifulSoup
 from magika import Magika
 from khoj.database.models import (
    KhojUser,
    LocalMarkdownConfig,
    LocalOrgConfig,
    LocalPdfConfig,
    LocalPlaintextConfig,
 )
 from khoj.utils.config import SearchType
 from khoj.utils.helpers import get_absolute_path, is_none_or_empty
 from khoj.utils.rawconfig import TextContentConfig
 logger = logging.getLogger(__name__)
 magika = Magika()
 def collect_files(user: KhojUser, search_type: Optional[SearchType] = SearchType.All) -> dict:
    files: dict[str, dict] = {"docx": {}, "image": {}}
    if search_type == SearchType.All or search_type == SearchType.Org:
        org_config = LocalOrgConfig.objects.filter(user=user).first()
        files["org"] = get_org_files(construct_config_from_db(org_config)) if org_config else {}
    if search_type == SearchType.All or search_type == SearchType.Markdown:
        markdown_config = LocalMarkdownConfig.objects.filter(user=user).first()
        files["markdown"] = get_markdown_files(construct_config_from_db(markdown_config)) if markdown_config else {}
    if search_type == SearchType.All or search_type == SearchType.Plaintext:
        plaintext_config = LocalPlaintextConfig.objects.filter(user=user).first()
        files["plaintext"] = get_plaintext_files(construct_config_from_db(plaintext_config)) if plaintext_config else {}
    if search_type == SearchType.All or search_type == SearchType.Pdf:
        pdf_config = LocalPdfConfig.objects.filter(user=user).first()
        files["pdf"] = get_pdf_files(construct_config_from_db(pdf_config)) if pdf_config else {}
    files["image"] = {}
    files["docx"] = {}
    return files
 def construct_config_from_db(db_config) -> TextContentConfig:
    return TextContentConfig(
        input_files=db_config.input_files,
        input_filter=db_config.input_filter,
        index_heading_entries=db_config.index_heading_entries,
    )
 def get_plaintext_files(config: TextContentConfig) -> dict[str, str]:
    def is_plaintextfile(file: str):
        "Check if file is plaintext file"
        # Check if file path exists
        content_group = magika.identify_path(Path(file)).output.group
        # Use file extension to decide plaintext if file content is not identifiable
        valid_text_file_extensions = ("txt", "md", "markdown", "org" "mbox", "rst", "html", "htm", "xml")
        return file.endswith(valid_text_file_extensions) or content_group in ["text", "code"]
    def extract_html_content(html_content: str):
        "Extract content from HTML"
        soup = BeautifulSoup(html_content, "html.parser")
        return soup.get_text(strip=True, separator="\n")
    # Extract required fields from config
    input_files, input_filters = (
        config.input_files,
        config.input_filter,
    )
    # Input Validation
    if is_none_or_empty(input_files) and is_none_or_empty(input_filters):
        logger.debug("At least one of input-files or input-file-filter is required to be specified")
        return {}
    # Get all plain text files to process
    absolute_plaintext_files, filtered_plaintext_files = set(), set()
    if input_files:
        absolute_plaintext_files = {get_absolute_path(jsonl_file) for jsonl_file in input_files}
    if input_filters:
        filtered_plaintext_files = {
            filtered_file
            for plaintext_file_filter in input_filters
            for filtered_file in glob.glob(get_absolute_path(plaintext_file_filter), recursive=True)
            if os.path.isfile(filtered_file)
        }
    all_target_files = sorted(absolute_plaintext_files | filtered_plaintext_files)
    files_with_no_plaintext_extensions = {
        target_files for target_files in all_target_files if not is_plaintextfile(target_files)
    }
    if any(files_with_no_plaintext_extensions):
        logger.warning(f"Skipping unsupported files from plaintext indexing: {files_with_no_plaintext_extensions}")
        all_target_files = list(set(all_target_files) - files_with_no_plaintext_extensions)
    logger.debug(f"Processing files: {all_target_files}")
    filename_to_content_map = {}
    for file in all_target_files:
        with open(file, "r", encoding="utf8") as f:
            try:
                plaintext_content = f.read()
                if file.endswith(("html", "htm", "xml")):
                    plaintext_content = extract_html_content(plaintext_content)
                filename_to_content_map[file] = plaintext_content
            except Exception as e:
                logger.warning(f"Unable to read file: {file} as plaintext. Skipping file.")
                logger.warning(e, exc_info=True)
    return filename_to_content_map
 def get_org_files(config: TextContentConfig):
    # Extract required fields from config
    org_files, org_file_filters = (
        config.input_files,
        config.input_filter,
    )
    # Input Validation
    if is_none_or_empty(org_files) and is_none_or_empty(org_file_filters):
        logger.debug("At least one of org-files or org-file-filter is required to be specified")
        return {}
    # Get Org files to process
    absolute_org_files, filtered_org_files = set(), set()
    if org_files:
        absolute_org_files = {get_absolute_path(org_file) for org_file in org_files}
    if org_file_filters:
        filtered_org_files = {
            filtered_file
            for org_file_filter in org_file_filters
            for filtered_file in glob.glob(get_absolute_path(org_file_filter), recursive=True)
            if os.path.isfile(filtered_file)
        }
    all_org_files = sorted(absolute_org_files | filtered_org_files)
    files_with_non_org_extensions = {org_file for org_file in all_org_files if not org_file.endswith(".org")}
    if any(files_with_non_org_extensions):
        logger.warning(f"There maybe non org-mode files in the input set: {files_with_non_org_extensions}")
    logger.debug(f"Processing files: {all_org_files}")
    filename_to_content_map = {}
    for file in all_org_files:
        with open(file, "r", encoding="utf8") as f:
            try:
                filename_to_content_map[file] = f.read()
            except Exception as e:
                logger.warning(f"Unable to read file: {file} as org. Skipping file.")
                logger.warning(e, exc_info=True)
    return filename_to_content_map
 def get_markdown_files(config: TextContentConfig):
    # Extract required fields from config
    markdown_files, markdown_file_filters = (
        config.input_files,
        config.input_filter,
    )
    # Input Validation
    if is_none_or_empty(markdown_files) and is_none_or_empty(markdown_file_filters):
        logger.debug("At least one of markdown-files or markdown-file-filter is required to be specified")
        return {}
    # Get markdown files to process
    absolute_markdown_files, filtered_markdown_files = set(), set()
    if markdown_files:
        absolute_markdown_files = {get_absolute_path(markdown_file) for markdown_file in markdown_files}
    if markdown_file_filters:
        filtered_markdown_files = {
            filtered_file
            for markdown_file_filter in markdown_file_filters
            for filtered_file in glob.glob(get_absolute_path(markdown_file_filter), recursive=True)
            if os.path.isfile(filtered_file)
        }
    all_markdown_files = sorted(absolute_markdown_files | filtered_markdown_files)
    files_with_non_markdown_extensions = {
        md_file for md_file in all_markdown_files if not md_file.endswith(".md") and not md_file.endswith(".markdown")
    }
    if any(files_with_non_markdown_extensions):
        logger.warning(
            f"[Warning] There maybe non markdown-mode files in the input set: {files_with_non_markdown_extensions}"
        )
    logger.debug(f"Processing files: {all_markdown_files}")
    filename_to_content_map = {}
    for file in all_markdown_files:
        with open(file, "r", encoding="utf8") as f:
            try:
                filename_to_content_map[file] = f.read()
            except Exception as e:
                logger.warning(f"Unable to read file: {file} as markdown. Skipping file.")
                logger.warning(e, exc_info=True)
    return filename_to_content_map
 def get_pdf_files(config: TextContentConfig):
    # Extract required fields from config
    pdf_files, pdf_file_filters = (
        config.input_files,
        config.input_filter,
    )
    # Input Validation
    if is_none_or_empty(pdf_files) and is_none_or_empty(pdf_file_filters):
        logger.debug("At least one of pdf-files or pdf-file-filter is required to be specified")
        return {}
    # Get PDF files to process
    absolute_pdf_files, filtered_pdf_files = set(), set()
    if pdf_files:
        absolute_pdf_files = {get_absolute_path(pdf_file) for pdf_file in pdf_files}
    if pdf_file_filters:
        filtered_pdf_files = {
            filtered_file
            for pdf_file_filter in pdf_file_filters
            for filtered_file in glob.glob(get_absolute_path(pdf_file_filter), recursive=True)
            if os.path.isfile(filtered_file)
        }
    all_pdf_files = sorted(absolute_pdf_files | filtered_pdf_files)
    files_with_non_pdf_extensions = {pdf_file for pdf_file in all_pdf_files if not pdf_file.endswith(".pdf")}
    if any(files_with_non_pdf_extensions):
        logger.warning(f"[Warning] There maybe non pdf-mode files in the input set: {files_with_non_pdf_extensions}")
    logger.debug(f"Processing files: {all_pdf_files}")
    filename_to_content_map = {}
    for file in all_pdf_files:
        with open(file, "rb") as f:
            try:
                filename_to_content_map[file] = f.read()
            except Exception as e:
                logger.warning(f"Unable to read file: {file} as PDF. Skipping file.")
                logger.warning(e, exc_info=True)
    return filename_to_content_map
--- a/src/khoj/utils/helpers.py
+++ b/src/khoj/utils/helpers.py
@@ -47,7 +47,6 @@ if TYPE_CHECKING:
    from sentence_transformers import CrossEncoder, SentenceTransformer
    from khoj.utils.models import BaseEncoder
    from khoj.utils.rawconfig import AppConfig
 logger = logging.getLogger(__name__)
@@ -267,23 +266,16 @@ def get_server_id():
    return server_id
 def telemetry_disabled(app_config: AppConfig, telemetry_disable_env) -> bool:
    if telemetry_disable_env is True:
        return True
    return not app_config or not app_config.should_log_telemetry
 def log_telemetry(
    telemetry_type: str,
    api: str = None,
    client: Optional[str] = None,
    app_config: Optional[AppConfig] = None,
    disable_telemetry_env: bool = False,
    properties: dict = None,
 ):
    """Log basic app usage telemetry like client, os, api called"""
    # Do not log usage telemetry, if telemetry is disabled via app config
-    if telemetry_disabled(app_config, disable_telemetry_env):
+    if disable_telemetry_env:
        return []
    if properties.get("server_id") is None:
--- a/src/khoj/utils/initialization.py
+++ b/src/khoj/utils/initialization.py
@@ -147,24 +147,6 @@ def initialization(interactive: bool = True):
        logger.info("🗣️ Chat model configuration complete")
        # Set up offline speech to text model
        use_offline_speech2text_model = "n" if not interactive else input("Use offline speech to text model? (y/n): ")
        if use_offline_speech2text_model == "y":
            logger.info("🗣️ Setting up offline speech to text model")
            # Delete any existing speech to text model options. There can only be one.
            SpeechToTextModelOptions.objects.all().delete()
            default_offline_speech2text_model = "base"
            offline_speech2text_model = input(
                f"Enter the Whisper model to use Offline (default: {default_offline_speech2text_model}): "
            )
            offline_speech2text_model = offline_speech2text_model or default_offline_speech2text_model
            SpeechToTextModelOptions.objects.create(
                model_name=offline_speech2text_model, model_type=SpeechToTextModelOptions.ModelType.OFFLINE
            )
            logger.info(f"🗣️  Offline speech to text model configured to {offline_speech2text_model}")
    def _setup_chat_model_provider(
        model_type: ChatModel.ModelType,
        default_chat_models: list,
--- a/src/khoj/utils/rawconfig.py
+++ b/src/khoj/utils/rawconfig.py
@@ -48,17 +48,6 @@ class FilesFilterRequest(BaseModel):
    conversation_id: str
 class TextConfigBase(ConfigBase):
    compressed_jsonl: Path
    embeddings_file: Path
 class TextContentConfig(ConfigBase):
    input_files: Optional[List[Path]] = None
    input_filter: Optional[List[str]] = None
    index_heading_entries: Optional[bool] = False
 class GithubRepoConfig(ConfigBase):
    name: str
    owner: str
@@ -74,57 +63,6 @@ class NotionContentConfig(ConfigBase):
    token: str
 class ContentConfig(ConfigBase):
    org: Optional[TextContentConfig] = None
    markdown: Optional[TextContentConfig] = None
    pdf: Optional[TextContentConfig] = None
    plaintext: Optional[TextContentConfig] = None
    github: Optional[GithubContentConfig] = None
    notion: Optional[NotionContentConfig] = None
    image: Optional[TextContentConfig] = None
    docx: Optional[TextContentConfig] = None
 class ImageSearchConfig(ConfigBase):
    encoder: str
    encoder_type: Optional[str] = None
    model_directory: Optional[Path] = None
    class Config:
        protected_namespaces = ()
 class SearchConfig(ConfigBase):
    image: Optional[ImageSearchConfig] = None
 class OpenAIProcessorConfig(ConfigBase):
    api_key: str
    chat_model: Optional[str] = "gpt-4o-mini"
 class ConversationProcessorConfig(ConfigBase):
    openai: Optional[OpenAIProcessorConfig] = None
    max_prompt_size: Optional[int] = None
    tokenizer: Optional[str] = None
 class ProcessorConfig(ConfigBase):
    conversation: Optional[ConversationProcessorConfig] = None
 class AppConfig(ConfigBase):
    should_log_telemetry: bool = True
 class FullConfig(ConfigBase):
    content_type: Optional[ContentConfig] = None
    search_type: Optional[SearchConfig] = None
    processor: Optional[ProcessorConfig] = None
    app: Optional[AppConfig] = AppConfig()
    version: Optional[str] = None
 class SearchResponse(ConfigBase):
    entry: str
    score: float
--- a/src/khoj/utils/state.py
+++ b/src/khoj/utils/state.py
@@ -12,18 +12,14 @@ from whisper import Whisper
 from khoj.database.models import ProcessLock
 from khoj.processor.embeddings import CrossEncoderModel, EmbeddingsModel
 from khoj.utils import config as utils_config
 from khoj.utils.config import SearchModels
 from khoj.utils.helpers import LRU, get_device, is_env_var_true
 from khoj.utils.rawconfig import FullConfig
 # Application Global State
 config = FullConfig()
 search_models = SearchModels()
 embeddings_model: Dict[str, EmbeddingsModel] = None
 cross_encoder_model: Dict[str, CrossEncoderModel] = None
 openai_client: OpenAI = None
 whisper_model: Whisper = None
-config_file: Path = None
+log_file: Path = None
 verbose: int = 0
 host: str = None
 port: int = None
--- a/src/khoj/utils/yaml.py
+++ b/src/khoj/utils/yaml.py
@@ -1,47 +1,8 @@
 from pathlib import Path
 import yaml
 from khoj.utils import state
 from khoj.utils.rawconfig import FullConfig
 # Do not emit tags when dumping to YAML
 yaml.emitter.Emitter.process_tag = lambda self, *args, **kwargs: None  # type: ignore[assignment]
 def save_config_to_file_updated_state():
    with open(state.config_file, "w") as outfile:
        yaml.dump(yaml.safe_load(state.config.json(by_alias=True)), outfile)
        outfile.close()
    return state.config
 def save_config_to_file(yaml_config: dict, yaml_config_file: Path):
    "Write config to YML file"
    # Create output directory, if it doesn't exist
    yaml_config_file.parent.mkdir(parents=True, exist_ok=True)
    with open(yaml_config_file, "w", encoding="utf-8") as config_file:
        yaml.safe_dump(yaml_config, config_file, allow_unicode=True)
 def load_config_from_file(yaml_config_file: Path) -> dict:
    "Read config from YML file"
    config_from_file = None
    with open(yaml_config_file, "r", encoding="utf-8") as config_file:
        config_from_file = yaml.safe_load(config_file)
    return config_from_file
 def parse_config_from_string(yaml_config: dict) -> FullConfig:
    "Parse and validate config in YML string"
    return FullConfig.model_validate(yaml_config)
 def parse_config_from_file(yaml_config_file):
    "Parse and validate config in YML file"
    return parse_config_from_string(load_config_from_file(yaml_config_file))
 def yaml_dump(data):
    return yaml.dump(data, allow_unicode=True, sort_keys=False, default_flow_style=False)
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -33,7 +33,7 @@ from khoj.utils import fs_syncer, state
 from khoj.utils.config import SearchModels
 from khoj.utils.constants import web_directory
 from khoj.utils.helpers import resolve_absolute_path
-from khoj.utils.rawconfig import ContentConfig, ImageSearchConfig, SearchConfig
+from khoj.utils.rawconfig import ContentConfig, SearchConfig
 from tests.helpers import (
    AiModelApiFactory,
    ChatModelFactory,
@@ -69,12 +69,6 @@ def search_config() -> SearchConfig:
    model_dir.mkdir(parents=True, exist_ok=True)
    search_config = SearchConfig()
    search_config.image = ImageSearchConfig(
        encoder="sentence-transformers/clip-ViT-B-32",
        model_directory=model_dir / "image/",
        encoder_type=None,
    )
    return search_config
@@ -301,7 +295,6 @@ def chat_client_with_large_kb(search_config: SearchConfig, default_user2: KhojUs
@pytest.mark.django_db
 def chat_client_builder(search_config, user, index_content=True, require_auth=False):
    # Initialize app state
    state.config.search_type = search_config
    state.SearchType = configure_search_types()
    if index_content:
@@ -349,7 +342,6 @@ def large_kb_chat_client_builder(search_config, user):
    import tempfile
    # Initialize app state
    state.config.search_type = search_config
    state.SearchType = configure_search_types()
    # Create temporary directory for large number of test files
@@ -470,12 +462,8 @@ def fastapi_app():
@pytest.fixture(scope="function")
 def client(
    content_config: ContentConfig,
    search_config: SearchConfig,
    api_user: KhojApiUser,
 ):
    state.config.content_type = content_config
    state.config.search_type = search_config
    state.SearchType = configure_search_types()
    state.embeddings_model = dict()
    state.embeddings_model["default"] = EmbeddingsModel()
--- a/tests/test_client.py
+++ b/tests/test_client.py
@@ -283,10 +283,6 @@ def test_get_api_config_types(client, sample_org_data, default_user: KhojUser):
 def test_get_configured_types_with_no_content_config(fastapi_app: FastAPI):
    # Arrange
    state.anonymous_mode = True
    if state.config and state.config.content_type:
        state.config.content_type = None
    state.search_models = configure_search_types()
    configure_routes(fastapi_app)
    client = TestClient(fastapi_app)