Drop old code to sync files on server filesystem. Clean cli, init paths

This stale code was originally used to index files on server file
system directly by server. We currently push files to sync via API.

Server side syncing of remote content like Github and Notion is still
supported. But old, unused code for server side sync of files on
server fs is being cleaned out.

New --log-file cli args allows specifying where khoj server should
store logs on fs. This replaces the --config-file cli arg that was
only being used as a proxy for deciding where to store the log file.

- TODO
  - Tests are broken. They were relying on the server side content
    syncing for test setup
This commit is contained in:
Debanjum
2025-07-03 15:27:06 -07:00
parent b1f2737c9a
commit d9d24dd638
21 changed files with 82 additions and 688 deletions

View File

@@ -50,13 +50,11 @@ from khoj.database.adapters import (
) )
from khoj.database.models import ClientApplication, KhojUser, ProcessLock, Subscription from khoj.database.models import ClientApplication, KhojUser, ProcessLock, Subscription
from khoj.processor.embeddings import CrossEncoderModel, EmbeddingsModel from khoj.processor.embeddings import CrossEncoderModel, EmbeddingsModel
from khoj.routers.api_content import configure_content, configure_search from khoj.routers.api_content import configure_content
from khoj.routers.twilio import is_twilio_enabled from khoj.routers.twilio import is_twilio_enabled
from khoj.utils import constants, state from khoj.utils import constants, state
from khoj.utils.config import SearchType from khoj.utils.config import SearchType
from khoj.utils.fs_syncer import collect_files from khoj.utils.helpers import is_none_or_empty
from khoj.utils.helpers import is_none_or_empty, telemetry_disabled
from khoj.utils.rawconfig import FullConfig
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
@@ -232,14 +230,6 @@ class UserAuthenticationBackend(AuthenticationBackend):
return AuthCredentials(), UnauthenticatedUser() return AuthCredentials(), UnauthenticatedUser()
def initialize_server(config: Optional[FullConfig]):
try:
configure_server(config, init=True)
except Exception as e:
logger.error(f"🚨 Failed to configure server on app load: {e}", exc_info=True)
raise e
def clean_connections(func): def clean_connections(func):
""" """
A decorator that ensures that Django database connections that have become unusable, or are obsolete, are closed A decorator that ensures that Django database connections that have become unusable, or are obsolete, are closed
@@ -260,19 +250,7 @@ def clean_connections(func):
return func_wrapper return func_wrapper
def configure_server( def initialize_server():
config: FullConfig,
regenerate: bool = False,
search_type: Optional[SearchType] = None,
init=False,
user: KhojUser = None,
):
# Update Config
if config == None:
logger.info(f"Initializing with default config.")
config = FullConfig()
state.config = config
if ConversationAdapters.has_valid_ai_model_api(): if ConversationAdapters.has_valid_ai_model_api():
ai_model_api = ConversationAdapters.get_ai_model_api() ai_model_api = ConversationAdapters.get_ai_model_api()
state.openai_client = openai.OpenAI(api_key=ai_model_api.api_key, base_url=ai_model_api.api_base_url) state.openai_client = openai.OpenAI(api_key=ai_model_api.api_key, base_url=ai_model_api.api_base_url)
@@ -309,36 +287,26 @@ def configure_server(
) )
state.SearchType = configure_search_types() state.SearchType = configure_search_types()
state.search_models = configure_search(state.search_models, state.config.search_type) setup_default_agent()
setup_default_agent(user)
message = ( message = "📡 Telemetry disabled" if state.telemetry_disabled else "📡 Telemetry enabled"
"📡 Telemetry disabled"
if telemetry_disabled(state.config.app, state.telemetry_disabled)
else "📡 Telemetry enabled"
)
logger.info(message) logger.info(message)
if not init:
initialize_content(user, regenerate, search_type)
except Exception as e: except Exception as e:
logger.error(f"Failed to load some search models: {e}", exc_info=True) logger.error(f"Failed to load some search models: {e}", exc_info=True)
def setup_default_agent(user: KhojUser): def setup_default_agent():
AgentAdapters.create_default_agent(user) AgentAdapters.create_default_agent()
def initialize_content(user: KhojUser, regenerate: bool, search_type: Optional[SearchType] = None): def initialize_content(user: KhojUser, regenerate: bool, search_type: Optional[SearchType] = None):
# Initialize Content from Config # Initialize Content from Config
if state.search_models:
try: try:
logger.info("📬 Updating content index...") logger.info("📬 Updating content index...")
all_files = collect_files(user=user)
status = configure_content( status = configure_content(
user, user,
all_files, {},
regenerate, regenerate,
search_type, search_type,
) )
@@ -438,8 +406,7 @@ def configure_middleware(app, ssl_enabled: bool = False):
def update_content_index(): def update_content_index():
for user in get_all_users(): for user in get_all_users():
all_files = collect_files(user=user) success = configure_content(user, {})
success = configure_content(user, all_files)
if not success: if not success:
raise RuntimeError("Failed to update content index") raise RuntimeError("Failed to update content index")
logger.info("📪 Content index updated via Scheduler") logger.info("📪 Content index updated via Scheduler")
@@ -464,7 +431,7 @@ def configure_search_types():
@schedule.repeat(schedule.every(2).minutes) @schedule.repeat(schedule.every(2).minutes)
@clean_connections @clean_connections
def upload_telemetry(): def upload_telemetry():
if telemetry_disabled(state.config.app, state.telemetry_disabled) or not state.telemetry: if state.telemetry_disabled or not state.telemetry:
return return
try: try:

View File

@@ -788,8 +788,8 @@ class AgentAdapters:
return Agent.objects.filter(name=AgentAdapters.DEFAULT_AGENT_NAME).first() return Agent.objects.filter(name=AgentAdapters.DEFAULT_AGENT_NAME).first()
@staticmethod @staticmethod
def create_default_agent(user: KhojUser): def create_default_agent():
default_chat_model = ConversationAdapters.get_default_chat_model(user) default_chat_model = ConversationAdapters.get_default_chat_model(user=None)
if default_chat_model is None: if default_chat_model is None:
logger.info("No default conversation config found, skipping default agent creation") logger.info("No default conversation config found, skipping default agent creation")
return None return None

View File

@@ -0,0 +1,36 @@
# Generated by Django 5.1.10 on 2025-07-25 23:30
from django.db import migrations
class Migration(migrations.Migration):
dependencies = [
("database", "0092_alter_chatmodel_model_type_alter_chatmodel_name_and_more"),
]
operations = [
migrations.RemoveField(
model_name="localorgconfig",
name="user",
),
migrations.RemoveField(
model_name="localpdfconfig",
name="user",
),
migrations.RemoveField(
model_name="localplaintextconfig",
name="user",
),
migrations.DeleteModel(
name="LocalMarkdownConfig",
),
migrations.DeleteModel(
name="LocalOrgConfig",
),
migrations.DeleteModel(
name="LocalPdfConfig",
),
migrations.DeleteModel(
name="LocalPlaintextConfig",
),
]

View File

@@ -488,34 +488,6 @@ class ServerChatSettings(DbBaseModel):
super().save(*args, **kwargs) super().save(*args, **kwargs)
class LocalOrgConfig(DbBaseModel):
input_files = models.JSONField(default=list, null=True)
input_filter = models.JSONField(default=list, null=True)
index_heading_entries = models.BooleanField(default=False)
user = models.ForeignKey(KhojUser, on_delete=models.CASCADE)
class LocalMarkdownConfig(DbBaseModel):
input_files = models.JSONField(default=list, null=True)
input_filter = models.JSONField(default=list, null=True)
index_heading_entries = models.BooleanField(default=False)
user = models.ForeignKey(KhojUser, on_delete=models.CASCADE)
class LocalPdfConfig(DbBaseModel):
input_files = models.JSONField(default=list, null=True)
input_filter = models.JSONField(default=list, null=True)
index_heading_entries = models.BooleanField(default=False)
user = models.ForeignKey(KhojUser, on_delete=models.CASCADE)
class LocalPlaintextConfig(DbBaseModel):
input_files = models.JSONField(default=list, null=True)
input_filter = models.JSONField(default=list, null=True)
index_heading_entries = models.BooleanField(default=False)
user = models.ForeignKey(KhojUser, on_delete=models.CASCADE)
class SearchModelConfig(DbBaseModel): class SearchModelConfig(DbBaseModel):
class ModelType(models.TextChoices): class ModelType(models.TextChoices):
TEXT = "text" TEXT = "text"

View File

@@ -138,10 +138,10 @@ def run(should_start_server=True):
initialization(not args.non_interactive) initialization(not args.non_interactive)
# Create app directory, if it doesn't exist # Create app directory, if it doesn't exist
state.config_file.parent.mkdir(parents=True, exist_ok=True) state.log_file.parent.mkdir(parents=True, exist_ok=True)
# Set Log File # Set Log File
fh = logging.FileHandler(state.config_file.parent / "khoj.log", encoding="utf-8") fh = logging.FileHandler(state.log_file, encoding="utf-8")
fh.setLevel(logging.DEBUG) fh.setLevel(logging.DEBUG)
logger.addHandler(fh) logger.addHandler(fh)
@@ -194,7 +194,7 @@ def run(should_start_server=True):
# Configure Middleware # Configure Middleware
configure_middleware(app, state.ssl_config) configure_middleware(app, state.ssl_config)
initialize_server(args.config) initialize_server()
# If the server is started through gunicorn (external to the script), don't start the server # If the server is started through gunicorn (external to the script), don't start the server
if should_start_server: if should_start_server:
@@ -204,8 +204,7 @@ def run(should_start_server=True):
def set_state(args): def set_state(args):
state.config_file = args.config_file state.log_file = args.log_file
state.config = args.config
state.verbose = args.verbose state.verbose = args.verbose
state.host = args.host state.host = args.host
state.port = args.port state.port = args.port

View File

@@ -20,7 +20,6 @@ magika = Magika()
class GithubToEntries(TextToEntries): class GithubToEntries(TextToEntries):
def __init__(self, config: GithubConfig): def __init__(self, config: GithubConfig):
super().__init__(config)
raw_repos = config.githubrepoconfig.all() raw_repos = config.githubrepoconfig.all()
repos = [] repos = []
for repo in raw_repos: for repo in raw_repos:

View File

@@ -47,7 +47,6 @@ class NotionBlockType(Enum):
class NotionToEntries(TextToEntries): class NotionToEntries(TextToEntries):
def __init__(self, config: NotionConfig): def __init__(self, config: NotionConfig):
super().__init__(config)
self.config = NotionContentConfig( self.config = NotionContentConfig(
token=config.token, token=config.token,
) )

View File

@@ -27,7 +27,6 @@ logger = logging.getLogger(__name__)
class TextToEntries(ABC): class TextToEntries(ABC):
def __init__(self, config: Any = None): def __init__(self, config: Any = None):
self.embeddings_model = state.embeddings_model self.embeddings_model = state.embeddings_model
self.config = config
self.date_filter = DateFilter() self.date_filter = DateFilter()
@abstractmethod @abstractmethod

View File

@@ -87,22 +87,14 @@ def update(
force: Optional[bool] = False, force: Optional[bool] = False,
): ):
user = request.user.object user = request.user.object
if not state.config:
error_msg = f"🚨 Khoj is not configured.\nConfigure it via http://localhost:42110/settings, plugins or by editing {state.config_file}."
logger.warning(error_msg)
raise HTTPException(status_code=500, detail=error_msg)
try: try:
initialize_content(user=user, regenerate=force, search_type=t) initialize_content(user=user, regenerate=force, search_type=t)
except Exception as e: except Exception as e:
error_msg = f"🚨 Failed to update server via API: {e}" error_msg = f"🚨 Failed to update server indexed content via API: {e}"
logger.error(error_msg, exc_info=True) logger.error(error_msg, exc_info=True)
raise HTTPException(status_code=500, detail=error_msg) raise HTTPException(status_code=500, detail=error_msg)
else: else:
components = [] logger.info(f"📪 Server indexed content updated via API")
if state.search_models:
components.append("Search models")
components_msg = ", ".join(components)
logger.info(f"📪 {components_msg} updated via API")
update_telemetry_state( update_telemetry_state(
request=request, request=request,

View File

@@ -27,16 +27,7 @@ from khoj.database.adapters import (
get_user_notion_config, get_user_notion_config,
) )
from khoj.database.models import Entry as DbEntry from khoj.database.models import Entry as DbEntry
from khoj.database.models import ( from khoj.database.models import GithubConfig, GithubRepoConfig, NotionConfig
GithubConfig,
GithubRepoConfig,
KhojUser,
LocalMarkdownConfig,
LocalOrgConfig,
LocalPdfConfig,
LocalPlaintextConfig,
NotionConfig,
)
from khoj.processor.content.docx.docx_to_entries import DocxToEntries from khoj.processor.content.docx.docx_to_entries import DocxToEntries
from khoj.processor.content.pdf.pdf_to_entries import PdfToEntries from khoj.processor.content.pdf.pdf_to_entries import PdfToEntries
from khoj.routers.helpers import ( from khoj.routers.helpers import (
@@ -47,17 +38,9 @@ from khoj.routers.helpers import (
get_user_config, get_user_config,
update_telemetry_state, update_telemetry_state,
) )
from khoj.utils import constants, state from khoj.utils import state
from khoj.utils.config import SearchModels from khoj.utils.rawconfig import GithubContentConfig, NotionContentConfig
from khoj.utils.rawconfig import (
ContentConfig,
FullConfig,
GithubContentConfig,
NotionContentConfig,
SearchConfig,
)
from khoj.utils.state import SearchType from khoj.utils.state import SearchType
from khoj.utils.yaml import save_config_to_file_updated_state
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
@@ -192,8 +175,6 @@ async def set_content_github(
updated_config: Union[GithubContentConfig, None], updated_config: Union[GithubContentConfig, None],
client: Optional[str] = None, client: Optional[str] = None,
): ):
_initialize_config()
user = request.user.object user = request.user.object
try: try:
@@ -225,8 +206,6 @@ async def set_content_notion(
updated_config: Union[NotionContentConfig, None], updated_config: Union[NotionContentConfig, None],
client: Optional[str] = None, client: Optional[str] = None,
): ):
_initialize_config()
user = request.user.object user = request.user.object
try: try:
@@ -323,10 +302,6 @@ def get_content_types(request: Request, client: Optional[str] = None):
configured_content_types = set(EntryAdapters.get_unique_file_types(user)) configured_content_types = set(EntryAdapters.get_unique_file_types(user))
configured_content_types |= {"all"} configured_content_types |= {"all"}
if state.config and state.config.content_type:
for ctype in state.config.content_type.model_dump(exclude_none=True):
configured_content_types.add(ctype)
return list(configured_content_types & all_content_types) return list(configured_content_types & all_content_types)
@@ -606,28 +581,6 @@ async def indexer(
docx=index_files["docx"], docx=index_files["docx"],
) )
if state.config == None:
logger.info("📬 Initializing content index on first run.")
default_full_config = FullConfig(
content_type=None,
search_type=SearchConfig.model_validate(constants.default_config["search-type"]),
processor=None,
)
state.config = default_full_config
default_content_config = ContentConfig(
org=None,
markdown=None,
pdf=None,
docx=None,
image=None,
github=None,
notion=None,
plaintext=None,
)
state.config.content_type = default_content_config
save_config_to_file_updated_state()
configure_search(state.search_models, state.config.search_type)
loop = asyncio.get_event_loop() loop = asyncio.get_event_loop()
success = await loop.run_in_executor( success = await loop.run_in_executor(
None, None,
@@ -674,14 +627,6 @@ async def indexer(
return Response(content=indexed_filenames, status_code=200) return Response(content=indexed_filenames, status_code=200)
def configure_search(search_models: SearchModels, search_config: Optional[SearchConfig]) -> Optional[SearchModels]:
# Run Validation Checks
if search_models is None:
search_models = SearchModels()
return search_models
def map_config_to_object(content_source: str): def map_config_to_object(content_source: str):
if content_source == DbEntry.EntrySource.GITHUB: if content_source == DbEntry.EntrySource.GITHUB:
return GithubConfig return GithubConfig
@@ -689,56 +634,3 @@ def map_config_to_object(content_source: str):
return NotionConfig return NotionConfig
if content_source == DbEntry.EntrySource.COMPUTER: if content_source == DbEntry.EntrySource.COMPUTER:
return "Computer" return "Computer"
async def map_config_to_db(config: FullConfig, user: KhojUser):
if config.content_type:
if config.content_type.org:
await LocalOrgConfig.objects.filter(user=user).adelete()
await LocalOrgConfig.objects.acreate(
input_files=config.content_type.org.input_files,
input_filter=config.content_type.org.input_filter,
index_heading_entries=config.content_type.org.index_heading_entries,
user=user,
)
if config.content_type.markdown:
await LocalMarkdownConfig.objects.filter(user=user).adelete()
await LocalMarkdownConfig.objects.acreate(
input_files=config.content_type.markdown.input_files,
input_filter=config.content_type.markdown.input_filter,
index_heading_entries=config.content_type.markdown.index_heading_entries,
user=user,
)
if config.content_type.pdf:
await LocalPdfConfig.objects.filter(user=user).adelete()
await LocalPdfConfig.objects.acreate(
input_files=config.content_type.pdf.input_files,
input_filter=config.content_type.pdf.input_filter,
index_heading_entries=config.content_type.pdf.index_heading_entries,
user=user,
)
if config.content_type.plaintext:
await LocalPlaintextConfig.objects.filter(user=user).adelete()
await LocalPlaintextConfig.objects.acreate(
input_files=config.content_type.plaintext.input_files,
input_filter=config.content_type.plaintext.input_filter,
index_heading_entries=config.content_type.plaintext.index_heading_entries,
user=user,
)
if config.content_type.github:
await adapters.set_user_github_config(
user=user,
pat_token=config.content_type.github.pat_token,
repos=config.content_type.github.repos,
)
if config.content_type.notion:
await adapters.set_notion_config(
user=user,
token=config.content_type.notion.token,
)
def _initialize_config():
if state.config is None:
state.config = FullConfig()
state.config.search_type = SearchConfig.model_validate(constants.default_config["search-type"])

View File

@@ -218,7 +218,6 @@ def update_telemetry_state(
telemetry_type=telemetry_type, telemetry_type=telemetry_type,
api=api, api=api,
client=client, client=client,
app_config=state.config.app,
disable_telemetry_env=state.telemetry_disabled, disable_telemetry_env=state.telemetry_disabled,
properties=user_state, properties=user_state,
) )
@@ -2726,7 +2725,8 @@ def configure_content(
search_type = t.value if t else None search_type = t.value if t else None
no_documents = all([not files.get(file_type) for file_type in files]) # Check if client sent any documents of the supported types
no_client_sent_documents = all([not files.get(file_type) for file_type in files])
if files is None: if files is None:
logger.warning(f"🚨 No files to process for {search_type} search.") logger.warning(f"🚨 No files to process for {search_type} search.")
@@ -2800,7 +2800,8 @@ def configure_content(
success = False success = False
try: try:
if no_documents: # Run server side indexing of user Github docs if no client sent documents
if no_client_sent_documents:
github_config = GithubConfig.objects.filter(user=user).prefetch_related("githubrepoconfig").first() github_config = GithubConfig.objects.filter(user=user).prefetch_related("githubrepoconfig").first()
if ( if (
search_type == state.SearchType.All.value or search_type == state.SearchType.Github.value search_type == state.SearchType.All.value or search_type == state.SearchType.Github.value
@@ -2820,7 +2821,8 @@ def configure_content(
success = False success = False
try: try:
if no_documents: # Run server side indexing of user Notion docs if no client sent documents
if no_client_sent_documents:
# Initialize Notion Search # Initialize Notion Search
notion_config = NotionConfig.objects.filter(user=user).first() notion_config = NotionConfig.objects.filter(user=user).first()
if ( if (

View File

@@ -1,26 +1,19 @@
import argparse import argparse
import logging import logging
import os
import pathlib import pathlib
from importlib.metadata import version from importlib.metadata import version
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
from khoj.utils.helpers import is_env_var_true, resolve_absolute_path
from khoj.utils.yaml import parse_config_from_file
def cli(args=None): def cli(args=None):
# Setup Argument Parser for the Commandline Interface # Setup Argument Parser for the Commandline Interface
parser = argparse.ArgumentParser(description="Start Khoj; An AI personal assistant for your Digital Brain") parser = argparse.ArgumentParser(description="Start Khoj; An AI personal assistant for your Digital Brain")
parser.add_argument( parser.add_argument(
"--config-file", default="~/.khoj/khoj.yml", type=pathlib.Path, help="YAML file to configure Khoj" "--log-file",
) default="~/.khoj/khoj.log",
parser.add_argument( type=pathlib.Path,
"--regenerate", help="File path for server logs. Default: ~/.khoj/khoj.log",
action="store_true",
default=False,
help="Regenerate model embeddings from source files. Default: false",
) )
parser.add_argument("--verbose", "-v", action="count", default=0, help="Show verbose conversion logs. Default: 0") parser.add_argument("--verbose", "-v", action="count", default=0, help="Show verbose conversion logs. Default: 0")
parser.add_argument("--host", type=str, default="127.0.0.1", help="Host address of the server. Default: 127.0.0.1") parser.add_argument("--host", type=str, default="127.0.0.1", help="Host address of the server. Default: 127.0.0.1")
@@ -37,7 +30,7 @@ def cli(args=None):
"--anonymous-mode", "--anonymous-mode",
action="store_true", action="store_true",
default=False, default=False,
help="Run Khoj in anonymous mode. This does not require any login for connecting users.", help="Run Khoj in single user mode with no login required. Useful for personal use or testing.",
) )
parser.add_argument( parser.add_argument(
"--non-interactive", "--non-interactive",
@@ -57,15 +50,4 @@ def cli(args=None):
print(args.version_no) print(args.version_no)
exit(0) exit(0)
# Normalize config_file path to absolute path
args.config_file = resolve_absolute_path(args.config_file)
if not args.config_file.exists():
args.config = None
else:
args = run_migrations(args)
args.config = parse_config_from_file(args.config_file)
if is_env_var_true("KHOJ_TELEMETRY_DISABLE"):
args.config.app.should_log_telemetry = False
return args return args

View File

@@ -1,20 +1,7 @@
# System Packages # System Packages
from __future__ import annotations # to avoid quoting type hints from __future__ import annotations # to avoid quoting type hints
import logging
from dataclasses import dataclass
from enum import Enum from enum import Enum
from typing import TYPE_CHECKING, Any, List, Optional, Union
import torch
logger = logging.getLogger(__name__)
if TYPE_CHECKING:
from sentence_transformers import CrossEncoder
from khoj.utils.models import BaseEncoder
class SearchType(str, Enum): class SearchType(str, Enum):
@@ -27,36 +14,3 @@ class SearchType(str, Enum):
Notion = "notion" Notion = "notion"
Plaintext = "plaintext" Plaintext = "plaintext"
Docx = "docx" Docx = "docx"
class ProcessorType(str, Enum):
Conversation = "conversation"
@dataclass
class TextContent:
enabled: bool
@dataclass
class ImageContent:
image_names: List[str]
image_embeddings: torch.Tensor
image_metadata_embeddings: torch.Tensor
@dataclass
class TextSearchModel:
bi_encoder: BaseEncoder
cross_encoder: Optional[CrossEncoder] = None
top_k: Optional[int] = 15
@dataclass
class ImageSearchModel:
image_encoder: BaseEncoder
@dataclass
class SearchModels:
text_search: Optional[TextSearchModel] = None

View File

@@ -1,252 +0,0 @@
import glob
import logging
import os
from pathlib import Path
from typing import Optional
from bs4 import BeautifulSoup
from magika import Magika
from khoj.database.models import (
KhojUser,
LocalMarkdownConfig,
LocalOrgConfig,
LocalPdfConfig,
LocalPlaintextConfig,
)
from khoj.utils.config import SearchType
from khoj.utils.helpers import get_absolute_path, is_none_or_empty
from khoj.utils.rawconfig import TextContentConfig
logger = logging.getLogger(__name__)
magika = Magika()
def collect_files(user: KhojUser, search_type: Optional[SearchType] = SearchType.All) -> dict:
files: dict[str, dict] = {"docx": {}, "image": {}}
if search_type == SearchType.All or search_type == SearchType.Org:
org_config = LocalOrgConfig.objects.filter(user=user).first()
files["org"] = get_org_files(construct_config_from_db(org_config)) if org_config else {}
if search_type == SearchType.All or search_type == SearchType.Markdown:
markdown_config = LocalMarkdownConfig.objects.filter(user=user).first()
files["markdown"] = get_markdown_files(construct_config_from_db(markdown_config)) if markdown_config else {}
if search_type == SearchType.All or search_type == SearchType.Plaintext:
plaintext_config = LocalPlaintextConfig.objects.filter(user=user).first()
files["plaintext"] = get_plaintext_files(construct_config_from_db(plaintext_config)) if plaintext_config else {}
if search_type == SearchType.All or search_type == SearchType.Pdf:
pdf_config = LocalPdfConfig.objects.filter(user=user).first()
files["pdf"] = get_pdf_files(construct_config_from_db(pdf_config)) if pdf_config else {}
files["image"] = {}
files["docx"] = {}
return files
def construct_config_from_db(db_config) -> TextContentConfig:
return TextContentConfig(
input_files=db_config.input_files,
input_filter=db_config.input_filter,
index_heading_entries=db_config.index_heading_entries,
)
def get_plaintext_files(config: TextContentConfig) -> dict[str, str]:
def is_plaintextfile(file: str):
"Check if file is plaintext file"
# Check if file path exists
content_group = magika.identify_path(Path(file)).output.group
# Use file extension to decide plaintext if file content is not identifiable
valid_text_file_extensions = ("txt", "md", "markdown", "org" "mbox", "rst", "html", "htm", "xml")
return file.endswith(valid_text_file_extensions) or content_group in ["text", "code"]
def extract_html_content(html_content: str):
"Extract content from HTML"
soup = BeautifulSoup(html_content, "html.parser")
return soup.get_text(strip=True, separator="\n")
# Extract required fields from config
input_files, input_filters = (
config.input_files,
config.input_filter,
)
# Input Validation
if is_none_or_empty(input_files) and is_none_or_empty(input_filters):
logger.debug("At least one of input-files or input-file-filter is required to be specified")
return {}
# Get all plain text files to process
absolute_plaintext_files, filtered_plaintext_files = set(), set()
if input_files:
absolute_plaintext_files = {get_absolute_path(jsonl_file) for jsonl_file in input_files}
if input_filters:
filtered_plaintext_files = {
filtered_file
for plaintext_file_filter in input_filters
for filtered_file in glob.glob(get_absolute_path(plaintext_file_filter), recursive=True)
if os.path.isfile(filtered_file)
}
all_target_files = sorted(absolute_plaintext_files | filtered_plaintext_files)
files_with_no_plaintext_extensions = {
target_files for target_files in all_target_files if not is_plaintextfile(target_files)
}
if any(files_with_no_plaintext_extensions):
logger.warning(f"Skipping unsupported files from plaintext indexing: {files_with_no_plaintext_extensions}")
all_target_files = list(set(all_target_files) - files_with_no_plaintext_extensions)
logger.debug(f"Processing files: {all_target_files}")
filename_to_content_map = {}
for file in all_target_files:
with open(file, "r", encoding="utf8") as f:
try:
plaintext_content = f.read()
if file.endswith(("html", "htm", "xml")):
plaintext_content = extract_html_content(plaintext_content)
filename_to_content_map[file] = plaintext_content
except Exception as e:
logger.warning(f"Unable to read file: {file} as plaintext. Skipping file.")
logger.warning(e, exc_info=True)
return filename_to_content_map
def get_org_files(config: TextContentConfig):
# Extract required fields from config
org_files, org_file_filters = (
config.input_files,
config.input_filter,
)
# Input Validation
if is_none_or_empty(org_files) and is_none_or_empty(org_file_filters):
logger.debug("At least one of org-files or org-file-filter is required to be specified")
return {}
# Get Org files to process
absolute_org_files, filtered_org_files = set(), set()
if org_files:
absolute_org_files = {get_absolute_path(org_file) for org_file in org_files}
if org_file_filters:
filtered_org_files = {
filtered_file
for org_file_filter in org_file_filters
for filtered_file in glob.glob(get_absolute_path(org_file_filter), recursive=True)
if os.path.isfile(filtered_file)
}
all_org_files = sorted(absolute_org_files | filtered_org_files)
files_with_non_org_extensions = {org_file for org_file in all_org_files if not org_file.endswith(".org")}
if any(files_with_non_org_extensions):
logger.warning(f"There maybe non org-mode files in the input set: {files_with_non_org_extensions}")
logger.debug(f"Processing files: {all_org_files}")
filename_to_content_map = {}
for file in all_org_files:
with open(file, "r", encoding="utf8") as f:
try:
filename_to_content_map[file] = f.read()
except Exception as e:
logger.warning(f"Unable to read file: {file} as org. Skipping file.")
logger.warning(e, exc_info=True)
return filename_to_content_map
def get_markdown_files(config: TextContentConfig):
# Extract required fields from config
markdown_files, markdown_file_filters = (
config.input_files,
config.input_filter,
)
# Input Validation
if is_none_or_empty(markdown_files) and is_none_or_empty(markdown_file_filters):
logger.debug("At least one of markdown-files or markdown-file-filter is required to be specified")
return {}
# Get markdown files to process
absolute_markdown_files, filtered_markdown_files = set(), set()
if markdown_files:
absolute_markdown_files = {get_absolute_path(markdown_file) for markdown_file in markdown_files}
if markdown_file_filters:
filtered_markdown_files = {
filtered_file
for markdown_file_filter in markdown_file_filters
for filtered_file in glob.glob(get_absolute_path(markdown_file_filter), recursive=True)
if os.path.isfile(filtered_file)
}
all_markdown_files = sorted(absolute_markdown_files | filtered_markdown_files)
files_with_non_markdown_extensions = {
md_file for md_file in all_markdown_files if not md_file.endswith(".md") and not md_file.endswith(".markdown")
}
if any(files_with_non_markdown_extensions):
logger.warning(
f"[Warning] There maybe non markdown-mode files in the input set: {files_with_non_markdown_extensions}"
)
logger.debug(f"Processing files: {all_markdown_files}")
filename_to_content_map = {}
for file in all_markdown_files:
with open(file, "r", encoding="utf8") as f:
try:
filename_to_content_map[file] = f.read()
except Exception as e:
logger.warning(f"Unable to read file: {file} as markdown. Skipping file.")
logger.warning(e, exc_info=True)
return filename_to_content_map
def get_pdf_files(config: TextContentConfig):
# Extract required fields from config
pdf_files, pdf_file_filters = (
config.input_files,
config.input_filter,
)
# Input Validation
if is_none_or_empty(pdf_files) and is_none_or_empty(pdf_file_filters):
logger.debug("At least one of pdf-files or pdf-file-filter is required to be specified")
return {}
# Get PDF files to process
absolute_pdf_files, filtered_pdf_files = set(), set()
if pdf_files:
absolute_pdf_files = {get_absolute_path(pdf_file) for pdf_file in pdf_files}
if pdf_file_filters:
filtered_pdf_files = {
filtered_file
for pdf_file_filter in pdf_file_filters
for filtered_file in glob.glob(get_absolute_path(pdf_file_filter), recursive=True)
if os.path.isfile(filtered_file)
}
all_pdf_files = sorted(absolute_pdf_files | filtered_pdf_files)
files_with_non_pdf_extensions = {pdf_file for pdf_file in all_pdf_files if not pdf_file.endswith(".pdf")}
if any(files_with_non_pdf_extensions):
logger.warning(f"[Warning] There maybe non pdf-mode files in the input set: {files_with_non_pdf_extensions}")
logger.debug(f"Processing files: {all_pdf_files}")
filename_to_content_map = {}
for file in all_pdf_files:
with open(file, "rb") as f:
try:
filename_to_content_map[file] = f.read()
except Exception as e:
logger.warning(f"Unable to read file: {file} as PDF. Skipping file.")
logger.warning(e, exc_info=True)
return filename_to_content_map

View File

@@ -47,7 +47,6 @@ if TYPE_CHECKING:
from sentence_transformers import CrossEncoder, SentenceTransformer from sentence_transformers import CrossEncoder, SentenceTransformer
from khoj.utils.models import BaseEncoder from khoj.utils.models import BaseEncoder
from khoj.utils.rawconfig import AppConfig
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
@@ -267,23 +266,16 @@ def get_server_id():
return server_id return server_id
def telemetry_disabled(app_config: AppConfig, telemetry_disable_env) -> bool:
if telemetry_disable_env is True:
return True
return not app_config or not app_config.should_log_telemetry
def log_telemetry( def log_telemetry(
telemetry_type: str, telemetry_type: str,
api: str = None, api: str = None,
client: Optional[str] = None, client: Optional[str] = None,
app_config: Optional[AppConfig] = None,
disable_telemetry_env: bool = False, disable_telemetry_env: bool = False,
properties: dict = None, properties: dict = None,
): ):
"""Log basic app usage telemetry like client, os, api called""" """Log basic app usage telemetry like client, os, api called"""
# Do not log usage telemetry, if telemetry is disabled via app config # Do not log usage telemetry, if telemetry is disabled via app config
if telemetry_disabled(app_config, disable_telemetry_env): if disable_telemetry_env:
return [] return []
if properties.get("server_id") is None: if properties.get("server_id") is None:

View File

@@ -147,24 +147,6 @@ def initialization(interactive: bool = True):
logger.info("🗣️ Chat model configuration complete") logger.info("🗣️ Chat model configuration complete")
# Set up offline speech to text model
use_offline_speech2text_model = "n" if not interactive else input("Use offline speech to text model? (y/n): ")
if use_offline_speech2text_model == "y":
logger.info("🗣️ Setting up offline speech to text model")
# Delete any existing speech to text model options. There can only be one.
SpeechToTextModelOptions.objects.all().delete()
default_offline_speech2text_model = "base"
offline_speech2text_model = input(
f"Enter the Whisper model to use Offline (default: {default_offline_speech2text_model}): "
)
offline_speech2text_model = offline_speech2text_model or default_offline_speech2text_model
SpeechToTextModelOptions.objects.create(
model_name=offline_speech2text_model, model_type=SpeechToTextModelOptions.ModelType.OFFLINE
)
logger.info(f"🗣️ Offline speech to text model configured to {offline_speech2text_model}")
def _setup_chat_model_provider( def _setup_chat_model_provider(
model_type: ChatModel.ModelType, model_type: ChatModel.ModelType,
default_chat_models: list, default_chat_models: list,

View File

@@ -48,17 +48,6 @@ class FilesFilterRequest(BaseModel):
conversation_id: str conversation_id: str
class TextConfigBase(ConfigBase):
compressed_jsonl: Path
embeddings_file: Path
class TextContentConfig(ConfigBase):
input_files: Optional[List[Path]] = None
input_filter: Optional[List[str]] = None
index_heading_entries: Optional[bool] = False
class GithubRepoConfig(ConfigBase): class GithubRepoConfig(ConfigBase):
name: str name: str
owner: str owner: str
@@ -74,57 +63,6 @@ class NotionContentConfig(ConfigBase):
token: str token: str
class ContentConfig(ConfigBase):
org: Optional[TextContentConfig] = None
markdown: Optional[TextContentConfig] = None
pdf: Optional[TextContentConfig] = None
plaintext: Optional[TextContentConfig] = None
github: Optional[GithubContentConfig] = None
notion: Optional[NotionContentConfig] = None
image: Optional[TextContentConfig] = None
docx: Optional[TextContentConfig] = None
class ImageSearchConfig(ConfigBase):
encoder: str
encoder_type: Optional[str] = None
model_directory: Optional[Path] = None
class Config:
protected_namespaces = ()
class SearchConfig(ConfigBase):
image: Optional[ImageSearchConfig] = None
class OpenAIProcessorConfig(ConfigBase):
api_key: str
chat_model: Optional[str] = "gpt-4o-mini"
class ConversationProcessorConfig(ConfigBase):
openai: Optional[OpenAIProcessorConfig] = None
max_prompt_size: Optional[int] = None
tokenizer: Optional[str] = None
class ProcessorConfig(ConfigBase):
conversation: Optional[ConversationProcessorConfig] = None
class AppConfig(ConfigBase):
should_log_telemetry: bool = True
class FullConfig(ConfigBase):
content_type: Optional[ContentConfig] = None
search_type: Optional[SearchConfig] = None
processor: Optional[ProcessorConfig] = None
app: Optional[AppConfig] = AppConfig()
version: Optional[str] = None
class SearchResponse(ConfigBase): class SearchResponse(ConfigBase):
entry: str entry: str
score: float score: float

View File

@@ -12,18 +12,14 @@ from whisper import Whisper
from khoj.database.models import ProcessLock from khoj.database.models import ProcessLock
from khoj.processor.embeddings import CrossEncoderModel, EmbeddingsModel from khoj.processor.embeddings import CrossEncoderModel, EmbeddingsModel
from khoj.utils import config as utils_config from khoj.utils import config as utils_config
from khoj.utils.config import SearchModels
from khoj.utils.helpers import LRU, get_device, is_env_var_true from khoj.utils.helpers import LRU, get_device, is_env_var_true
from khoj.utils.rawconfig import FullConfig
# Application Global State # Application Global State
config = FullConfig()
search_models = SearchModels()
embeddings_model: Dict[str, EmbeddingsModel] = None embeddings_model: Dict[str, EmbeddingsModel] = None
cross_encoder_model: Dict[str, CrossEncoderModel] = None cross_encoder_model: Dict[str, CrossEncoderModel] = None
openai_client: OpenAI = None openai_client: OpenAI = None
whisper_model: Whisper = None whisper_model: Whisper = None
config_file: Path = None log_file: Path = None
verbose: int = 0 verbose: int = 0
host: str = None host: str = None
port: int = None port: int = None

View File

@@ -1,47 +1,8 @@
from pathlib import Path
import yaml import yaml
from khoj.utils import state
from khoj.utils.rawconfig import FullConfig
# Do not emit tags when dumping to YAML # Do not emit tags when dumping to YAML
yaml.emitter.Emitter.process_tag = lambda self, *args, **kwargs: None # type: ignore[assignment] yaml.emitter.Emitter.process_tag = lambda self, *args, **kwargs: None # type: ignore[assignment]
def save_config_to_file_updated_state():
with open(state.config_file, "w") as outfile:
yaml.dump(yaml.safe_load(state.config.json(by_alias=True)), outfile)
outfile.close()
return state.config
def save_config_to_file(yaml_config: dict, yaml_config_file: Path):
"Write config to YML file"
# Create output directory, if it doesn't exist
yaml_config_file.parent.mkdir(parents=True, exist_ok=True)
with open(yaml_config_file, "w", encoding="utf-8") as config_file:
yaml.safe_dump(yaml_config, config_file, allow_unicode=True)
def load_config_from_file(yaml_config_file: Path) -> dict:
"Read config from YML file"
config_from_file = None
with open(yaml_config_file, "r", encoding="utf-8") as config_file:
config_from_file = yaml.safe_load(config_file)
return config_from_file
def parse_config_from_string(yaml_config: dict) -> FullConfig:
"Parse and validate config in YML string"
return FullConfig.model_validate(yaml_config)
def parse_config_from_file(yaml_config_file):
"Parse and validate config in YML file"
return parse_config_from_string(load_config_from_file(yaml_config_file))
def yaml_dump(data): def yaml_dump(data):
return yaml.dump(data, allow_unicode=True, sort_keys=False, default_flow_style=False) return yaml.dump(data, allow_unicode=True, sort_keys=False, default_flow_style=False)

View File

@@ -33,7 +33,7 @@ from khoj.utils import fs_syncer, state
from khoj.utils.config import SearchModels from khoj.utils.config import SearchModels
from khoj.utils.constants import web_directory from khoj.utils.constants import web_directory
from khoj.utils.helpers import resolve_absolute_path from khoj.utils.helpers import resolve_absolute_path
from khoj.utils.rawconfig import ContentConfig, ImageSearchConfig, SearchConfig from khoj.utils.rawconfig import ContentConfig, SearchConfig
from tests.helpers import ( from tests.helpers import (
AiModelApiFactory, AiModelApiFactory,
ChatModelFactory, ChatModelFactory,
@@ -69,12 +69,6 @@ def search_config() -> SearchConfig:
model_dir.mkdir(parents=True, exist_ok=True) model_dir.mkdir(parents=True, exist_ok=True)
search_config = SearchConfig() search_config = SearchConfig()
search_config.image = ImageSearchConfig(
encoder="sentence-transformers/clip-ViT-B-32",
model_directory=model_dir / "image/",
encoder_type=None,
)
return search_config return search_config
@@ -301,7 +295,6 @@ def chat_client_with_large_kb(search_config: SearchConfig, default_user2: KhojUs
@pytest.mark.django_db @pytest.mark.django_db
def chat_client_builder(search_config, user, index_content=True, require_auth=False): def chat_client_builder(search_config, user, index_content=True, require_auth=False):
# Initialize app state # Initialize app state
state.config.search_type = search_config
state.SearchType = configure_search_types() state.SearchType = configure_search_types()
if index_content: if index_content:
@@ -349,7 +342,6 @@ def large_kb_chat_client_builder(search_config, user):
import tempfile import tempfile
# Initialize app state # Initialize app state
state.config.search_type = search_config
state.SearchType = configure_search_types() state.SearchType = configure_search_types()
# Create temporary directory for large number of test files # Create temporary directory for large number of test files
@@ -470,12 +462,8 @@ def fastapi_app():
@pytest.fixture(scope="function") @pytest.fixture(scope="function")
def client( def client(
content_config: ContentConfig,
search_config: SearchConfig,
api_user: KhojApiUser, api_user: KhojApiUser,
): ):
state.config.content_type = content_config
state.config.search_type = search_config
state.SearchType = configure_search_types() state.SearchType = configure_search_types()
state.embeddings_model = dict() state.embeddings_model = dict()
state.embeddings_model["default"] = EmbeddingsModel() state.embeddings_model["default"] = EmbeddingsModel()

View File

@@ -283,10 +283,6 @@ def test_get_api_config_types(client, sample_org_data, default_user: KhojUser):
def test_get_configured_types_with_no_content_config(fastapi_app: FastAPI): def test_get_configured_types_with_no_content_config(fastapi_app: FastAPI):
# Arrange # Arrange
state.anonymous_mode = True state.anonymous_mode = True
if state.config and state.config.content_type:
state.config.content_type = None
state.search_models = configure_search_types()
configure_routes(fastapi_app) configure_routes(fastapi_app)
client = TestClient(fastapi_app) client = TestClient(fastapi_app)