Drop old code to sync files on server filesystem. Clean cli, init paths

This stale code was originally used to index files on server file
system directly by server. We currently push files to sync via API.

Server side syncing of remote content like Github and Notion is still
supported. But old, unused code for server side sync of files on
server fs is being cleaned out.

New --log-file cli args allows specifying where khoj server should
store logs on fs. This replaces the --config-file cli arg that was
only being used as a proxy for deciding where to store the log file.

- TODO
  - Tests are broken. They were relying on the server side content
    syncing for test setup
This commit is contained in:
Debanjum
2025-07-03 15:27:06 -07:00
parent b1f2737c9a
commit d9d24dd638
21 changed files with 82 additions and 688 deletions

View File

@@ -50,13 +50,11 @@ from khoj.database.adapters import (
)
from khoj.database.models import ClientApplication, KhojUser, ProcessLock, Subscription
from khoj.processor.embeddings import CrossEncoderModel, EmbeddingsModel
from khoj.routers.api_content import configure_content, configure_search
from khoj.routers.api_content import configure_content
from khoj.routers.twilio import is_twilio_enabled
from khoj.utils import constants, state
from khoj.utils.config import SearchType
from khoj.utils.fs_syncer import collect_files
from khoj.utils.helpers import is_none_or_empty, telemetry_disabled
from khoj.utils.rawconfig import FullConfig
from khoj.utils.helpers import is_none_or_empty
logger = logging.getLogger(__name__)
@@ -232,14 +230,6 @@ class UserAuthenticationBackend(AuthenticationBackend):
return AuthCredentials(), UnauthenticatedUser()
def initialize_server(config: Optional[FullConfig]):
try:
configure_server(config, init=True)
except Exception as e:
logger.error(f"🚨 Failed to configure server on app load: {e}", exc_info=True)
raise e
def clean_connections(func):
"""
A decorator that ensures that Django database connections that have become unusable, or are obsolete, are closed
@@ -260,19 +250,7 @@ def clean_connections(func):
return func_wrapper
def configure_server(
config: FullConfig,
regenerate: bool = False,
search_type: Optional[SearchType] = None,
init=False,
user: KhojUser = None,
):
# Update Config
if config == None:
logger.info(f"Initializing with default config.")
config = FullConfig()
state.config = config
def initialize_server():
if ConversationAdapters.has_valid_ai_model_api():
ai_model_api = ConversationAdapters.get_ai_model_api()
state.openai_client = openai.OpenAI(api_key=ai_model_api.api_key, base_url=ai_model_api.api_base_url)
@@ -309,43 +287,33 @@ def configure_server(
)
state.SearchType = configure_search_types()
state.search_models = configure_search(state.search_models, state.config.search_type)
setup_default_agent(user)
setup_default_agent()
message = (
"📡 Telemetry disabled"
if telemetry_disabled(state.config.app, state.telemetry_disabled)
else "📡 Telemetry enabled"
)
message = "📡 Telemetry disabled" if state.telemetry_disabled else "📡 Telemetry enabled"
logger.info(message)
if not init:
initialize_content(user, regenerate, search_type)
except Exception as e:
logger.error(f"Failed to load some search models: {e}", exc_info=True)
def setup_default_agent(user: KhojUser):
AgentAdapters.create_default_agent(user)
def setup_default_agent():
AgentAdapters.create_default_agent()
def initialize_content(user: KhojUser, regenerate: bool, search_type: Optional[SearchType] = None):
# Initialize Content from Config
if state.search_models:
try:
logger.info("📬 Updating content index...")
all_files = collect_files(user=user)
status = configure_content(
user,
all_files,
regenerate,
search_type,
)
if not status:
raise RuntimeError("Failed to update content index")
except Exception as e:
raise e
try:
logger.info("📬 Updating content index...")
status = configure_content(
user,
{},
regenerate,
search_type,
)
if not status:
raise RuntimeError("Failed to update content index")
except Exception as e:
raise e
def configure_routes(app):
@@ -438,8 +406,7 @@ def configure_middleware(app, ssl_enabled: bool = False):
def update_content_index():
for user in get_all_users():
all_files = collect_files(user=user)
success = configure_content(user, all_files)
success = configure_content(user, {})
if not success:
raise RuntimeError("Failed to update content index")
logger.info("📪 Content index updated via Scheduler")
@@ -464,7 +431,7 @@ def configure_search_types():
@schedule.repeat(schedule.every(2).minutes)
@clean_connections
def upload_telemetry():
if telemetry_disabled(state.config.app, state.telemetry_disabled) or not state.telemetry:
if state.telemetry_disabled or not state.telemetry:
return
try:

View File

@@ -788,8 +788,8 @@ class AgentAdapters:
return Agent.objects.filter(name=AgentAdapters.DEFAULT_AGENT_NAME).first()
@staticmethod
def create_default_agent(user: KhojUser):
default_chat_model = ConversationAdapters.get_default_chat_model(user)
def create_default_agent():
default_chat_model = ConversationAdapters.get_default_chat_model(user=None)
if default_chat_model is None:
logger.info("No default conversation config found, skipping default agent creation")
return None

View File

@@ -0,0 +1,36 @@
# Generated by Django 5.1.10 on 2025-07-25 23:30
from django.db import migrations
class Migration(migrations.Migration):
dependencies = [
("database", "0092_alter_chatmodel_model_type_alter_chatmodel_name_and_more"),
]
operations = [
migrations.RemoveField(
model_name="localorgconfig",
name="user",
),
migrations.RemoveField(
model_name="localpdfconfig",
name="user",
),
migrations.RemoveField(
model_name="localplaintextconfig",
name="user",
),
migrations.DeleteModel(
name="LocalMarkdownConfig",
),
migrations.DeleteModel(
name="LocalOrgConfig",
),
migrations.DeleteModel(
name="LocalPdfConfig",
),
migrations.DeleteModel(
name="LocalPlaintextConfig",
),
]

View File

@@ -488,34 +488,6 @@ class ServerChatSettings(DbBaseModel):
super().save(*args, **kwargs)
class LocalOrgConfig(DbBaseModel):
input_files = models.JSONField(default=list, null=True)
input_filter = models.JSONField(default=list, null=True)
index_heading_entries = models.BooleanField(default=False)
user = models.ForeignKey(KhojUser, on_delete=models.CASCADE)
class LocalMarkdownConfig(DbBaseModel):
input_files = models.JSONField(default=list, null=True)
input_filter = models.JSONField(default=list, null=True)
index_heading_entries = models.BooleanField(default=False)
user = models.ForeignKey(KhojUser, on_delete=models.CASCADE)
class LocalPdfConfig(DbBaseModel):
input_files = models.JSONField(default=list, null=True)
input_filter = models.JSONField(default=list, null=True)
index_heading_entries = models.BooleanField(default=False)
user = models.ForeignKey(KhojUser, on_delete=models.CASCADE)
class LocalPlaintextConfig(DbBaseModel):
input_files = models.JSONField(default=list, null=True)
input_filter = models.JSONField(default=list, null=True)
index_heading_entries = models.BooleanField(default=False)
user = models.ForeignKey(KhojUser, on_delete=models.CASCADE)
class SearchModelConfig(DbBaseModel):
class ModelType(models.TextChoices):
TEXT = "text"

View File

@@ -138,10 +138,10 @@ def run(should_start_server=True):
initialization(not args.non_interactive)
# Create app directory, if it doesn't exist
state.config_file.parent.mkdir(parents=True, exist_ok=True)
state.log_file.parent.mkdir(parents=True, exist_ok=True)
# Set Log File
fh = logging.FileHandler(state.config_file.parent / "khoj.log", encoding="utf-8")
fh = logging.FileHandler(state.log_file, encoding="utf-8")
fh.setLevel(logging.DEBUG)
logger.addHandler(fh)
@@ -194,7 +194,7 @@ def run(should_start_server=True):
# Configure Middleware
configure_middleware(app, state.ssl_config)
initialize_server(args.config)
initialize_server()
# If the server is started through gunicorn (external to the script), don't start the server
if should_start_server:
@@ -204,8 +204,7 @@ def run(should_start_server=True):
def set_state(args):
state.config_file = args.config_file
state.config = args.config
state.log_file = args.log_file
state.verbose = args.verbose
state.host = args.host
state.port = args.port

View File

@@ -20,7 +20,6 @@ magika = Magika()
class GithubToEntries(TextToEntries):
def __init__(self, config: GithubConfig):
super().__init__(config)
raw_repos = config.githubrepoconfig.all()
repos = []
for repo in raw_repos:

View File

@@ -47,7 +47,6 @@ class NotionBlockType(Enum):
class NotionToEntries(TextToEntries):
def __init__(self, config: NotionConfig):
super().__init__(config)
self.config = NotionContentConfig(
token=config.token,
)

View File

@@ -27,7 +27,6 @@ logger = logging.getLogger(__name__)
class TextToEntries(ABC):
def __init__(self, config: Any = None):
self.embeddings_model = state.embeddings_model
self.config = config
self.date_filter = DateFilter()
@abstractmethod

View File

@@ -87,22 +87,14 @@ def update(
force: Optional[bool] = False,
):
user = request.user.object
if not state.config:
error_msg = f"🚨 Khoj is not configured.\nConfigure it via http://localhost:42110/settings, plugins or by editing {state.config_file}."
logger.warning(error_msg)
raise HTTPException(status_code=500, detail=error_msg)
try:
initialize_content(user=user, regenerate=force, search_type=t)
except Exception as e:
error_msg = f"🚨 Failed to update server via API: {e}"
error_msg = f"🚨 Failed to update server indexed content via API: {e}"
logger.error(error_msg, exc_info=True)
raise HTTPException(status_code=500, detail=error_msg)
else:
components = []
if state.search_models:
components.append("Search models")
components_msg = ", ".join(components)
logger.info(f"📪 {components_msg} updated via API")
logger.info(f"📪 Server indexed content updated via API")
update_telemetry_state(
request=request,

View File

@@ -27,16 +27,7 @@ from khoj.database.adapters import (
get_user_notion_config,
)
from khoj.database.models import Entry as DbEntry
from khoj.database.models import (
GithubConfig,
GithubRepoConfig,
KhojUser,
LocalMarkdownConfig,
LocalOrgConfig,
LocalPdfConfig,
LocalPlaintextConfig,
NotionConfig,
)
from khoj.database.models import GithubConfig, GithubRepoConfig, NotionConfig
from khoj.processor.content.docx.docx_to_entries import DocxToEntries
from khoj.processor.content.pdf.pdf_to_entries import PdfToEntries
from khoj.routers.helpers import (
@@ -47,17 +38,9 @@ from khoj.routers.helpers import (
get_user_config,
update_telemetry_state,
)
from khoj.utils import constants, state
from khoj.utils.config import SearchModels
from khoj.utils.rawconfig import (
ContentConfig,
FullConfig,
GithubContentConfig,
NotionContentConfig,
SearchConfig,
)
from khoj.utils import state
from khoj.utils.rawconfig import GithubContentConfig, NotionContentConfig
from khoj.utils.state import SearchType
from khoj.utils.yaml import save_config_to_file_updated_state
logger = logging.getLogger(__name__)
@@ -192,8 +175,6 @@ async def set_content_github(
updated_config: Union[GithubContentConfig, None],
client: Optional[str] = None,
):
_initialize_config()
user = request.user.object
try:
@@ -225,8 +206,6 @@ async def set_content_notion(
updated_config: Union[NotionContentConfig, None],
client: Optional[str] = None,
):
_initialize_config()
user = request.user.object
try:
@@ -323,10 +302,6 @@ def get_content_types(request: Request, client: Optional[str] = None):
configured_content_types = set(EntryAdapters.get_unique_file_types(user))
configured_content_types |= {"all"}
if state.config and state.config.content_type:
for ctype in state.config.content_type.model_dump(exclude_none=True):
configured_content_types.add(ctype)
return list(configured_content_types & all_content_types)
@@ -606,28 +581,6 @@ async def indexer(
docx=index_files["docx"],
)
if state.config == None:
logger.info("📬 Initializing content index on first run.")
default_full_config = FullConfig(
content_type=None,
search_type=SearchConfig.model_validate(constants.default_config["search-type"]),
processor=None,
)
state.config = default_full_config
default_content_config = ContentConfig(
org=None,
markdown=None,
pdf=None,
docx=None,
image=None,
github=None,
notion=None,
plaintext=None,
)
state.config.content_type = default_content_config
save_config_to_file_updated_state()
configure_search(state.search_models, state.config.search_type)
loop = asyncio.get_event_loop()
success = await loop.run_in_executor(
None,
@@ -674,14 +627,6 @@ async def indexer(
return Response(content=indexed_filenames, status_code=200)
def configure_search(search_models: SearchModels, search_config: Optional[SearchConfig]) -> Optional[SearchModels]:
# Run Validation Checks
if search_models is None:
search_models = SearchModels()
return search_models
def map_config_to_object(content_source: str):
if content_source == DbEntry.EntrySource.GITHUB:
return GithubConfig
@@ -689,56 +634,3 @@ def map_config_to_object(content_source: str):
return NotionConfig
if content_source == DbEntry.EntrySource.COMPUTER:
return "Computer"
async def map_config_to_db(config: FullConfig, user: KhojUser):
if config.content_type:
if config.content_type.org:
await LocalOrgConfig.objects.filter(user=user).adelete()
await LocalOrgConfig.objects.acreate(
input_files=config.content_type.org.input_files,
input_filter=config.content_type.org.input_filter,
index_heading_entries=config.content_type.org.index_heading_entries,
user=user,
)
if config.content_type.markdown:
await LocalMarkdownConfig.objects.filter(user=user).adelete()
await LocalMarkdownConfig.objects.acreate(
input_files=config.content_type.markdown.input_files,
input_filter=config.content_type.markdown.input_filter,
index_heading_entries=config.content_type.markdown.index_heading_entries,
user=user,
)
if config.content_type.pdf:
await LocalPdfConfig.objects.filter(user=user).adelete()
await LocalPdfConfig.objects.acreate(
input_files=config.content_type.pdf.input_files,
input_filter=config.content_type.pdf.input_filter,
index_heading_entries=config.content_type.pdf.index_heading_entries,
user=user,
)
if config.content_type.plaintext:
await LocalPlaintextConfig.objects.filter(user=user).adelete()
await LocalPlaintextConfig.objects.acreate(
input_files=config.content_type.plaintext.input_files,
input_filter=config.content_type.plaintext.input_filter,
index_heading_entries=config.content_type.plaintext.index_heading_entries,
user=user,
)
if config.content_type.github:
await adapters.set_user_github_config(
user=user,
pat_token=config.content_type.github.pat_token,
repos=config.content_type.github.repos,
)
if config.content_type.notion:
await adapters.set_notion_config(
user=user,
token=config.content_type.notion.token,
)
def _initialize_config():
if state.config is None:
state.config = FullConfig()
state.config.search_type = SearchConfig.model_validate(constants.default_config["search-type"])

View File

@@ -218,7 +218,6 @@ def update_telemetry_state(
telemetry_type=telemetry_type,
api=api,
client=client,
app_config=state.config.app,
disable_telemetry_env=state.telemetry_disabled,
properties=user_state,
)
@@ -2726,7 +2725,8 @@ def configure_content(
search_type = t.value if t else None
no_documents = all([not files.get(file_type) for file_type in files])
# Check if client sent any documents of the supported types
no_client_sent_documents = all([not files.get(file_type) for file_type in files])
if files is None:
logger.warning(f"🚨 No files to process for {search_type} search.")
@@ -2800,7 +2800,8 @@ def configure_content(
success = False
try:
if no_documents:
# Run server side indexing of user Github docs if no client sent documents
if no_client_sent_documents:
github_config = GithubConfig.objects.filter(user=user).prefetch_related("githubrepoconfig").first()
if (
search_type == state.SearchType.All.value or search_type == state.SearchType.Github.value
@@ -2820,7 +2821,8 @@ def configure_content(
success = False
try:
if no_documents:
# Run server side indexing of user Notion docs if no client sent documents
if no_client_sent_documents:
# Initialize Notion Search
notion_config = NotionConfig.objects.filter(user=user).first()
if (

View File

@@ -1,26 +1,19 @@
import argparse
import logging
import os
import pathlib
from importlib.metadata import version
logger = logging.getLogger(__name__)
from khoj.utils.helpers import is_env_var_true, resolve_absolute_path
from khoj.utils.yaml import parse_config_from_file
def cli(args=None):
# Setup Argument Parser for the Commandline Interface
parser = argparse.ArgumentParser(description="Start Khoj; An AI personal assistant for your Digital Brain")
parser.add_argument(
"--config-file", default="~/.khoj/khoj.yml", type=pathlib.Path, help="YAML file to configure Khoj"
)
parser.add_argument(
"--regenerate",
action="store_true",
default=False,
help="Regenerate model embeddings from source files. Default: false",
"--log-file",
default="~/.khoj/khoj.log",
type=pathlib.Path,
help="File path for server logs. Default: ~/.khoj/khoj.log",
)
parser.add_argument("--verbose", "-v", action="count", default=0, help="Show verbose conversion logs. Default: 0")
parser.add_argument("--host", type=str, default="127.0.0.1", help="Host address of the server. Default: 127.0.0.1")
@@ -37,7 +30,7 @@ def cli(args=None):
"--anonymous-mode",
action="store_true",
default=False,
help="Run Khoj in anonymous mode. This does not require any login for connecting users.",
help="Run Khoj in single user mode with no login required. Useful for personal use or testing.",
)
parser.add_argument(
"--non-interactive",
@@ -57,15 +50,4 @@ def cli(args=None):
print(args.version_no)
exit(0)
# Normalize config_file path to absolute path
args.config_file = resolve_absolute_path(args.config_file)
if not args.config_file.exists():
args.config = None
else:
args = run_migrations(args)
args.config = parse_config_from_file(args.config_file)
if is_env_var_true("KHOJ_TELEMETRY_DISABLE"):
args.config.app.should_log_telemetry = False
return args

View File

@@ -1,20 +1,7 @@
# System Packages
from __future__ import annotations # to avoid quoting type hints
import logging
from dataclasses import dataclass
from enum import Enum
from typing import TYPE_CHECKING, Any, List, Optional, Union
import torch
logger = logging.getLogger(__name__)
if TYPE_CHECKING:
from sentence_transformers import CrossEncoder
from khoj.utils.models import BaseEncoder
class SearchType(str, Enum):
@@ -27,36 +14,3 @@ class SearchType(str, Enum):
Notion = "notion"
Plaintext = "plaintext"
Docx = "docx"
class ProcessorType(str, Enum):
Conversation = "conversation"
@dataclass
class TextContent:
enabled: bool
@dataclass
class ImageContent:
image_names: List[str]
image_embeddings: torch.Tensor
image_metadata_embeddings: torch.Tensor
@dataclass
class TextSearchModel:
bi_encoder: BaseEncoder
cross_encoder: Optional[CrossEncoder] = None
top_k: Optional[int] = 15
@dataclass
class ImageSearchModel:
image_encoder: BaseEncoder
@dataclass
class SearchModels:
text_search: Optional[TextSearchModel] = None

View File

@@ -1,252 +0,0 @@
import glob
import logging
import os
from pathlib import Path
from typing import Optional
from bs4 import BeautifulSoup
from magika import Magika
from khoj.database.models import (
KhojUser,
LocalMarkdownConfig,
LocalOrgConfig,
LocalPdfConfig,
LocalPlaintextConfig,
)
from khoj.utils.config import SearchType
from khoj.utils.helpers import get_absolute_path, is_none_or_empty
from khoj.utils.rawconfig import TextContentConfig
logger = logging.getLogger(__name__)
magika = Magika()
def collect_files(user: KhojUser, search_type: Optional[SearchType] = SearchType.All) -> dict:
files: dict[str, dict] = {"docx": {}, "image": {}}
if search_type == SearchType.All or search_type == SearchType.Org:
org_config = LocalOrgConfig.objects.filter(user=user).first()
files["org"] = get_org_files(construct_config_from_db(org_config)) if org_config else {}
if search_type == SearchType.All or search_type == SearchType.Markdown:
markdown_config = LocalMarkdownConfig.objects.filter(user=user).first()
files["markdown"] = get_markdown_files(construct_config_from_db(markdown_config)) if markdown_config else {}
if search_type == SearchType.All or search_type == SearchType.Plaintext:
plaintext_config = LocalPlaintextConfig.objects.filter(user=user).first()
files["plaintext"] = get_plaintext_files(construct_config_from_db(plaintext_config)) if plaintext_config else {}
if search_type == SearchType.All or search_type == SearchType.Pdf:
pdf_config = LocalPdfConfig.objects.filter(user=user).first()
files["pdf"] = get_pdf_files(construct_config_from_db(pdf_config)) if pdf_config else {}
files["image"] = {}
files["docx"] = {}
return files
def construct_config_from_db(db_config) -> TextContentConfig:
return TextContentConfig(
input_files=db_config.input_files,
input_filter=db_config.input_filter,
index_heading_entries=db_config.index_heading_entries,
)
def get_plaintext_files(config: TextContentConfig) -> dict[str, str]:
def is_plaintextfile(file: str):
"Check if file is plaintext file"
# Check if file path exists
content_group = magika.identify_path(Path(file)).output.group
# Use file extension to decide plaintext if file content is not identifiable
valid_text_file_extensions = ("txt", "md", "markdown", "org" "mbox", "rst", "html", "htm", "xml")
return file.endswith(valid_text_file_extensions) or content_group in ["text", "code"]
def extract_html_content(html_content: str):
"Extract content from HTML"
soup = BeautifulSoup(html_content, "html.parser")
return soup.get_text(strip=True, separator="\n")
# Extract required fields from config
input_files, input_filters = (
config.input_files,
config.input_filter,
)
# Input Validation
if is_none_or_empty(input_files) and is_none_or_empty(input_filters):
logger.debug("At least one of input-files or input-file-filter is required to be specified")
return {}
# Get all plain text files to process
absolute_plaintext_files, filtered_plaintext_files = set(), set()
if input_files:
absolute_plaintext_files = {get_absolute_path(jsonl_file) for jsonl_file in input_files}
if input_filters:
filtered_plaintext_files = {
filtered_file
for plaintext_file_filter in input_filters
for filtered_file in glob.glob(get_absolute_path(plaintext_file_filter), recursive=True)
if os.path.isfile(filtered_file)
}
all_target_files = sorted(absolute_plaintext_files | filtered_plaintext_files)
files_with_no_plaintext_extensions = {
target_files for target_files in all_target_files if not is_plaintextfile(target_files)
}
if any(files_with_no_plaintext_extensions):
logger.warning(f"Skipping unsupported files from plaintext indexing: {files_with_no_plaintext_extensions}")
all_target_files = list(set(all_target_files) - files_with_no_plaintext_extensions)
logger.debug(f"Processing files: {all_target_files}")
filename_to_content_map = {}
for file in all_target_files:
with open(file, "r", encoding="utf8") as f:
try:
plaintext_content = f.read()
if file.endswith(("html", "htm", "xml")):
plaintext_content = extract_html_content(plaintext_content)
filename_to_content_map[file] = plaintext_content
except Exception as e:
logger.warning(f"Unable to read file: {file} as plaintext. Skipping file.")
logger.warning(e, exc_info=True)
return filename_to_content_map
def get_org_files(config: TextContentConfig):
# Extract required fields from config
org_files, org_file_filters = (
config.input_files,
config.input_filter,
)
# Input Validation
if is_none_or_empty(org_files) and is_none_or_empty(org_file_filters):
logger.debug("At least one of org-files or org-file-filter is required to be specified")
return {}
# Get Org files to process
absolute_org_files, filtered_org_files = set(), set()
if org_files:
absolute_org_files = {get_absolute_path(org_file) for org_file in org_files}
if org_file_filters:
filtered_org_files = {
filtered_file
for org_file_filter in org_file_filters
for filtered_file in glob.glob(get_absolute_path(org_file_filter), recursive=True)
if os.path.isfile(filtered_file)
}
all_org_files = sorted(absolute_org_files | filtered_org_files)
files_with_non_org_extensions = {org_file for org_file in all_org_files if not org_file.endswith(".org")}
if any(files_with_non_org_extensions):
logger.warning(f"There maybe non org-mode files in the input set: {files_with_non_org_extensions}")
logger.debug(f"Processing files: {all_org_files}")
filename_to_content_map = {}
for file in all_org_files:
with open(file, "r", encoding="utf8") as f:
try:
filename_to_content_map[file] = f.read()
except Exception as e:
logger.warning(f"Unable to read file: {file} as org. Skipping file.")
logger.warning(e, exc_info=True)
return filename_to_content_map
def get_markdown_files(config: TextContentConfig):
# Extract required fields from config
markdown_files, markdown_file_filters = (
config.input_files,
config.input_filter,
)
# Input Validation
if is_none_or_empty(markdown_files) and is_none_or_empty(markdown_file_filters):
logger.debug("At least one of markdown-files or markdown-file-filter is required to be specified")
return {}
# Get markdown files to process
absolute_markdown_files, filtered_markdown_files = set(), set()
if markdown_files:
absolute_markdown_files = {get_absolute_path(markdown_file) for markdown_file in markdown_files}
if markdown_file_filters:
filtered_markdown_files = {
filtered_file
for markdown_file_filter in markdown_file_filters
for filtered_file in glob.glob(get_absolute_path(markdown_file_filter), recursive=True)
if os.path.isfile(filtered_file)
}
all_markdown_files = sorted(absolute_markdown_files | filtered_markdown_files)
files_with_non_markdown_extensions = {
md_file for md_file in all_markdown_files if not md_file.endswith(".md") and not md_file.endswith(".markdown")
}
if any(files_with_non_markdown_extensions):
logger.warning(
f"[Warning] There maybe non markdown-mode files in the input set: {files_with_non_markdown_extensions}"
)
logger.debug(f"Processing files: {all_markdown_files}")
filename_to_content_map = {}
for file in all_markdown_files:
with open(file, "r", encoding="utf8") as f:
try:
filename_to_content_map[file] = f.read()
except Exception as e:
logger.warning(f"Unable to read file: {file} as markdown. Skipping file.")
logger.warning(e, exc_info=True)
return filename_to_content_map
def get_pdf_files(config: TextContentConfig):
# Extract required fields from config
pdf_files, pdf_file_filters = (
config.input_files,
config.input_filter,
)
# Input Validation
if is_none_or_empty(pdf_files) and is_none_or_empty(pdf_file_filters):
logger.debug("At least one of pdf-files or pdf-file-filter is required to be specified")
return {}
# Get PDF files to process
absolute_pdf_files, filtered_pdf_files = set(), set()
if pdf_files:
absolute_pdf_files = {get_absolute_path(pdf_file) for pdf_file in pdf_files}
if pdf_file_filters:
filtered_pdf_files = {
filtered_file
for pdf_file_filter in pdf_file_filters
for filtered_file in glob.glob(get_absolute_path(pdf_file_filter), recursive=True)
if os.path.isfile(filtered_file)
}
all_pdf_files = sorted(absolute_pdf_files | filtered_pdf_files)
files_with_non_pdf_extensions = {pdf_file for pdf_file in all_pdf_files if not pdf_file.endswith(".pdf")}
if any(files_with_non_pdf_extensions):
logger.warning(f"[Warning] There maybe non pdf-mode files in the input set: {files_with_non_pdf_extensions}")
logger.debug(f"Processing files: {all_pdf_files}")
filename_to_content_map = {}
for file in all_pdf_files:
with open(file, "rb") as f:
try:
filename_to_content_map[file] = f.read()
except Exception as e:
logger.warning(f"Unable to read file: {file} as PDF. Skipping file.")
logger.warning(e, exc_info=True)
return filename_to_content_map

View File

@@ -47,7 +47,6 @@ if TYPE_CHECKING:
from sentence_transformers import CrossEncoder, SentenceTransformer
from khoj.utils.models import BaseEncoder
from khoj.utils.rawconfig import AppConfig
logger = logging.getLogger(__name__)
@@ -267,23 +266,16 @@ def get_server_id():
return server_id
def telemetry_disabled(app_config: AppConfig, telemetry_disable_env) -> bool:
if telemetry_disable_env is True:
return True
return not app_config or not app_config.should_log_telemetry
def log_telemetry(
telemetry_type: str,
api: str = None,
client: Optional[str] = None,
app_config: Optional[AppConfig] = None,
disable_telemetry_env: bool = False,
properties: dict = None,
):
"""Log basic app usage telemetry like client, os, api called"""
# Do not log usage telemetry, if telemetry is disabled via app config
if telemetry_disabled(app_config, disable_telemetry_env):
if disable_telemetry_env:
return []
if properties.get("server_id") is None:

View File

@@ -147,24 +147,6 @@ def initialization(interactive: bool = True):
logger.info("🗣️ Chat model configuration complete")
# Set up offline speech to text model
use_offline_speech2text_model = "n" if not interactive else input("Use offline speech to text model? (y/n): ")
if use_offline_speech2text_model == "y":
logger.info("🗣️ Setting up offline speech to text model")
# Delete any existing speech to text model options. There can only be one.
SpeechToTextModelOptions.objects.all().delete()
default_offline_speech2text_model = "base"
offline_speech2text_model = input(
f"Enter the Whisper model to use Offline (default: {default_offline_speech2text_model}): "
)
offline_speech2text_model = offline_speech2text_model or default_offline_speech2text_model
SpeechToTextModelOptions.objects.create(
model_name=offline_speech2text_model, model_type=SpeechToTextModelOptions.ModelType.OFFLINE
)
logger.info(f"🗣️ Offline speech to text model configured to {offline_speech2text_model}")
def _setup_chat_model_provider(
model_type: ChatModel.ModelType,
default_chat_models: list,

View File

@@ -48,17 +48,6 @@ class FilesFilterRequest(BaseModel):
conversation_id: str
class TextConfigBase(ConfigBase):
compressed_jsonl: Path
embeddings_file: Path
class TextContentConfig(ConfigBase):
input_files: Optional[List[Path]] = None
input_filter: Optional[List[str]] = None
index_heading_entries: Optional[bool] = False
class GithubRepoConfig(ConfigBase):
name: str
owner: str
@@ -74,57 +63,6 @@ class NotionContentConfig(ConfigBase):
token: str
class ContentConfig(ConfigBase):
org: Optional[TextContentConfig] = None
markdown: Optional[TextContentConfig] = None
pdf: Optional[TextContentConfig] = None
plaintext: Optional[TextContentConfig] = None
github: Optional[GithubContentConfig] = None
notion: Optional[NotionContentConfig] = None
image: Optional[TextContentConfig] = None
docx: Optional[TextContentConfig] = None
class ImageSearchConfig(ConfigBase):
encoder: str
encoder_type: Optional[str] = None
model_directory: Optional[Path] = None
class Config:
protected_namespaces = ()
class SearchConfig(ConfigBase):
image: Optional[ImageSearchConfig] = None
class OpenAIProcessorConfig(ConfigBase):
api_key: str
chat_model: Optional[str] = "gpt-4o-mini"
class ConversationProcessorConfig(ConfigBase):
openai: Optional[OpenAIProcessorConfig] = None
max_prompt_size: Optional[int] = None
tokenizer: Optional[str] = None
class ProcessorConfig(ConfigBase):
conversation: Optional[ConversationProcessorConfig] = None
class AppConfig(ConfigBase):
should_log_telemetry: bool = True
class FullConfig(ConfigBase):
content_type: Optional[ContentConfig] = None
search_type: Optional[SearchConfig] = None
processor: Optional[ProcessorConfig] = None
app: Optional[AppConfig] = AppConfig()
version: Optional[str] = None
class SearchResponse(ConfigBase):
entry: str
score: float

View File

@@ -12,18 +12,14 @@ from whisper import Whisper
from khoj.database.models import ProcessLock
from khoj.processor.embeddings import CrossEncoderModel, EmbeddingsModel
from khoj.utils import config as utils_config
from khoj.utils.config import SearchModels
from khoj.utils.helpers import LRU, get_device, is_env_var_true
from khoj.utils.rawconfig import FullConfig
# Application Global State
config = FullConfig()
search_models = SearchModels()
embeddings_model: Dict[str, EmbeddingsModel] = None
cross_encoder_model: Dict[str, CrossEncoderModel] = None
openai_client: OpenAI = None
whisper_model: Whisper = None
config_file: Path = None
log_file: Path = None
verbose: int = 0
host: str = None
port: int = None

View File

@@ -1,47 +1,8 @@
from pathlib import Path
import yaml
from khoj.utils import state
from khoj.utils.rawconfig import FullConfig
# Do not emit tags when dumping to YAML
yaml.emitter.Emitter.process_tag = lambda self, *args, **kwargs: None # type: ignore[assignment]
def save_config_to_file_updated_state():
with open(state.config_file, "w") as outfile:
yaml.dump(yaml.safe_load(state.config.json(by_alias=True)), outfile)
outfile.close()
return state.config
def save_config_to_file(yaml_config: dict, yaml_config_file: Path):
"Write config to YML file"
# Create output directory, if it doesn't exist
yaml_config_file.parent.mkdir(parents=True, exist_ok=True)
with open(yaml_config_file, "w", encoding="utf-8") as config_file:
yaml.safe_dump(yaml_config, config_file, allow_unicode=True)
def load_config_from_file(yaml_config_file: Path) -> dict:
"Read config from YML file"
config_from_file = None
with open(yaml_config_file, "r", encoding="utf-8") as config_file:
config_from_file = yaml.safe_load(config_file)
return config_from_file
def parse_config_from_string(yaml_config: dict) -> FullConfig:
"Parse and validate config in YML string"
return FullConfig.model_validate(yaml_config)
def parse_config_from_file(yaml_config_file):
"Parse and validate config in YML file"
return parse_config_from_string(load_config_from_file(yaml_config_file))
def yaml_dump(data):
return yaml.dump(data, allow_unicode=True, sort_keys=False, default_flow_style=False)