diff --git a/src/khoj/routers/api_content.py b/src/khoj/routers/api_content.py index fa46d71c..72fd34a7 100644 --- a/src/khoj/routers/api_content.py +++ b/src/khoj/routers/api_content.py @@ -90,7 +90,7 @@ async def put_content( indexed_data_limiter: ApiIndexedDataLimiter = Depends( ApiIndexedDataLimiter( incoming_entries_size_limit=10, - subscribed_incoming_entries_size_limit=25, + subscribed_incoming_entries_size_limit=75, total_entries_size_limit=10, subscribed_total_entries_size_limit=100, ) @@ -112,7 +112,7 @@ async def patch_content( indexed_data_limiter: ApiIndexedDataLimiter = Depends( ApiIndexedDataLimiter( incoming_entries_size_limit=10, - subscribed_incoming_entries_size_limit=25, + subscribed_incoming_entries_size_limit=75, total_entries_size_limit=10, subscribed_total_entries_size_limit=100, ) diff --git a/src/khoj/routers/indexer.py b/src/khoj/routers/indexer.py deleted file mode 100644 index 91f120f3..00000000 --- a/src/khoj/routers/indexer.py +++ /dev/null @@ -1,349 +0,0 @@ -import asyncio -import logging -from typing import Dict, Optional, Union - -from fastapi import APIRouter, Depends, Header, Request, Response, UploadFile -from pydantic import BaseModel -from starlette.authentication import requires - -from khoj.database.models import GithubConfig, KhojUser, NotionConfig -from khoj.processor.content.docx.docx_to_entries import DocxToEntries -from khoj.processor.content.github.github_to_entries import GithubToEntries -from khoj.processor.content.images.image_to_entries import ImageToEntries -from khoj.processor.content.markdown.markdown_to_entries import MarkdownToEntries -from khoj.processor.content.notion.notion_to_entries import NotionToEntries -from khoj.processor.content.org_mode.org_to_entries import OrgToEntries -from khoj.processor.content.pdf.pdf_to_entries import PdfToEntries -from khoj.processor.content.plaintext.plaintext_to_entries import PlaintextToEntries -from khoj.routers.helpers import ApiIndexedDataLimiter, update_telemetry_state -from khoj.search_type import text_search -from khoj.utils import constants, state -from khoj.utils.config import SearchModels -from khoj.utils.helpers import LRU, get_file_type -from khoj.utils.rawconfig import ContentConfig, FullConfig, SearchConfig -from khoj.utils.yaml import save_config_to_file_updated_state - -logger = logging.getLogger(__name__) - -indexer = APIRouter() - - -class File(BaseModel): - path: str - content: Union[str, bytes] - - -class IndexBatchRequest(BaseModel): - files: list[File] - - -class IndexerInput(BaseModel): - org: Optional[dict[str, str]] = None - markdown: Optional[dict[str, str]] = None - pdf: Optional[dict[str, bytes]] = None - plaintext: Optional[dict[str, str]] = None - image: Optional[dict[str, bytes]] = None - docx: Optional[dict[str, bytes]] = None - - -@indexer.post("/update") -@requires(["authenticated"]) -async def update( - request: Request, - files: list[UploadFile], - force: bool = False, - t: Optional[Union[state.SearchType, str]] = state.SearchType.All, - client: Optional[str] = None, - user_agent: Optional[str] = Header(None), - referer: Optional[str] = Header(None), - host: Optional[str] = Header(None), - indexed_data_limiter: ApiIndexedDataLimiter = Depends( - ApiIndexedDataLimiter( - incoming_entries_size_limit=10, - subscribed_incoming_entries_size_limit=75, - total_entries_size_limit=10, - subscribed_total_entries_size_limit=100, - ) - ), -): - user = request.user.object - index_files: Dict[str, Dict[str, str]] = { - "org": {}, - "markdown": {}, - "pdf": {}, - "plaintext": {}, - "image": {}, - "docx": {}, - } - try: - logger.info(f"📬 Updating content index via API call by {client} client") - for file in files: - file_content = file.file.read() - file_type, encoding = get_file_type(file.content_type, file_content) - if file_type in index_files: - index_files[file_type][file.filename] = file_content.decode(encoding) if encoding else file_content - else: - logger.warning(f"Skipped indexing unsupported file type sent by {client} client: {file.filename}") - - indexer_input = IndexerInput( - org=index_files["org"], - markdown=index_files["markdown"], - pdf=index_files["pdf"], - plaintext=index_files["plaintext"], - image=index_files["image"], - docx=index_files["docx"], - ) - - if state.config == None: - logger.info("📬 Initializing content index on first run.") - default_full_config = FullConfig( - content_type=None, - search_type=SearchConfig.model_validate(constants.default_config["search-type"]), - processor=None, - ) - state.config = default_full_config - default_content_config = ContentConfig( - org=None, - markdown=None, - pdf=None, - docx=None, - image=None, - github=None, - notion=None, - plaintext=None, - ) - state.config.content_type = default_content_config - save_config_to_file_updated_state() - configure_search(state.search_models, state.config.search_type) - - # Extract required fields from config - loop = asyncio.get_event_loop() - success = await loop.run_in_executor( - None, - configure_content, - indexer_input.model_dump(), - force, - t, - False, - user, - ) - if not success: - raise RuntimeError("Failed to update content index") - logger.info(f"Finished processing batch indexing request") - except Exception as e: - logger.error(f"Failed to process batch indexing request: {e}", exc_info=True) - logger.error( - f'🚨 Failed to {"force " if force else ""}update {t} content index triggered via API call by {client} client: {e}', - exc_info=True, - ) - return Response(content="Failed", status_code=500) - - indexing_metadata = { - "num_org": len(index_files["org"]), - "num_markdown": len(index_files["markdown"]), - "num_pdf": len(index_files["pdf"]), - "num_plaintext": len(index_files["plaintext"]), - "num_image": len(index_files["image"]), - "num_docx": len(index_files["docx"]), - } - - update_telemetry_state( - request=request, - telemetry_type="api", - api="index/update", - client=client, - user_agent=user_agent, - referer=referer, - host=host, - metadata=indexing_metadata, - ) - - logger.info(f"📪 Content index updated via API call by {client} client") - - indexed_filenames = ",".join(file for ctype in index_files for file in index_files[ctype]) or "" - return Response(content=indexed_filenames, status_code=200) - - -def configure_search(search_models: SearchModels, search_config: Optional[SearchConfig]) -> Optional[SearchModels]: - # Run Validation Checks - if search_models is None: - search_models = SearchModels() - - return search_models - - -def configure_content( - files: Optional[dict[str, dict[str, str]]], - regenerate: bool = False, - t: Optional[state.SearchType] = state.SearchType.All, - full_corpus: bool = True, - user: KhojUser = None, -) -> bool: - success = True - if t == None: - t = state.SearchType.All - - if t is not None and t in [type.value for type in state.SearchType]: - t = state.SearchType(t) - - if t is not None and not t.value in [type.value for type in state.SearchType]: - logger.warning(f"🚨 Invalid search type: {t}") - return False - - search_type = t.value if t else None - - no_documents = all([not files.get(file_type) for file_type in files]) - - if files is None: - logger.warning(f"🚨 No files to process for {search_type} search.") - return True - - try: - # Initialize Org Notes Search - if (search_type == state.SearchType.All.value or search_type == state.SearchType.Org.value) and files["org"]: - logger.info("🦄 Setting up search for orgmode notes") - # Extract Entries, Generate Notes Embeddings - text_search.setup( - OrgToEntries, - files.get("org"), - regenerate=regenerate, - full_corpus=full_corpus, - user=user, - ) - except Exception as e: - logger.error(f"🚨 Failed to setup org: {e}", exc_info=True) - success = False - - try: - # Initialize Markdown Search - if (search_type == state.SearchType.All.value or search_type == state.SearchType.Markdown.value) and files[ - "markdown" - ]: - logger.info("💎 Setting up search for markdown notes") - # Extract Entries, Generate Markdown Embeddings - text_search.setup( - MarkdownToEntries, - files.get("markdown"), - regenerate=regenerate, - full_corpus=full_corpus, - user=user, - ) - - except Exception as e: - logger.error(f"🚨 Failed to setup markdown: {e}", exc_info=True) - success = False - - try: - # Initialize PDF Search - if (search_type == state.SearchType.All.value or search_type == state.SearchType.Pdf.value) and files["pdf"]: - logger.info("🖨️ Setting up search for pdf") - # Extract Entries, Generate PDF Embeddings - text_search.setup( - PdfToEntries, - files.get("pdf"), - regenerate=regenerate, - full_corpus=full_corpus, - user=user, - ) - - except Exception as e: - logger.error(f"🚨 Failed to setup PDF: {e}", exc_info=True) - success = False - - try: - # Initialize Plaintext Search - if (search_type == state.SearchType.All.value or search_type == state.SearchType.Plaintext.value) and files[ - "plaintext" - ]: - logger.info("📄 Setting up search for plaintext") - # Extract Entries, Generate Plaintext Embeddings - text_search.setup( - PlaintextToEntries, - files.get("plaintext"), - regenerate=regenerate, - full_corpus=full_corpus, - user=user, - ) - - except Exception as e: - logger.error(f"🚨 Failed to setup plaintext: {e}", exc_info=True) - success = False - - try: - if no_documents: - github_config = GithubConfig.objects.filter(user=user).prefetch_related("githubrepoconfig").first() - if ( - search_type == state.SearchType.All.value or search_type == state.SearchType.Github.value - ) and github_config is not None: - logger.info("🐙 Setting up search for github") - # Extract Entries, Generate Github Embeddings - text_search.setup( - GithubToEntries, - None, - regenerate=regenerate, - full_corpus=full_corpus, - user=user, - config=github_config, - ) - - except Exception as e: - logger.error(f"🚨 Failed to setup GitHub: {e}", exc_info=True) - success = False - - try: - if no_documents: - # Initialize Notion Search - notion_config = NotionConfig.objects.filter(user=user).first() - if ( - search_type == state.SearchType.All.value or search_type == state.SearchType.Notion.value - ) and notion_config: - logger.info("🔌 Setting up search for notion") - text_search.setup( - NotionToEntries, - None, - regenerate=regenerate, - full_corpus=full_corpus, - user=user, - config=notion_config, - ) - - except Exception as e: - logger.error(f"🚨 Failed to setup Notion: {e}", exc_info=True) - success = False - - try: - # Initialize Image Search - if (search_type == state.SearchType.All.value or search_type == state.SearchType.Image.value) and files[ - "image" - ]: - logger.info("🖼️ Setting up search for images") - # Extract Entries, Generate Image Embeddings - text_search.setup( - ImageToEntries, - files.get("image"), - regenerate=regenerate, - full_corpus=full_corpus, - user=user, - ) - except Exception as e: - logger.error(f"🚨 Failed to setup images: {e}", exc_info=True) - success = False - try: - if (search_type == state.SearchType.All.value or search_type == state.SearchType.Docx.value) and files["docx"]: - logger.info("📄 Setting up search for docx") - text_search.setup( - DocxToEntries, - files.get("docx"), - regenerate=regenerate, - full_corpus=full_corpus, - user=user, - ) - except Exception as e: - logger.error(f"🚨 Failed to setup docx: {e}", exc_info=True) - success = False - - # Invalidate Query Cache - if user: - state.query_cache[user.uuid] = LRU() - - return success