Do not wrap filepath in Path to fix indexing markdown files on Windows (#993)

### Issue - Path with / are converted to \\ on Windows using the `Path' operator. - The `markdown_to_entries' module was trying to normalize file paths with`Path' for some reason. This would store the file paths in DB Entry differently than the file to entries map if Khoj ran on Windows. That'd result in a KeyError when trying to look up the entry file path from `file_to_text_map' in the `text_to_entries:update_embeddings()' function. ### Fix - Removing the unnecessary OS dependent Path normalization in `markdown_to_entries' should keep the file path storage consistent across `file_to_text_map' var, `FileObjectAdaptor', `Entry' DB tables on Windows for Markdown files as well. This issue will affect users hosting Khoj server on Windows and attempting to index markdown files. Resolves #984
2026-03-05 05:39:11 +00:00 · 2024-12-02 01:02:58 -08:00
parent 9e0a2c7a98 47c926b0ff
commit db29894038
2 changed files with 9 additions and 9 deletions
--- a/src/khoj/processor/content/markdown/markdown_to_entries.py
+++ b/src/khoj/processor/content/markdown/markdown_to_entries.py
@@ -3,7 +3,7 @@ import re
 from pathlib import Path
 from typing import Dict, List, Tuple

-import urllib3
+import urllib3.util

 from khoj.database.models import Entry as DbEntry
 from khoj.database.models import KhojUser
@@ -51,11 +51,11 @@ class MarkdownToEntries(TextToEntries):
        return num_new_embeddings, num_deleted_embeddings

    @staticmethod
-    def extract_markdown_entries(markdown_files, max_tokens=256) -> Tuple[Dict, List[Entry]]:
+    def extract_markdown_entries(markdown_files: Dict[str, str], max_tokens=256) -> Tuple[Dict[str, str], List[Entry]]:
        "Extract entries by heading from specified Markdown files"
        entries: List[str] = []
        entry_to_file_map: List[Tuple[str, str]] = []
-        file_to_text_map = dict()
+        file_to_text_map: Dict[str, str] = dict()
        for markdown_file in markdown_files:
            try:
                markdown_content = markdown_files[markdown_file]
@@ -128,7 +128,7 @@ class MarkdownToEntries(TextToEntries):
        return entries, entry_to_file_map

    @staticmethod
-    def convert_markdown_entries_to_maps(parsed_entries: List[str], entry_to_file_map) -> List[Entry]:
+    def convert_markdown_entries_to_maps(parsed_entries: List[str], entry_to_file_map: Dict[str, str]) -> List[Entry]:
        "Convert each Markdown entries into a dictionary"
        entries: List[Entry] = []
        for parsed_entry in parsed_entries:
@@ -139,7 +139,7 @@ class MarkdownToEntries(TextToEntries):
                # Escape the URL to avoid issues with special characters
                entry_filename = urllib3.util.parse_url(raw_filename).url
            else:
-                entry_filename = str(Path(raw_filename))
+                entry_filename = raw_filename

            heading = parsed_entry.splitlines()[0] if re.search(r"^#+\s", parsed_entry) else ""
            # Append base filename to compiled entry for context to model
@@ -151,7 +151,7 @@ class MarkdownToEntries(TextToEntries):
                    compiled=compiled_entry,
                    raw=parsed_entry,
                    heading=f"{prefix}{heading}",
-                    file=f"{entry_filename}",
+                    file=entry_filename,
                )
            )

--- a/src/khoj/processor/content/org_mode/org_to_entries.py
+++ b/src/khoj/processor/content/org_mode/org_to_entries.py
@@ -208,7 +208,7 @@ class OrgToEntries(TextToEntries):
                    compiled += f"\n {parsed_entry.body}"

                # Add the sub-entry contents to the entry
-                entry_compiled += f"{compiled}"
+                entry_compiled += compiled
                entry_raw += f"{parsed_entry}"
                if not entry_heading:
                    entry_heading = heading
@@ -218,8 +218,8 @@ class OrgToEntries(TextToEntries):
                    Entry(
                        compiled=entry_compiled,
                        raw=entry_raw,
-                        heading=f"{entry_heading}",
-                        file=f"{entry_to_file_map[parsed_entry]}",
+                        heading=entry_heading,
+                        file=entry_to_file_map[parsed_entry],
                    )
                )