Do not wrap filepath in Path to fix indexing markdown files on Windows (#993)

### Issue
- Path with / are converted to \\ on Windows using the `Path' operator.
- The `markdown_to_entries' module was trying to normalize file paths with`Path'  for some reason.
  This would store the file paths in DB Entry differently than the file to entries map if Khoj ran on Windows.
  That'd result in a KeyError when trying to look up the entry file path from `file_to_text_map' in the `text_to_entries:update_embeddings()' function.

### Fix
- Removing the unnecessary OS dependent Path normalization in `markdown_to_entries' should keep the file path storage consistent across `file_to_text_map' var, `FileObjectAdaptor', `Entry' DB tables on Windows for Markdown files as well.

This issue will affect users hosting Khoj server on Windows and attempting to index markdown files.

Resolves #984
This commit is contained in:
Debanjum
2024-12-02 01:02:58 -08:00
committed by GitHub
2 changed files with 9 additions and 9 deletions

View File

@@ -3,7 +3,7 @@ import re
from pathlib import Path from pathlib import Path
from typing import Dict, List, Tuple from typing import Dict, List, Tuple
import urllib3 import urllib3.util
from khoj.database.models import Entry as DbEntry from khoj.database.models import Entry as DbEntry
from khoj.database.models import KhojUser from khoj.database.models import KhojUser
@@ -51,11 +51,11 @@ class MarkdownToEntries(TextToEntries):
return num_new_embeddings, num_deleted_embeddings return num_new_embeddings, num_deleted_embeddings
@staticmethod @staticmethod
def extract_markdown_entries(markdown_files, max_tokens=256) -> Tuple[Dict, List[Entry]]: def extract_markdown_entries(markdown_files: Dict[str, str], max_tokens=256) -> Tuple[Dict[str, str], List[Entry]]:
"Extract entries by heading from specified Markdown files" "Extract entries by heading from specified Markdown files"
entries: List[str] = [] entries: List[str] = []
entry_to_file_map: List[Tuple[str, str]] = [] entry_to_file_map: List[Tuple[str, str]] = []
file_to_text_map = dict() file_to_text_map: Dict[str, str] = dict()
for markdown_file in markdown_files: for markdown_file in markdown_files:
try: try:
markdown_content = markdown_files[markdown_file] markdown_content = markdown_files[markdown_file]
@@ -128,7 +128,7 @@ class MarkdownToEntries(TextToEntries):
return entries, entry_to_file_map return entries, entry_to_file_map
@staticmethod @staticmethod
def convert_markdown_entries_to_maps(parsed_entries: List[str], entry_to_file_map) -> List[Entry]: def convert_markdown_entries_to_maps(parsed_entries: List[str], entry_to_file_map: Dict[str, str]) -> List[Entry]:
"Convert each Markdown entries into a dictionary" "Convert each Markdown entries into a dictionary"
entries: List[Entry] = [] entries: List[Entry] = []
for parsed_entry in parsed_entries: for parsed_entry in parsed_entries:
@@ -139,7 +139,7 @@ class MarkdownToEntries(TextToEntries):
# Escape the URL to avoid issues with special characters # Escape the URL to avoid issues with special characters
entry_filename = urllib3.util.parse_url(raw_filename).url entry_filename = urllib3.util.parse_url(raw_filename).url
else: else:
entry_filename = str(Path(raw_filename)) entry_filename = raw_filename
heading = parsed_entry.splitlines()[0] if re.search(r"^#+\s", parsed_entry) else "" heading = parsed_entry.splitlines()[0] if re.search(r"^#+\s", parsed_entry) else ""
# Append base filename to compiled entry for context to model # Append base filename to compiled entry for context to model
@@ -151,7 +151,7 @@ class MarkdownToEntries(TextToEntries):
compiled=compiled_entry, compiled=compiled_entry,
raw=parsed_entry, raw=parsed_entry,
heading=f"{prefix}{heading}", heading=f"{prefix}{heading}",
file=f"{entry_filename}", file=entry_filename,
) )
) )

View File

@@ -208,7 +208,7 @@ class OrgToEntries(TextToEntries):
compiled += f"\n {parsed_entry.body}" compiled += f"\n {parsed_entry.body}"
# Add the sub-entry contents to the entry # Add the sub-entry contents to the entry
entry_compiled += f"{compiled}" entry_compiled += compiled
entry_raw += f"{parsed_entry}" entry_raw += f"{parsed_entry}"
if not entry_heading: if not entry_heading:
entry_heading = heading entry_heading = heading
@@ -218,8 +218,8 @@ class OrgToEntries(TextToEntries):
Entry( Entry(
compiled=entry_compiled, compiled=entry_compiled,
raw=entry_raw, raw=entry_raw,
heading=f"{entry_heading}", heading=entry_heading,
file=f"{entry_to_file_map[parsed_entry]}", file=entry_to_file_map[parsed_entry],
) )
) )