Do not wrap filepath in Path to fix indexing markdown files on Windows (#993)

### Issue
- Path with / are converted to \\ on Windows using the `Path' operator.
- The `markdown_to_entries' module was trying to normalize file paths with`Path'  for some reason.
  This would store the file paths in DB Entry differently than the file to entries map if Khoj ran on Windows.
  That'd result in a KeyError when trying to look up the entry file path from `file_to_text_map' in the `text_to_entries:update_embeddings()' function.

### Fix
- Removing the unnecessary OS dependent Path normalization in `markdown_to_entries' should keep the file path storage consistent across `file_to_text_map' var, `FileObjectAdaptor', `Entry' DB tables on Windows for Markdown files as well.

This issue will affect users hosting Khoj server on Windows and attempting to index markdown files.

Resolves #984
This commit is contained in:
Debanjum
2024-12-02 01:02:58 -08:00
committed by GitHub
2 changed files with 9 additions and 9 deletions

View File

@@ -3,7 +3,7 @@ import re
from pathlib import Path
from typing import Dict, List, Tuple
import urllib3
import urllib3.util
from khoj.database.models import Entry as DbEntry
from khoj.database.models import KhojUser
@@ -51,11 +51,11 @@ class MarkdownToEntries(TextToEntries):
return num_new_embeddings, num_deleted_embeddings
@staticmethod
def extract_markdown_entries(markdown_files, max_tokens=256) -> Tuple[Dict, List[Entry]]:
def extract_markdown_entries(markdown_files: Dict[str, str], max_tokens=256) -> Tuple[Dict[str, str], List[Entry]]:
"Extract entries by heading from specified Markdown files"
entries: List[str] = []
entry_to_file_map: List[Tuple[str, str]] = []
file_to_text_map = dict()
file_to_text_map: Dict[str, str] = dict()
for markdown_file in markdown_files:
try:
markdown_content = markdown_files[markdown_file]
@@ -128,7 +128,7 @@ class MarkdownToEntries(TextToEntries):
return entries, entry_to_file_map
@staticmethod
def convert_markdown_entries_to_maps(parsed_entries: List[str], entry_to_file_map) -> List[Entry]:
def convert_markdown_entries_to_maps(parsed_entries: List[str], entry_to_file_map: Dict[str, str]) -> List[Entry]:
"Convert each Markdown entries into a dictionary"
entries: List[Entry] = []
for parsed_entry in parsed_entries:
@@ -139,7 +139,7 @@ class MarkdownToEntries(TextToEntries):
# Escape the URL to avoid issues with special characters
entry_filename = urllib3.util.parse_url(raw_filename).url
else:
entry_filename = str(Path(raw_filename))
entry_filename = raw_filename
heading = parsed_entry.splitlines()[0] if re.search(r"^#+\s", parsed_entry) else ""
# Append base filename to compiled entry for context to model
@@ -151,7 +151,7 @@ class MarkdownToEntries(TextToEntries):
compiled=compiled_entry,
raw=parsed_entry,
heading=f"{prefix}{heading}",
file=f"{entry_filename}",
file=entry_filename,
)
)

View File

@@ -208,7 +208,7 @@ class OrgToEntries(TextToEntries):
compiled += f"\n {parsed_entry.body}"
# Add the sub-entry contents to the entry
entry_compiled += f"{compiled}"
entry_compiled += compiled
entry_raw += f"{parsed_entry}"
if not entry_heading:
entry_heading = heading
@@ -218,8 +218,8 @@ class OrgToEntries(TextToEntries):
Entry(
compiled=entry_compiled,
raw=entry_raw,
heading=f"{entry_heading}",
file=f"{entry_to_file_map[parsed_entry]}",
heading=entry_heading,
file=entry_to_file_map[parsed_entry],
)
)