mirror of
https://github.com/khoaliber/khoj.git
synced 2026-03-05 05:39:11 +00:00
Do not wrap filepath in Path to fix indexing markdown files on Windows (#993)
### Issue - Path with / are converted to \\ on Windows using the `Path' operator. - The `markdown_to_entries' module was trying to normalize file paths with`Path' for some reason. This would store the file paths in DB Entry differently than the file to entries map if Khoj ran on Windows. That'd result in a KeyError when trying to look up the entry file path from `file_to_text_map' in the `text_to_entries:update_embeddings()' function. ### Fix - Removing the unnecessary OS dependent Path normalization in `markdown_to_entries' should keep the file path storage consistent across `file_to_text_map' var, `FileObjectAdaptor', `Entry' DB tables on Windows for Markdown files as well. This issue will affect users hosting Khoj server on Windows and attempting to index markdown files. Resolves #984
This commit is contained in:
@@ -3,7 +3,7 @@ import re
|
||||
from pathlib import Path
|
||||
from typing import Dict, List, Tuple
|
||||
|
||||
import urllib3
|
||||
import urllib3.util
|
||||
|
||||
from khoj.database.models import Entry as DbEntry
|
||||
from khoj.database.models import KhojUser
|
||||
@@ -51,11 +51,11 @@ class MarkdownToEntries(TextToEntries):
|
||||
return num_new_embeddings, num_deleted_embeddings
|
||||
|
||||
@staticmethod
|
||||
def extract_markdown_entries(markdown_files, max_tokens=256) -> Tuple[Dict, List[Entry]]:
|
||||
def extract_markdown_entries(markdown_files: Dict[str, str], max_tokens=256) -> Tuple[Dict[str, str], List[Entry]]:
|
||||
"Extract entries by heading from specified Markdown files"
|
||||
entries: List[str] = []
|
||||
entry_to_file_map: List[Tuple[str, str]] = []
|
||||
file_to_text_map = dict()
|
||||
file_to_text_map: Dict[str, str] = dict()
|
||||
for markdown_file in markdown_files:
|
||||
try:
|
||||
markdown_content = markdown_files[markdown_file]
|
||||
@@ -128,7 +128,7 @@ class MarkdownToEntries(TextToEntries):
|
||||
return entries, entry_to_file_map
|
||||
|
||||
@staticmethod
|
||||
def convert_markdown_entries_to_maps(parsed_entries: List[str], entry_to_file_map) -> List[Entry]:
|
||||
def convert_markdown_entries_to_maps(parsed_entries: List[str], entry_to_file_map: Dict[str, str]) -> List[Entry]:
|
||||
"Convert each Markdown entries into a dictionary"
|
||||
entries: List[Entry] = []
|
||||
for parsed_entry in parsed_entries:
|
||||
@@ -139,7 +139,7 @@ class MarkdownToEntries(TextToEntries):
|
||||
# Escape the URL to avoid issues with special characters
|
||||
entry_filename = urllib3.util.parse_url(raw_filename).url
|
||||
else:
|
||||
entry_filename = str(Path(raw_filename))
|
||||
entry_filename = raw_filename
|
||||
|
||||
heading = parsed_entry.splitlines()[0] if re.search(r"^#+\s", parsed_entry) else ""
|
||||
# Append base filename to compiled entry for context to model
|
||||
@@ -151,7 +151,7 @@ class MarkdownToEntries(TextToEntries):
|
||||
compiled=compiled_entry,
|
||||
raw=parsed_entry,
|
||||
heading=f"{prefix}{heading}",
|
||||
file=f"{entry_filename}",
|
||||
file=entry_filename,
|
||||
)
|
||||
)
|
||||
|
||||
|
||||
@@ -208,7 +208,7 @@ class OrgToEntries(TextToEntries):
|
||||
compiled += f"\n {parsed_entry.body}"
|
||||
|
||||
# Add the sub-entry contents to the entry
|
||||
entry_compiled += f"{compiled}"
|
||||
entry_compiled += compiled
|
||||
entry_raw += f"{parsed_entry}"
|
||||
if not entry_heading:
|
||||
entry_heading = heading
|
||||
@@ -218,8 +218,8 @@ class OrgToEntries(TextToEntries):
|
||||
Entry(
|
||||
compiled=entry_compiled,
|
||||
raw=entry_raw,
|
||||
heading=f"{entry_heading}",
|
||||
file=f"{entry_to_file_map[parsed_entry]}",
|
||||
heading=entry_heading,
|
||||
file=entry_to_file_map[parsed_entry],
|
||||
)
|
||||
)
|
||||
|
||||
|
||||
Reference in New Issue
Block a user