mirror of
https://github.com/khoaliber/khoj.git
synced 2026-03-08 05:39:13 +00:00
Do not wrap filepath in Path to fix indexing markdown files on Windows (#993)
### Issue - Path with / are converted to \\ on Windows using the `Path' operator. - The `markdown_to_entries' module was trying to normalize file paths with`Path' for some reason. This would store the file paths in DB Entry differently than the file to entries map if Khoj ran on Windows. That'd result in a KeyError when trying to look up the entry file path from `file_to_text_map' in the `text_to_entries:update_embeddings()' function. ### Fix - Removing the unnecessary OS dependent Path normalization in `markdown_to_entries' should keep the file path storage consistent across `file_to_text_map' var, `FileObjectAdaptor', `Entry' DB tables on Windows for Markdown files as well. This issue will affect users hosting Khoj server on Windows and attempting to index markdown files. Resolves #984
This commit is contained in:
@@ -3,7 +3,7 @@ import re
|
|||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import Dict, List, Tuple
|
from typing import Dict, List, Tuple
|
||||||
|
|
||||||
import urllib3
|
import urllib3.util
|
||||||
|
|
||||||
from khoj.database.models import Entry as DbEntry
|
from khoj.database.models import Entry as DbEntry
|
||||||
from khoj.database.models import KhojUser
|
from khoj.database.models import KhojUser
|
||||||
@@ -51,11 +51,11 @@ class MarkdownToEntries(TextToEntries):
|
|||||||
return num_new_embeddings, num_deleted_embeddings
|
return num_new_embeddings, num_deleted_embeddings
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def extract_markdown_entries(markdown_files, max_tokens=256) -> Tuple[Dict, List[Entry]]:
|
def extract_markdown_entries(markdown_files: Dict[str, str], max_tokens=256) -> Tuple[Dict[str, str], List[Entry]]:
|
||||||
"Extract entries by heading from specified Markdown files"
|
"Extract entries by heading from specified Markdown files"
|
||||||
entries: List[str] = []
|
entries: List[str] = []
|
||||||
entry_to_file_map: List[Tuple[str, str]] = []
|
entry_to_file_map: List[Tuple[str, str]] = []
|
||||||
file_to_text_map = dict()
|
file_to_text_map: Dict[str, str] = dict()
|
||||||
for markdown_file in markdown_files:
|
for markdown_file in markdown_files:
|
||||||
try:
|
try:
|
||||||
markdown_content = markdown_files[markdown_file]
|
markdown_content = markdown_files[markdown_file]
|
||||||
@@ -128,7 +128,7 @@ class MarkdownToEntries(TextToEntries):
|
|||||||
return entries, entry_to_file_map
|
return entries, entry_to_file_map
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def convert_markdown_entries_to_maps(parsed_entries: List[str], entry_to_file_map) -> List[Entry]:
|
def convert_markdown_entries_to_maps(parsed_entries: List[str], entry_to_file_map: Dict[str, str]) -> List[Entry]:
|
||||||
"Convert each Markdown entries into a dictionary"
|
"Convert each Markdown entries into a dictionary"
|
||||||
entries: List[Entry] = []
|
entries: List[Entry] = []
|
||||||
for parsed_entry in parsed_entries:
|
for parsed_entry in parsed_entries:
|
||||||
@@ -139,7 +139,7 @@ class MarkdownToEntries(TextToEntries):
|
|||||||
# Escape the URL to avoid issues with special characters
|
# Escape the URL to avoid issues with special characters
|
||||||
entry_filename = urllib3.util.parse_url(raw_filename).url
|
entry_filename = urllib3.util.parse_url(raw_filename).url
|
||||||
else:
|
else:
|
||||||
entry_filename = str(Path(raw_filename))
|
entry_filename = raw_filename
|
||||||
|
|
||||||
heading = parsed_entry.splitlines()[0] if re.search(r"^#+\s", parsed_entry) else ""
|
heading = parsed_entry.splitlines()[0] if re.search(r"^#+\s", parsed_entry) else ""
|
||||||
# Append base filename to compiled entry for context to model
|
# Append base filename to compiled entry for context to model
|
||||||
@@ -151,7 +151,7 @@ class MarkdownToEntries(TextToEntries):
|
|||||||
compiled=compiled_entry,
|
compiled=compiled_entry,
|
||||||
raw=parsed_entry,
|
raw=parsed_entry,
|
||||||
heading=f"{prefix}{heading}",
|
heading=f"{prefix}{heading}",
|
||||||
file=f"{entry_filename}",
|
file=entry_filename,
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|||||||
@@ -208,7 +208,7 @@ class OrgToEntries(TextToEntries):
|
|||||||
compiled += f"\n {parsed_entry.body}"
|
compiled += f"\n {parsed_entry.body}"
|
||||||
|
|
||||||
# Add the sub-entry contents to the entry
|
# Add the sub-entry contents to the entry
|
||||||
entry_compiled += f"{compiled}"
|
entry_compiled += compiled
|
||||||
entry_raw += f"{parsed_entry}"
|
entry_raw += f"{parsed_entry}"
|
||||||
if not entry_heading:
|
if not entry_heading:
|
||||||
entry_heading = heading
|
entry_heading = heading
|
||||||
@@ -218,8 +218,8 @@ class OrgToEntries(TextToEntries):
|
|||||||
Entry(
|
Entry(
|
||||||
compiled=entry_compiled,
|
compiled=entry_compiled,
|
||||||
raw=entry_raw,
|
raw=entry_raw,
|
||||||
heading=f"{entry_heading}",
|
heading=entry_heading,
|
||||||
file=f"{entry_to_file_map[parsed_entry]}",
|
file=entry_to_file_map[parsed_entry],
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user