From dffdd8134514d417121aa589e87f66d8d3236277 Mon Sep 17 00:00:00 2001 From: Debanjum Date: Sun, 1 Dec 2024 22:37:57 -0800 Subject: [PATCH 1/2] Do not wrap filepath in Path to fix indexing markdown files on Windows Issue - Path with / are converted to \\ on Windows using the Path operator. - The markdown to entries method for some reason was doing this. This would store the file paths in DB entry differently than the file to entries map. Resulting in a KeyError when trying to look up the entry file path from file_to_text_map in the text_to_entries:update_embeddings() function. Fix - Removing the unnecessary OS dependendent Path normalization in markdown_to_entries should keep the file path storage consistent across file_to_text_map var, FileObjectAdaptor, Entry DB tables on Windows for Markdown files as well This issue would only affect users hosting Khoj server on Windows and attempting to index markdown files. Resolves #984 --- src/khoj/processor/content/markdown/markdown_to_entries.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/khoj/processor/content/markdown/markdown_to_entries.py b/src/khoj/processor/content/markdown/markdown_to_entries.py index c4ee03ef..c7ed690c 100644 --- a/src/khoj/processor/content/markdown/markdown_to_entries.py +++ b/src/khoj/processor/content/markdown/markdown_to_entries.py @@ -139,7 +139,7 @@ class MarkdownToEntries(TextToEntries): # Escape the URL to avoid issues with special characters entry_filename = urllib3.util.parse_url(raw_filename).url else: - entry_filename = str(Path(raw_filename)) + entry_filename = raw_filename heading = parsed_entry.splitlines()[0] if re.search(r"^#+\s", parsed_entry) else "" # Append base filename to compiled entry for context to model From 47c926b0ff103cf570307c513c7292a3107e989a Mon Sep 17 00:00:00 2001 From: Debanjum Date: Sun, 1 Dec 2024 23:02:52 -0800 Subject: [PATCH 2/2] Add more typing to org|md_to_entries. Remove redundant f-string wraps - Add type hints to improve maintainability of stabilzed indexing code - It shouldn't be necessary to wrap string variables in an f-string This change aims to improve code quality. It should not affect functionality. --- .../processor/content/markdown/markdown_to_entries.py | 10 +++++----- src/khoj/processor/content/org_mode/org_to_entries.py | 6 +++--- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/src/khoj/processor/content/markdown/markdown_to_entries.py b/src/khoj/processor/content/markdown/markdown_to_entries.py index c7ed690c..8d1fbbf4 100644 --- a/src/khoj/processor/content/markdown/markdown_to_entries.py +++ b/src/khoj/processor/content/markdown/markdown_to_entries.py @@ -3,7 +3,7 @@ import re from pathlib import Path from typing import Dict, List, Tuple -import urllib3 +import urllib3.util from khoj.database.models import Entry as DbEntry from khoj.database.models import KhojUser @@ -51,11 +51,11 @@ class MarkdownToEntries(TextToEntries): return num_new_embeddings, num_deleted_embeddings @staticmethod - def extract_markdown_entries(markdown_files, max_tokens=256) -> Tuple[Dict, List[Entry]]: + def extract_markdown_entries(markdown_files: Dict[str, str], max_tokens=256) -> Tuple[Dict[str, str], List[Entry]]: "Extract entries by heading from specified Markdown files" entries: List[str] = [] entry_to_file_map: List[Tuple[str, str]] = [] - file_to_text_map = dict() + file_to_text_map: Dict[str, str] = dict() for markdown_file in markdown_files: try: markdown_content = markdown_files[markdown_file] @@ -128,7 +128,7 @@ class MarkdownToEntries(TextToEntries): return entries, entry_to_file_map @staticmethod - def convert_markdown_entries_to_maps(parsed_entries: List[str], entry_to_file_map) -> List[Entry]: + def convert_markdown_entries_to_maps(parsed_entries: List[str], entry_to_file_map: Dict[str, str]) -> List[Entry]: "Convert each Markdown entries into a dictionary" entries: List[Entry] = [] for parsed_entry in parsed_entries: @@ -151,7 +151,7 @@ class MarkdownToEntries(TextToEntries): compiled=compiled_entry, raw=parsed_entry, heading=f"{prefix}{heading}", - file=f"{entry_filename}", + file=entry_filename, ) ) diff --git a/src/khoj/processor/content/org_mode/org_to_entries.py b/src/khoj/processor/content/org_mode/org_to_entries.py index cfc17cc0..4b9fe3ae 100644 --- a/src/khoj/processor/content/org_mode/org_to_entries.py +++ b/src/khoj/processor/content/org_mode/org_to_entries.py @@ -208,7 +208,7 @@ class OrgToEntries(TextToEntries): compiled += f"\n {parsed_entry.body}" # Add the sub-entry contents to the entry - entry_compiled += f"{compiled}" + entry_compiled += compiled entry_raw += f"{parsed_entry}" if not entry_heading: entry_heading = heading @@ -218,8 +218,8 @@ class OrgToEntries(TextToEntries): Entry( compiled=entry_compiled, raw=entry_raw, - heading=f"{entry_heading}", - file=f"{entry_to_file_map[parsed_entry]}", + heading=entry_heading, + file=entry_to_file_map[parsed_entry], ) )