diff --git a/src/khoj/processor/content/markdown/markdown_to_entries.py b/src/khoj/processor/content/markdown/markdown_to_entries.py index 8d1fbbf4..43b10431 100644 --- a/src/khoj/processor/content/markdown/markdown_to_entries.py +++ b/src/khoj/processor/content/markdown/markdown_to_entries.py @@ -54,13 +54,13 @@ class MarkdownToEntries(TextToEntries): def extract_markdown_entries(markdown_files: Dict[str, str], max_tokens=256) -> Tuple[Dict[str, str], List[Entry]]: "Extract entries by heading from specified Markdown files" entries: List[str] = [] - entry_to_file_map: List[Tuple[str, str]] = [] + entry_to_file_map: List[Tuple[str, str, int]] = [] file_to_text_map: Dict[str, str] = dict() for markdown_file in markdown_files: try: markdown_content = markdown_files[markdown_file] entries, entry_to_file_map = MarkdownToEntries.process_single_markdown_file( - markdown_content, markdown_file, entries, entry_to_file_map, max_tokens + markdown_content, markdown_file, entries, entry_to_file_map, max_tokens, start_line=1 ) file_to_text_map[markdown_file] = markdown_content except Exception as e: @@ -68,17 +68,18 @@ class MarkdownToEntries(TextToEntries): f"Unable to process file: {markdown_file}. This file will not be indexed.\n{e}", exc_info=True ) - return file_to_text_map, MarkdownToEntries.convert_markdown_entries_to_maps(entries, dict(entry_to_file_map)) + return file_to_text_map, MarkdownToEntries.convert_markdown_entries_to_maps(entries, entry_to_file_map) @staticmethod def process_single_markdown_file( markdown_content: str, markdown_file: str, entries: List[str], - entry_to_file_map: List[Tuple[str, str]], + entry_to_file_map: List[Tuple[str, str, int]], max_tokens=256, ancestry: Dict[int, str] = {}, - ) -> Tuple[List[str], List[Tuple[str, str]]]: + start_line: int = 1, + ) -> Tuple[List[str], List[Tuple[str, str, int]]]: # Prepend the markdown section's heading ancestry ancestry_string = "\n".join([f"{'#' * key} {ancestry[key]}" for key in sorted(ancestry.keys())]) markdown_content_with_ancestry = f"{ancestry_string}{markdown_content}" @@ -87,7 +88,9 @@ class MarkdownToEntries(TextToEntries): if len(TextToEntries.tokenizer(markdown_content_with_ancestry)) <= max_tokens or not re.search( rf"^#{{{len(ancestry)+1},}}\s", markdown_content, flags=re.MULTILINE ): - entry_to_file_map += [(markdown_content_with_ancestry, markdown_file)] + # Create entry with line number information + entry_with_line_info = (markdown_content_with_ancestry, markdown_file, start_line) + entry_to_file_map += [entry_with_line_info] entries.extend([markdown_content_with_ancestry]) return entries, entry_to_file_map @@ -98,22 +101,32 @@ class MarkdownToEntries(TextToEntries): next_heading_level += 1 sections = re.split(rf"(\n|^)(?=[#]{{{next_heading_level}}} .+\n?)", markdown_content, flags=re.MULTILINE) + # Recurse down each non-empty section after parsing its body, heading and ancestry + current_line_offset = 0 for section in sections: + num_lines_in_section = section.count("\n") # Skip empty sections if section.strip() == "": + current_line_offset += num_lines_in_section continue + section_start_line_in_file = start_line + current_line_offset + # Extract the section body and (when present) the heading current_ancestry = ancestry.copy() first_line = [line for line in section.split("\n") if line.strip() != ""][0] if re.search(rf"^#{{{next_heading_level}}} ", first_line): # Extract the section body without the heading - current_section_body = "\n".join(section.split(first_line)[1:]) + current_section_heading, current_section_body = section.split(first_line, 1) + current_section_body_offset = current_section_heading.count("\n") # Parse the section heading into current section ancestry current_section_title = first_line[next_heading_level:].strip() current_ancestry[next_heading_level] = current_section_title + # Line number should point to the heading itself + recursive_start_line = section_start_line_in_file + current_section_body_offset else: current_section_body = section + recursive_start_line = section_start_line_in_file # Recurse down children of the current entry MarkdownToEntries.process_single_markdown_file( @@ -123,23 +136,38 @@ class MarkdownToEntries(TextToEntries): entry_to_file_map, max_tokens, current_ancestry, + start_line=recursive_start_line, ) + current_line_offset += num_lines_in_section return entries, entry_to_file_map @staticmethod - def convert_markdown_entries_to_maps(parsed_entries: List[str], entry_to_file_map: Dict[str, str]) -> List[Entry]: + def convert_markdown_entries_to_maps( + parsed_entries: List[str], entry_to_file_map: List[Tuple[str, str, int]] + ) -> List[Entry]: "Convert each Markdown entries into a dictionary" entries: List[Entry] = [] + + # Create a mapping from parsed entry to file info + entry_map: Dict[str, Tuple[str, int]] = {} + for entry_info in entry_to_file_map: + entry_content, raw_filename, start_line = entry_info + entry_map[entry_content] = (raw_filename, start_line) + for parsed_entry in parsed_entries: - raw_filename = entry_to_file_map[parsed_entry] + raw_filename, start_line = entry_map[parsed_entry] + calculated_line = start_line if start_line > 0 else 1 # Check if raw_filename is a URL. If so, save it as is. If not, convert it to a Path. if type(raw_filename) == str and re.search(r"^https?://", raw_filename): # Escape the URL to avoid issues with special characters entry_filename = urllib3.util.parse_url(raw_filename).url + uri = entry_filename else: entry_filename = raw_filename + # Create URI with line number + uri = f"file://{entry_filename}#line={calculated_line}" heading = parsed_entry.splitlines()[0] if re.search(r"^#+\s", parsed_entry) else "" # Append base filename to compiled entry for context to model @@ -152,6 +180,7 @@ class MarkdownToEntries(TextToEntries): raw=parsed_entry, heading=f"{prefix}{heading}", file=entry_filename, + uri=uri, ) ) diff --git a/tests/data/markdown/main_readme.md b/tests/data/markdown/main_readme.md new file mode 100644 index 00000000..5eb7c7c9 --- /dev/null +++ b/tests/data/markdown/main_readme.md @@ -0,0 +1,39 @@ +# Main Readme +> Allow natural language search, chat with your documents using transformer based models + +This is a test markdown file with multiple, nested child entries. + +## Dependencies + +- Python3 +- [Miniconda](https://docs.conda.io/en/latest/miniconda.html#latest-miniconda-installer-links) + +## Installation + +```bash +pip install khoj +``` + +## Run + Load ML model, generate embeddings and expose API to query specified org-mode files + + ```shell + python3 main.py --input-files ~/Notes/Schedule.org ~/Notes/Incoming.org --verbose + ``` + +## Use + +### **Khoj via API** +- Query: `GET` [http://localhost:42110/api/search?q="What is the meaning of life"](http://localhost:42110/api/search?q=%22what%20is%20the%20meaning%20of%20life%22) +- Update Index: `GET` [http://localhost:42110/api/update](http://localhost:42110/api/update) +- [Khoj API Docs](http://localhost:42110/docs) + +### *Khoj via Web* + +- Open browser to http://localhost:42110 +- Enter query in search box + +## Acknowledgments + +- [MiniLM Model](https://huggingface.co/sentence-transformers/multi-qa-MiniLM-L6-cos-v1) for Asymmetric Text Search. See (SBert Documentation)[https://www.sbert.net/examples/applications/retrieve_rerank/README.html] +- [OpenAI CLIP Model](https://github.com/openai/CLIP) for Image Search. See [SBert Documentation](https://www.sbert.net/examples/applications/image-search/README.html) diff --git a/tests/test_markdown_to_entries.py b/tests/test_markdown_to_entries.py index 22f94ef5..30813555 100644 --- a/tests/test_markdown_to_entries.py +++ b/tests/test_markdown_to_entries.py @@ -1,4 +1,5 @@ import os +import re from pathlib import Path from khoj.processor.content.markdown.markdown_to_entries import MarkdownToEntries @@ -248,6 +249,58 @@ def test_get_markdown_files(tmp_path): assert set(extracted_org_files.keys()) == expected_files +def test_line_number_tracking_in_recursive_split(): + "Ensure line numbers in URIs are correct after recursive splitting by checking against the actual file." + # Arrange + markdown_file_path = os.path.abspath("tests/data/markdown/main_readme.md") + + with open(markdown_file_path, "r") as f: + markdown_content = f.read() + lines = markdown_content.splitlines() + data = {markdown_file_path: markdown_content} + + # Act + # Using a small max_tokens to force recursive splitting + _, entries = MarkdownToEntries.extract_markdown_entries(markdown_files=data, max_tokens=10) + + # Assert + assert len(entries) > 0, "No entries were extracted." + + for entry in entries: + # Extract file path and line number from the entry URI + # for files uri is expected in format: file:///path/to/file.md#line=5 + match = re.search(r"file://(.*?)#line=(\d+)", entry.uri) + filepath_from_uri = match.group(1) + line_number_from_uri = int(match.group(2)) + + # line_number is 1-based, list index is 0-based + line_in_file = clean(lines[line_number_from_uri - 1]) + next_line_in_file = clean(lines[line_number_from_uri]) if line_number_from_uri < len(lines) else "" + + # Remove ancestor heading lines inserted during post-processing + first_entry_line = "" + for line in entry.raw.splitlines(): + if line.startswith("#"): + first_entry_line = line + else: + break # Stop at the first non-heading line + # Remove heading prefix from entry.compiled as level changed during post-processing + cleaned_first_entry_line = first_entry_line.strip() + # Remove multiple consecutive spaces + cleaned_first_entry_line = clean(cleaned_first_entry_line) + + assert entry.uri is not None, f"Entry '{entry}' has a None URI." + assert match is not None, f"URI format is incorrect: {entry.uri}" + assert ( + filepath_from_uri == markdown_file_path + ), f"File path in URI '{filepath_from_uri}' does not match expected '{markdown_file_path}'" + + # Ensure the first non-heading line in the compiled entry matches the line in the file + assert ( + cleaned_first_entry_line in line_in_file.strip() or cleaned_first_entry_line in next_line_in_file.strip() + ), f"First non-heading line '{cleaned_first_entry_line}' in {entry.raw} does not match line {line_number_from_uri} in file: '{line_in_file}' or next line '{next_line_in_file}'" + + # Helper Functions def create_file(tmp_path: Path, entry=None, filename="test.md"): markdown_file = tmp_path / filename @@ -255,3 +308,8 @@ def create_file(tmp_path: Path, entry=None, filename="test.md"): if entry: markdown_file.write_text(entry) return markdown_file + + +def clean(text): + "Normalize spaces in text for easier comparison." + return re.sub(r"\s+", " ", text)