From 5010623a0acd11d2c45825184dcdabf3c28c2149 Mon Sep 17 00:00:00 2001 From: Debanjum Date: Thu, 3 Jul 2025 18:34:34 -0700 Subject: [PATCH] Deep link to markdown entries by line number in uri Use url fragment schema for deep link URIs, borrowing from URL/PDF schemas. E.g file:///path/to/file.txt#line=&#page= Compute line number during (recursive) markdown entry chunking. Test line number in URI maps to line number of chunk in actual md file. This deeplink URI with line number is passed to llm as context to better combine with line range based view file tool. Grep tool already passed matching line number. This change passes line number in URIs of markdown entries matched by the semantic search tool. --- .../content/markdown/markdown_to_entries.py | 47 ++++++++++++--- tests/data/markdown/main_readme.md | 39 +++++++++++++ tests/test_markdown_to_entries.py | 58 +++++++++++++++++++ 3 files changed, 135 insertions(+), 9 deletions(-) create mode 100644 tests/data/markdown/main_readme.md diff --git a/src/khoj/processor/content/markdown/markdown_to_entries.py b/src/khoj/processor/content/markdown/markdown_to_entries.py index 8d1fbbf4..43b10431 100644 --- a/src/khoj/processor/content/markdown/markdown_to_entries.py +++ b/src/khoj/processor/content/markdown/markdown_to_entries.py @@ -54,13 +54,13 @@ class MarkdownToEntries(TextToEntries): def extract_markdown_entries(markdown_files: Dict[str, str], max_tokens=256) -> Tuple[Dict[str, str], List[Entry]]: "Extract entries by heading from specified Markdown files" entries: List[str] = [] - entry_to_file_map: List[Tuple[str, str]] = [] + entry_to_file_map: List[Tuple[str, str, int]] = [] file_to_text_map: Dict[str, str] = dict() for markdown_file in markdown_files: try: markdown_content = markdown_files[markdown_file] entries, entry_to_file_map = MarkdownToEntries.process_single_markdown_file( - markdown_content, markdown_file, entries, entry_to_file_map, max_tokens + markdown_content, markdown_file, entries, entry_to_file_map, max_tokens, start_line=1 ) file_to_text_map[markdown_file] = markdown_content except Exception as e: @@ -68,17 +68,18 @@ class MarkdownToEntries(TextToEntries): f"Unable to process file: {markdown_file}. This file will not be indexed.\n{e}", exc_info=True ) - return file_to_text_map, MarkdownToEntries.convert_markdown_entries_to_maps(entries, dict(entry_to_file_map)) + return file_to_text_map, MarkdownToEntries.convert_markdown_entries_to_maps(entries, entry_to_file_map) @staticmethod def process_single_markdown_file( markdown_content: str, markdown_file: str, entries: List[str], - entry_to_file_map: List[Tuple[str, str]], + entry_to_file_map: List[Tuple[str, str, int]], max_tokens=256, ancestry: Dict[int, str] = {}, - ) -> Tuple[List[str], List[Tuple[str, str]]]: + start_line: int = 1, + ) -> Tuple[List[str], List[Tuple[str, str, int]]]: # Prepend the markdown section's heading ancestry ancestry_string = "\n".join([f"{'#' * key} {ancestry[key]}" for key in sorted(ancestry.keys())]) markdown_content_with_ancestry = f"{ancestry_string}{markdown_content}" @@ -87,7 +88,9 @@ class MarkdownToEntries(TextToEntries): if len(TextToEntries.tokenizer(markdown_content_with_ancestry)) <= max_tokens or not re.search( rf"^#{{{len(ancestry)+1},}}\s", markdown_content, flags=re.MULTILINE ): - entry_to_file_map += [(markdown_content_with_ancestry, markdown_file)] + # Create entry with line number information + entry_with_line_info = (markdown_content_with_ancestry, markdown_file, start_line) + entry_to_file_map += [entry_with_line_info] entries.extend([markdown_content_with_ancestry]) return entries, entry_to_file_map @@ -98,22 +101,32 @@ class MarkdownToEntries(TextToEntries): next_heading_level += 1 sections = re.split(rf"(\n|^)(?=[#]{{{next_heading_level}}} .+\n?)", markdown_content, flags=re.MULTILINE) + # Recurse down each non-empty section after parsing its body, heading and ancestry + current_line_offset = 0 for section in sections: + num_lines_in_section = section.count("\n") # Skip empty sections if section.strip() == "": + current_line_offset += num_lines_in_section continue + section_start_line_in_file = start_line + current_line_offset + # Extract the section body and (when present) the heading current_ancestry = ancestry.copy() first_line = [line for line in section.split("\n") if line.strip() != ""][0] if re.search(rf"^#{{{next_heading_level}}} ", first_line): # Extract the section body without the heading - current_section_body = "\n".join(section.split(first_line)[1:]) + current_section_heading, current_section_body = section.split(first_line, 1) + current_section_body_offset = current_section_heading.count("\n") # Parse the section heading into current section ancestry current_section_title = first_line[next_heading_level:].strip() current_ancestry[next_heading_level] = current_section_title + # Line number should point to the heading itself + recursive_start_line = section_start_line_in_file + current_section_body_offset else: current_section_body = section + recursive_start_line = section_start_line_in_file # Recurse down children of the current entry MarkdownToEntries.process_single_markdown_file( @@ -123,23 +136,38 @@ class MarkdownToEntries(TextToEntries): entry_to_file_map, max_tokens, current_ancestry, + start_line=recursive_start_line, ) + current_line_offset += num_lines_in_section return entries, entry_to_file_map @staticmethod - def convert_markdown_entries_to_maps(parsed_entries: List[str], entry_to_file_map: Dict[str, str]) -> List[Entry]: + def convert_markdown_entries_to_maps( + parsed_entries: List[str], entry_to_file_map: List[Tuple[str, str, int]] + ) -> List[Entry]: "Convert each Markdown entries into a dictionary" entries: List[Entry] = [] + + # Create a mapping from parsed entry to file info + entry_map: Dict[str, Tuple[str, int]] = {} + for entry_info in entry_to_file_map: + entry_content, raw_filename, start_line = entry_info + entry_map[entry_content] = (raw_filename, start_line) + for parsed_entry in parsed_entries: - raw_filename = entry_to_file_map[parsed_entry] + raw_filename, start_line = entry_map[parsed_entry] + calculated_line = start_line if start_line > 0 else 1 # Check if raw_filename is a URL. If so, save it as is. If not, convert it to a Path. if type(raw_filename) == str and re.search(r"^https?://", raw_filename): # Escape the URL to avoid issues with special characters entry_filename = urllib3.util.parse_url(raw_filename).url + uri = entry_filename else: entry_filename = raw_filename + # Create URI with line number + uri = f"file://{entry_filename}#line={calculated_line}" heading = parsed_entry.splitlines()[0] if re.search(r"^#+\s", parsed_entry) else "" # Append base filename to compiled entry for context to model @@ -152,6 +180,7 @@ class MarkdownToEntries(TextToEntries): raw=parsed_entry, heading=f"{prefix}{heading}", file=entry_filename, + uri=uri, ) ) diff --git a/tests/data/markdown/main_readme.md b/tests/data/markdown/main_readme.md new file mode 100644 index 00000000..5eb7c7c9 --- /dev/null +++ b/tests/data/markdown/main_readme.md @@ -0,0 +1,39 @@ +# Main Readme +> Allow natural language search, chat with your documents using transformer based models + +This is a test markdown file with multiple, nested child entries. + +## Dependencies + +- Python3 +- [Miniconda](https://docs.conda.io/en/latest/miniconda.html#latest-miniconda-installer-links) + +## Installation + +```bash +pip install khoj +``` + +## Run + Load ML model, generate embeddings and expose API to query specified org-mode files + + ```shell + python3 main.py --input-files ~/Notes/Schedule.org ~/Notes/Incoming.org --verbose + ``` + +## Use + +### **Khoj via API** +- Query: `GET` [http://localhost:42110/api/search?q="What is the meaning of life"](http://localhost:42110/api/search?q=%22what%20is%20the%20meaning%20of%20life%22) +- Update Index: `GET` [http://localhost:42110/api/update](http://localhost:42110/api/update) +- [Khoj API Docs](http://localhost:42110/docs) + +### *Khoj via Web* + +- Open browser to http://localhost:42110 +- Enter query in search box + +## Acknowledgments + +- [MiniLM Model](https://huggingface.co/sentence-transformers/multi-qa-MiniLM-L6-cos-v1) for Asymmetric Text Search. See (SBert Documentation)[https://www.sbert.net/examples/applications/retrieve_rerank/README.html] +- [OpenAI CLIP Model](https://github.com/openai/CLIP) for Image Search. See [SBert Documentation](https://www.sbert.net/examples/applications/image-search/README.html) diff --git a/tests/test_markdown_to_entries.py b/tests/test_markdown_to_entries.py index 22f94ef5..30813555 100644 --- a/tests/test_markdown_to_entries.py +++ b/tests/test_markdown_to_entries.py @@ -1,4 +1,5 @@ import os +import re from pathlib import Path from khoj.processor.content.markdown.markdown_to_entries import MarkdownToEntries @@ -248,6 +249,58 @@ def test_get_markdown_files(tmp_path): assert set(extracted_org_files.keys()) == expected_files +def test_line_number_tracking_in_recursive_split(): + "Ensure line numbers in URIs are correct after recursive splitting by checking against the actual file." + # Arrange + markdown_file_path = os.path.abspath("tests/data/markdown/main_readme.md") + + with open(markdown_file_path, "r") as f: + markdown_content = f.read() + lines = markdown_content.splitlines() + data = {markdown_file_path: markdown_content} + + # Act + # Using a small max_tokens to force recursive splitting + _, entries = MarkdownToEntries.extract_markdown_entries(markdown_files=data, max_tokens=10) + + # Assert + assert len(entries) > 0, "No entries were extracted." + + for entry in entries: + # Extract file path and line number from the entry URI + # for files uri is expected in format: file:///path/to/file.md#line=5 + match = re.search(r"file://(.*?)#line=(\d+)", entry.uri) + filepath_from_uri = match.group(1) + line_number_from_uri = int(match.group(2)) + + # line_number is 1-based, list index is 0-based + line_in_file = clean(lines[line_number_from_uri - 1]) + next_line_in_file = clean(lines[line_number_from_uri]) if line_number_from_uri < len(lines) else "" + + # Remove ancestor heading lines inserted during post-processing + first_entry_line = "" + for line in entry.raw.splitlines(): + if line.startswith("#"): + first_entry_line = line + else: + break # Stop at the first non-heading line + # Remove heading prefix from entry.compiled as level changed during post-processing + cleaned_first_entry_line = first_entry_line.strip() + # Remove multiple consecutive spaces + cleaned_first_entry_line = clean(cleaned_first_entry_line) + + assert entry.uri is not None, f"Entry '{entry}' has a None URI." + assert match is not None, f"URI format is incorrect: {entry.uri}" + assert ( + filepath_from_uri == markdown_file_path + ), f"File path in URI '{filepath_from_uri}' does not match expected '{markdown_file_path}'" + + # Ensure the first non-heading line in the compiled entry matches the line in the file + assert ( + cleaned_first_entry_line in line_in_file.strip() or cleaned_first_entry_line in next_line_in_file.strip() + ), f"First non-heading line '{cleaned_first_entry_line}' in {entry.raw} does not match line {line_number_from_uri} in file: '{line_in_file}' or next line '{next_line_in_file}'" + + # Helper Functions def create_file(tmp_path: Path, entry=None, filename="test.md"): markdown_file = tmp_path / filename @@ -255,3 +308,8 @@ def create_file(tmp_path: Path, entry=None, filename="test.md"): if entry: markdown_file.write_text(entry) return markdown_file + + +def clean(text): + "Normalize spaces in text for easier comparison." + return re.sub(r"\s+", " ", text)