From 5c4d41d30080a6c4506a86354881b0f57dc32292 Mon Sep 17 00:00:00 2001
From: Debanjum <debanjum@gmail.com>
Date: Mon, 23 Jun 2025 15:42:58 -0700
Subject: [PATCH 1/5] Reduce structural changes to indexed raw org mode entries

Reduce structural changes to raw entry allows better deep-linking and
re-annotation. Currently done via line number in new uri field.

Only add properties drawer to raw entry if entry has properties
Previously line and source properties were inserted into raw entries.
This isn't done anymore. Line, source are deprecated for use in khoj.el.
---
 .../processor/content/org_mode/orgnode.py     | 21 +++++++------------
 tests/test_org_to_entries.py                  |  6 +-----
 2 files changed, 8 insertions(+), 19 deletions(-)

diff --git a/src/khoj/processor/content/org_mode/orgnode.py b/src/khoj/processor/content/org_mode/orgnode.py
index f81c6e1b..e190e17a 100644
--- a/src/khoj/processor/content/org_mode/orgnode.py
+++ b/src/khoj/processor/content/org_mode/orgnode.py
@@ -66,7 +66,7 @@ def makelist(file, filename) -> List["Orgnode"]:
     ctr = 0
 
     if type(file) == str:
-        f = file.split("\n")
+        f = file.splitlines()
     else:
         f = file
 
@@ -121,7 +121,7 @@ def makelist(file, filename) -> List["Orgnode"]:
             heading = heading_search.group(2)
             bodytext = ""
             tags = list()  # set of all tags in headline
-            tag_search = re.search(r"(.*?)\s*:([a-zA-Z0-9].*?):$", heading)
+            tag_search = re.search(r"(.*?)\s+:([a-zA-Z0-9@_].*?):\s*$", heading)
             if tag_search:
                 heading = tag_search.group(1)
                 parsedtags = tag_search.group(2)
@@ -260,14 +260,6 @@ def makelist(file, filename) -> List["Orgnode"]:
         # Prefix filepath/title to ancestors
         n.ancestors = [file_title] + n.ancestors
 
-        # Set SOURCE property to a file+heading based org-mode link to the entry
-        if n.level == 0:
-            n.properties["LINE"] = f"file:{normalize_filename(filename)}::0"
-            n.properties["SOURCE"] = f"[[file:{normalize_filename(filename)}]]"
-        else:
-            escaped_heading = n.heading.replace("[", "\\[").replace("]", "\\]")
-            n.properties["SOURCE"] = f"[[file:{normalize_filename(filename)}::*{escaped_heading}]]"
-
     return nodelist
 
 
@@ -520,10 +512,11 @@ class Orgnode(object):
             n = n + "\n"
 
         # Output Property Drawer
-        n = n + indent + ":PROPERTIES:\n"
-        for key, value in self._properties.items():
-            n = n + indent + f":{key}: {value}\n"
-        n = n + indent + ":END:\n"
+        if self._properties:
+            n = n + indent + ":PROPERTIES:\n"
+            for key, value in self._properties.items():
+                n = n + indent + f":{key}: {value}\n"
+            n = n + indent + ":END:\n"
 
         # Output Body
         if self.hasBody:
diff --git a/tests/test_org_to_entries.py b/tests/test_org_to_entries.py
index a84fe6e8..5c11a6fd 100644
--- a/tests/test_org_to_entries.py
+++ b/tests/test_org_to_entries.py
@@ -147,12 +147,10 @@ body line 1.1
     # Extract Entries from specified Org files
     extracted_entries = OrgToEntries.extract_org_entries(org_files=data, max_tokens=12)
     assert len(extracted_entries) == 2
-    for entry in extracted_entries[1]:
-        entry.raw = clean(entry.raw)
 
     # Assert
     assert len(extracted_entries[1]) == 1
-    assert entry.raw == expected_entry
+    assert extracted_entries[1][-1].raw == expected_entry
 
 
 def test_parse_org_entry_with_children_as_single_entry_if_small(tmp_path):
@@ -388,8 +386,6 @@ def test_extract_entries_with_different_level_headings(tmp_path):
     # Extract Entries from specified Org files
     entries = OrgToEntries.extract_org_entries(org_files=data, index_heading_entries=True, max_tokens=3)
     assert len(entries) == 2
-    for entry in entries[1]:
-        entry.raw = clean(f"{entry.raw}")
 
     # Assert
     assert len(entries[1]) == 2

From 820b4523fd058a7b7fea4ce996f694ca40835239 Mon Sep 17 00:00:00 2001
From: Debanjum <debanjum@gmail.com>
Date: Sun, 29 Jun 2025 15:06:08 -0700
Subject: [PATCH 2/5] Show raw rather than compiled entry to llm and users

Only embedding models see, operate on compiled text.

LLMs should see raw entry to improve combining it with other document
traversal tools for better regex and line matching.

Users see raw entry for better matching with their actual notes.
---
 src/khoj/routers/helpers.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/khoj/routers/helpers.py b/src/khoj/routers/helpers.py
index 5f90c2be..9877233e 100644
--- a/src/khoj/routers/helpers.py
+++ b/src/khoj/routers/helpers.py
@@ -1263,7 +1263,7 @@ async def search_documents(
         compiled_references = [
             {
                 "query": item.additional["query"],
-                "compiled": item.additional["compiled"],
+                "compiled": item["entry"],
                 "file": item.additional["file"],
             }
             for item in search_results

From e90ab5341ac5c9a7e9a559b5a57478a24ec6a7b3 Mon Sep 17 00:00:00 2001
From: Debanjum <debanjum@gmail.com>
Date: Mon, 23 Jun 2025 19:04:06 -0700
Subject: [PATCH 3/5] Add context uri field to deeplink line number in original
 doc

---
 src/khoj/database/models/__init__.py          |  1 +
 src/khoj/processor/content/text_to_entries.py |  3 +++
 src/khoj/processor/conversation/utils.py      |  2 +-
 src/khoj/routers/helpers.py                   | 17 ++++++++++-------
 src/khoj/search_type/text_search.py           |  2 ++
 src/khoj/utils/rawconfig.py                   | 11 +++++++++++
 6 files changed, 28 insertions(+), 8 deletions(-)

diff --git a/src/khoj/database/models/__init__.py b/src/khoj/database/models/__init__.py
index 4a952de4..1a10f962 100644
--- a/src/khoj/database/models/__init__.py
+++ b/src/khoj/database/models/__init__.py
@@ -23,6 +23,7 @@ logger = logging.getLogger(__name__)
 class Context(PydanticBaseModel):
     compiled: str
     file: str
+    uri: str
     query: Optional[str] = None
 
 
diff --git a/src/khoj/processor/content/text_to_entries.py b/src/khoj/processor/content/text_to_entries.py
index 8e0b3322..bac55aa4 100644
--- a/src/khoj/processor/content/text_to_entries.py
+++ b/src/khoj/processor/content/text_to_entries.py
@@ -99,6 +99,7 @@ class TextToEntries(ABC):
                 entry.raw = compiled_entry_chunk if raw_is_compiled else TextToEntries.clean_field(entry.raw)
                 entry.heading = TextToEntries.clean_field(entry.heading)
                 entry.file = TextToEntries.clean_field(entry.file)
+                entry_uri = TextToEntries.clean_field(entry_uri)
 
                 chunked_entries.append(
                     Entry(
@@ -107,6 +108,7 @@ class TextToEntries(ABC):
                         heading=entry.heading,
                         file=entry.file,
                         corpus_id=corpus_id,
+                        uri=entry_uri,
                     )
                 )
 
@@ -192,6 +194,7 @@ class TextToEntries(ABC):
                             file_type=file_type,
                             hashed_value=entry_hash,
                             corpus_id=entry.corpus_id,
+                            url=entry.uri,
                             search_model=model,
                             file_object=file_object,
                         )
diff --git a/src/khoj/processor/conversation/utils.py b/src/khoj/processor/conversation/utils.py
index f4deb8d9..58b36baa 100644
--- a/src/khoj/processor/conversation/utils.py
+++ b/src/khoj/processor/conversation/utils.py
@@ -646,7 +646,7 @@ def generate_chatml_messages_with_context(
 
         if not is_none_or_empty(chat.context):
             references = "\n\n".join(
-                {f"# File: {item.file}\n## {item.compiled}\n" for item in chat.context or [] if isinstance(item, dict)}
+                {f"# URI: {item.uri}\n## {item.compiled}\n" for item in chat.context or [] if isinstance(item, dict)}
             )
             message_context += [{"type": "text", "text": f"{prompts.notes_conversation.format(references=references)}"}]
 
diff --git a/src/khoj/routers/helpers.py b/src/khoj/routers/helpers.py
index 9877233e..0643d134 100644
--- a/src/khoj/routers/helpers.py
+++ b/src/khoj/routers/helpers.py
@@ -1265,6 +1265,7 @@ async def search_documents(
                 "query": item.additional["query"],
                 "compiled": item["entry"],
                 "file": item.additional["file"],
+                "uri": item.additional["uri"],
             }
             for item in search_results
         ]
@@ -2867,6 +2868,7 @@ async def view_file_content(
             {
                 "query": query,
                 "file": path,
+                "uri": path,
                 "compiled": filtered_text,
             }
         ]
@@ -2878,7 +2880,7 @@ async def view_file_content(
         logger.error(error_msg, exc_info=True)
 
         # Return an error result in the expected format
-        yield [{"query": query, "file": path, "compiled": error_msg}]
+        yield [{"query": query, "file": path, "uri": path, "compiled": error_msg}]
 
 
 async def grep_files(
@@ -2982,7 +2984,7 @@ async def grep_files(
             max_results,
         )
         if not line_matches:
-            yield {"query": query, "file": path_prefix, "compiled": "No matches found."}
+            yield {"query": query, "file": path_prefix, "uri": path_prefix, "compiled": "No matches found."}
             return
 
         # Truncate matched lines list if too long
@@ -2991,7 +2993,7 @@ async def grep_files(
                 f"... {len(line_matches) - max_results} more results found. Use stricter regex or path to narrow down results."
             ]
 
-        yield {"query": query, "file": path_prefix or "", "compiled": "\n".join(line_matches)}
+        yield {"query": query, "file": path_prefix, "uri": path_prefix, "compiled": "\n".join(line_matches)}
 
     except Exception as e:
         error_msg = f"Error using grep files tool: {str(e)}"
@@ -3000,6 +3002,7 @@ async def grep_files(
             {
                 "query": _generate_query(0, 0, path_prefix or "", regex_pattern, lines_before, lines_after),
                 "file": path_prefix,
+                "uri": path_prefix,
                 "compiled": error_msg,
             }
         ]
@@ -3032,7 +3035,7 @@ async def list_files(
             file_objects = await FileObjectAdapters.aget_file_objects_by_path_prefix(user, path)
 
         if not file_objects:
-            yield {"query": _generate_query(0, path, pattern), "file": path, "compiled": "No files found."}
+            yield {"query": _generate_query(0, path, pattern), "file": path, "uri": path, "compiled": "No files found."}
             return
 
         # Extract file names from file objects
@@ -3047,7 +3050,7 @@ async def list_files(
 
         query = _generate_query(len(files), path, pattern)
         if not files:
-            yield {"query": query, "file": path, "compiled": "No files found."}
+            yield {"query": query, "file": path, "uri": path, "compiled": "No files found."}
             return
 
         # Truncate the list if it's too long
@@ -3057,9 +3060,9 @@ async def list_files(
                 f"... {len(files) - max_files} more files found. Use glob pattern to narrow down results."
             ]
 
-        yield {"query": query, "file": path, "compiled": "\n- ".join(files)}
+        yield {"query": query, "file": path, "uri": path, "compiled": "\n- ".join(files)}
 
     except Exception as e:
         error_msg = f"Error listing files in {path}: {str(e)}"
         logger.error(error_msg, exc_info=True)
-        yield {"query": query, "file": path, "compiled": error_msg}
+        yield {"query": query, "file": path, "uri": path, "compiled": error_msg}
diff --git a/src/khoj/search_type/text_search.py b/src/khoj/search_type/text_search.py
index 3fafa44b..b2b3453b 100644
--- a/src/khoj/search_type/text_search.py
+++ b/src/khoj/search_type/text_search.py
@@ -157,6 +157,7 @@ def collate_results(hits, dedupe=True):
                     "additional": {
                         "source": hit.file_source,
                         "file": hit.file_path,
+                        "uri": hit.url,
                         "compiled": hit.compiled,
                         "heading": hit.heading,
                     },
@@ -180,6 +181,7 @@ def deduplicated_search_responses(hits: List[SearchResponse]):
                     "additional": {
                         "source": hit.additional["source"],
                         "file": hit.additional["file"],
+                        "uri": hit.additional["uri"],
                         "query": hit.additional["query"],
                         "compiled": hit.additional["compiled"],
                         "heading": hit.additional["heading"],
diff --git a/src/khoj/utils/rawconfig.py b/src/khoj/utils/rawconfig.py
index e0248a66..e3662db5 100644
--- a/src/khoj/utils/rawconfig.py
+++ b/src/khoj/utils/rawconfig.py
@@ -176,6 +176,7 @@ class Entry:
     compiled: str
     heading: Optional[str]
     file: Optional[str]
+    uri: Optional[str] = None
     corpus_id: str
 
     def __init__(
@@ -184,6 +185,7 @@ class Entry:
         compiled: str = None,
         heading: Optional[str] = None,
         file: Optional[str] = None,
+        uri: Optional[str] = None,
         corpus_id: uuid.UUID = None,
     ):
         self.raw = raw
@@ -191,6 +193,14 @@ class Entry:
         self.heading = heading
         self.file = file
         self.corpus_id = str(corpus_id)
+        if uri:
+            self.uri = uri
+        elif file and (file.startswith("http") or file.startswith("file://")):
+            self.uri = file
+        elif file:
+            self.uri = f"file://{file}"
+        else:
+            self.uri = None
 
     def to_json(self) -> str:
         return json.dumps(self.__dict__, ensure_ascii=False)
@@ -206,4 +216,5 @@ class Entry:
             file=dictionary.get("file", None),
             heading=dictionary.get("heading", None),
             corpus_id=dictionary.get("corpus_id", None),
+            uri=dictionary.get("uri", None),
         )

From dcfa4288c4175c7e63436ace8a43d76fc622bcb6 Mon Sep 17 00:00:00 2001
From: Debanjum <debanjum@gmail.com>
Date: Mon, 23 Jun 2025 19:07:38 -0700
Subject: [PATCH 4/5] Deep link to org-mode entries. Deep link by line number
 in uri

Use url fragment schema for deep link URIs, borrowing from URL/PDF
schemas. E.g file:///path/to/file.txt#line=<line_no>&#page=<page_no>

Compute line number during (recursive) org-mode entry chunking.

Thoroughly test line number in URI maps to line number of chunk in
actual org mode file.

This deeplink URI with line number is passed to llm as context to
better combine with line range based view file tool.

Grep tool already passed matching line number. This change passes
line number in URIs of org entries matched by the semantic search tool
---
 .../content/org_mode/org_to_entries.py        | 20 ++++++-
 .../processor/content/org_mode/orgnode.py     | 13 +++-
 src/khoj/processor/content/text_to_entries.py | 27 +++++++++
 tests/data/org/main_readme.org                |  4 +-
 tests/test_org_to_entries.py                  | 60 ++++++++++++++++++-
 tests/test_orgnode.py                         | 34 ++---------
 6 files changed, 119 insertions(+), 39 deletions(-)

diff --git a/src/khoj/processor/content/org_mode/org_to_entries.py b/src/khoj/processor/content/org_mode/org_to_entries.py
index 4b9fe3ae..0dfe7674 100644
--- a/src/khoj/processor/content/org_mode/org_to_entries.py
+++ b/src/khoj/processor/content/org_mode/org_to_entries.py
@@ -87,6 +87,7 @@ class OrgToEntries(TextToEntries):
         entry_to_file_map: List[Tuple[Orgnode, str]],
         max_tokens=256,
         ancestry: Dict[int, str] = {},
+        start_line: int = 1,
     ) -> Tuple[List[List[Orgnode]], List[Tuple[Orgnode, str]]]:
         """Parse org_content from org_file into OrgNode entries
 
@@ -104,7 +105,9 @@ class OrgToEntries(TextToEntries):
         if len(TextToEntries.tokenizer(org_content_with_ancestry)) <= max_tokens or not re.search(
             rf"^\*{{{len(ancestry)+1},}}\s", org_content, re.MULTILINE
         ):
-            orgnode_content_with_ancestry = orgnode.makelist(org_content_with_ancestry, org_file)
+            orgnode_content_with_ancestry = orgnode.makelist(
+                org_content_with_ancestry, org_file, start_line=start_line, ancestry_lines=len(ancestry)
+            )
             entry_to_file_map += zip(orgnode_content_with_ancestry, [org_file] * len(orgnode_content_with_ancestry))
             entries.extend([orgnode_content_with_ancestry])
             return entries, entry_to_file_map
@@ -125,24 +128,32 @@ class OrgToEntries(TextToEntries):
             return entries, entry_to_file_map
 
         # Recurse down each non-empty section after parsing its body, heading and ancestry
+        current_line_offset = 0
         for section in sections:
+            num_lines_in_section = section.count("\n")
             # Skip empty sections
             if section.strip() == "":
+                current_line_offset += num_lines_in_section
                 continue
 
+            section_start_line_in_file = start_line + current_line_offset
+
             # Extract the section body and (when present) the heading
             current_ancestry = ancestry.copy()
             first_non_empty_line = [line for line in section.split("\n") if line.strip() != ""][0]
             # If first non-empty line is a heading with expected heading level
             if re.search(rf"^\*{{{next_heading_level}}}\s", first_non_empty_line):
                 # Extract the section body without the heading
-                current_section_body = "\n".join(section.split(first_non_empty_line, 1)[1:])
+                current_section_heading, current_section_body = section.split(first_non_empty_line, 1)
+                current_section_body_offset = current_section_heading.count("\n")
                 # Parse the section heading into current section ancestry
                 current_section_title = first_non_empty_line[next_heading_level:].strip()
                 current_ancestry[next_heading_level] = current_section_title
+                recursive_start_line = section_start_line_in_file + current_section_body_offset
             # Else process the section as just body text
             else:
                 current_section_body = section
+                recursive_start_line = section_start_line_in_file
 
             # Recurse down children of the current entry
             OrgToEntries.process_single_org_file(
@@ -152,7 +163,9 @@ class OrgToEntries(TextToEntries):
                 entry_to_file_map,
                 max_tokens,
                 current_ancestry,
+                start_line=recursive_start_line,
             )
+            current_line_offset += num_lines_in_section
 
         return entries, entry_to_file_map
 
@@ -207,6 +220,8 @@ class OrgToEntries(TextToEntries):
                 if parsed_entry.hasBody:
                     compiled += f"\n {parsed_entry.body}"
 
+                uri = parsed_entry.properties.pop("LINE", None)
+
                 # Add the sub-entry contents to the entry
                 entry_compiled += compiled
                 entry_raw += f"{parsed_entry}"
@@ -220,6 +235,7 @@ class OrgToEntries(TextToEntries):
                         raw=entry_raw,
                         heading=entry_heading,
                         file=entry_to_file_map[parsed_entry],
+                        uri=uri,
                     )
                 )
 
diff --git a/src/khoj/processor/content/org_mode/orgnode.py b/src/khoj/processor/content/org_mode/orgnode.py
index e190e17a..34bb54f3 100644
--- a/src/khoj/processor/content/org_mode/orgnode.py
+++ b/src/khoj/processor/content/org_mode/orgnode.py
@@ -58,7 +58,7 @@ def makelist_with_filepath(filename):
     return makelist(f, filename)
 
 
-def makelist(file, filename) -> List["Orgnode"]:
+def makelist(file, filename, start_line: int = 1, ancestry_lines: int = 0) -> List["Orgnode"]:
     """
     Read an org-mode file and return a list of Orgnode objects
     created from this file.
@@ -114,7 +114,16 @@ def makelist(file, filename) -> List["Orgnode"]:
                     logbook = list()
                 thisNode.properties = property_map
                 nodelist.append(thisNode)
-            property_map = {"LINE": f"file:{normalize_filename(filename)}::{ctr}"}
+            # Account for ancestry lines that were prepended when calculating line numbers
+            if ancestry_lines > 0:
+                calculated_line = start_line + ctr - 1 - ancestry_lines
+                if calculated_line <= 0:
+                    calculated_line = 1  # Fallback to line 1 if calculation results in invalid line number
+            else:
+                calculated_line = start_line + ctr - 1
+                if calculated_line <= 0:
+                    calculated_line = ctr  # Use the original behavior if start_line calculation fails
+            property_map = {"LINE": f"file://{normalize_filename(filename)}#line={calculated_line}"}
             previous_level = level
             previous_heading: str = heading
             level = heading_search.group(1)
diff --git a/src/khoj/processor/content/text_to_entries.py b/src/khoj/processor/content/text_to_entries.py
index bac55aa4..0ceda11d 100644
--- a/src/khoj/processor/content/text_to_entries.py
+++ b/src/khoj/processor/content/text_to_entries.py
@@ -81,8 +81,35 @@ class TextToEntries(ABC):
             chunked_entry_chunks = text_splitter.split_text(entry.compiled)
             corpus_id = uuid.uuid4()
 
+            line_start = None
+            last_offset = 0
+            if entry.uri and entry.uri.startswith("file://"):
+                if "#line=" in entry.uri:
+                    line_start = int(entry.uri.split("#line=", 1)[-1].split("&", 1)[0])
+                else:
+                    line_start = 0
+
             # Create heading prefixed entry from each chunk
             for chunk_index, compiled_entry_chunk in enumerate(chunked_entry_chunks):
+                # set line start in uri of chunked entries
+                entry_uri = entry.uri
+                if line_start is not None:
+                    # Find the chunk in the raw text to get an accurate line number.
+                    # Search for the unmodified chunk from the last offset.
+                    searchable_chunk = compiled_entry_chunk.strip()
+                    if searchable_chunk:
+                        chunk_start_pos_in_raw = entry.raw.find(searchable_chunk, last_offset)
+                        if chunk_start_pos_in_raw != -1:
+                            # Found the chunk. Calculate its line offset from the start of the raw text.
+                            line_offset_in_raw = entry.raw[:chunk_start_pos_in_raw].count("\n")
+                            new_line_num = line_start + line_offset_in_raw
+                            entry_uri = re.sub(r"#line=\d+", f"#line={new_line_num}", entry.uri)
+                            # Update search position for the next chunk to start after the current one.
+                            last_offset = chunk_start_pos_in_raw + len(searchable_chunk)
+                        else:
+                            # Chunk not found in raw text, likely from a heading. Use original line_start.
+                            entry_uri = re.sub(r"#line=\d+", f"#line={line_start}", entry.uri)
+
                 # Prepend heading to all other chunks, the first chunk already has heading from original entry
                 if chunk_index > 0 and entry.heading:
                     # Snip heading to avoid crossing max_tokens limit
diff --git a/tests/data/org/main_readme.org b/tests/data/org/main_readme.org
index d88a2b2b..df5ac6b4 100644
--- a/tests/data/org/main_readme.org
+++ b/tests/data/org/main_readme.org
@@ -3,7 +3,7 @@
 
   All data is processed locally. User can interface with khoj app via [[./interface/emacs/khoj.el][Emacs]], API or Commandline
 
-** Dependencies
+** Dependencies [[id:123-421-121-12]]  :TAG1:@TAG1_1:
    - Python3
    - [[https://docs.conda.io/en/latest/miniconda.html#latest-miniconda-installer-links][Miniconda]]
 
@@ -22,7 +22,7 @@
    #+end_src
 
 ** Use
-*** *Khoj via Emacs*
+*** *Khoj via Emacs*  [[https://khoj.dev][link to khoj website]]  :@EMACS:CLIENT_1:KHOJ:
      - [[https://github.com/khoj-ai/khoj/tree/master/interface/emacs#installation][Install]] [[./interface/emacs/khoj.el][khoj.el]]
      - Run ~M-x khoj <user-query>~ or Call ~C-c C-s~
 
diff --git a/tests/test_org_to_entries.py b/tests/test_org_to_entries.py
index 5c11a6fd..d5dcdbd2 100644
--- a/tests/test_org_to_entries.py
+++ b/tests/test_org_to_entries.py
@@ -393,6 +393,60 @@ def test_extract_entries_with_different_level_headings(tmp_path):
     assert entries[1][1].raw == "* Heading 2\n"
 
 
+def test_line_number_tracking_in_recursive_split():
+    "Ensure line numbers in URIs are correct after recursive splitting by checking against the actual file."
+    # Arrange
+    org_file_path = os.path.abspath("tests/data/org/main_readme.org")
+
+    with open(org_file_path, "r") as f:
+        org_content = f.read()
+    lines = org_content.splitlines()
+    data = {org_file_path: org_content}
+
+    # Act
+    # Using a small max_tokens to force recursive splitting
+    _, entries = OrgToEntries.extract_org_entries(org_files=data, max_tokens=10, index_heading_entries=True)
+
+    # Assert
+    assert len(entries) > 0, "No entries were extracted."
+
+    for entry in entries:
+        # Extract file path and line number from the entry URI
+        # for files uri is expected in format: file:///path/to/file.org#line=5
+        match = re.search(r"file://(.*?)#line=(\d+)", entry.uri)
+        if not match:
+            continue
+        filepath_from_uri = match.group(1)
+        line_number_from_uri = int(match.group(2))
+
+        # line_number is 1-based, list index is 0-based
+        line_in_file = clean(lines[line_number_from_uri - 1])
+        next_line_in_file = clean(lines[line_number_from_uri]) if line_number_from_uri < len(lines) else ""
+
+        # Remove ancestor heading lines inserted during post-processing
+        first_entry_line = ""
+        for line in entry.raw.splitlines():
+            if line.startswith("*"):
+                first_entry_line = line
+            else:
+                break  # Stop at the first non-heading line
+        # Remove heading prefix from entry.compiled as level changed during post-processing
+        cleaned_first_entry_line = first_entry_line.strip()
+        # Remove multiple consecutive spaces
+        cleaned_first_entry_line = clean(cleaned_first_entry_line)
+
+        assert entry.uri is not None, f"Entry '{entry}' has a None URI."
+        assert match is not None, f"URI format is incorrect: {entry.uri}"
+        assert (
+            filepath_from_uri == org_file_path
+        ), f"File path in URI '{filepath_from_uri}' does not match expected '{org_file_path}'"
+
+        # Ensure the first non-heading line in the compiled entry matches the line in the file
+        assert (
+            cleaned_first_entry_line in line_in_file.strip() or cleaned_first_entry_line in next_line_in_file.strip()
+        ), f"First non-heading line '{cleaned_first_entry_line}' in {entry.raw} does not match line {line_number_from_uri} in file: '{line_in_file}' or next line '{next_line_in_file}'"
+
+
 # Helper Functions
 def create_file(tmp_path, entry=None, filename="test.org"):
     org_file = tmp_path / filename
@@ -402,6 +456,6 @@ def create_file(tmp_path, entry=None, filename="test.org"):
     return org_file
 
 
-def clean(entry):
-    "Remove properties from entry for easier comparison."
-    return re.sub(r"\n:PROPERTIES:(.*?):END:", "", entry, flags=re.DOTALL)
+def clean(text):
+    "Normalize spaces in text for easier comparison."
+    return re.sub(r"\s+", " ", text)
diff --git a/tests/test_orgnode.py b/tests/test_orgnode.py
index 49344325..00c471b1 100644
--- a/tests/test_orgnode.py
+++ b/tests/test_orgnode.py
@@ -100,9 +100,8 @@ def test_render_entry_with_property_drawer_and_empty_body(tmp_path):
 
     expected_entry = f"""*** [#A] Heading1                                            :tag1:
 :PROPERTIES:
-:LINE: file:{orgfile}::2
+:LINE: file://{orgfile}#line=2
 :ID: id:111-111-111-1111-1111
-:SOURCE: [[file:{orgfile}::*Heading1]]
 :END:
 """
 
@@ -133,37 +132,12 @@ Body Line 2
 
     # Assert
     # SOURCE link rendered with Heading
-    assert f":SOURCE: [[file:{orgfile}::*{entries[0].heading}]]" in f"{entries[0]}"
     # ID link rendered with ID
     assert f":ID: id:123-456-789-4234-1231" in f"{entries[0]}"
     # LINE link rendered with line number
-    assert f":LINE: file:{orgfile}::2" in f"{entries[0]}"
-
-
-# ----------------------------------------------------------------------------------------------------
-def test_source_link_to_entry_escaped_for_rendering(tmp_path):
-    "Test SOURCE link renders with square brackets in filename, heading escaped for org-mode rendering"
-    # Arrange
-    entry = f"""
-*** [#A] Heading[1]   :tag1:
-:PROPERTIES:
-:ID: 123-456-789-4234-1231
-:END:
-Body Line 1"""
-    orgfile = create_file(tmp_path, entry, filename="test[1].org")
-
-    # Act
-    entries = orgnode.makelist_with_filepath(orgfile)
-
-    # Assert
-    assert len(entries) == 1
-    # parsed heading from entry
-    assert entries[0].heading == "Heading[1]"
-    # track ancestors of entry
-    assert entries[0].ancestors == [f"{orgfile}"]
-    # ensure SOURCE link has square brackets in filename, heading escaped in rendered entries
-    escaped_orgfile = f"{orgfile}".replace("[1]", "\\[1\\]")
-    assert f":SOURCE: [[file:{escaped_orgfile}::*Heading\\[1\\]" in f"{entries[0]}"
+    assert f":LINE: file://{orgfile}#line=2" in f"{entries[0]}"
+    # LINE link rendered with line number
+    assert f":LINE: file://{orgfile}#line=7" in f"{entries[1]}"
 
 
 # ----------------------------------------------------------------------------------------------------

From 5010623a0acd11d2c45825184dcdabf3c28c2149 Mon Sep 17 00:00:00 2001
From: Debanjum <debanjum@gmail.com>
Date: Thu, 3 Jul 2025 18:34:34 -0700
Subject: [PATCH 5/5] Deep link to markdown entries by line number in uri

Use url fragment schema for deep link URIs, borrowing from URL/PDF
schemas. E.g file:///path/to/file.txt#line=<line_no>&#page=<page_no>

Compute line number during (recursive) markdown entry chunking.

Test line number in URI maps to line number of chunk in actual md file.

This deeplink URI with line number is passed to llm as context to
better combine with line range based view file tool.

Grep tool already passed matching line number. This change passes
line number in URIs of markdown entries matched by the semantic search
tool.
---
 .../content/markdown/markdown_to_entries.py   | 47 ++++++++++++---
 tests/data/markdown/main_readme.md            | 39 +++++++++++++
 tests/test_markdown_to_entries.py             | 58 +++++++++++++++++++
 3 files changed, 135 insertions(+), 9 deletions(-)
 create mode 100644 tests/data/markdown/main_readme.md

diff --git a/src/khoj/processor/content/markdown/markdown_to_entries.py b/src/khoj/processor/content/markdown/markdown_to_entries.py
index 8d1fbbf4..43b10431 100644
--- a/src/khoj/processor/content/markdown/markdown_to_entries.py
+++ b/src/khoj/processor/content/markdown/markdown_to_entries.py
@@ -54,13 +54,13 @@ class MarkdownToEntries(TextToEntries):
     def extract_markdown_entries(markdown_files: Dict[str, str], max_tokens=256) -> Tuple[Dict[str, str], List[Entry]]:
         "Extract entries by heading from specified Markdown files"
         entries: List[str] = []
-        entry_to_file_map: List[Tuple[str, str]] = []
+        entry_to_file_map: List[Tuple[str, str, int]] = []
         file_to_text_map: Dict[str, str] = dict()
         for markdown_file in markdown_files:
             try:
                 markdown_content = markdown_files[markdown_file]
                 entries, entry_to_file_map = MarkdownToEntries.process_single_markdown_file(
-                    markdown_content, markdown_file, entries, entry_to_file_map, max_tokens
+                    markdown_content, markdown_file, entries, entry_to_file_map, max_tokens, start_line=1
                 )
                 file_to_text_map[markdown_file] = markdown_content
             except Exception as e:
@@ -68,17 +68,18 @@ class MarkdownToEntries(TextToEntries):
                     f"Unable to process file: {markdown_file}. This file will not be indexed.\n{e}", exc_info=True
                 )
 
-        return file_to_text_map, MarkdownToEntries.convert_markdown_entries_to_maps(entries, dict(entry_to_file_map))
+        return file_to_text_map, MarkdownToEntries.convert_markdown_entries_to_maps(entries, entry_to_file_map)
 
     @staticmethod
     def process_single_markdown_file(
         markdown_content: str,
         markdown_file: str,
         entries: List[str],
-        entry_to_file_map: List[Tuple[str, str]],
+        entry_to_file_map: List[Tuple[str, str, int]],
         max_tokens=256,
         ancestry: Dict[int, str] = {},
-    ) -> Tuple[List[str], List[Tuple[str, str]]]:
+        start_line: int = 1,
+    ) -> Tuple[List[str], List[Tuple[str, str, int]]]:
         # Prepend the markdown section's heading ancestry
         ancestry_string = "\n".join([f"{'#' * key} {ancestry[key]}" for key in sorted(ancestry.keys())])
         markdown_content_with_ancestry = f"{ancestry_string}{markdown_content}"
@@ -87,7 +88,9 @@ class MarkdownToEntries(TextToEntries):
         if len(TextToEntries.tokenizer(markdown_content_with_ancestry)) <= max_tokens or not re.search(
             rf"^#{{{len(ancestry)+1},}}\s", markdown_content, flags=re.MULTILINE
         ):
-            entry_to_file_map += [(markdown_content_with_ancestry, markdown_file)]
+            # Create entry with line number information
+            entry_with_line_info = (markdown_content_with_ancestry, markdown_file, start_line)
+            entry_to_file_map += [entry_with_line_info]
             entries.extend([markdown_content_with_ancestry])
             return entries, entry_to_file_map
 
@@ -98,22 +101,32 @@ class MarkdownToEntries(TextToEntries):
             next_heading_level += 1
             sections = re.split(rf"(\n|^)(?=[#]{{{next_heading_level}}} .+\n?)", markdown_content, flags=re.MULTILINE)
 
+        # Recurse down each non-empty section after parsing its body, heading and ancestry
+        current_line_offset = 0
         for section in sections:
+            num_lines_in_section = section.count("\n")
             # Skip empty sections
             if section.strip() == "":
+                current_line_offset += num_lines_in_section
                 continue
 
+            section_start_line_in_file = start_line + current_line_offset
+
             # Extract the section body and (when present) the heading
             current_ancestry = ancestry.copy()
             first_line = [line for line in section.split("\n") if line.strip() != ""][0]
             if re.search(rf"^#{{{next_heading_level}}} ", first_line):
                 # Extract the section body without the heading
-                current_section_body = "\n".join(section.split(first_line)[1:])
+                current_section_heading, current_section_body = section.split(first_line, 1)
+                current_section_body_offset = current_section_heading.count("\n")
                 # Parse the section heading into current section ancestry
                 current_section_title = first_line[next_heading_level:].strip()
                 current_ancestry[next_heading_level] = current_section_title
+                # Line number should point to the heading itself
+                recursive_start_line = section_start_line_in_file + current_section_body_offset
             else:
                 current_section_body = section
+                recursive_start_line = section_start_line_in_file
 
             # Recurse down children of the current entry
             MarkdownToEntries.process_single_markdown_file(
@@ -123,23 +136,38 @@ class MarkdownToEntries(TextToEntries):
                 entry_to_file_map,
                 max_tokens,
                 current_ancestry,
+                start_line=recursive_start_line,
             )
+            current_line_offset += num_lines_in_section
 
         return entries, entry_to_file_map
 
     @staticmethod
-    def convert_markdown_entries_to_maps(parsed_entries: List[str], entry_to_file_map: Dict[str, str]) -> List[Entry]:
+    def convert_markdown_entries_to_maps(
+        parsed_entries: List[str], entry_to_file_map: List[Tuple[str, str, int]]
+    ) -> List[Entry]:
         "Convert each Markdown entries into a dictionary"
         entries: List[Entry] = []
+
+        # Create a mapping from parsed entry to file info
+        entry_map: Dict[str, Tuple[str, int]] = {}
+        for entry_info in entry_to_file_map:
+            entry_content, raw_filename, start_line = entry_info
+            entry_map[entry_content] = (raw_filename, start_line)
+
         for parsed_entry in parsed_entries:
-            raw_filename = entry_to_file_map[parsed_entry]
+            raw_filename, start_line = entry_map[parsed_entry]
+            calculated_line = start_line if start_line > 0 else 1
 
             # Check if raw_filename is a URL. If so, save it as is. If not, convert it to a Path.
             if type(raw_filename) == str and re.search(r"^https?://", raw_filename):
                 # Escape the URL to avoid issues with special characters
                 entry_filename = urllib3.util.parse_url(raw_filename).url
+                uri = entry_filename
             else:
                 entry_filename = raw_filename
+                # Create URI with line number
+                uri = f"file://{entry_filename}#line={calculated_line}"
 
             heading = parsed_entry.splitlines()[0] if re.search(r"^#+\s", parsed_entry) else ""
             # Append base filename to compiled entry for context to model
@@ -152,6 +180,7 @@ class MarkdownToEntries(TextToEntries):
                     raw=parsed_entry,
                     heading=f"{prefix}{heading}",
                     file=entry_filename,
+                    uri=uri,
                 )
             )
 
diff --git a/tests/data/markdown/main_readme.md b/tests/data/markdown/main_readme.md
new file mode 100644
index 00000000..5eb7c7c9
--- /dev/null
+++ b/tests/data/markdown/main_readme.md
@@ -0,0 +1,39 @@
+# Main Readme
+> Allow natural language search, chat with your documents using transformer based models
+
+This is a test markdown file with multiple, nested child entries.
+
+## Dependencies
+
+- Python3
+- [Miniconda](https://docs.conda.io/en/latest/miniconda.html#latest-miniconda-installer-links)
+
+## Installation
+
+```bash
+pip install khoj
+```
+
+## Run
+  Load ML model, generate embeddings and expose API to query specified org-mode files
+
+  ```shell
+  python3 main.py --input-files ~/Notes/Schedule.org ~/Notes/Incoming.org --verbose
+  ```
+
+## Use
+
+### **Khoj via API**
+- Query: `GET` [http://localhost:42110/api/search?q="What is the meaning of life"](http://localhost:42110/api/search?q=%22what%20is%20the%20meaning%20of%20life%22)
+- Update Index: `GET` [http://localhost:42110/api/update](http://localhost:42110/api/update)
+- [Khoj API Docs](http://localhost:42110/docs)
+
+### *Khoj via Web*
+
+- Open browser to http://localhost:42110
+- Enter query in search box
+
+## Acknowledgments
+
+- [MiniLM Model](https://huggingface.co/sentence-transformers/multi-qa-MiniLM-L6-cos-v1) for Asymmetric Text Search. See (SBert Documentation)[https://www.sbert.net/examples/applications/retrieve_rerank/README.html]
+- [OpenAI CLIP Model](https://github.com/openai/CLIP) for Image Search. See [SBert Documentation](https://www.sbert.net/examples/applications/image-search/README.html)
diff --git a/tests/test_markdown_to_entries.py b/tests/test_markdown_to_entries.py
index 22f94ef5..30813555 100644
--- a/tests/test_markdown_to_entries.py
+++ b/tests/test_markdown_to_entries.py
@@ -1,4 +1,5 @@
 import os
+import re
 from pathlib import Path
 
 from khoj.processor.content.markdown.markdown_to_entries import MarkdownToEntries
@@ -248,6 +249,58 @@ def test_get_markdown_files(tmp_path):
     assert set(extracted_org_files.keys()) == expected_files
 
 
+def test_line_number_tracking_in_recursive_split():
+    "Ensure line numbers in URIs are correct after recursive splitting by checking against the actual file."
+    # Arrange
+    markdown_file_path = os.path.abspath("tests/data/markdown/main_readme.md")
+
+    with open(markdown_file_path, "r") as f:
+        markdown_content = f.read()
+    lines = markdown_content.splitlines()
+    data = {markdown_file_path: markdown_content}
+
+    # Act
+    # Using a small max_tokens to force recursive splitting
+    _, entries = MarkdownToEntries.extract_markdown_entries(markdown_files=data, max_tokens=10)
+
+    # Assert
+    assert len(entries) > 0, "No entries were extracted."
+
+    for entry in entries:
+        # Extract file path and line number from the entry URI
+        # for files uri is expected in format: file:///path/to/file.md#line=5
+        match = re.search(r"file://(.*?)#line=(\d+)", entry.uri)
+        filepath_from_uri = match.group(1)
+        line_number_from_uri = int(match.group(2))
+
+        # line_number is 1-based, list index is 0-based
+        line_in_file = clean(lines[line_number_from_uri - 1])
+        next_line_in_file = clean(lines[line_number_from_uri]) if line_number_from_uri < len(lines) else ""
+
+        # Remove ancestor heading lines inserted during post-processing
+        first_entry_line = ""
+        for line in entry.raw.splitlines():
+            if line.startswith("#"):
+                first_entry_line = line
+            else:
+                break  # Stop at the first non-heading line
+        # Remove heading prefix from entry.compiled as level changed during post-processing
+        cleaned_first_entry_line = first_entry_line.strip()
+        # Remove multiple consecutive spaces
+        cleaned_first_entry_line = clean(cleaned_first_entry_line)
+
+        assert entry.uri is not None, f"Entry '{entry}' has a None URI."
+        assert match is not None, f"URI format is incorrect: {entry.uri}"
+        assert (
+            filepath_from_uri == markdown_file_path
+        ), f"File path in URI '{filepath_from_uri}' does not match expected '{markdown_file_path}'"
+
+        # Ensure the first non-heading line in the compiled entry matches the line in the file
+        assert (
+            cleaned_first_entry_line in line_in_file.strip() or cleaned_first_entry_line in next_line_in_file.strip()
+        ), f"First non-heading line '{cleaned_first_entry_line}' in {entry.raw} does not match line {line_number_from_uri} in file: '{line_in_file}' or next line '{next_line_in_file}'"
+
+
 # Helper Functions
 def create_file(tmp_path: Path, entry=None, filename="test.md"):
     markdown_file = tmp_path / filename
@@ -255,3 +308,8 @@ def create_file(tmp_path: Path, entry=None, filename="test.md"):
     if entry:
         markdown_file.write_text(entry)
     return markdown_file
+
+
+def clean(text):
+    "Normalize spaces in text for easier comparison."
+    return re.sub(r"\s+", " ", text)