Deep link to org-mode entries. Deep link by line number in uri

Use url fragment schema for deep link URIs, borrowing from URL/PDF schemas. E.g file:///path/to/file.txt#line=<line_no>&#page=<page_no> Compute line number during (recursive) org-mode entry chunking. Thoroughly test line number in URI maps to line number of chunk in actual org mode file. This deeplink URI with line number is passed to llm as context to better combine with line range based view file tool. Grep tool already passed matching line number. This change passes line number in URIs of org entries matched by the semantic search tool
2026-03-02 21:19:12 +00:00 · 2025-06-23 19:07:38 -07:00
parent e90ab5341a
commit dcfa4288c4
6 changed files with 119 additions and 39 deletions
--- a/src/khoj/processor/content/org_mode/org_to_entries.py
+++ b/src/khoj/processor/content/org_mode/org_to_entries.py
@@ -87,6 +87,7 @@ class OrgToEntries(TextToEntries):
        entry_to_file_map: List[Tuple[Orgnode, str]],
        max_tokens=256,
        ancestry: Dict[int, str] = {},
+        start_line: int = 1,
    ) -> Tuple[List[List[Orgnode]], List[Tuple[Orgnode, str]]]:
        """Parse org_content from org_file into OrgNode entries

@@ -104,7 +105,9 @@ class OrgToEntries(TextToEntries):
        if len(TextToEntries.tokenizer(org_content_with_ancestry)) <= max_tokens or not re.search(
            rf"^\*{{{len(ancestry)+1},}}\s", org_content, re.MULTILINE
        ):
-            orgnode_content_with_ancestry = orgnode.makelist(org_content_with_ancestry, org_file)
+            orgnode_content_with_ancestry = orgnode.makelist(
+                org_content_with_ancestry, org_file, start_line=start_line, ancestry_lines=len(ancestry)
+            )
            entry_to_file_map += zip(orgnode_content_with_ancestry, [org_file] * len(orgnode_content_with_ancestry))
            entries.extend([orgnode_content_with_ancestry])
            return entries, entry_to_file_map
@@ -125,24 +128,32 @@ class OrgToEntries(TextToEntries):
            return entries, entry_to_file_map

        # Recurse down each non-empty section after parsing its body, heading and ancestry
+        current_line_offset = 0
        for section in sections:
+            num_lines_in_section = section.count("\n")
            # Skip empty sections
            if section.strip() == "":
+                current_line_offset += num_lines_in_section
                continue

+            section_start_line_in_file = start_line + current_line_offset
+
            # Extract the section body and (when present) the heading
            current_ancestry = ancestry.copy()
            first_non_empty_line = [line for line in section.split("\n") if line.strip() != ""][0]
            # If first non-empty line is a heading with expected heading level
            if re.search(rf"^\*{{{next_heading_level}}}\s", first_non_empty_line):
                # Extract the section body without the heading
-                current_section_body = "\n".join(section.split(first_non_empty_line, 1)[1:])
+                current_section_heading, current_section_body = section.split(first_non_empty_line, 1)
+                current_section_body_offset = current_section_heading.count("\n")
                # Parse the section heading into current section ancestry
                current_section_title = first_non_empty_line[next_heading_level:].strip()
                current_ancestry[next_heading_level] = current_section_title
+                recursive_start_line = section_start_line_in_file + current_section_body_offset
            # Else process the section as just body text
            else:
                current_section_body = section
+                recursive_start_line = section_start_line_in_file

            # Recurse down children of the current entry
            OrgToEntries.process_single_org_file(
@@ -152,7 +163,9 @@ class OrgToEntries(TextToEntries):
                entry_to_file_map,
                max_tokens,
                current_ancestry,
+                start_line=recursive_start_line,
            )
+            current_line_offset += num_lines_in_section

        return entries, entry_to_file_map

@@ -207,6 +220,8 @@ class OrgToEntries(TextToEntries):
                if parsed_entry.hasBody:
                    compiled += f"\n {parsed_entry.body}"

+                uri = parsed_entry.properties.pop("LINE", None)
+
                # Add the sub-entry contents to the entry
                entry_compiled += compiled
                entry_raw += f"{parsed_entry}"
@@ -220,6 +235,7 @@ class OrgToEntries(TextToEntries):
                        raw=entry_raw,
                        heading=entry_heading,
                        file=entry_to_file_map[parsed_entry],
+                        uri=uri,
                    )
                )

--- a/src/khoj/processor/content/org_mode/orgnode.py
+++ b/src/khoj/processor/content/org_mode/orgnode.py
@@ -58,7 +58,7 @@ def makelist_with_filepath(filename):
    return makelist(f, filename)


-def makelist(file, filename) -> List["Orgnode"]:
+def makelist(file, filename, start_line: int = 1, ancestry_lines: int = 0) -> List["Orgnode"]:
    """
    Read an org-mode file and return a list of Orgnode objects
    created from this file.
@@ -114,7 +114,16 @@ def makelist(file, filename) -> List["Orgnode"]:
                    logbook = list()
                thisNode.properties = property_map
                nodelist.append(thisNode)
-            property_map = {"LINE": f"file:{normalize_filename(filename)}::{ctr}"}
+            # Account for ancestry lines that were prepended when calculating line numbers
+            if ancestry_lines > 0:
+                calculated_line = start_line + ctr - 1 - ancestry_lines
+                if calculated_line <= 0:
+                    calculated_line = 1  # Fallback to line 1 if calculation results in invalid line number
+            else:
+                calculated_line = start_line + ctr - 1
+                if calculated_line <= 0:
+                    calculated_line = ctr  # Use the original behavior if start_line calculation fails
+            property_map = {"LINE": f"file://{normalize_filename(filename)}#line={calculated_line}"}
            previous_level = level
            previous_heading: str = heading
            level = heading_search.group(1)
--- a/src/khoj/processor/content/text_to_entries.py
+++ b/src/khoj/processor/content/text_to_entries.py
@@ -81,8 +81,35 @@ class TextToEntries(ABC):
            chunked_entry_chunks = text_splitter.split_text(entry.compiled)
            corpus_id = uuid.uuid4()

+            line_start = None
+            last_offset = 0
+            if entry.uri and entry.uri.startswith("file://"):
+                if "#line=" in entry.uri:
+                    line_start = int(entry.uri.split("#line=", 1)[-1].split("&", 1)[0])
+                else:
+                    line_start = 0
+
            # Create heading prefixed entry from each chunk
            for chunk_index, compiled_entry_chunk in enumerate(chunked_entry_chunks):
+                # set line start in uri of chunked entries
+                entry_uri = entry.uri
+                if line_start is not None:
+                    # Find the chunk in the raw text to get an accurate line number.
+                    # Search for the unmodified chunk from the last offset.
+                    searchable_chunk = compiled_entry_chunk.strip()
+                    if searchable_chunk:
+                        chunk_start_pos_in_raw = entry.raw.find(searchable_chunk, last_offset)
+                        if chunk_start_pos_in_raw != -1:
+                            # Found the chunk. Calculate its line offset from the start of the raw text.
+                            line_offset_in_raw = entry.raw[:chunk_start_pos_in_raw].count("\n")
+                            new_line_num = line_start + line_offset_in_raw
+                            entry_uri = re.sub(r"#line=\d+", f"#line={new_line_num}", entry.uri)
+                            # Update search position for the next chunk to start after the current one.
+                            last_offset = chunk_start_pos_in_raw + len(searchable_chunk)
+                        else:
+                            # Chunk not found in raw text, likely from a heading. Use original line_start.
+                            entry_uri = re.sub(r"#line=\d+", f"#line={line_start}", entry.uri)
+
                # Prepend heading to all other chunks, the first chunk already has heading from original entry
                if chunk_index > 0 and entry.heading:
                    # Snip heading to avoid crossing max_tokens limit