Deep link to org-mode entries. Deep link by line number in uri

Use url fragment schema for deep link URIs, borrowing from URL/PDF
schemas. E.g file:///path/to/file.txt#line=<line_no>&#page=<page_no>

Compute line number during (recursive) org-mode entry chunking.

Thoroughly test line number in URI maps to line number of chunk in
actual org mode file.

This deeplink URI with line number is passed to llm as context to
better combine with line range based view file tool.

Grep tool already passed matching line number. This change passes
line number in URIs of org entries matched by the semantic search tool
This commit is contained in:
Debanjum
2025-06-23 19:07:38 -07:00
parent e90ab5341a
commit dcfa4288c4
6 changed files with 119 additions and 39 deletions

View File

@@ -87,6 +87,7 @@ class OrgToEntries(TextToEntries):
entry_to_file_map: List[Tuple[Orgnode, str]], entry_to_file_map: List[Tuple[Orgnode, str]],
max_tokens=256, max_tokens=256,
ancestry: Dict[int, str] = {}, ancestry: Dict[int, str] = {},
start_line: int = 1,
) -> Tuple[List[List[Orgnode]], List[Tuple[Orgnode, str]]]: ) -> Tuple[List[List[Orgnode]], List[Tuple[Orgnode, str]]]:
"""Parse org_content from org_file into OrgNode entries """Parse org_content from org_file into OrgNode entries
@@ -104,7 +105,9 @@ class OrgToEntries(TextToEntries):
if len(TextToEntries.tokenizer(org_content_with_ancestry)) <= max_tokens or not re.search( if len(TextToEntries.tokenizer(org_content_with_ancestry)) <= max_tokens or not re.search(
rf"^\*{{{len(ancestry)+1},}}\s", org_content, re.MULTILINE rf"^\*{{{len(ancestry)+1},}}\s", org_content, re.MULTILINE
): ):
orgnode_content_with_ancestry = orgnode.makelist(org_content_with_ancestry, org_file) orgnode_content_with_ancestry = orgnode.makelist(
org_content_with_ancestry, org_file, start_line=start_line, ancestry_lines=len(ancestry)
)
entry_to_file_map += zip(orgnode_content_with_ancestry, [org_file] * len(orgnode_content_with_ancestry)) entry_to_file_map += zip(orgnode_content_with_ancestry, [org_file] * len(orgnode_content_with_ancestry))
entries.extend([orgnode_content_with_ancestry]) entries.extend([orgnode_content_with_ancestry])
return entries, entry_to_file_map return entries, entry_to_file_map
@@ -125,24 +128,32 @@ class OrgToEntries(TextToEntries):
return entries, entry_to_file_map return entries, entry_to_file_map
# Recurse down each non-empty section after parsing its body, heading and ancestry # Recurse down each non-empty section after parsing its body, heading and ancestry
current_line_offset = 0
for section in sections: for section in sections:
num_lines_in_section = section.count("\n")
# Skip empty sections # Skip empty sections
if section.strip() == "": if section.strip() == "":
current_line_offset += num_lines_in_section
continue continue
section_start_line_in_file = start_line + current_line_offset
# Extract the section body and (when present) the heading # Extract the section body and (when present) the heading
current_ancestry = ancestry.copy() current_ancestry = ancestry.copy()
first_non_empty_line = [line for line in section.split("\n") if line.strip() != ""][0] first_non_empty_line = [line for line in section.split("\n") if line.strip() != ""][0]
# If first non-empty line is a heading with expected heading level # If first non-empty line is a heading with expected heading level
if re.search(rf"^\*{{{next_heading_level}}}\s", first_non_empty_line): if re.search(rf"^\*{{{next_heading_level}}}\s", first_non_empty_line):
# Extract the section body without the heading # Extract the section body without the heading
current_section_body = "\n".join(section.split(first_non_empty_line, 1)[1:]) current_section_heading, current_section_body = section.split(first_non_empty_line, 1)
current_section_body_offset = current_section_heading.count("\n")
# Parse the section heading into current section ancestry # Parse the section heading into current section ancestry
current_section_title = first_non_empty_line[next_heading_level:].strip() current_section_title = first_non_empty_line[next_heading_level:].strip()
current_ancestry[next_heading_level] = current_section_title current_ancestry[next_heading_level] = current_section_title
recursive_start_line = section_start_line_in_file + current_section_body_offset
# Else process the section as just body text # Else process the section as just body text
else: else:
current_section_body = section current_section_body = section
recursive_start_line = section_start_line_in_file
# Recurse down children of the current entry # Recurse down children of the current entry
OrgToEntries.process_single_org_file( OrgToEntries.process_single_org_file(
@@ -152,7 +163,9 @@ class OrgToEntries(TextToEntries):
entry_to_file_map, entry_to_file_map,
max_tokens, max_tokens,
current_ancestry, current_ancestry,
start_line=recursive_start_line,
) )
current_line_offset += num_lines_in_section
return entries, entry_to_file_map return entries, entry_to_file_map
@@ -207,6 +220,8 @@ class OrgToEntries(TextToEntries):
if parsed_entry.hasBody: if parsed_entry.hasBody:
compiled += f"\n {parsed_entry.body}" compiled += f"\n {parsed_entry.body}"
uri = parsed_entry.properties.pop("LINE", None)
# Add the sub-entry contents to the entry # Add the sub-entry contents to the entry
entry_compiled += compiled entry_compiled += compiled
entry_raw += f"{parsed_entry}" entry_raw += f"{parsed_entry}"
@@ -220,6 +235,7 @@ class OrgToEntries(TextToEntries):
raw=entry_raw, raw=entry_raw,
heading=entry_heading, heading=entry_heading,
file=entry_to_file_map[parsed_entry], file=entry_to_file_map[parsed_entry],
uri=uri,
) )
) )

View File

@@ -58,7 +58,7 @@ def makelist_with_filepath(filename):
return makelist(f, filename) return makelist(f, filename)
def makelist(file, filename) -> List["Orgnode"]: def makelist(file, filename, start_line: int = 1, ancestry_lines: int = 0) -> List["Orgnode"]:
""" """
Read an org-mode file and return a list of Orgnode objects Read an org-mode file and return a list of Orgnode objects
created from this file. created from this file.
@@ -114,7 +114,16 @@ def makelist(file, filename) -> List["Orgnode"]:
logbook = list() logbook = list()
thisNode.properties = property_map thisNode.properties = property_map
nodelist.append(thisNode) nodelist.append(thisNode)
property_map = {"LINE": f"file:{normalize_filename(filename)}::{ctr}"} # Account for ancestry lines that were prepended when calculating line numbers
if ancestry_lines > 0:
calculated_line = start_line + ctr - 1 - ancestry_lines
if calculated_line <= 0:
calculated_line = 1 # Fallback to line 1 if calculation results in invalid line number
else:
calculated_line = start_line + ctr - 1
if calculated_line <= 0:
calculated_line = ctr # Use the original behavior if start_line calculation fails
property_map = {"LINE": f"file://{normalize_filename(filename)}#line={calculated_line}"}
previous_level = level previous_level = level
previous_heading: str = heading previous_heading: str = heading
level = heading_search.group(1) level = heading_search.group(1)

View File

@@ -81,8 +81,35 @@ class TextToEntries(ABC):
chunked_entry_chunks = text_splitter.split_text(entry.compiled) chunked_entry_chunks = text_splitter.split_text(entry.compiled)
corpus_id = uuid.uuid4() corpus_id = uuid.uuid4()
line_start = None
last_offset = 0
if entry.uri and entry.uri.startswith("file://"):
if "#line=" in entry.uri:
line_start = int(entry.uri.split("#line=", 1)[-1].split("&", 1)[0])
else:
line_start = 0
# Create heading prefixed entry from each chunk # Create heading prefixed entry from each chunk
for chunk_index, compiled_entry_chunk in enumerate(chunked_entry_chunks): for chunk_index, compiled_entry_chunk in enumerate(chunked_entry_chunks):
# set line start in uri of chunked entries
entry_uri = entry.uri
if line_start is not None:
# Find the chunk in the raw text to get an accurate line number.
# Search for the unmodified chunk from the last offset.
searchable_chunk = compiled_entry_chunk.strip()
if searchable_chunk:
chunk_start_pos_in_raw = entry.raw.find(searchable_chunk, last_offset)
if chunk_start_pos_in_raw != -1:
# Found the chunk. Calculate its line offset from the start of the raw text.
line_offset_in_raw = entry.raw[:chunk_start_pos_in_raw].count("\n")
new_line_num = line_start + line_offset_in_raw
entry_uri = re.sub(r"#line=\d+", f"#line={new_line_num}", entry.uri)
# Update search position for the next chunk to start after the current one.
last_offset = chunk_start_pos_in_raw + len(searchable_chunk)
else:
# Chunk not found in raw text, likely from a heading. Use original line_start.
entry_uri = re.sub(r"#line=\d+", f"#line={line_start}", entry.uri)
# Prepend heading to all other chunks, the first chunk already has heading from original entry # Prepend heading to all other chunks, the first chunk already has heading from original entry
if chunk_index > 0 and entry.heading: if chunk_index > 0 and entry.heading:
# Snip heading to avoid crossing max_tokens limit # Snip heading to avoid crossing max_tokens limit

View File

@@ -3,7 +3,7 @@
All data is processed locally. User can interface with khoj app via [[./interface/emacs/khoj.el][Emacs]], API or Commandline All data is processed locally. User can interface with khoj app via [[./interface/emacs/khoj.el][Emacs]], API or Commandline
** Dependencies ** Dependencies [[id:123-421-121-12]] :TAG1:@TAG1_1:
- Python3 - Python3
- [[https://docs.conda.io/en/latest/miniconda.html#latest-miniconda-installer-links][Miniconda]] - [[https://docs.conda.io/en/latest/miniconda.html#latest-miniconda-installer-links][Miniconda]]
@@ -22,7 +22,7 @@
#+end_src #+end_src
** Use ** Use
*** *Khoj via Emacs* *** *Khoj via Emacs* [[https://khoj.dev][link to khoj website]] :@EMACS:CLIENT_1:KHOJ:
- [[https://github.com/khoj-ai/khoj/tree/master/interface/emacs#installation][Install]] [[./interface/emacs/khoj.el][khoj.el]] - [[https://github.com/khoj-ai/khoj/tree/master/interface/emacs#installation][Install]] [[./interface/emacs/khoj.el][khoj.el]]
- Run ~M-x khoj <user-query>~ or Call ~C-c C-s~ - Run ~M-x khoj <user-query>~ or Call ~C-c C-s~

View File

@@ -393,6 +393,60 @@ def test_extract_entries_with_different_level_headings(tmp_path):
assert entries[1][1].raw == "* Heading 2\n" assert entries[1][1].raw == "* Heading 2\n"
def test_line_number_tracking_in_recursive_split():
"Ensure line numbers in URIs are correct after recursive splitting by checking against the actual file."
# Arrange
org_file_path = os.path.abspath("tests/data/org/main_readme.org")
with open(org_file_path, "r") as f:
org_content = f.read()
lines = org_content.splitlines()
data = {org_file_path: org_content}
# Act
# Using a small max_tokens to force recursive splitting
_, entries = OrgToEntries.extract_org_entries(org_files=data, max_tokens=10, index_heading_entries=True)
# Assert
assert len(entries) > 0, "No entries were extracted."
for entry in entries:
# Extract file path and line number from the entry URI
# for files uri is expected in format: file:///path/to/file.org#line=5
match = re.search(r"file://(.*?)#line=(\d+)", entry.uri)
if not match:
continue
filepath_from_uri = match.group(1)
line_number_from_uri = int(match.group(2))
# line_number is 1-based, list index is 0-based
line_in_file = clean(lines[line_number_from_uri - 1])
next_line_in_file = clean(lines[line_number_from_uri]) if line_number_from_uri < len(lines) else ""
# Remove ancestor heading lines inserted during post-processing
first_entry_line = ""
for line in entry.raw.splitlines():
if line.startswith("*"):
first_entry_line = line
else:
break # Stop at the first non-heading line
# Remove heading prefix from entry.compiled as level changed during post-processing
cleaned_first_entry_line = first_entry_line.strip()
# Remove multiple consecutive spaces
cleaned_first_entry_line = clean(cleaned_first_entry_line)
assert entry.uri is not None, f"Entry '{entry}' has a None URI."
assert match is not None, f"URI format is incorrect: {entry.uri}"
assert (
filepath_from_uri == org_file_path
), f"File path in URI '{filepath_from_uri}' does not match expected '{org_file_path}'"
# Ensure the first non-heading line in the compiled entry matches the line in the file
assert (
cleaned_first_entry_line in line_in_file.strip() or cleaned_first_entry_line in next_line_in_file.strip()
), f"First non-heading line '{cleaned_first_entry_line}' in {entry.raw} does not match line {line_number_from_uri} in file: '{line_in_file}' or next line '{next_line_in_file}'"
# Helper Functions # Helper Functions
def create_file(tmp_path, entry=None, filename="test.org"): def create_file(tmp_path, entry=None, filename="test.org"):
org_file = tmp_path / filename org_file = tmp_path / filename
@@ -402,6 +456,6 @@ def create_file(tmp_path, entry=None, filename="test.org"):
return org_file return org_file
def clean(entry): def clean(text):
"Remove properties from entry for easier comparison." "Normalize spaces in text for easier comparison."
return re.sub(r"\n:PROPERTIES:(.*?):END:", "", entry, flags=re.DOTALL) return re.sub(r"\s+", " ", text)

View File

@@ -100,9 +100,8 @@ def test_render_entry_with_property_drawer_and_empty_body(tmp_path):
expected_entry = f"""*** [#A] Heading1 :tag1: expected_entry = f"""*** [#A] Heading1 :tag1:
:PROPERTIES: :PROPERTIES:
:LINE: file:{orgfile}::2 :LINE: file://{orgfile}#line=2
:ID: id:111-111-111-1111-1111 :ID: id:111-111-111-1111-1111
:SOURCE: [[file:{orgfile}::*Heading1]]
:END: :END:
""" """
@@ -133,37 +132,12 @@ Body Line 2
# Assert # Assert
# SOURCE link rendered with Heading # SOURCE link rendered with Heading
assert f":SOURCE: [[file:{orgfile}::*{entries[0].heading}]]" in f"{entries[0]}"
# ID link rendered with ID # ID link rendered with ID
assert f":ID: id:123-456-789-4234-1231" in f"{entries[0]}" assert f":ID: id:123-456-789-4234-1231" in f"{entries[0]}"
# LINE link rendered with line number # LINE link rendered with line number
assert f":LINE: file:{orgfile}::2" in f"{entries[0]}" assert f":LINE: file://{orgfile}#line=2" in f"{entries[0]}"
# LINE link rendered with line number
assert f":LINE: file://{orgfile}#line=7" in f"{entries[1]}"
# ----------------------------------------------------------------------------------------------------
def test_source_link_to_entry_escaped_for_rendering(tmp_path):
"Test SOURCE link renders with square brackets in filename, heading escaped for org-mode rendering"
# Arrange
entry = f"""
*** [#A] Heading[1] :tag1:
:PROPERTIES:
:ID: 123-456-789-4234-1231
:END:
Body Line 1"""
orgfile = create_file(tmp_path, entry, filename="test[1].org")
# Act
entries = orgnode.makelist_with_filepath(orgfile)
# Assert
assert len(entries) == 1
# parsed heading from entry
assert entries[0].heading == "Heading[1]"
# track ancestors of entry
assert entries[0].ancestors == [f"{orgfile}"]
# ensure SOURCE link has square brackets in filename, heading escaped in rendered entries
escaped_orgfile = f"{orgfile}".replace("[1]", "\\[1\\]")
assert f":SOURCE: [[file:{escaped_orgfile}::*Heading\\[1\\]" in f"{entries[0]}"
# ---------------------------------------------------------------------------------------------------- # ----------------------------------------------------------------------------------------------------