Deep link to org-mode entries. Deep link by line number in uri

Use url fragment schema for deep link URIs, borrowing from URL/PDF
schemas. E.g file:///path/to/file.txt#line=<line_no>&#page=<page_no>

Compute line number during (recursive) org-mode entry chunking.

Thoroughly test line number in URI maps to line number of chunk in
actual org mode file.

This deeplink URI with line number is passed to llm as context to
better combine with line range based view file tool.

Grep tool already passed matching line number. This change passes
line number in URIs of org entries matched by the semantic search tool
This commit is contained in:
Debanjum
2025-06-23 19:07:38 -07:00
parent e90ab5341a
commit dcfa4288c4
6 changed files with 119 additions and 39 deletions

View File

@@ -393,6 +393,60 @@ def test_extract_entries_with_different_level_headings(tmp_path):
assert entries[1][1].raw == "* Heading 2\n"
def test_line_number_tracking_in_recursive_split():
"Ensure line numbers in URIs are correct after recursive splitting by checking against the actual file."
# Arrange
org_file_path = os.path.abspath("tests/data/org/main_readme.org")
with open(org_file_path, "r") as f:
org_content = f.read()
lines = org_content.splitlines()
data = {org_file_path: org_content}
# Act
# Using a small max_tokens to force recursive splitting
_, entries = OrgToEntries.extract_org_entries(org_files=data, max_tokens=10, index_heading_entries=True)
# Assert
assert len(entries) > 0, "No entries were extracted."
for entry in entries:
# Extract file path and line number from the entry URI
# for files uri is expected in format: file:///path/to/file.org#line=5
match = re.search(r"file://(.*?)#line=(\d+)", entry.uri)
if not match:
continue
filepath_from_uri = match.group(1)
line_number_from_uri = int(match.group(2))
# line_number is 1-based, list index is 0-based
line_in_file = clean(lines[line_number_from_uri - 1])
next_line_in_file = clean(lines[line_number_from_uri]) if line_number_from_uri < len(lines) else ""
# Remove ancestor heading lines inserted during post-processing
first_entry_line = ""
for line in entry.raw.splitlines():
if line.startswith("*"):
first_entry_line = line
else:
break # Stop at the first non-heading line
# Remove heading prefix from entry.compiled as level changed during post-processing
cleaned_first_entry_line = first_entry_line.strip()
# Remove multiple consecutive spaces
cleaned_first_entry_line = clean(cleaned_first_entry_line)
assert entry.uri is not None, f"Entry '{entry}' has a None URI."
assert match is not None, f"URI format is incorrect: {entry.uri}"
assert (
filepath_from_uri == org_file_path
), f"File path in URI '{filepath_from_uri}' does not match expected '{org_file_path}'"
# Ensure the first non-heading line in the compiled entry matches the line in the file
assert (
cleaned_first_entry_line in line_in_file.strip() or cleaned_first_entry_line in next_line_in_file.strip()
), f"First non-heading line '{cleaned_first_entry_line}' in {entry.raw} does not match line {line_number_from_uri} in file: '{line_in_file}' or next line '{next_line_in_file}'"
# Helper Functions
def create_file(tmp_path, entry=None, filename="test.org"):
org_file = tmp_path / filename
@@ -402,6 +456,6 @@ def create_file(tmp_path, entry=None, filename="test.org"):
return org_file
def clean(entry):
"Remove properties from entry for easier comparison."
return re.sub(r"\n:PROPERTIES:(.*?):END:", "", entry, flags=re.DOTALL)
def clean(text):
"Normalize spaces in text for easier comparison."
return re.sub(r"\s+", " ", text)