mirror of
https://github.com/khoaliber/khoj.git
synced 2026-03-02 13:18:18 +00:00
Deep link to org-mode entries. Deep link by line number in uri
Use url fragment schema for deep link URIs, borrowing from URL/PDF schemas. E.g file:///path/to/file.txt#line=<line_no>&#page=<page_no> Compute line number during (recursive) org-mode entry chunking. Thoroughly test line number in URI maps to line number of chunk in actual org mode file. This deeplink URI with line number is passed to llm as context to better combine with line range based view file tool. Grep tool already passed matching line number. This change passes line number in URIs of org entries matched by the semantic search tool
This commit is contained in:
@@ -87,6 +87,7 @@ class OrgToEntries(TextToEntries):
|
||||
entry_to_file_map: List[Tuple[Orgnode, str]],
|
||||
max_tokens=256,
|
||||
ancestry: Dict[int, str] = {},
|
||||
start_line: int = 1,
|
||||
) -> Tuple[List[List[Orgnode]], List[Tuple[Orgnode, str]]]:
|
||||
"""Parse org_content from org_file into OrgNode entries
|
||||
|
||||
@@ -104,7 +105,9 @@ class OrgToEntries(TextToEntries):
|
||||
if len(TextToEntries.tokenizer(org_content_with_ancestry)) <= max_tokens or not re.search(
|
||||
rf"^\*{{{len(ancestry)+1},}}\s", org_content, re.MULTILINE
|
||||
):
|
||||
orgnode_content_with_ancestry = orgnode.makelist(org_content_with_ancestry, org_file)
|
||||
orgnode_content_with_ancestry = orgnode.makelist(
|
||||
org_content_with_ancestry, org_file, start_line=start_line, ancestry_lines=len(ancestry)
|
||||
)
|
||||
entry_to_file_map += zip(orgnode_content_with_ancestry, [org_file] * len(orgnode_content_with_ancestry))
|
||||
entries.extend([orgnode_content_with_ancestry])
|
||||
return entries, entry_to_file_map
|
||||
@@ -125,24 +128,32 @@ class OrgToEntries(TextToEntries):
|
||||
return entries, entry_to_file_map
|
||||
|
||||
# Recurse down each non-empty section after parsing its body, heading and ancestry
|
||||
current_line_offset = 0
|
||||
for section in sections:
|
||||
num_lines_in_section = section.count("\n")
|
||||
# Skip empty sections
|
||||
if section.strip() == "":
|
||||
current_line_offset += num_lines_in_section
|
||||
continue
|
||||
|
||||
section_start_line_in_file = start_line + current_line_offset
|
||||
|
||||
# Extract the section body and (when present) the heading
|
||||
current_ancestry = ancestry.copy()
|
||||
first_non_empty_line = [line for line in section.split("\n") if line.strip() != ""][0]
|
||||
# If first non-empty line is a heading with expected heading level
|
||||
if re.search(rf"^\*{{{next_heading_level}}}\s", first_non_empty_line):
|
||||
# Extract the section body without the heading
|
||||
current_section_body = "\n".join(section.split(first_non_empty_line, 1)[1:])
|
||||
current_section_heading, current_section_body = section.split(first_non_empty_line, 1)
|
||||
current_section_body_offset = current_section_heading.count("\n")
|
||||
# Parse the section heading into current section ancestry
|
||||
current_section_title = first_non_empty_line[next_heading_level:].strip()
|
||||
current_ancestry[next_heading_level] = current_section_title
|
||||
recursive_start_line = section_start_line_in_file + current_section_body_offset
|
||||
# Else process the section as just body text
|
||||
else:
|
||||
current_section_body = section
|
||||
recursive_start_line = section_start_line_in_file
|
||||
|
||||
# Recurse down children of the current entry
|
||||
OrgToEntries.process_single_org_file(
|
||||
@@ -152,7 +163,9 @@ class OrgToEntries(TextToEntries):
|
||||
entry_to_file_map,
|
||||
max_tokens,
|
||||
current_ancestry,
|
||||
start_line=recursive_start_line,
|
||||
)
|
||||
current_line_offset += num_lines_in_section
|
||||
|
||||
return entries, entry_to_file_map
|
||||
|
||||
@@ -207,6 +220,8 @@ class OrgToEntries(TextToEntries):
|
||||
if parsed_entry.hasBody:
|
||||
compiled += f"\n {parsed_entry.body}"
|
||||
|
||||
uri = parsed_entry.properties.pop("LINE", None)
|
||||
|
||||
# Add the sub-entry contents to the entry
|
||||
entry_compiled += compiled
|
||||
entry_raw += f"{parsed_entry}"
|
||||
@@ -220,6 +235,7 @@ class OrgToEntries(TextToEntries):
|
||||
raw=entry_raw,
|
||||
heading=entry_heading,
|
||||
file=entry_to_file_map[parsed_entry],
|
||||
uri=uri,
|
||||
)
|
||||
)
|
||||
|
||||
|
||||
@@ -58,7 +58,7 @@ def makelist_with_filepath(filename):
|
||||
return makelist(f, filename)
|
||||
|
||||
|
||||
def makelist(file, filename) -> List["Orgnode"]:
|
||||
def makelist(file, filename, start_line: int = 1, ancestry_lines: int = 0) -> List["Orgnode"]:
|
||||
"""
|
||||
Read an org-mode file and return a list of Orgnode objects
|
||||
created from this file.
|
||||
@@ -114,7 +114,16 @@ def makelist(file, filename) -> List["Orgnode"]:
|
||||
logbook = list()
|
||||
thisNode.properties = property_map
|
||||
nodelist.append(thisNode)
|
||||
property_map = {"LINE": f"file:{normalize_filename(filename)}::{ctr}"}
|
||||
# Account for ancestry lines that were prepended when calculating line numbers
|
||||
if ancestry_lines > 0:
|
||||
calculated_line = start_line + ctr - 1 - ancestry_lines
|
||||
if calculated_line <= 0:
|
||||
calculated_line = 1 # Fallback to line 1 if calculation results in invalid line number
|
||||
else:
|
||||
calculated_line = start_line + ctr - 1
|
||||
if calculated_line <= 0:
|
||||
calculated_line = ctr # Use the original behavior if start_line calculation fails
|
||||
property_map = {"LINE": f"file://{normalize_filename(filename)}#line={calculated_line}"}
|
||||
previous_level = level
|
||||
previous_heading: str = heading
|
||||
level = heading_search.group(1)
|
||||
|
||||
@@ -81,8 +81,35 @@ class TextToEntries(ABC):
|
||||
chunked_entry_chunks = text_splitter.split_text(entry.compiled)
|
||||
corpus_id = uuid.uuid4()
|
||||
|
||||
line_start = None
|
||||
last_offset = 0
|
||||
if entry.uri and entry.uri.startswith("file://"):
|
||||
if "#line=" in entry.uri:
|
||||
line_start = int(entry.uri.split("#line=", 1)[-1].split("&", 1)[0])
|
||||
else:
|
||||
line_start = 0
|
||||
|
||||
# Create heading prefixed entry from each chunk
|
||||
for chunk_index, compiled_entry_chunk in enumerate(chunked_entry_chunks):
|
||||
# set line start in uri of chunked entries
|
||||
entry_uri = entry.uri
|
||||
if line_start is not None:
|
||||
# Find the chunk in the raw text to get an accurate line number.
|
||||
# Search for the unmodified chunk from the last offset.
|
||||
searchable_chunk = compiled_entry_chunk.strip()
|
||||
if searchable_chunk:
|
||||
chunk_start_pos_in_raw = entry.raw.find(searchable_chunk, last_offset)
|
||||
if chunk_start_pos_in_raw != -1:
|
||||
# Found the chunk. Calculate its line offset from the start of the raw text.
|
||||
line_offset_in_raw = entry.raw[:chunk_start_pos_in_raw].count("\n")
|
||||
new_line_num = line_start + line_offset_in_raw
|
||||
entry_uri = re.sub(r"#line=\d+", f"#line={new_line_num}", entry.uri)
|
||||
# Update search position for the next chunk to start after the current one.
|
||||
last_offset = chunk_start_pos_in_raw + len(searchable_chunk)
|
||||
else:
|
||||
# Chunk not found in raw text, likely from a heading. Use original line_start.
|
||||
entry_uri = re.sub(r"#line=\d+", f"#line={line_start}", entry.uri)
|
||||
|
||||
# Prepend heading to all other chunks, the first chunk already has heading from original entry
|
||||
if chunk_index > 0 and entry.heading:
|
||||
# Snip heading to avoid crossing max_tokens limit
|
||||
|
||||
4
tests/data/org/main_readme.org
vendored
4
tests/data/org/main_readme.org
vendored
@@ -3,7 +3,7 @@
|
||||
|
||||
All data is processed locally. User can interface with khoj app via [[./interface/emacs/khoj.el][Emacs]], API or Commandline
|
||||
|
||||
** Dependencies
|
||||
** Dependencies [[id:123-421-121-12]] :TAG1:@TAG1_1:
|
||||
- Python3
|
||||
- [[https://docs.conda.io/en/latest/miniconda.html#latest-miniconda-installer-links][Miniconda]]
|
||||
|
||||
@@ -22,7 +22,7 @@
|
||||
#+end_src
|
||||
|
||||
** Use
|
||||
*** *Khoj via Emacs*
|
||||
*** *Khoj via Emacs* [[https://khoj.dev][link to khoj website]] :@EMACS:CLIENT_1:KHOJ:
|
||||
- [[https://github.com/khoj-ai/khoj/tree/master/interface/emacs#installation][Install]] [[./interface/emacs/khoj.el][khoj.el]]
|
||||
- Run ~M-x khoj <user-query>~ or Call ~C-c C-s~
|
||||
|
||||
|
||||
@@ -393,6 +393,60 @@ def test_extract_entries_with_different_level_headings(tmp_path):
|
||||
assert entries[1][1].raw == "* Heading 2\n"
|
||||
|
||||
|
||||
def test_line_number_tracking_in_recursive_split():
|
||||
"Ensure line numbers in URIs are correct after recursive splitting by checking against the actual file."
|
||||
# Arrange
|
||||
org_file_path = os.path.abspath("tests/data/org/main_readme.org")
|
||||
|
||||
with open(org_file_path, "r") as f:
|
||||
org_content = f.read()
|
||||
lines = org_content.splitlines()
|
||||
data = {org_file_path: org_content}
|
||||
|
||||
# Act
|
||||
# Using a small max_tokens to force recursive splitting
|
||||
_, entries = OrgToEntries.extract_org_entries(org_files=data, max_tokens=10, index_heading_entries=True)
|
||||
|
||||
# Assert
|
||||
assert len(entries) > 0, "No entries were extracted."
|
||||
|
||||
for entry in entries:
|
||||
# Extract file path and line number from the entry URI
|
||||
# for files uri is expected in format: file:///path/to/file.org#line=5
|
||||
match = re.search(r"file://(.*?)#line=(\d+)", entry.uri)
|
||||
if not match:
|
||||
continue
|
||||
filepath_from_uri = match.group(1)
|
||||
line_number_from_uri = int(match.group(2))
|
||||
|
||||
# line_number is 1-based, list index is 0-based
|
||||
line_in_file = clean(lines[line_number_from_uri - 1])
|
||||
next_line_in_file = clean(lines[line_number_from_uri]) if line_number_from_uri < len(lines) else ""
|
||||
|
||||
# Remove ancestor heading lines inserted during post-processing
|
||||
first_entry_line = ""
|
||||
for line in entry.raw.splitlines():
|
||||
if line.startswith("*"):
|
||||
first_entry_line = line
|
||||
else:
|
||||
break # Stop at the first non-heading line
|
||||
# Remove heading prefix from entry.compiled as level changed during post-processing
|
||||
cleaned_first_entry_line = first_entry_line.strip()
|
||||
# Remove multiple consecutive spaces
|
||||
cleaned_first_entry_line = clean(cleaned_first_entry_line)
|
||||
|
||||
assert entry.uri is not None, f"Entry '{entry}' has a None URI."
|
||||
assert match is not None, f"URI format is incorrect: {entry.uri}"
|
||||
assert (
|
||||
filepath_from_uri == org_file_path
|
||||
), f"File path in URI '{filepath_from_uri}' does not match expected '{org_file_path}'"
|
||||
|
||||
# Ensure the first non-heading line in the compiled entry matches the line in the file
|
||||
assert (
|
||||
cleaned_first_entry_line in line_in_file.strip() or cleaned_first_entry_line in next_line_in_file.strip()
|
||||
), f"First non-heading line '{cleaned_first_entry_line}' in {entry.raw} does not match line {line_number_from_uri} in file: '{line_in_file}' or next line '{next_line_in_file}'"
|
||||
|
||||
|
||||
# Helper Functions
|
||||
def create_file(tmp_path, entry=None, filename="test.org"):
|
||||
org_file = tmp_path / filename
|
||||
@@ -402,6 +456,6 @@ def create_file(tmp_path, entry=None, filename="test.org"):
|
||||
return org_file
|
||||
|
||||
|
||||
def clean(entry):
|
||||
"Remove properties from entry for easier comparison."
|
||||
return re.sub(r"\n:PROPERTIES:(.*?):END:", "", entry, flags=re.DOTALL)
|
||||
def clean(text):
|
||||
"Normalize spaces in text for easier comparison."
|
||||
return re.sub(r"\s+", " ", text)
|
||||
|
||||
@@ -100,9 +100,8 @@ def test_render_entry_with_property_drawer_and_empty_body(tmp_path):
|
||||
|
||||
expected_entry = f"""*** [#A] Heading1 :tag1:
|
||||
:PROPERTIES:
|
||||
:LINE: file:{orgfile}::2
|
||||
:LINE: file://{orgfile}#line=2
|
||||
:ID: id:111-111-111-1111-1111
|
||||
:SOURCE: [[file:{orgfile}::*Heading1]]
|
||||
:END:
|
||||
"""
|
||||
|
||||
@@ -133,37 +132,12 @@ Body Line 2
|
||||
|
||||
# Assert
|
||||
# SOURCE link rendered with Heading
|
||||
assert f":SOURCE: [[file:{orgfile}::*{entries[0].heading}]]" in f"{entries[0]}"
|
||||
# ID link rendered with ID
|
||||
assert f":ID: id:123-456-789-4234-1231" in f"{entries[0]}"
|
||||
# LINE link rendered with line number
|
||||
assert f":LINE: file:{orgfile}::2" in f"{entries[0]}"
|
||||
|
||||
|
||||
# ----------------------------------------------------------------------------------------------------
|
||||
def test_source_link_to_entry_escaped_for_rendering(tmp_path):
|
||||
"Test SOURCE link renders with square brackets in filename, heading escaped for org-mode rendering"
|
||||
# Arrange
|
||||
entry = f"""
|
||||
*** [#A] Heading[1] :tag1:
|
||||
:PROPERTIES:
|
||||
:ID: 123-456-789-4234-1231
|
||||
:END:
|
||||
Body Line 1"""
|
||||
orgfile = create_file(tmp_path, entry, filename="test[1].org")
|
||||
|
||||
# Act
|
||||
entries = orgnode.makelist_with_filepath(orgfile)
|
||||
|
||||
# Assert
|
||||
assert len(entries) == 1
|
||||
# parsed heading from entry
|
||||
assert entries[0].heading == "Heading[1]"
|
||||
# track ancestors of entry
|
||||
assert entries[0].ancestors == [f"{orgfile}"]
|
||||
# ensure SOURCE link has square brackets in filename, heading escaped in rendered entries
|
||||
escaped_orgfile = f"{orgfile}".replace("[1]", "\\[1\\]")
|
||||
assert f":SOURCE: [[file:{escaped_orgfile}::*Heading\\[1\\]" in f"{entries[0]}"
|
||||
assert f":LINE: file://{orgfile}#line=2" in f"{entries[0]}"
|
||||
# LINE link rendered with line number
|
||||
assert f":LINE: file://{orgfile}#line=7" in f"{entries[1]}"
|
||||
|
||||
|
||||
# ----------------------------------------------------------------------------------------------------
|
||||
|
||||
Reference in New Issue
Block a user