diff --git a/src/khoj/processor/content/org_mode/org_to_entries.py b/src/khoj/processor/content/org_mode/org_to_entries.py index 4b9fe3ae..0dfe7674 100644 --- a/src/khoj/processor/content/org_mode/org_to_entries.py +++ b/src/khoj/processor/content/org_mode/org_to_entries.py @@ -87,6 +87,7 @@ class OrgToEntries(TextToEntries): entry_to_file_map: List[Tuple[Orgnode, str]], max_tokens=256, ancestry: Dict[int, str] = {}, + start_line: int = 1, ) -> Tuple[List[List[Orgnode]], List[Tuple[Orgnode, str]]]: """Parse org_content from org_file into OrgNode entries @@ -104,7 +105,9 @@ class OrgToEntries(TextToEntries): if len(TextToEntries.tokenizer(org_content_with_ancestry)) <= max_tokens or not re.search( rf"^\*{{{len(ancestry)+1},}}\s", org_content, re.MULTILINE ): - orgnode_content_with_ancestry = orgnode.makelist(org_content_with_ancestry, org_file) + orgnode_content_with_ancestry = orgnode.makelist( + org_content_with_ancestry, org_file, start_line=start_line, ancestry_lines=len(ancestry) + ) entry_to_file_map += zip(orgnode_content_with_ancestry, [org_file] * len(orgnode_content_with_ancestry)) entries.extend([orgnode_content_with_ancestry]) return entries, entry_to_file_map @@ -125,24 +128,32 @@ class OrgToEntries(TextToEntries): return entries, entry_to_file_map # Recurse down each non-empty section after parsing its body, heading and ancestry + current_line_offset = 0 for section in sections: + num_lines_in_section = section.count("\n") # Skip empty sections if section.strip() == "": + current_line_offset += num_lines_in_section continue + section_start_line_in_file = start_line + current_line_offset + # Extract the section body and (when present) the heading current_ancestry = ancestry.copy() first_non_empty_line = [line for line in section.split("\n") if line.strip() != ""][0] # If first non-empty line is a heading with expected heading level if re.search(rf"^\*{{{next_heading_level}}}\s", first_non_empty_line): # Extract the section body without the heading - current_section_body = "\n".join(section.split(first_non_empty_line, 1)[1:]) + current_section_heading, current_section_body = section.split(first_non_empty_line, 1) + current_section_body_offset = current_section_heading.count("\n") # Parse the section heading into current section ancestry current_section_title = first_non_empty_line[next_heading_level:].strip() current_ancestry[next_heading_level] = current_section_title + recursive_start_line = section_start_line_in_file + current_section_body_offset # Else process the section as just body text else: current_section_body = section + recursive_start_line = section_start_line_in_file # Recurse down children of the current entry OrgToEntries.process_single_org_file( @@ -152,7 +163,9 @@ class OrgToEntries(TextToEntries): entry_to_file_map, max_tokens, current_ancestry, + start_line=recursive_start_line, ) + current_line_offset += num_lines_in_section return entries, entry_to_file_map @@ -207,6 +220,8 @@ class OrgToEntries(TextToEntries): if parsed_entry.hasBody: compiled += f"\n {parsed_entry.body}" + uri = parsed_entry.properties.pop("LINE", None) + # Add the sub-entry contents to the entry entry_compiled += compiled entry_raw += f"{parsed_entry}" @@ -220,6 +235,7 @@ class OrgToEntries(TextToEntries): raw=entry_raw, heading=entry_heading, file=entry_to_file_map[parsed_entry], + uri=uri, ) ) diff --git a/src/khoj/processor/content/org_mode/orgnode.py b/src/khoj/processor/content/org_mode/orgnode.py index e190e17a..34bb54f3 100644 --- a/src/khoj/processor/content/org_mode/orgnode.py +++ b/src/khoj/processor/content/org_mode/orgnode.py @@ -58,7 +58,7 @@ def makelist_with_filepath(filename): return makelist(f, filename) -def makelist(file, filename) -> List["Orgnode"]: +def makelist(file, filename, start_line: int = 1, ancestry_lines: int = 0) -> List["Orgnode"]: """ Read an org-mode file and return a list of Orgnode objects created from this file. @@ -114,7 +114,16 @@ def makelist(file, filename) -> List["Orgnode"]: logbook = list() thisNode.properties = property_map nodelist.append(thisNode) - property_map = {"LINE": f"file:{normalize_filename(filename)}::{ctr}"} + # Account for ancestry lines that were prepended when calculating line numbers + if ancestry_lines > 0: + calculated_line = start_line + ctr - 1 - ancestry_lines + if calculated_line <= 0: + calculated_line = 1 # Fallback to line 1 if calculation results in invalid line number + else: + calculated_line = start_line + ctr - 1 + if calculated_line <= 0: + calculated_line = ctr # Use the original behavior if start_line calculation fails + property_map = {"LINE": f"file://{normalize_filename(filename)}#line={calculated_line}"} previous_level = level previous_heading: str = heading level = heading_search.group(1) diff --git a/src/khoj/processor/content/text_to_entries.py b/src/khoj/processor/content/text_to_entries.py index bac55aa4..0ceda11d 100644 --- a/src/khoj/processor/content/text_to_entries.py +++ b/src/khoj/processor/content/text_to_entries.py @@ -81,8 +81,35 @@ class TextToEntries(ABC): chunked_entry_chunks = text_splitter.split_text(entry.compiled) corpus_id = uuid.uuid4() + line_start = None + last_offset = 0 + if entry.uri and entry.uri.startswith("file://"): + if "#line=" in entry.uri: + line_start = int(entry.uri.split("#line=", 1)[-1].split("&", 1)[0]) + else: + line_start = 0 + # Create heading prefixed entry from each chunk for chunk_index, compiled_entry_chunk in enumerate(chunked_entry_chunks): + # set line start in uri of chunked entries + entry_uri = entry.uri + if line_start is not None: + # Find the chunk in the raw text to get an accurate line number. + # Search for the unmodified chunk from the last offset. + searchable_chunk = compiled_entry_chunk.strip() + if searchable_chunk: + chunk_start_pos_in_raw = entry.raw.find(searchable_chunk, last_offset) + if chunk_start_pos_in_raw != -1: + # Found the chunk. Calculate its line offset from the start of the raw text. + line_offset_in_raw = entry.raw[:chunk_start_pos_in_raw].count("\n") + new_line_num = line_start + line_offset_in_raw + entry_uri = re.sub(r"#line=\d+", f"#line={new_line_num}", entry.uri) + # Update search position for the next chunk to start after the current one. + last_offset = chunk_start_pos_in_raw + len(searchable_chunk) + else: + # Chunk not found in raw text, likely from a heading. Use original line_start. + entry_uri = re.sub(r"#line=\d+", f"#line={line_start}", entry.uri) + # Prepend heading to all other chunks, the first chunk already has heading from original entry if chunk_index > 0 and entry.heading: # Snip heading to avoid crossing max_tokens limit diff --git a/tests/data/org/main_readme.org b/tests/data/org/main_readme.org index d88a2b2b..df5ac6b4 100644 --- a/tests/data/org/main_readme.org +++ b/tests/data/org/main_readme.org @@ -3,7 +3,7 @@ All data is processed locally. User can interface with khoj app via [[./interface/emacs/khoj.el][Emacs]], API or Commandline -** Dependencies +** Dependencies [[id:123-421-121-12]] :TAG1:@TAG1_1: - Python3 - [[https://docs.conda.io/en/latest/miniconda.html#latest-miniconda-installer-links][Miniconda]] @@ -22,7 +22,7 @@ #+end_src ** Use -*** *Khoj via Emacs* +*** *Khoj via Emacs* [[https://khoj.dev][link to khoj website]] :@EMACS:CLIENT_1:KHOJ: - [[https://github.com/khoj-ai/khoj/tree/master/interface/emacs#installation][Install]] [[./interface/emacs/khoj.el][khoj.el]] - Run ~M-x khoj ~ or Call ~C-c C-s~ diff --git a/tests/test_org_to_entries.py b/tests/test_org_to_entries.py index 5c11a6fd..d5dcdbd2 100644 --- a/tests/test_org_to_entries.py +++ b/tests/test_org_to_entries.py @@ -393,6 +393,60 @@ def test_extract_entries_with_different_level_headings(tmp_path): assert entries[1][1].raw == "* Heading 2\n" +def test_line_number_tracking_in_recursive_split(): + "Ensure line numbers in URIs are correct after recursive splitting by checking against the actual file." + # Arrange + org_file_path = os.path.abspath("tests/data/org/main_readme.org") + + with open(org_file_path, "r") as f: + org_content = f.read() + lines = org_content.splitlines() + data = {org_file_path: org_content} + + # Act + # Using a small max_tokens to force recursive splitting + _, entries = OrgToEntries.extract_org_entries(org_files=data, max_tokens=10, index_heading_entries=True) + + # Assert + assert len(entries) > 0, "No entries were extracted." + + for entry in entries: + # Extract file path and line number from the entry URI + # for files uri is expected in format: file:///path/to/file.org#line=5 + match = re.search(r"file://(.*?)#line=(\d+)", entry.uri) + if not match: + continue + filepath_from_uri = match.group(1) + line_number_from_uri = int(match.group(2)) + + # line_number is 1-based, list index is 0-based + line_in_file = clean(lines[line_number_from_uri - 1]) + next_line_in_file = clean(lines[line_number_from_uri]) if line_number_from_uri < len(lines) else "" + + # Remove ancestor heading lines inserted during post-processing + first_entry_line = "" + for line in entry.raw.splitlines(): + if line.startswith("*"): + first_entry_line = line + else: + break # Stop at the first non-heading line + # Remove heading prefix from entry.compiled as level changed during post-processing + cleaned_first_entry_line = first_entry_line.strip() + # Remove multiple consecutive spaces + cleaned_first_entry_line = clean(cleaned_first_entry_line) + + assert entry.uri is not None, f"Entry '{entry}' has a None URI." + assert match is not None, f"URI format is incorrect: {entry.uri}" + assert ( + filepath_from_uri == org_file_path + ), f"File path in URI '{filepath_from_uri}' does not match expected '{org_file_path}'" + + # Ensure the first non-heading line in the compiled entry matches the line in the file + assert ( + cleaned_first_entry_line in line_in_file.strip() or cleaned_first_entry_line in next_line_in_file.strip() + ), f"First non-heading line '{cleaned_first_entry_line}' in {entry.raw} does not match line {line_number_from_uri} in file: '{line_in_file}' or next line '{next_line_in_file}'" + + # Helper Functions def create_file(tmp_path, entry=None, filename="test.org"): org_file = tmp_path / filename @@ -402,6 +456,6 @@ def create_file(tmp_path, entry=None, filename="test.org"): return org_file -def clean(entry): - "Remove properties from entry for easier comparison." - return re.sub(r"\n:PROPERTIES:(.*?):END:", "", entry, flags=re.DOTALL) +def clean(text): + "Normalize spaces in text for easier comparison." + return re.sub(r"\s+", " ", text) diff --git a/tests/test_orgnode.py b/tests/test_orgnode.py index 49344325..00c471b1 100644 --- a/tests/test_orgnode.py +++ b/tests/test_orgnode.py @@ -100,9 +100,8 @@ def test_render_entry_with_property_drawer_and_empty_body(tmp_path): expected_entry = f"""*** [#A] Heading1 :tag1: :PROPERTIES: -:LINE: file:{orgfile}::2 +:LINE: file://{orgfile}#line=2 :ID: id:111-111-111-1111-1111 -:SOURCE: [[file:{orgfile}::*Heading1]] :END: """ @@ -133,37 +132,12 @@ Body Line 2 # Assert # SOURCE link rendered with Heading - assert f":SOURCE: [[file:{orgfile}::*{entries[0].heading}]]" in f"{entries[0]}" # ID link rendered with ID assert f":ID: id:123-456-789-4234-1231" in f"{entries[0]}" # LINE link rendered with line number - assert f":LINE: file:{orgfile}::2" in f"{entries[0]}" - - -# ---------------------------------------------------------------------------------------------------- -def test_source_link_to_entry_escaped_for_rendering(tmp_path): - "Test SOURCE link renders with square brackets in filename, heading escaped for org-mode rendering" - # Arrange - entry = f""" -*** [#A] Heading[1] :tag1: -:PROPERTIES: -:ID: 123-456-789-4234-1231 -:END: -Body Line 1""" - orgfile = create_file(tmp_path, entry, filename="test[1].org") - - # Act - entries = orgnode.makelist_with_filepath(orgfile) - - # Assert - assert len(entries) == 1 - # parsed heading from entry - assert entries[0].heading == "Heading[1]" - # track ancestors of entry - assert entries[0].ancestors == [f"{orgfile}"] - # ensure SOURCE link has square brackets in filename, heading escaped in rendered entries - escaped_orgfile = f"{orgfile}".replace("[1]", "\\[1\\]") - assert f":SOURCE: [[file:{escaped_orgfile}::*Heading\\[1\\]" in f"{entries[0]}" + assert f":LINE: file://{orgfile}#line=2" in f"{entries[0]}" + # LINE link rendered with line number + assert f":LINE: file://{orgfile}#line=7" in f"{entries[1]}" # ----------------------------------------------------------------------------------------------------