Parse markdown file as single entry if it fits with max token limits

These changes improve entry context available to the search model Specifically this should improve entry context from short knowledge trees, that is knowledge bases with small files Previously we split all markdown files by their headings, even if the file was small enough to fit entirely within the max token limits of the search model. This used to reduce the context available to select the appropriate entries for a given query for the search model, especially from short knowledge trees
2026-03-02 21:19:12 +00:00 · 2024-02-10 14:34:09 +05:30
parent d8f01876e5
commit 982ac1859c
2 changed files with 41 additions and 9 deletions
--- a/tests/test_markdown_to_entries.py
+++ b/tests/test_markdown_to_entries.py
@@ -20,7 +20,7 @@ def test_extract_markdown_with_no_headings(tmp_path):

    # Act
    # Extract Entries from specified Markdown files
-    entries = MarkdownToEntries.extract_markdown_entries(markdown_files=data)
+    entries = MarkdownToEntries.extract_markdown_entries(markdown_files=data, max_tokens=3)

    # Assert
    assert len(entries) == 1
@@ -45,7 +45,7 @@ def test_extract_single_markdown_entry(tmp_path):

    # Act
    # Extract Entries from specified Markdown files
-    entries = MarkdownToEntries.extract_markdown_entries(markdown_files=data)
+    entries = MarkdownToEntries.extract_markdown_entries(markdown_files=data, max_tokens=3)

    # Assert
    assert len(entries) == 1
@@ -68,7 +68,7 @@ def test_extract_multiple_markdown_entries(tmp_path):

    # Act
    # Extract Entries from specified Markdown files
-    entries = MarkdownToEntries.extract_markdown_entries(markdown_files=data)
+    entries = MarkdownToEntries.extract_markdown_entries(markdown_files=data, max_tokens=3)

    # Assert
    assert len(entries) == 2
@@ -127,7 +127,7 @@ def test_extract_entries_with_different_level_headings(tmp_path):

    # Act
    # Extract Entries from specified Markdown files
-    entries = MarkdownToEntries.extract_markdown_entries(markdown_files=data)
+    entries = MarkdownToEntries.extract_markdown_entries(markdown_files=data, max_tokens=3)

    # Assert
    assert len(entries) == 3
@@ -161,6 +161,28 @@ body line 2
    assert entries[2].raw == "# Heading 1\n## Heading 2\nbody line 2", "Ensure raw entry includes heading ancestory"


+def test_parse_markdown_file_into_single_entry_if_small(tmp_path):
+    "Parse markdown file into single entry if it fits within the token limits."
+    # Arrange
+    entry = f"""
+# Heading 1
+body line 1
+## Subheading 1.1
+body line 1.1
+"""
+    data = {
+        f"{tmp_path}": entry,
+    }
+
+    # Act
+    # Extract Entries from specified Markdown files
+    entries = MarkdownToEntries.extract_markdown_entries(markdown_files=data, max_tokens=12)
+
+    # Assert
+    assert len(entries) == 1
+    assert entries[0].raw == entry
+
+
 # Helper Functions
 def create_file(tmp_path: Path, entry=None, filename="test.md"):
    markdown_file = tmp_path / filename