Update Text Chunking Strategy to Improve Search Context (#645)

## Major
- Parse markdown, org parent entries as single entry if fit within max tokens
- Parse a file as single entry if it fits with max token limits
- Add parent heading ancestry to extracted markdown entries for context
- Chunk text in preference order of para, sentence, word, character

## Minor
- Create wrapper function to get entries from org, md, pdf & text files
- Remove unused Entry to Jsonl converter from text to entry class, tests
- Dedupe code by using single func to process an org file into entries

Resolves #620
This commit is contained in:
Debanjum
2024-04-08 13:56:38 +05:30
committed by GitHub
15 changed files with 704 additions and 393 deletions

View File

@@ -306,7 +306,7 @@ def test_notes_search(client, search_config: SearchConfig, sample_org_data, defa
user_query = quote("How to git install application?")
# Act
response = client.get(f"/api/search?q={user_query}&n=1&t=org&r=true&max_distance=0.18", headers=headers)
response = client.get(f"/api/search?q={user_query}&n=1&t=org&r=true&max_distance=0.22", headers=headers)
# Assert
assert response.status_code == 200
@@ -325,7 +325,7 @@ def test_notes_search_no_results(client, search_config: SearchConfig, sample_org
user_query = quote("How to find my goat?")
# Act
response = client.get(f"/api/search?q={user_query}&n=1&t=org&r=true&max_distance=0.18", headers=headers)
response = client.get(f"/api/search?q={user_query}&n=1&t=org&r=true&max_distance=0.22", headers=headers)
# Assert
assert response.status_code == 200
@@ -409,7 +409,7 @@ def test_notes_search_requires_parent_context(
user_query = quote("Install Khoj on Emacs")
# Act
response = client.get(f"/api/search?q={user_query}&n=1&t=org&r=true&max_distance=0.18", headers=headers)
response = client.get(f"/api/search?q={user_query}&n=1&t=org&r=true&max_distance=0.22", headers=headers)
# Assert
assert response.status_code == 200