mirror of
https://github.com/khoaliber/khoj.git
synced 2026-03-02 21:19:12 +00:00
Keep original formatting in compiled text entry strings
- Explicity split entry string by space during split by max_tokens - Prevent formatting of compiled entry from being lost - The formatting itself contains useful information No point in dropping the formatting unnecessarily, even if (say) the currrent search models don't account for it (yet)
This commit is contained in:
@@ -31,7 +31,7 @@ class TextToJsonl(ABC):
|
||||
"Split entries if compiled entry length exceeds the max tokens supported by the ML model."
|
||||
chunked_entries: List[Entry] = []
|
||||
for entry in entries:
|
||||
compiled_entry_words = entry.compiled.split()
|
||||
compiled_entry_words = [word for word in entry.compiled.split(" ") if word != ""]
|
||||
# Drop long words instead of having entry truncated to maintain quality of entry processed by models
|
||||
compiled_entry_words = [word for word in compiled_entry_words if len(word) <= max_word_length]
|
||||
for chunk_index in range(0, len(compiled_entry_words), max_tokens):
|
||||
|
||||
@@ -44,7 +44,7 @@ def test_entry_split_when_exceeds_max_words(tmp_path):
|
||||
# Arrange
|
||||
entry = f"""*** Heading
|
||||
\t\r
|
||||
Body Line 1
|
||||
Body Line
|
||||
"""
|
||||
orgfile = create_file(tmp_path, entry)
|
||||
|
||||
|
||||
Reference in New Issue
Block a user