mirror of
https://github.com/khoaliber/khoj.git
synced 2026-03-09 21:29:11 +00:00
Switch default tokenizer to tiktoken as more widely used
The tiktoken BPE based tokenizers seem more widely used these days. Fallback to gpt-4o tiktoken tokenizer to count tokens for context stuffing
This commit is contained in:
@@ -223,7 +223,7 @@ def truncate_messages(
|
|||||||
) -> list[ChatMessage]:
|
) -> list[ChatMessage]:
|
||||||
"""Truncate messages to fit within max prompt size supported by model"""
|
"""Truncate messages to fit within max prompt size supported by model"""
|
||||||
|
|
||||||
default_tokenizer = "hf-internal-testing/llama-tokenizer"
|
default_tokenizer = "gpt-4o"
|
||||||
|
|
||||||
try:
|
try:
|
||||||
if loaded_model:
|
if loaded_model:
|
||||||
@@ -240,13 +240,9 @@ def truncate_messages(
|
|||||||
else:
|
else:
|
||||||
encoder = download_model(model_name).tokenizer()
|
encoder = download_model(model_name).tokenizer()
|
||||||
except:
|
except:
|
||||||
if default_tokenizer in state.pretrained_tokenizers:
|
encoder = tiktoken.encoding_for_model(default_tokenizer)
|
||||||
encoder = state.pretrained_tokenizers[default_tokenizer]
|
|
||||||
else:
|
|
||||||
encoder = AutoTokenizer.from_pretrained(default_tokenizer)
|
|
||||||
state.pretrained_tokenizers[default_tokenizer] = encoder
|
|
||||||
logger.debug(
|
logger.debug(
|
||||||
f"Fallback to default chat model tokenizer: {tokenizer_name}.\nConfigure tokenizer for unsupported model: {model_name} in Khoj settings to improve context stuffing."
|
f"Fallback to default chat model tokenizer: {default_tokenizer}.\nConfigure tokenizer for model: {model_name} in Khoj settings to improve context stuffing."
|
||||||
)
|
)
|
||||||
|
|
||||||
# Extract system message from messages
|
# Extract system message from messages
|
||||||
|
|||||||
Reference in New Issue
Block a user