From 516472a8d5a129a33f67c00f24e83273eb265979 Mon Sep 17 00:00:00 2001 From: Debanjum Singh Solanky Date: Sun, 6 Oct 2024 15:45:13 -0700 Subject: [PATCH] Switch default tokenizer to tiktoken as more widely used The tiktoken BPE based tokenizers seem more widely used these days. Fallback to gpt-4o tiktoken tokenizer to count tokens for context stuffing --- src/khoj/processor/conversation/utils.py | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) diff --git a/src/khoj/processor/conversation/utils.py b/src/khoj/processor/conversation/utils.py index ff3451f5..e841c484 100644 --- a/src/khoj/processor/conversation/utils.py +++ b/src/khoj/processor/conversation/utils.py @@ -223,7 +223,7 @@ def truncate_messages( ) -> list[ChatMessage]: """Truncate messages to fit within max prompt size supported by model""" - default_tokenizer = "hf-internal-testing/llama-tokenizer" + default_tokenizer = "gpt-4o" try: if loaded_model: @@ -240,13 +240,9 @@ def truncate_messages( else: encoder = download_model(model_name).tokenizer() except: - if default_tokenizer in state.pretrained_tokenizers: - encoder = state.pretrained_tokenizers[default_tokenizer] - else: - encoder = AutoTokenizer.from_pretrained(default_tokenizer) - state.pretrained_tokenizers[default_tokenizer] = encoder + encoder = tiktoken.encoding_for_model(default_tokenizer) logger.debug( - f"Fallback to default chat model tokenizer: {tokenizer_name}.\nConfigure tokenizer for unsupported model: {model_name} in Khoj settings to improve context stuffing." + f"Fallback to default chat model tokenizer: {default_tokenizer}.\nConfigure tokenizer for model: {model_name} in Khoj settings to improve context stuffing." ) # Extract system message from messages