Upgrade default offline chat model to llama 3.1

This commit is contained in:
Debanjum Singh Solanky
2024-08-16 07:58:04 -05:00
parent acdc3f9470
commit 58c8068079
12 changed files with 30 additions and 12 deletions

View File

@@ -25,7 +25,7 @@ Offline chat stays completely private and can work without internet using open-s
> - An Nvidia, AMD GPU or a Mac M1+ machine would significantly speed up chat response times > - An Nvidia, AMD GPU or a Mac M1+ machine would significantly speed up chat response times
1. Open your [Khoj offline settings](http://localhost:42110/server/admin/database/offlinechatprocessorconversationconfig/) and click *Enable* on the Offline Chat configuration. 1. Open your [Khoj offline settings](http://localhost:42110/server/admin/database/offlinechatprocessorconversationconfig/) and click *Enable* on the Offline Chat configuration.
2. Open your [Chat model options settings](http://localhost:42110/server/admin/database/chatmodeloptions/) and add any [GGUF chat model](https://huggingface.co/models?library=gguf) to use for offline chat. Make sure to use `Offline` as its type. For a balanced chat model that runs well on standard consumer hardware we recommend using [Hermes-2-Pro-Mistral-7B by NousResearch](https://huggingface.co/NousResearch/Hermes-2-Pro-Mistral-7B-GGUF) by default. 2. Open your [Chat model options settings](http://localhost:42110/server/admin/database/chatmodeloptions/) and add any [GGUF chat model](https://huggingface.co/models?library=gguf) to use for offline chat. Make sure to use `Offline` as its type. For a balanced chat model that runs well on standard consumer hardware we recommend using [Llama 3.1 by Meta](https://huggingface.co/bartowski/Meta-Llama-3.1-8B-Instruct-GGUF) by default.
:::tip[Note] :::tip[Note]

View File

@@ -222,7 +222,7 @@ Using Ollama? See the [Ollama Integration](/advanced/ollama) section for more cu
Any chat model on Huggingface in GGUF format can be used for local chat. Here's how you can set it up: Any chat model on Huggingface in GGUF format can be used for local chat. Here's how you can set it up:
1. No need to setup a conversation processor config! 1. No need to setup a conversation processor config!
2. Go over to configure your [chat model options](http://localhost:42110/server/admin/database/chatmodeloptions/). Set the `chat-model` field to a supported chat model[^1] of your choice. For example, we recommend `NousResearch/Hermes-2-Pro-Mistral-7B-GGUF`, but [any gguf model on huggingface](https://huggingface.co/models?library=gguf) should work. 2. Go over to configure your [chat model options](http://localhost:42110/server/admin/database/chatmodeloptions/). Set the `chat-model` field to a supported chat model[^1] of your choice. For example, we recommend `bartowski/Meta-Llama-3.1-8B-Instruct-GGUF`, but [any gguf model on huggingface](https://huggingface.co/models?library=gguf) should work.
- Make sure to set the `model-type` to `Offline`. Do not set `openai config`. - Make sure to set the `model-type` to `Offline`. Do not set `openai config`.
- The `tokenizer` and `max-prompt-size` fields are optional. You can set these for non-standard models (i.e not Mistral or Llama based models) or when you know the token limit of the model to improve context stuffing. - The `tokenizer` and `max-prompt-size` fields are optional. You can set these for non-standard models (i.e not Mistral or Llama based models) or when you know the token limit of the model to improve context stuffing.

View File

@@ -0,0 +1,17 @@
# Generated by Django 5.0.7 on 2024-08-19 12:37
from django.db import migrations, models
class Migration(migrations.Migration):
dependencies = [
("database", "0057_remove_serverchatsettings_default_model_and_more"),
]
operations = [
migrations.AlterField(
model_name="chatmodeloptions",
name="chat_model",
field=models.CharField(default="bartowski/Meta-Llama-3.1-8B-Instruct-GGUF", max_length=200),
),
]

View File

@@ -91,7 +91,7 @@ class ChatModelOptions(BaseModel):
max_prompt_size = models.IntegerField(default=None, null=True, blank=True) max_prompt_size = models.IntegerField(default=None, null=True, blank=True)
subscribed_max_prompt_size = models.IntegerField(default=None, null=True, blank=True) subscribed_max_prompt_size = models.IntegerField(default=None, null=True, blank=True)
tokenizer = models.CharField(max_length=200, default=None, null=True, blank=True) tokenizer = models.CharField(max_length=200, default=None, null=True, blank=True)
chat_model = models.CharField(max_length=200, default="NousResearch/Hermes-2-Pro-Mistral-7B-GGUF") chat_model = models.CharField(max_length=200, default="bartowski/Meta-Llama-3.1-8B-Instruct-GGUF")
model_type = models.CharField(max_length=200, choices=ModelType.choices, default=ModelType.OFFLINE) model_type = models.CharField(max_length=200, choices=ModelType.choices, default=ModelType.OFFLINE)
openai_config = models.ForeignKey( openai_config = models.ForeignKey(
OpenAIProcessorConversationConfig, on_delete=models.CASCADE, default=None, null=True, blank=True OpenAIProcessorConversationConfig, on_delete=models.CASCADE, default=None, null=True, blank=True

View File

@@ -24,7 +24,7 @@ logger = logging.getLogger(__name__)
def extract_questions_offline( def extract_questions_offline(
text: str, text: str,
model: str = "NousResearch/Hermes-2-Pro-Mistral-7B-GGUF", model: str = "bartowski/Meta-Llama-3.1-8B-Instruct-GGUF",
loaded_model: Union[Any, None] = None, loaded_model: Union[Any, None] = None,
conversation_log={}, conversation_log={},
use_history: bool = True, use_history: bool = True,
@@ -141,7 +141,7 @@ def converse_offline(
references=[], references=[],
online_results=[], online_results=[],
conversation_log={}, conversation_log={},
model: str = "NousResearch/Hermes-2-Pro-Mistral-7B-GGUF", model: str = "bartowski/Meta-Llama-3.1-8B-Instruct-GGUF",
loaded_model: Union[Any, None] = None, loaded_model: Union[Any, None] = None,
completion_func=None, completion_func=None,
conversation_commands=[ConversationCommand.Default], conversation_commands=[ConversationCommand.Default],
@@ -240,7 +240,7 @@ def llm_thread(g, messages: List[ChatMessage], model: Any, max_prompt_size: int
def send_message_to_model_offline( def send_message_to_model_offline(
messages: List[ChatMessage], messages: List[ChatMessage],
loaded_model=None, loaded_model=None,
model="NousResearch/Hermes-2-Pro-Mistral-7B-GGUF", model="bartowski/Meta-Llama-3.1-8B-Instruct-GGUF",
temperature: float = 0.2, temperature: float = 0.2,
streaming=False, streaming=False,
stop=[], stop=[],

View File

@@ -75,6 +75,6 @@ def load_model_from_cache(repo_id: str, filename: str, repo_type="models"):
def infer_max_tokens(model_context_window: int, configured_max_tokens=None) -> int: def infer_max_tokens(model_context_window: int, configured_max_tokens=None) -> int:
"""Infer max prompt size based on device memory and max context window supported by the model""" """Infer max prompt size based on device memory and max context window supported by the model"""
configured_max_tokens = math.inf if configured_max_tokens is None else configured_max_tokens configured_max_tokens = math.inf if configured_max_tokens is None else configured_max_tokens
vram_based_n_ctx = int(get_device_memory() / 2e6) # based on heuristic vram_based_n_ctx = int(get_device_memory() / 1e6) # based on heuristic
configured_max_tokens = configured_max_tokens or math.inf # do not use if set to None configured_max_tokens = configured_max_tokens or math.inf # do not use if set to None
return min(configured_max_tokens, vram_based_n_ctx, model_context_window) return min(configured_max_tokens, vram_based_n_ctx, model_context_window)

View File

@@ -25,6 +25,7 @@ model_to_prompt_size = {
"gpt-4-turbo-preview": 20000, "gpt-4-turbo-preview": 20000,
"TheBloke/Mistral-7B-Instruct-v0.2-GGUF": 3500, "TheBloke/Mistral-7B-Instruct-v0.2-GGUF": 3500,
"NousResearch/Hermes-2-Pro-Mistral-7B-GGUF": 3500, "NousResearch/Hermes-2-Pro-Mistral-7B-GGUF": 3500,
"bartowski/Meta-Llama-3.1-8B-Instruct-GGUF": 20000,
} }
model_to_tokenizer: Dict[str, str] = {} model_to_tokenizer: Dict[str, str] = {}

View File

@@ -70,7 +70,7 @@ class OfflineChatProcessorConfig:
class OfflineChatProcessorModel: class OfflineChatProcessorModel:
def __init__(self, chat_model: str = "NousResearch/Hermes-2-Pro-Mistral-7B-GGUF", max_tokens: int = None): def __init__(self, chat_model: str = "bartowski/Meta-Llama-3.1-8B-Instruct-GGUF", max_tokens: int = None):
self.chat_model = chat_model self.chat_model = chat_model
self.loaded_model = None self.loaded_model = None
try: try:

View File

@@ -8,7 +8,7 @@ empty_escape_sequences = "\n|\r|\t| "
app_env_filepath = "~/.khoj/env" app_env_filepath = "~/.khoj/env"
telemetry_server = "https://khoj.beta.haletic.com/v1/telemetry" telemetry_server = "https://khoj.beta.haletic.com/v1/telemetry"
content_directory = "~/.khoj/content/" content_directory = "~/.khoj/content/"
default_offline_chat_model = "NousResearch/Hermes-2-Pro-Mistral-7B-GGUF" default_offline_chat_model = "bartowski/Meta-Llama-3.1-8B-Instruct-GGUF"
default_online_chat_model = "gpt-4-turbo-preview" default_online_chat_model = "gpt-4-turbo-preview"
empty_config = { empty_config = {

View File

@@ -93,7 +93,7 @@ class OpenAIProcessorConfig(ConfigBase):
class OfflineChatProcessorConfig(ConfigBase): class OfflineChatProcessorConfig(ConfigBase):
chat_model: Optional[str] = "NousResearch/Hermes-2-Pro-Mistral-7B-GGUF" chat_model: Optional[str] = "bartowski/Meta-Llama-3.1-8B-Instruct-GGUF"
class ConversationProcessorConfig(ConfigBase): class ConversationProcessorConfig(ConfigBase):

View File

@@ -378,7 +378,7 @@ def client_offline_chat(search_config: SearchConfig, default_user2: KhojUser):
# Initialize Processor from Config # Initialize Processor from Config
ChatModelOptionsFactory( ChatModelOptionsFactory(
chat_model="NousResearch/Hermes-2-Pro-Mistral-7B-GGUF", chat_model="bartowski/Meta-Llama-3.1-8B-Instruct-GGUF",
tokenizer=None, tokenizer=None,
max_prompt_size=None, max_prompt_size=None,
model_type="offline", model_type="offline",

View File

@@ -49,7 +49,7 @@ class ChatModelOptionsFactory(factory.django.DjangoModelFactory):
max_prompt_size = 3500 max_prompt_size = 3500
tokenizer = None tokenizer = None
chat_model = "NousResearch/Hermes-2-Pro-Mistral-7B-GGUF" chat_model = "bartowski/Meta-Llama-3.1-8B-Instruct-GGUF"
model_type = "offline" model_type = "offline"
openai_config = factory.LazyAttribute( openai_config = factory.LazyAttribute(
lambda obj: OpenAIProcessorConversationConfigFactory() if os.getenv("OPENAI_API_KEY") else None lambda obj: OpenAIProcessorConversationConfigFactory() if os.getenv("OPENAI_API_KEY") else None