Increase factor for n_ctx reduciton to 2e6

2026-03-09 13:25:11 +00:00 · 2024-04-16 22:48:45 +05:30
parent 1f2ffce85b
commit 0208688801
1 changed files with 1 additions and 1 deletions
--- a/src/khoj/processor/conversation/offline/utils.py
+++ b/src/khoj/processor/conversation/offline/utils.py
@@ -67,7 +67,7 @@ def load_model_from_cache(repo_id: str, filename: str, repo_type="models"):
 def infer_max_tokens(model_context_window: int, configured_max_tokens=math.inf) -> int:
    """Infer max prompt size based on device memory and max context window supported by the model"""
-    vram_based_n_ctx = int(get_device_memory() / 1e6)  # based on heuristic
+    vram_based_n_ctx = int(get_device_memory() / 2e6)  # based on heuristic
    if configured_max_tokens:
        return min(configured_max_tokens, model_context_window)
    else: