Increase factor for n_ctx reduciton to 2e6

This commit is contained in:
sabaimran
2024-04-16 22:48:45 +05:30
parent 1f2ffce85b
commit 0208688801

View File

@@ -67,7 +67,7 @@ def load_model_from_cache(repo_id: str, filename: str, repo_type="models"):
def infer_max_tokens(model_context_window: int, configured_max_tokens=math.inf) -> int: def infer_max_tokens(model_context_window: int, configured_max_tokens=math.inf) -> int:
"""Infer max prompt size based on device memory and max context window supported by the model""" """Infer max prompt size based on device memory and max context window supported by the model"""
vram_based_n_ctx = int(get_device_memory() / 1e6) # based on heuristic vram_based_n_ctx = int(get_device_memory() / 2e6) # based on heuristic
if configured_max_tokens: if configured_max_tokens:
return min(configured_max_tokens, model_context_window) return min(configured_max_tokens, model_context_window)
else: else: