From 48c3b77e6f558a9899de0e1155c7dc0c7958d8e8 Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Thu, 18 Jan 2024 11:08:57 -0500 Subject: [PATCH] Offload KQV by default --- llama_cpp/llama.py | 2 +- llama_cpp/server/settings.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py index 25abf36cb..6cdc1eb76 100644 --- a/llama_cpp/llama.py +++ b/llama_cpp/llama.py @@ -77,7 +77,7 @@ def __init__( mul_mat_q: bool = True, logits_all: bool = False, embedding: bool = False, - offload_kqv: bool = False, + offload_kqv: bool = True, # Sampling Params last_n_tokens_size: int = 64, # LoRA Params diff --git a/llama_cpp/server/settings.py b/llama_cpp/server/settings.py index a10390c75..dc5be209d 100644 --- a/llama_cpp/server/settings.py +++ b/llama_cpp/server/settings.py @@ -90,7 +90,7 @@ class ModelSettings(BaseSettings): logits_all: bool = Field(default=True, description="Whether to return logits.") embedding: bool = Field(default=True, description="Whether to use embeddings.") offload_kqv: bool = Field( - default=False, description="Whether to offload kqv to the GPU." + default=True, description="Whether to offload kqv to the GPU." ) # Sampling Params last_n_tokens_size: int = Field(