Offload KQV by default

abetlen · abetlen · commit 48c3b77e6f55 · 2024-01-18T11:08:57.000-05:00
diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py
@@ -77,7 +77,7 @@ def __init__(
         mul_mat_q: bool = True,
         logits_all: bool = False,
         embedding: bool = False,
-        offload_kqv: bool = False,
+        offload_kqv: bool = True,
         # Sampling Params
         last_n_tokens_size: int = 64,
         # LoRA Params
diff --git a/llama_cpp/server/settings.py b/llama_cpp/server/settings.py
@@ -90,7 +90,7 @@ class ModelSettings(BaseSettings):
     logits_all: bool = Field(default=True, description="Whether to return logits.")
     embedding: bool = Field(default=True, description="Whether to use embeddings.")
     offload_kqv: bool = Field(
-        default=False, description="Whether to offload kqv to the GPU."
+        default=True, description="Whether to offload kqv to the GPU."
     )
     # Sampling Params
     last_n_tokens_size: int = Field(

Original file line number	Diff line number	Diff line change
`@@ -90,7 +90,7 @@ class ModelSettings(BaseSettings):`
`90`	`90`	`logits_all: bool = Field(default=True, description="Whether to return logits.")`
`91`	`91`	`embedding: bool = Field(default=True, description="Whether to use embeddings.")`
`92`	`92`	`offload_kqv: bool = Field(`
`93`		`- default=False, description="Whether to offload kqv to the GPU."`
	`93`	`+ default=True, description="Whether to offload kqv to the GPU."`
`94`	`94`	`)`
`95`	`95`	`# Sampling Params`
`96`	`96`	`last_n_tokens_size: int = Field(`