8000 Offload KQV by default · sjanaX01/llama-cpp-python@48c3b77 · GitHub
[go: up one dir, main page]

Skip to content

Commit 48c3b77

Browse files
committed
Offload KQV by default
1 parent 6bfe98b commit 48c3b77

File tree

2 files changed

+2
-2
lines changed

2 files changed

+2
-2
lines changed

llama_cpp/llama.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -77,7 +77,7 @@ def __init__(
7777
mul_mat_q: bool = True,
7878
logits_all: bool = False,
7979
embedding: bool = False,
80-
offload_kqv: bool = False,
80+
offload_kqv: bool = True,
8181
# Sampling Params
8282
last_n_tokens_size: int = 64,
8383
# LoRA Params

llama_cpp/server/settings.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -90,7 +90,7 @@ class ModelSettings(BaseSettings):
9090
logits_all: bool = Field(default=True, description="Whether to return logits.")
9191
embedding: bool = Field(default=True, description="Whether to use embeddings.")
9292
offload_kqv: bool = Field(
93-
default=False, description="Whether to offload kqv to the GPU."
93+
default=True, description="Whether to offload kqv to the GPU."
9494
)
9595
# Sampling Params
9696
last_n_tokens_size: int = Field(

0 commit comments

Comments
 (0)
0