8000 Add offload_kqv option to llama and server · cyberjon/llama-cpp-python@095c650 · GitHub
[go: up one dir, main page]

Skip to content

Commit 095c650

Browse files
committed
Add offload_kqv option to llama and server
1 parent 472b344 commit 095c650

File tree

2 files changed

+7
-0
lines changed

2 files changed

+7
-0
lines changed

llama_cpp/llama.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -752,6 +752,7 @@ def __init__(
752752
mul_mat_q: bool = True,
753753
logits_all: bool = False,
754754
embedding: bool = False,
755+
offload_kqv: bool = False,
755756
# Sampling Params
756757
last_n_tokens_size: int = 64,
757758
# LoRA Params
@@ -817,6 +818,7 @@ def __init__(
817818
yarn_orig_ctx: YaRN original context size
818819
logits_all: Return logits for all tokens, not just the last token. Must be True for completion to return logprobs.
819820
embedding: Embedding mode only.
821+
offload_kqv: Offload K, Q, V to GPU.
820822
last_n_tokens_size: Maximum number of tokens to keep in the last_n_tokens deque.
821823
lora_base: Optional path to base model, useful if using a quantized base model and you want to apply LoRA to an f16 model.
822824
lora_path: Path to a LoRA file to apply to the model.
@@ -903,6 +905,7 @@ def __init__(
903905
self.context_params.mul_mat_q = mul_mat_q
904906
self.context_params.logits_all = logits_all
905907
self.context_params.embedding = embedding
908+
self.context_params.offload_kqv = offload_kqv
906909

907910
# Sampling Params
908911
self.last_n_tokens_size = last_n_tokens_size

llama_cpp/server/app.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -100,6 +100,9 @@ class Settings(BaseSettings):
100100
)
101101
logits_all: bool = Field(default=True, description="Whether to return logits.")
102102
embedding: bool = Field(default=True, description="Whether to use embeddings.")
103+
offload_kqv: bool = Field(
104+
default=False, description="Whether to offload kqv to the GPU."
105+
)
103106
# Sampling Params
104107
last_n_tokens_size: int = Field(
105108
default=64,
@@ -409,6 +412,7 @@ def create_app(settings: Optional[Settings] = None):
409412
mul_mat_q=settings.mul_mat_q,
410413
logits_all=settings.logits_all,
411414
embedding=settings.embedding,
415+
offload_kqv=settings.offload_kqv,
412416
# Sampling Params
413417
last_n_tokens_size=settings.last_n_tokens_size,
414418
# LoRA Params

0 commit comments

Comments
 (0)
0