10000 Implement GGUF metadata KV overrides by phiharri · Pull Request #1011 · abetlen/llama-cpp-python · GitHub
[go: up one dir, main page]

Skip to content

Implement GGUF metadata KV overrides #1011

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 9 commits into from
Jan 15, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
32 changes: 32 additions & 0 deletions llama_cpp/llama.py
Original file line number Diff line number Diff line change
Expand Up @@ -735,6 +735,7 @@ def __init__(
vocab_only: bool = False,
use_mmap: bool = True,
use_mlock: bool = False,
kv_overrides: Optional[Dict[str, Union[bool, int, float]]] = None,
# Context Params
seed: int = llama_cpp.LLAMA_DEFAULT_SEED,
n_ctx: int = 512,
Expand Down Expand Up @@ -803,6 +804,7 @@ def __init__(
vocab_only: Only load the vocabulary no weights.
use_mmap: Use mmap if possible.
use_mlock: Force the system to keep the model in RAM.
kv_overrides: Key-value overrides for the model.
seed: RNG seed, -1 for random
n_ctx: Text context, 0 = from model
n_batch: Prompt processing maximum batch size
Expand Down Expand Up @@ -866,6 +868,34 @@ def __init__(
self.model_params.use_mmap = use_mmap if lora_path is None else False
self.model_params.use_mlock = use_mlock

self.kv_overrides = kv_overrides
if kv_overrides is not None:
n_overrides = len(kv_overrides)
self._kv_overrides_array = llama_cpp.llama_model_kv_override * (n_overrides + 1)
self._kv_overrides_array_keys = []

for k, v in kv_overrides.items():
key_buf = ctypes.create_string_buffer(k.encode("utf-8"))
self._kv_overrides_array_keys.append(key_buf)
self._kv_overrides_array[i].key = key_buf
if isinstance(v, int):
self._kv_overrides_array[i].tag = llama_cpp.LLAMA_KV_OVERRIDE_INT
self._kv_overrides_array[i].value.int_value = v
elif isinstance(v, float):
self._kv_overrides_array[i].tag = llama_cpp.LLAMA_KV_OVERRIDE_FLOAT
self._kv_overrides_array[i].value.float_value = v
elif isinstance(v, bool):
self._kv_overrides_array[i].tag = llama_cpp.LLAMA_KV_OVERRIDE_BOOL
self._kv_overrides_array[i].value.bool_value = v
else:
raise ValueError(f"Unknown value type for {k}: {v}")

self._kv_overrides_array_sentinel_key = b'\0'

# null array sentinel
self._kv_overrides_array[n_overrides].key = self._kv_overrides_array_sentinel_key
self.model_params.kv_overrides = self._kv_overrides_array

self.n_batch = min(n_ctx, n_batch) # ???
self.n_threads = n_threads or max(multiprocessing.cpu_count() // 2, 1)
self.n_threads_batch = n_threads_batch or max(
Expand Down Expand Up @@ -2148,6 +2178,7 @@ def __getstate__(self):
vocab_only=self.model_params.vocab_only,
use_mmap=self.model_params.use_mmap,
use_mlock=self.model_params.use_mlock,
kv_overrides=self.kv_overrides,
# Context Params
seed=self.context_params.seed,
n_ctx=self.context_params.n_ctx,
Expand Down Expand Up @@ -2190,6 +2221,7 @@ def __setstate__(self, state):
vocab_only=state["vocab_only"],
use_mmap=state["use_mmap"],
use_mlock=state["use_mlock"],
kv_overrides=state["kv_overrides"],
# Context Params
seed=state["seed"],
n_ctx=state["n_ctx"],
Expand Down
20 changes: 19 additions & 1 deletion llama_cpp/server/model.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
from __future__ import annotations

from typing import Optional, Union, List
from typing import Dict, Optional, Union, List

import llama_cpp

Expand Down Expand Up @@ -71,6 +71,23 @@ def load_llama_from_model_settings(settings: ModelSettings) -> llama_cpp.Llama:
chat_handler = llama_cpp.llama_chat_format.Llava15ChatHandler(
clip_model_path=settings.clip_model_path, verbose=settings.verbose
)

kv_overrides: Optional[Dict[str, Union[bool, int, float]]] = None
if settings.kv_overrides is not None:
assert isinstance(settings.kv_overrides, list)
kv_overrides = {}
for kv in settings.kv_overrides:
key, value = kv.split("=")
if ":" in value:
value_type, value = value.split(":")
if value_type == "bool":
kv_overrides[key] = value.lower() in ["true", "1"]
elif value_type == "int":
kv_overrides[key] = int(value)
elif value_type == "float":
kv_overrides[key] = float(value)
else:
raise ValueError(f"Unknown value type {value_type}")

_model = llama_cpp.Llama(
model_path=settings.model,
Expand All @@ -81,6 +98,7 @@ def load_llama_from_model_settings(settings: ModelSettings) -> llama_cpp.Llama:
vocab_only=settings.vocab_only,
use_mmap=settings.use_mmap,
use_mlock=settings.use_mlock,
kv_overrides=kv_overrides,
# Context Params
seed=settings.seed,
n_ctx=settings.n_ctx,
Expand Down
4 changes: 4 additions & 0 deletions llama_cpp/server/settings.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,10 @@ class ModelSettings(BaseSettings):
default=llama_cpp.llama_mlock_supported(),
description="Use mlock.",
)
kv_overrides: Optional[List[str]] = Field(
default=None,
description="List of model kv overrides in the format key=type:value where type is one of (bool, int, float). Valid true values are (true, TRUE, 1), otherwise false.",
)
# Context Params
seed: int = Field(
default=llama_cpp.LLAMA_DEFAULT_SEED, description="Random seed. -1 for random."
Expand Down
0