feat: Add option to configure n_ubatch

abetlen · abetlen · commit 6c44a3f36b08 · 2024-09-20T18:03:37.000-04:00
diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py
@@ -75,6 +75,7 @@ def __init__(
         seed: int = llama_cpp.LLAMA_DEFAULT_SEED,
         n_ctx: int = 512,
         n_batch: int = 512,
+        n_ubatch: int = 512,
         n_threads: Optional[int] = None,
         n_threads_batch: Optional[int] = None,
         rope_scaling_type: Optional[
@@ -156,6 +157,7 @@ def __init__(
             seed: RNG seed, -1 for random
             n_ctx: Text context, 0 = from model
             n_batch: Prompt processing maximum batch size
+            n_ubatch: Physical batch size
             n_threads: Number of threads to use for generation
             n_threads_batch: Number of threads to use for batch processing
             rope_scaling_type: RoPE scaling type, from `enum llama_rope_scaling_type`. ref: https://github.com/ggerganov/llama.cpp/pull/2054
@@ -309,6 +311,7 @@ def __init__(
         self.context_params = llama_cpp.llama_context_default_params()
         self.context_params.n_ctx = n_ctx
         self.context_params.n_batch = self.n_batch
+        self.context_params.n_ubatch = min(self.n_batch, n_ubatch)
         self.context_params.n_threads = self.n_threads
         self.context_params.n_threads_batch = self.n_threads_batch
         self.context_params.rope_scaling_type = (
@@ -380,6 +383,7 @@ def __init__(
             self.n_batch = min(n_ctx, n_batch)
             self.context_params.n_ctx = self._model.n_ctx_train()
             self.context_params.n_batch = self.n_batch
+            self.context_params.n_ubatch = min(self.n_batch, n_ubatch)
 
         self._ctx = self._stack.enter_context(
             contextlib.closing(
@@ -2071,6 +2075,7 @@ def __getstate__(self):
             seed=self.context_params.seed,
             n_ctx=self.context_params.n_ctx,
             n_batch=self.n_batch,
+            n_ubatch=self.context_params.n_ubatch,
             n_threads=self.context_params.n_threads,
             n_threads_batch=self.context_params.n_threads_batch,
             rope_scaling_type=self.context_params.rope_scaling_type,
diff --git a/llama_cpp/server/model.py b/llama_cpp/server/model.py
@@ -249,6 +249,7 @@ def load_llama_from_model_settings(settings: ModelSettings) -> llama_cpp.Llama:
             seed=settings.seed,
             n_ctx=settings.n_ctx,
             n_batch=settings.n_batch,
+            n_ubatch=settings.n_ubatch,
             n_threads=settings.n_threads,
             n_threads_batch=settings.n_threads_batch,
             rope_scaling_type=settings.rope_scaling_type,
diff --git a/llama_cpp/server/settings.py b/llama_cpp/server/settings.py
@@ -70,6 +70,9 @@ class ModelSettings(BaseSettings):
     n_batch: int = Field(
         default=512, ge=1, description="The batch size to use per eval."
     )
+    n_ubatch: int = Field(
+        default=512, ge=1, description="The physical batch size used by llama.cpp"
+    )
     n_threads: int = Field(
         default=max(multiprocessing.cpu_count() // 2, 1),
         ge=1,