ducky777
diff --git a/‎llama_cpp/llama.py
Lines changed: 10 additions & 4 deletions b/‎llama_cpp/llama.py
Lines changed: 10 additions & 4 deletions
diff --git a/‎llama_cpp/llama_cpp.py
Lines changed: 31 additions & 3 deletions b/‎llama_cpp/llama_cpp.py
Lines changed: 31 additions & 3 deletions
diff --git a/‎llama_cpp/server/settings.py
Lines changed: 2 additions & 2 deletions b/‎llama_cpp/server/settings.py
Lines changed: 2 additions & 2 deletions
diff --git a/‎vendor/llama.cpp b/‎vendor/llama.cpp
@@ -98,7 +98,7 @@ def __init__(
         lora_scale: float = 1.0,
         lora_path: Optional[str] = None,
         # Backend Params
-        numa: bool = False,
+        numa: Union[bool, int] = False,
         # Chat Format Params
         chat_format: Optional[str] = None,
         chat_handler: Optional[llama_chat_format.LlamaChatCompletionHandler] = None,
@@ -166,7 +166,7 @@ def __init__(
             last_n_tokens_size: Maximum number of tokens to keep in the last_n_tokens deque.
             lora_base: Optional path to base model, useful if using a quantized base model and you want to apply LoRA to an f16 model.
             lora_path: Path to a LoRA file to apply to the model.
-            numa: Enable NUMA support. (NOTE: The initial value of this parameter is used for the remainder of the program as this value is set in llama_backend_init)
+            numa: numa policy
             chat_format: String specifying the chat format to use when calling create_chat_completion.
             chat_handler: Optional chat handler to use when calling create_chat_completion.
             draft_model: Optional draft model to use for speculative decoding.
@@ -183,12 +183,18 @@ def __init__(
 
         set_verbose(verbose)
 
-        self.numa = numa
         if not Llama.__backend_initialized:
             with suppress_stdout_stderr(disable=verbose):
-                llama_cpp.llama_backend_init(self.numa)
+                llama_cpp.llama_backend_init()
             Llama.__backend_initialized = True
 
+        if isinstance(numa, bool):
+            self.numa = llama_cpp.GGML_NUMA_STRATEGY_DISTRIBUTE if numa else llama_cpp.GGML_NUMA_STRATEGY_DISABLED
+        
+        if self.numa != llama_cpp.GGML_NUMA_STRATEGY_DISABLED:
+            with suppress_stdout_stderr(disable=verbose):
+                llama_cpp.llama_numa_init(self.numa)
+
         self.model_path = model_path
 
         # Model Params
 
@@ -697,17 +697,45 @@ def llama_model_quantize_default_params() -> llama_model_quantize_params:
 # // If numa is true, use NUMA optimizations
 # // Call once at the start of the program
 # LLAMA_API void llama_backend_init(bool numa);
-def llama_backend_init(numa: Union[c_bool, bool]):
+# LLAMA_API void llama_backend_init(void);
+def llama_backend_init():
     """Initialize the llama + ggml backend
     If numa is true, use NUMA optimizations
     Call once at the start of the program"""
-    return _lib.llama_backend_init(numa)
+    return _lib.llama_backend_init()
 
 
-_lib.llama_backend_init.argtypes = [c_bool]
+_lib.llama_backend_init.argtypes = []
 _lib.llama_backend_init.restype = None
 
 
+# // numa strategies
+# enum ggml_numa_strategy {
+#     GGML_NUMA_STRATEGY_DISABLED   = 0,
+#     GGML_NUMA_STRATEGY_DISTRIBUTE = 1,
+#     GGML_NUMA_STRATEGY_ISOLATE    = 2,
+#     GGML_NUMA_STRATEGY_NUMACTL    = 3,
+#     GGML_NUMA_STRATEGY_MIRROR     = 4,
+#     GGML_NUMA_STRATEGY_COUNT
+# };
+GGML_NUMA_STRATEGY_DISABLED = 0
+GGML_NUMA_STRATEGY_DISTRIBUTE = 1
+GGML_NUMA_STRATEGY_ISOLATE = 2
+GGML_NUMA_STRATEGY_NUMACTL = 3
+GGML_NUMA_STRATEGY_MIRROR = 4
+GGML_NUMA_STRATEGY_COUNT = 5
+
+
+# //optional:
+# LLAMA_API void llama_numa_init(enum ggml_numa_strategy numa);
+def llama_numa_init(numa: int):
+    return _lib.llama_numa_init(numa)
+
+
+_lib.llama_numa_init.argtypes = [c_int]
+_lib.llama_numa_init.restype = None
+
+
 # // Call once at the end of the program - currently only used for MPI
 # LLAMA_API void llama_backend_free(void);
 def llama_backend_free():
 
@@ -2,7 +2,7 @@
 
 import multiprocessing
 
-from typing import Optional, List, Literal
+from typing import Optional, List, Literal, Union
 from pydantic import Field
 from pydantic_settings import BaseSettings
 
@@ -108,7 +108,7 @@ class ModelSettings(BaseSettings):
         description="Path to a LoRA file to apply to the model.",
     )
     # Backend Params
-    numa: bool = Field(
+    numa: Union[bool, int] = Field(
         default=False,
         description="Enable NUMA support.",
     )
Original file line number	Diff line number	Diff line change
`@@ -2,7 +2,7 @@`
`2`	`2`
`3`	`3`	`import multiprocessing`
`4`	`4`
`5`		`-from typing import Optional, List, Literal`
	`5`	`+from typing import Optional, List, Literal, Union`
`6`	`6`	`from pydantic import Field`
`7`	`7`	`from pydantic_settings import BaseSettings`
`8`	`8`
`@@ -108,7 +108,7 @@ class ModelSettings(BaseSettings):`
`108`	`108`	`description="Path to a LoRA file to apply to the model.",`
`109`	`109`	`)`
`110`	`110`	`# Backend Params`
`111`		`- numa: bool = Field(`
	`111`	`+ numa: Union[bool, int] = Field(`
`112`	`112`	`default=False,`
`113`	`113`	`description="Enable NUMA support.",`
`114`	`114`	`)`