@@ -91,6 +91,12 @@ def _load_shared_library(lib_base_name: str):
91
91
c_uint8_p = POINTER (c_uint8 )
92
92
c_size_t_p = POINTER (c_size_t )
93
93
94
+ # from ggml-backend.h
95
+ # typedef bool (*ggml_backend_sched_eval_callback)(struct ggml_tensor * t, bool ask, void * user_data);
96
+ ggml_backend_sched_eval_callback = ctypes .CFUNCTYPE (
97
+ c_bool , c_void_p , c_bool , c_void_p
98
+ )
99
+
94
100
# llama.h bindings
95
101
96
102
_lib .llama_max_devices .argtypes = []
@@ -448,6 +454,9 @@ class llama_model_params(Structure):
448
454
# float yarn_beta_slow; // YaRN high correction dim
449
455
# uint32_t yarn_orig_ctx; // YaRN original context size
450
456
457
+ # ggml_backend_sched_eval_callback cb_eval;
458
+ # void * cb_eval_user_data;
459
+
451
460
# enum ggml_type type_k; // data type for K cache
452
461
# enum ggml_type type_v; // data type for V cache
453
462
@@ -475,6 +484,8 @@ class llama_context_params(Structure):
475
484
yarn_beta_fast (float): YaRN low correction dim
476
485
yarn_beta_slow (float): YaRN high correction dim
477
486
yarn_orig_ctx (int): YaRN original context size
487
+ cb_eval (ggml_backend_sched_eval_callback): callback for scheduling eval
488
+ cb_eval_user_data (ctypes.c_void_p): user data for cb_eval
478
489
type_k (int): data type for K cache
479
490
type_v (int): data type for V cache
480
491
mul_mat_q (bool): if true, use experimental mul_mat_q kernels (DEPRECATED - always true)
@@ -497,6 +508,8 @@ class llama_context_params(Structure):
497
508
("yarn_beta_fast" , c_float ),
498
509
("yarn_beta_slow" , c_float ),
499
510
("yarn_orig_ctx" , c_uint32 ),
511
+ ("cb_eval" , ggml_backend_sched_eval_callback ),
512
+ ("cb_eval_user_data" , c_void_p ),
500
513
("type_k" , c_int ),
501
514
("type_v" , c_int ),
502
515
("mul_mat_q" , c_bool ),
0 commit comments