|
2 | 2 | import os
|
3 | 3 | import ctypes
|
4 | 4 | from ctypes import (
|
| 5 | + c_double, |
5 | 6 | c_int,
|
6 | 7 | c_float,
|
7 | 8 | c_char_p,
|
@@ -169,6 +170,7 @@ class llama_token_data_array(Structure):
|
169 | 170 | # // context pointer passed to the progress callback
|
170 | 171 | # void * progress_callback_user_data;
|
171 | 172 |
|
| 173 | + |
172 | 174 | # // Keep the booleans together to avoid misalignment during copy-by-value.
|
173 | 175 | # bool low_vram; // if true, reduce VRAM usage at the cost of performance
|
174 | 176 | # bool f16_kv; // use fp16 for KV cache
|
@@ -256,6 +258,34 @@ class llama_model_quantize_params(Structure):
|
256 | 258 | ]
|
257 | 259 |
|
258 | 260 |
|
| 261 | +# // performance timing information |
| 262 | +# struct llama_timings { |
| 263 | +# double t_start_ms; |
| 264 | +# double t_end_ms; |
| 265 | +# double t_load_ms; |
| 266 | +# double t_sample_ms; |
| 267 | +# double t_p_eval_ms; |
| 268 | +# double t_eval_ms; |
| 269 | + |
| 270 | + |
| 271 | +# int32_t n_sample; |
| 272 | +# int32_t n_p_eval; |
| 273 | +# int32_t n_eval; |
| 274 | +# }; |
| 275 | +class llama_timings(Structure): |
| 276 | + _fields_ = [ |
| 277 | + ("t_start_ms", c_double), |
| 278 | + ("t_end_ms", c_double), |
| 279 | + ("t_load_ms", c_double), |
| 280 | + ("t_sample_ms", c_double), |
| 281 | + ("t_p_eval_ms", c_double), |
| 282 | + ("t_eval_ms", c_double), |
| 283 | + ("n_sample", c_int32), |
| 284 | + ("n_p_eval", c_int32), |
| 285 | + ("n_eval", c_int32), |
| 286 | + ] |
| 287 | + |
| 288 | + |
259 | 289 | # LLAMA_API struct llama_context_params llama_context_default_params();
|
260 | 290 | def llama_context_default_params() -> llama_context_params:
|
261 | 291 | return _lib.llama_context_default_params()
|
@@ -991,6 +1021,15 @@ def llama_sample_token(
|
991 | 1021 | # Performance information
|
992 | 1022 |
|
993 | 1023 |
|
| 1024 | +# LLAMA_API struct llama_timings llama_get_timings(struct llama_context * ctx); |
| 1025 | +def llama_get_timings(ctx: llama_context_p) -> llama_timings: |
| 1026 | + return _lib.llama_get_timings(ctx) |
| 1027 | + |
| 1028 | + |
| 1029 | +_lib.llama_get_timings.argtypes = [llama_context_p] |
| 1030 | +_lib.llama_get_timings.restype = llama_timings |
| 1031 | + |
| 1032 | + |
994 | 1033 | # LLAMA_API void llama_print_timings(struct llama_context * ctx);
|
995 | 1034 | def llama_print_timings(ctx: llama_context_p):
|
996 | 1035 | _lib.llama_print_timings(ctx)
|
|
0 commit comments