From 1cdadacccf8549f983474331f29b7839df1bfdb0 Mon Sep 17 00:00:00 2001 From: lixianghan Date: Mon, 4 Mar 2024 16:28:11 +0800 Subject: [PATCH 1/3] replace ggerganov/llama.cpp with LixiangHan/llama.cpp --- .gitmodules | 2 +- vendor/llama.cpp | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/.gitmodules b/.gitmodules index 7edf0975d..91758b5dc 100644 --- a/.gitmodules +++ b/.gitmodules @@ -1,3 +1,3 @@ [submodule "vendor/llama.cpp"] path = vendor/llama.cpp - url = https://github.com/ggerganov/llama.cpp.git + url = git@github.com:LixiangHan/llama.cpp.git diff --git a/vendor/llama.cpp b/vendor/llama.cpp index 67be2ce10..d642707b7 160000 --- a/vendor/llama.cpp +++ b/vendor/llama.cpp @@ -1 +1 @@ -Subproject commit 67be2ce1015d070b3b2cd488bcb041eefb61de72 +Subproject commit d642707b706be736a4c0b939d90d564c8b41a072 From e32ffd475979ed2686da60eaaf8aefd7ed491b69 Mon Sep 17 00:00:00 2001 From: lixianghan Date: Mon, 4 Mar 2024 17:35:20 +0800 Subject: [PATCH 2/3] export timestamps related api & remove llama_set_abort_callback --- llama_cpp/llama_cpp.py | 47 ++++++++++++++++++++++++++++++------------ vendor/llama.cpp | 2 +- 2 files changed, 35 insertions(+), 14 deletions(-) diff --git a/llama_cpp/llama_cpp.py b/llama_cpp/llama_cpp.py index 08adfe205..aefc886a7 100644 --- a/llama_cpp/llama_cpp.py +++ b/llama_cpp/llama_cpp.py @@ -1725,19 +1725,19 @@ def llama_set_n_threads( # // Set abort callback # LLAMA_API void llama_set_abort_callback(struct llama_context * ctx, ggml_abort_callback abort_callback, void * abort_callback_data); -@ctypes_function( - "llama_set_abort_callback", - [llama_context_p_ctypes, ggml_abort_callback, ctypes.c_void_p], - None, -) -def llama_set_abort_callback( - ctx: llama_context_p, - abort_callback: Callable[[ctypes.c_void_p], None], - abort_callback_data: ctypes.c_void_p, - /, -): - """Set abort callback""" - ... +# @ctypes_function( +# "llama_set_abort_callback", +# [llama_context_p_ctypes, ggml_abort_callback, ctypes.c_void_p], +# None, +# ) +# def llama_set_abort_callback( +# ctx: llama_context_p, +# abort_callback: Callable[[ctypes.c_void_p], None], +# abort_callback_data: ctypes.c_void_p, +# /, +# ): +# """Set abort callback""" +# ... # // Token logits obtained from the last call to llama_decode() @@ -2699,3 +2699,24 @@ def llama_log_set( ) def llama_dump_timing_info_yaml(stream: ctypes.c_void_p, ctx: llama_context_p, /): ... + + +# LLAMA_API void llama_set_timestamp(struct llama_context * ctx, const char * name); +@ctypes_function( + "llama_set_timestamp", + [llama_context_p_ctypes, ctypes.c_char_p], + None, +) +def llama_set_timestamp(ctx: llama_context_p, name: bytes): + """Set timestamp with name""" + ... + +# LLAMA_API int64_t llama_get_timestamp(struct llama_context * ctx, const char * name); +@ctypes_function( + "llama_get_timestamp", + [llama_context_p_ctypes, ctypes.c_char_p], + ctypes.c_int64, +) +def llama_get_timestamp(ctx: llama_context_p, name: bytes) -> ctypes.c_int64: + """Get timestamp with name""" + ... \ No newline at end of file diff --git a/vendor/llama.cpp b/vendor/llama.cpp index d642707b7..09a947927 160000 --- a/vendor/llama.cpp +++ b/vendor/llama.cpp @@ -1 +1 @@ -Subproject commit d642707b706be736a4c0b939d90d564c8b41a072 +Subproject commit 09a947927579ccf3f43d764cbcd73af1ca340768 From dcffc1127a33fd378e61e8db8dca34e6efa5946e Mon Sep 17 00:00:00 2001 From: lixianghan Date: Tue, 5 Mar 2024 00:55:24 -0800 Subject: [PATCH 3/3] updata python binding APIs --- llama_cpp/llama_cpp.py | 44 ++++++++++++------------------------------ 1 file changed, 12 insertions(+), 32 deletions(-) diff --git a/llama_cpp/llama_cpp.py b/llama_cpp/llama_cpp.py index aefc886a7..b36e159e5 100644 --- a/llama_cpp/llama_cpp.py +++ b/llama_cpp/llama_cpp.py @@ -549,10 +549,7 @@ class llama_model_params(ctypes.Structure): # uint32_t n_batch; // prompt processing maximum batch size # uint32_t n_threads; // number of threads to use for generation # uint32_t n_threads_batch; // number of threads to use for batch processing - -# enum llama_rope_scaling_type rope_scaling_type; // RoPE scaling type, from `enum llama_rope_scaling_type` -# enum llama_pooling_type pooling_type; // whether to pool (sum) embedding results by sequence id -# // (ignored if no pooling layer) +# int32_t rope_scaling_type; // RoPE scaling type, from `enum llama_rope_scaling_type` # // ref: https://github.com/ggerganov/llama.cpp/pull/2054 # float rope_freq_base; // RoPE base frequency, 0 = from model @@ -571,15 +568,13 @@ class llama_model_params(ctypes.Structure): # enum ggml_type type_v; // data type for V cache # // Keep the booleans together to avoid misalignment during copy-by-value. -# bool logits_all; // the llama_decode() call computes all logits, not just the last one (DEPRECATED - set llama_batch.logits instead) +# bool mul_mat_q; // if true, use experimental mul_mat_q kernels (DEPRECATED - always true) +# bool logits_all; // the llama_eval() call computes all logits, not just the last one (DEPRECATED - setllama_batch.logits instead) # bool embedding; // embedding mode only # bool offload_kqv; // whether to offload the KQV ops (including the KV cache) to GPU +# bool do_pooling; // whether to pool (sum) embedding results by sequence id (ignored if no pooling layer) -# // Abort callback -# // if it returns true, execution of llama_decode() will be aborted -# // currently works only with CPU execution -# ggml_abort_callback abort_callback; -# void * abort_callback_data; +# bool enable_timing; // enable timing op # }; class llama_context_params(ctypes.Structure): """Parameters for llama_context @@ -591,7 +586,6 @@ class llama_context_params(ctypes.Structure): n_threads (int): number of threads to use for generation n_threads_batch (int): number of threads to use for batch processing rope_scaling_type (int): RoPE scaling type, from `enum llama_rope_scaling_type` - pooling_type (int): whether to pool (sum) embedding results by sequence id (ignored if no pooling layer) rope_freq_base (float): RoPE base frequency, 0 = from model rope_freq_scale (float): RoPE frequency scaling factor, 0 = from model yarn_ext_factor (float): YaRN extrapolation mix factor, negative = from model @@ -604,11 +598,12 @@ class llama_context_params(ctypes.Structure): cb_eval_user_data (ctypes.ctypes.c_void_p): user data for cb_eval type_k (int): data type for K cache type_v (int): data type for V cache + mul_mat_q (bool): if true, use experimental mul_mat_q kernels (DEPRECATED - always true) logits_all (bool): the llama_eval() call computes all logits, not just the last one (DEPRECATED - set llama_batch.logits instead) embedding (bool): embedding mode only offload_kqv (bool): whether to offload the KQV ops (including the KV cache) to GPU - abort_callback (ggml_abort_callback): abort callback if it returns true, execution of llama_decode() will be aborted - abort_callback_data (ctypes.ctypes.c_void_p): data for abort_callback + do_pooling (pool): whether to pool (sum) embedding results by sequence id (ignored if no pooling layer) + enable_timing (bool): enable timing op """ _fields_ = [ @@ -618,7 +613,6 @@ class llama_context_params(ctypes.Structure): ("n_threads", ctypes.c_uint32), ("n_threads_batch", ctypes.c_uint32), ("rope_scaling_type", ctypes.c_int), - ("pooling_type", ctypes.c_int), ("rope_freq_base", ctypes.c_float), ("rope_freq_scale", ctypes.c_float), ("yarn_ext_factor", ctypes.c_float), @@ -631,11 +625,12 @@ class llama_context_params(ctypes.Structure): ("cb_eval_user_data", ctypes.c_void_p), ("type_k", ctypes.c_int), ("type_v", ctypes.c_int), + ("mul_mat_q", ctypes.c_bool), ("logits_all", ctypes.c_bool), ("embedding", ctypes.c_bool), ("offload_kqv", ctypes.c_bool), - ("abort_callback", ggml_abort_callback), - ("abort_callback_data", ctypes.c_void_p), + ("do_pooling", ctypes.c_bool), + ("enable_timing", ctypes.c_bool), ] @@ -1723,22 +1718,6 @@ def llama_set_n_threads( """ ... -# // Set abort callback -# LLAMA_API void llama_set_abort_callback(struct llama_context * ctx, ggml_abort_callback abort_callback, void * abort_callback_data); -# @ctypes_function( -# "llama_set_abort_callback", -# [llama_context_p_ctypes, ggml_abort_callback, ctypes.c_void_p], -# None, -# ) -# def llama_set_abort_callback( -# ctx: llama_context_p, -# abort_callback: Callable[[ctypes.c_void_p], None], -# abort_callback_data: ctypes.c_void_p, -# /, -# ): -# """Set abort callback""" -# ... - # // Token logits obtained from the last call to llama_decode() # // The logits for the last token are stored in the last row @@ -2711,6 +2690,7 @@ def llama_set_timestamp(ctx: llama_context_p, name: bytes): """Set timestamp with name""" ... + # LLAMA_API int64_t llama_get_timestamp(struct llama_context * ctx, const char * name); @ctypes_function( "llama_get_timestamp",