From 1cdadacccf8549f983474331f29b7839df1bfdb0 Mon Sep 17 00:00:00 2001
From: lixianghan <lixianghan98@gmail.com>
Date: Mon, 4 Mar 2024 16:28:11 +0800
Subject: [PATCH 1/3] replace ggerganov/llama.cpp with LixiangHan/llama.cpp

---
 .gitmodules      | 2 +-
 vendor/llama.cpp | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/.gitmodules b/.gitmodules
index 7edf0975d..91758b5dc 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -1,3 +1,3 @@
 [submodule "vendor/llama.cpp"]
 	path = vendor/llama.cpp
-	url = https://github.com/ggerganov/llama.cpp.git
+	url = git@github.com:LixiangHan/llama.cpp.git
diff --git a/vendor/llama.cpp b/vendor/llama.cpp
index 67be2ce10..d642707b7 160000
--- a/vendor/llama.cpp
+++ b/vendor/llama.cpp
@@ -1 +1 @@
-Subproject commit 67be2ce1015d070b3b2cd488bcb041eefb61de72
+Subproject commit d642707b706be736a4c0b939d90d564c8b41a072

From e32ffd475979ed2686da60eaaf8aefd7ed491b69 Mon Sep 17 00:00:00 2001
From: lixianghan <lixianghan98@gmail.com>
Date: Mon, 4 Mar 2024 17:35:20 +0800
Subject: [PATCH 2/3] export timestamps related api & remove
 llama_set_abort_callback

---
 llama_cpp/llama_cpp.py | 47 ++++++++++++++++++++++++++++++------------
 vendor/llama.cpp       |  2 +-
 2 files changed, 35 insertions(+), 14 deletions(-)

diff --git a/llama_cpp/llama_cpp.py b/llama_cpp/llama_cpp.py
index 08adfe205..aefc886a7 100644
--- a/llama_cpp/llama_cpp.py
+++ b/llama_cpp/llama_cpp.py
@@ -1725,19 +1725,19 @@ def llama_set_n_threads(
 
 # // Set abort callback
 # LLAMA_API void llama_set_abort_callback(struct llama_context * ctx, ggml_abort_callback abort_callback, void * abort_callback_data);
-@ctypes_function(
-    "llama_set_abort_callback",
-    [llama_context_p_ctypes, ggml_abort_callback, ctypes.c_void_p],
-    None,
-)
-def llama_set_abort_callback(
-    ctx: llama_context_p,
-    abort_callback: Callable[[ctypes.c_void_p], None],
-    abort_callback_data: ctypes.c_void_p,
-    /,
-):
-    """Set abort callback"""
-    ...
+# @ctypes_function(
+#     "llama_set_abort_callback",
+#     [llama_context_p_ctypes, ggml_abort_callback, ctypes.c_void_p],
+#     None,
+# )
+# def llama_set_abort_callback(
+#     ctx: llama_context_p,
+#     abort_callback: Callable[[ctypes.c_void_p], None],
+#     abort_callback_data: ctypes.c_void_p,
+#     /,
+# ):
+#     """Set abort callback"""
+#     ...
 
 
 # // Token logits obtained from the last call to llama_decode()
@@ -2699,3 +2699,24 @@ def llama_log_set(
 )
 def llama_dump_timing_info_yaml(stream: ctypes.c_void_p, ctx: llama_context_p, /):
     ...
+
+
+# LLAMA_API void llama_set_timestamp(struct llama_context * ctx, const char * name);
+@ctypes_function(
+    "llama_set_timestamp",
+    [llama_context_p_ctypes, ctypes.c_char_p],
+    None,
+)
+def llama_set_timestamp(ctx: llama_context_p, name: bytes):
+    """Set timestamp with name"""
+    ...
+
+# LLAMA_API int64_t llama_get_timestamp(struct llama_context * ctx, const char * name);
+@ctypes_function(
+    "llama_get_timestamp",
+    [llama_context_p_ctypes, ctypes.c_char_p],
+    ctypes.c_int64,
+)
+def llama_get_timestamp(ctx: llama_context_p, name: bytes) -> ctypes.c_int64:
+    """Get timestamp with name"""
+    ...
\ No newline at end of file
diff --git a/vendor/llama.cpp b/vendor/llama.cpp
index d642707b7..09a947927 160000
--- a/vendor/llama.cpp
+++ b/vendor/llama.cpp
@@ -1 +1 @@
-Subproject commit d642707b706be736a4c0b939d90d564c8b41a072
+Subproject commit 09a947927579ccf3f43d764cbcd73af1ca340768

From dcffc1127a33fd378e61e8db8dca34e6efa5946e Mon Sep 17 00:00:00 2001
From: lixianghan <lixianghan98@gmail.com>
Date: Tue, 5 Mar 2024 00:55:24 -0800
Subject: [PATCH 3/3] updata python binding APIs

---
 llama_cpp/llama_cpp.py | 44 ++++++++++++------------------------------
 1 file changed, 12 insertions(+), 32 deletions(-)

diff --git a/llama_cpp/llama_cpp.py b/llama_cpp/llama_cpp.py
index aefc886a7..b36e159e5 100644
--- a/llama_cpp/llama_cpp.py
+++ b/llama_cpp/llama_cpp.py
@@ -549,10 +549,7 @@ class llama_model_params(ctypes.Structure):
 #     uint32_t n_batch;           // prompt processing maximum batch size
 #     uint32_t n_threads;         // number of threads to use for generation
 #     uint32_t n_threads_batch;   // number of threads to use for batch processing
-
-#     enum llama_rope_scaling_type rope_scaling_type; // RoPE scaling type, from `enum llama_rope_scaling_type`
-#     enum llama_pooling_type      pooling_type;      // whether to pool (sum) embedding results by sequence id
-#                                                     // (ignored if no pooling layer)
+#     int32_t  rope_scaling_type; // RoPE scaling type, from `enum llama_rope_scaling_type`
 
 #     // ref: https://github.com/ggerganov/llama.cpp/pull/2054
 #     float    rope_freq_base;   // RoPE base frequency, 0 = from model
@@ -571,15 +568,13 @@ class llama_model_params(ctypes.Structure):
 #     enum ggml_type type_v; // data type for V cache
 
 #     // Keep the booleans together to avoid misalignment during copy-by-value.
-#     bool logits_all;  // the llama_decode() call computes all logits, not just the last one (DEPRECATED - set llama_batch.logits instead)
+#     bool mul_mat_q;   // if true, use experimental mul_mat_q kernels (DEPRECATED - always true)
+#     bool logits_all;  // the llama_eval() call computes all logits, not just the last one (DEPRECATED - setllama_batch.logits instead)
 #     bool embedding;   // embedding mode only
 #     bool offload_kqv; // whether to offload the KQV ops (including the KV cache) to GPU
+#     bool do_pooling;  // whether to pool (sum) embedding results by sequence id (ignored if no pooling layer)
 
-#     // Abort callback
-#     // if it returns true, execution of llama_decode() will be aborted
-#     // currently works only with CPU execution
-#     ggml_abort_callback abort_callback;
-#     void *              abort_callback_data;
+#     bool enable_timing;   // enable timing op
 # };
 class llama_context_params(ctypes.Structure):
     """Parameters for llama_context
@@ -591,7 +586,6 @@ class llama_context_params(ctypes.Structure):
         n_threads (int): number of threads to use for generation
         n_threads_batch (int): number of threads to use for batch processing
         rope_scaling_type (int): RoPE scaling type, from `enum llama_rope_scaling_type`
-        pooling_type (int): whether to pool (sum) embedding results by sequence id (ignored if no pooling layer)
         rope_freq_base (float): RoPE base frequency, 0 = from model
         rope_freq_scale (float): RoPE frequency scaling factor, 0 = from model
         yarn_ext_factor (float): YaRN extrapolation mix factor, negative = from model
@@ -604,11 +598,12 @@ class llama_context_params(ctypes.Structure):
         cb_eval_user_data (ctypes.ctypes.c_void_p): user data for cb_eval
         type_k (int): data type for K cache
         type_v (int): data type for V cache
+        mul_mat_q (bool): if true, use experimental mul_mat_q kernels (DEPRECATED - always true)
         logits_all (bool): the llama_eval() call computes all logits, not just the last one (DEPRECATED - set llama_batch.logits instead)
         embedding (bool): embedding mode only
         offload_kqv (bool): whether to offload the KQV ops (including the KV cache) to GPU
-        abort_callback (ggml_abort_callback): abort callback if it returns true, execution of llama_decode() will be aborted
-        abort_callback_data (ctypes.ctypes.c_void_p): data for abort_callback
+        do_pooling (pool): whether to pool (sum) embedding results by sequence id (ignored if no pooling layer)
+        enable_timing (bool): enable timing op
     """
 
     _fields_ = [
@@ -618,7 +613,6 @@ class llama_context_params(ctypes.Structure):
         ("n_threads", ctypes.c_uint32),
         ("n_threads_batch", ctypes.c_uint32),
         ("rope_scaling_type", ctypes.c_int),
-        ("pooling_type", ctypes.c_int),
         ("rope_freq_base", ctypes.c_float),
         ("rope_freq_scale", ctypes.c_float),
         ("yarn_ext_factor", ctypes.c_float),
@@ -631,11 +625,12 @@ class llama_context_params(ctypes.Structure):
         ("cb_eval_user_data", ctypes.c_void_p),
         ("type_k", ctypes.c_int),
         ("type_v", ctypes.c_int),
+        ("mul_mat_q", ctypes.c_bool),
         ("logits_all", ctypes.c_bool),
         ("embedding", ctypes.c_bool),
         ("offload_kqv", ctypes.c_bool),
-        ("abort_callback", ggml_abort_callback),
-        ("abort_callback_data", ctypes.c_void_p),
+        ("do_pooling", ctypes.c_bool),
+        ("enable_timing", ctypes.c_bool),
     ]
 
 
@@ -1723,22 +1718,6 @@ def llama_set_n_threads(
     """
     ...
 
-# // Set abort callback
-# LLAMA_API void llama_set_abort_callback(struct llama_context * ctx, ggml_abort_callback abort_callback, void * abort_callback_data);
-# @ctypes_function(
-#     "llama_set_abort_callback",
-#     [llama_context_p_ctypes, ggml_abort_callback, ctypes.c_void_p],
-#     None,
-# )
-# def llama_set_abort_callback(
-#     ctx: llama_context_p,
-#     abort_callback: Callable[[ctypes.c_void_p], None],
-#     abort_callback_data: ctypes.c_void_p,
-#     /,
-# ):
-#     """Set abort callback"""
-#     ...
-
 
 # // Token logits obtained from the last call to llama_decode()
 # // The logits for the last token are stored in the last row
@@ -2711,6 +2690,7 @@ def llama_set_timestamp(ctx: llama_context_p, name: bytes):
     """Set timestamp with name"""
     ...
 
+
 # LLAMA_API int64_t llama_get_timestamp(struct llama_context * ctx, const char * name);
 @ctypes_function(
     "llama_get_timestamp",