From a99fd214dd0b7b97f2cd5d22ae4c7c91236aec6c Mon Sep 17 00:00:00 2001
From: Andrei Betlen <abetlen@gmail.com>
Date: Tue, 15 Jul 2025 07:35:40 -0400
Subject: [PATCH 01/13] feat: Update llama.cpp

---
 llama_cpp/llama_cpp.py | 1 +
 vendor/llama.cpp       | 2 +-
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/llama_cpp/llama_cpp.py b/llama_cpp/llama_cpp.py
index d13d60458..bd8e36b79 100644
--- a/llama_cpp/llama_cpp.py
+++ b/llama_cpp/llama_cpp.py
@@ -200,6 +200,7 @@
 """RWKV tokenizer based on greedy tokenization"""
 
 
+# NOTE: Deprecated and will be removed in the future. (already gone in llama.cpp)
 # // pre-tokenization types
 # enum llama_vocab_pre_type {
 #     LLAMA_VOCAB_PRE_TYPE_DEFAULT        = 0,
diff --git a/vendor/llama.cpp b/vendor/llama.cpp
index a0374a67e..bdca38376 160000
--- a/vendor/llama.cpp
+++ b/vendor/llama.cpp
@@ -1 +1 @@
-Subproject commit a0374a67e2924f2e845cdc59dd67d9a44065a89c
+Subproject commit bdca38376f7e8dd928defe01ce6a16218a64b040

From c8579d767aa53c48221f409c49df03e4019ed771 Mon Sep 17 00:00:00 2001
From: Alcoft <101468238+alcoftTAO@users.noreply.github.com>
Date: Tue, 15 Jul 2025 11:37:45 +0000
Subject: [PATCH 02/13] fix: Better chat format for Qwen2.5-VL (#2040)

---
 llama_cpp/llama_chat_format.py | 21 ++++++++++++---------
 1 file changed, 12 insertions(+), 9 deletions(-)

diff --git a/llama_cpp/llama_chat_format.py b/llama_cpp/llama_chat_format.py
index a288db7b0..2a7d575ce 100644
--- a/llama_cpp/llama_chat_format.py
+++ b/llama_cpp/llama_chat_format.py
@@ -3459,26 +3459,29 @@ class Qwen25VLChatHandler(Llava15ChatHandler):
     DEFAULT_SYSTEM_MESSAGE = "You are a helpful assistant."
 
     CHAT_FORMAT = (
-        "<|im_start|>system\n"
-        "You are a helpful assistant.<|im_end|>\n"
+        #"{% set image_count = namespace(value=0) %}"
+        #"{% set video_count = namespace(value=0) %}"
         "{% for message in messages %}"
-        "{% if message['role'] == 'user' %}"
-        "<|im_start|>user\n"
+        "{% if loop.first and message['role'] != 'system' %}"
+        "<|im_start|>system\n"
+        "{{ self.DEFAULT_SYSTEM_MESSAGE }}<|im_end|>\n"
+        "{% endif %}"
+        "<|im_start|>{{ message['role'] }}\n"
         "{% if message['content'] is string %}"
-        "{{ message['content'] }}"
+        "{{ message['content'] }}<|im_end|>\n"
         "{% else %}"
         "{% for content in message['content'] %}"
-        "{% if content['type'] == 'text' %}"
-        "{{ content['text'] }}"
-        "{% elif content['type'] == 'image_url' %}"
+        "{% if content['type'] == 'image_url' %}"
         "{% if content.image_url is string %}"
         "{{ content.image_url }}"
         "{% else %}"
         "{{ content.image_url.url }}"
         "{% endif %}"
+        #"{% set image_count.value = image_count.value + 1 %}"
+        "{% elif content['type'] == 'text' %}"
+        "{{ content['text'] }}"
         "{% endif %}"
         "{% endfor %}"
-        "{% endif %}"
         "<|im_end|>\n"
         "{% endif %}"
         "{% endfor %}"

From d9749cb9678d3ceb93b9ee63f4d723e3f01eb5c6 Mon Sep 17 00:00:00 2001
From: Andrei Betlen <abetlen@gmail.com>
Date: Tue, 15 Jul 2025 07:39:46 -0400
Subject: [PATCH 03/13] chore: Bump version

---
 CHANGELOG.md          | 5 +++++
 llama_cpp/__init__.py | 2 +-
 2 files changed, 6 insertions(+), 1 deletion(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 6017812bb..88b9a1b45 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -7,6 +7,11 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ## [Unreleased]
 
+## [0.3.13]
+
+- feat: Update llama.cpp to ggerganov/llama.cpp@
+- fix: Better chat format for Qwen2.5-VL by @alcoftTAO in #2040
+
 ## [0.3.12]
 
 - feat: Update llama.cpp to ggerganov/llama.cpp@a0374a67e2924f2e845cdc59dd67d9a44065a89c
diff --git a/llama_cpp/__init__.py b/llama_cpp/__init__.py
index b16bb7dc9..0c869dcae 100644
--- a/llama_cpp/__init__.py
+++ b/llama_cpp/__init__.py
@@ -1,4 +1,4 @@
 from .llama_cpp import *
 from .llama import *
 
-__version__ = "0.3.12"
+__version__ = "0.3.13"

From 95292e36c6cef7430cff0bda16578fe9f7303e01 Mon Sep 17 00:00:00 2001
From: Andrei Betlen <abetlen@gmail.com>
Date: Wed, 16 Jul 2025 02:47:55 -0400
Subject: [PATCH 04/13] feat: Update llama.cpp

---
 llama_cpp/llama_cpp.py | 17 ++++++++++-------
 vendor/llama.cpp       |  2 +-
 2 files changed, 11 insertions(+), 8 deletions(-)

diff --git a/llama_cpp/llama_cpp.py b/llama_cpp/llama_cpp.py
index bd8e36b79..b9e245e2f 100644
--- a/llama_cpp/llama_cpp.py
+++ b/llama_cpp/llama_cpp.py
@@ -179,12 +179,13 @@
 
 
 # enum llama_vocab_type {
-#     LLAMA_VOCAB_TYPE_NONE = 0, // For models without vocab
-#     LLAMA_VOCAB_TYPE_SPM  = 1, // LLaMA tokenizer based on byte-level BPE with byte fallback
-#     LLAMA_VOCAB_TYPE_BPE  = 2, // GPT-2 tokenizer based on byte-level BPE
-#     LLAMA_VOCAB_TYPE_WPM  = 3, // BERT tokenizer based on WordPiece
-#     LLAMA_VOCAB_TYPE_UGM  = 4, // T5 tokenizer based on Unigram
-#     LLAMA_VOCAB_TYPE_RWKV = 5, // RWKV tokenizer based on greedy tokenization
+#     LLAMA_VOCAB_TYPE_NONE   = 0, // For models without vocab
+#     LLAMA_VOCAB_TYPE_SPM    = 1, // LLaMA tokenizer based on byte-level BPE with byte fallback
+#     LLAMA_VOCAB_TYPE_BPE    = 2, // GPT-2 tokenizer based on byte-level BPE
+#     LLAMA_VOCAB_TYPE_WPM    = 3, // BERT tokenizer based on WordPiece
+#     LLAMA_VOCAB_TYPE_UGM    = 4, // T5 tokenizer based on Unigram
+#     LLAMA_VOCAB_TYPE_RWKV   = 5, // RWKV tokenizer based on greedy tokenization
+#     LLAMA_VOCAB_TYPE_PLAMO2 = 6, // PLaMo-2 tokenizer based on Aho-Corasick with dynamic programming
 # };
 LLAMA_VOCAB_TYPE_NONE = 0
 """For models without vocab"""
@@ -198,6 +199,8 @@
 """T5 tokenizer based on Unigram"""
 LLAMA_VOCAB_TYPE_RWKV = 5
 """RWKV tokenizer based on greedy tokenization"""
+LLAMA_VOCAB_TYPE_PLAMO2 = 6
+"""PLaMo-2 tokenizer based on Aho-Corasick with dynamic programming"""
 
 
 # NOTE: Deprecated and will be removed in the future. (already gone in llama.cpp)
@@ -2171,7 +2174,7 @@ def llama_kv_self_seq_add(
 # //   - lazily on next llama_decode()
 # // p0 < 0 : [0,  p1]
 # // p1 < 0 : [p0, inf)
-# DEPRECATED(void llama_kv_self_seq_div(
+# DEPRECATED(LLAMA_API void llama_kv_self_seq_div(
 #         struct llama_context * ctx,
 #                 llama_seq_id   seq_id,
 #                    llama_pos   p0,
diff --git a/vendor/llama.cpp b/vendor/llama.cpp
index bdca38376..79e0b68c1 160000
--- a/vendor/llama.cpp
+++ b/vendor/llama.cpp
@@ -1 +1 @@
-Subproject commit bdca38376f7e8dd928defe01ce6a16218a64b040
+Subproject commit 79e0b68c178656bb0632cb8602d2940b755077f8

From e1af05f43f57d2b660edfb77935dd2d2641ec602 Mon Sep 17 00:00:00 2001
From: Andrei Betlen <abetlen@gmail.com>
Date: Fri, 18 Jul 2025 12:45:45 -0400
Subject: [PATCH 05/13] chore: Bump version

---
 CHANGELOG.md          | 6 +++++-
 llama_cpp/__init__.py | 2 +-
 2 files changed, 6 insertions(+), 2 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 88b9a1b45..e743c4584 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -7,9 +7,13 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ## [Unreleased]
 
+## [0.3.14]
+
+- feat: Update llama.cpp to ggerganov/llama.cpp@79e0b68c178656bb0632cb8602d2940b755077f8
+
 ## [0.3.13]
 
-- feat: Update llama.cpp to ggerganov/llama.cpp@
+- feat: Update llama.cpp to ggerganov/llama.cpp@bdca38376f7e8dd928defe01ce6a16218a64b040
 - fix: Better chat format for Qwen2.5-VL by @alcoftTAO in #2040
 
 ## [0.3.12]
diff --git a/llama_cpp/__init__.py b/llama_cpp/__init__.py
index 0c869dcae..409c59514 100644
--- a/llama_cpp/__init__.py
+++ b/llama_cpp/__init__.py
@@ -1,4 +1,4 @@
 from .llama_cpp import *
 from .llama import *
 
-__version__ = "0.3.13"
+__version__ = "0.3.14"

From 4f260288ff0f34801f1c2ede975093201c8dcf4c Mon Sep 17 00:00:00 2001
From: Andrei Betlen <abetlen@gmail.com>
Date: Thu, 7 Aug 2025 06:40:18 -0700
Subject: [PATCH 06/13] feat: Update llama.cpp

---
 llama_cpp/_internals.py |  6 ++++++
 llama_cpp/llama_cpp.py  | 33 ++++++++++++++++++++++++++++++++-
 vendor/llama.cpp        |  2 +-
 3 files changed, 39 insertions(+), 2 deletions(-)

diff --git a/llama_cpp/_internals.py b/llama_cpp/_internals.py
index 18d733481..b5175a7f2 100644
--- a/llama_cpp/_internals.py
+++ b/llama_cpp/_internals.py
@@ -287,18 +287,24 @@ def pooling_type(self) -> int:
         return llama_cpp.llama_pooling_type(self.ctx)
 
     def kv_cache_clear(self):
+        assert self.memory is not None, "Memory is not initialized"
         llama_cpp.llama_memory_clear(self.memory, True)
 
     def kv_cache_seq_rm(self, seq_id: int, p0: int, p1: int):
+        assert self.memory is not None, "Memory is not initialized"
+        seq_id = seq_id if seq_id >= 0 else 0
         llama_cpp.llama_memory_seq_rm(self.memory, seq_id, p0, p1)
 
     def kv_cache_seq_cp(self, seq_id_src: int, seq_id_dst: int, p0: int, p1: int):
+        assert self.memory is not None, "Memory is not initialized"
         llama_cpp.llama_memory_seq_cp(self.memory, seq_id_src, seq_id_dst, p0, p1)
 
     def kv_cache_seq_keep(self, seq_id: int):
+        assert self.memory is not None, "Memory is not initialized"
         llama_cpp.llama_memory_seq_keep(self.memory, seq_id)
 
     def kv_cache_seq_shift(self, seq_id: int, p0: int, p1: int, shift: int):
+        assert self.memory is not None, "Memory is not initialized"
         llama_cpp.llama_memory_seq_add(self.memory, seq_id, p0, p1, shift)
 
     def get_state_size(self) -> int:
diff --git a/llama_cpp/llama_cpp.py b/llama_cpp/llama_cpp.py
index b9e245e2f..711d42a6a 100644
--- a/llama_cpp/llama_cpp.py
+++ b/llama_cpp/llama_cpp.py
@@ -381,6 +381,7 @@
 #     //LLAMA_FTYPE_MOSTLY_Q4_0_8_8      = 35, // removed from gguf files, use Q4_0 and runtime repack
 #     LLAMA_FTYPE_MOSTLY_TQ1_0         = 36, // except 1d tensors
 #     LLAMA_FTYPE_MOSTLY_TQ2_0         = 37, // except 1d tensors
+#     LLAMA_FTYPE_MOSTLY_MXFP4_MOE     = 38, // except 1d tensors
 #
 #     LLAMA_FTYPE_GUESSED = 1024, // not specified in the model file
 # };
@@ -419,6 +420,7 @@
 # LLAMA_FTYPE_MOSTLY_Q4_0_8_8 = 35
 LLAMA_FTYPE_MOSTLY_TQ1_0 = 36
 LLAMA_FTYPE_MOSTLY_TQ2_0 = 37
+LLAMA_FTYPE_MOSTLY_MXFP4_MOE = 38
 LLAMA_FTYPE_GUESSED = 1024
 
 # enum llama_rope_scaling_type {
@@ -691,6 +693,7 @@ class llama_model_kv_override(ctypes.Structure):
 #     bool use_mmap;      // use mmap if possible
 #     bool use_mlock;     // force system to keep model in RAM
 #     bool check_tensors; // validate model tensor data
+#     bool use_extra_bufts; // use extra buffer types (used for weight repacking)
 # };
 class llama_model_params(ctypes.Structure):
     """Parameters for llama_model
@@ -708,7 +711,8 @@ class llama_model_params(ctypes.Structure):
         vocab_only (bool): only load the vocabulary, no weights
         use_mmap (bool): use mmap if possible
         use_mlock (bool): force system to keep model in RAM
-        check_tensors (bool): validate model tensor data"""
+        check_tensors (bool): validate model tensor data
+        use_extra_bufts (bool): use extra buffer types (used for weight repacking)"""
 
     if TYPE_CHECKING:
         devices: CtypesArray[ctypes.c_void_p]  # NOTE: unused
@@ -724,6 +728,7 @@ class llama_model_params(ctypes.Structure):
         use_mmap: bool
         use_mlock: bool
         check_tensors: bool
+        use_extra_bufts: bool
 
     _fields_ = [
         ("devices", ctypes.c_void_p), # NOTE: unnused
@@ -739,6 +744,7 @@ class llama_model_params(ctypes.Structure):
         ("use_mmap", ctypes.c_bool),
         ("use_mlock", ctypes.c_bool),
         ("check_tensors", ctypes.c_bool),
+        ("use_extra_bufts", ctypes.c_bool),
     ]
 
 
@@ -787,6 +793,9 @@ class llama_model_params(ctypes.Structure):
 #     bool swa_full;    // use full-size SWA cache (https://github.com/ggml-org/llama.cpp/pull/13194#issuecomment-2868343055)
 #                       // NOTE: setting to false when n_seq_max > 1 can cause bad performance in some cases
 #                       //       ref: https://github.com/ggml-org/llama.cpp/pull/13845#issuecomment-2924800573
+#     bool kv_unified;  // use a unified buffer across the input sequences when computing the attention
+#                       // try to disable when n_seq_max > 1 for improved performance when the sequences do not share a large prefix
+#                       // ref: https://github.com/ggml-org/llama.cpp/pull/14363
 # };
 class llama_context_params(ctypes.Structure):
     """Parameters for llama_context
@@ -821,6 +830,7 @@ class llama_context_params(ctypes.Structure):
         no_perf (bool): whether to measure performance timings
         op_offload (bool): offload host tensor operations to device
         swa_full (bool): use full-size SWA cache
+        kv_unified (bool): use a unified buffer across the input sequences when computing the attention
     """
 
     if TYPE_CHECKING:
@@ -853,6 +863,7 @@ class llama_context_params(ctypes.Structure):
         no_perf: bool
         op_offload: bool
         swa_full: bool
+        kv_unified: bool
 
     _fields_ = [
         ("n_ctx", ctypes.c_uint32),
@@ -884,6 +895,7 @@ class llama_context_params(ctypes.Structure):
         ("no_perf", ctypes.c_bool),
         ("op_offload", ctypes.c_bool),
         ("swa_full", ctypes.c_bool),
+        ("kv_unified", ctypes.c_bool),
     ]
 
 
@@ -1651,6 +1663,14 @@ def llama_model_is_recurrent(model: llama_model_p, /) -> bool:
     ...
 
 
+# // Returns true if the model is diffusion-based (like LLaDA, Dream, etc.)
+# LLAMA_API bool llama_model_is_diffusion(const struct llama_model * model);
+@ctypes_function("llama_model_is_diffusion", [llama_model_p_ctypes], ctypes.c_bool)
+def llama_model_is_diffusion(model: llama_model_p, /) -> bool:
+    """Returns true if the model is diffusion-based (like LLaDA, Dream, etc.)"""
+    ...
+
+
 # // Returns 0 on success
 # LLAMA_API uint32_t llama_model_quantize(
 #         const char * fname_inp,
@@ -2833,6 +2853,7 @@ def llama_synchronize(ctx: llama_context_p, /):
 # // in the order they have appeared in the batch.
 # // Rows: number of tokens for which llama_batch.logits[i] != 0
 # // Cols: n_vocab
+# // TODO: deprecate in favor of llama_get_logits_ith() (ref: https://github.com/ggml-org/llama.cpp/pull/14853#issuecomment-3113143522)
 # LLAMA_API float * llama_get_logits(struct llama_context * ctx);
 @ctypes_function(
     "llama_get_logits", [llama_context_p_ctypes], ctypes.POINTER(ctypes.c_float)
@@ -2873,6 +2894,7 @@ def llama_get_logits_ith(
 # // in the order they have appeared in the batch.
 # // shape: [n_outputs*n_embd]
 # // Otherwise, returns NULL.
+# // TODO: deprecate in favor of llama_get_embeddings_ith() (ref: https://github.com/ggml-org/llama.cpp/pull/14853#issuecomment-3113143522)
 # LLAMA_API float * llama_get_embeddings(struct llama_context * ctx);
 @ctypes_function(
     "llama_get_embeddings", [llama_context_p_ctypes], ctypes.POINTER(ctypes.c_float)
@@ -3020,6 +3042,13 @@ def llama_vocab_pad(vocab: llama_vocab_p, /) -> llama_token:
     ...
 
 
+# LLAMA_API llama_token llama_vocab_mask(const struct llama_vocab * vocab); // mask
+@ctypes_function("llama_vocab_mask", [llama_vocab_p_ctypes], llama_token)
+def llama_vocab_mask(vocab: llama_vocab_p, /) -> llama_token:
+    """mask"""
+    ...
+
+
 # LLAMA_API bool llama_vocab_get_add_bos(const struct llama_vocab * vocab);
 @ctypes_function(
     "llama_vocab_get_add_bos",
@@ -4176,6 +4205,7 @@ def llama_log_set(
 
 #     int32_t n_p_eval;
 #     int32_t n_eval;
+#     int32_t n_reused; // number of times a ggml compute graph had been reused
 # };
 class llama_perf_context_data(ctypes.Structure):
     _fields_ = [
@@ -4185,6 +4215,7 @@ class llama_perf_context_data(ctypes.Structure):
         ("t_eval_ms", ctypes.c_double),
         ("n_p_eval", ctypes.c_int32),
         ("n_eval", ctypes.c_int32),
+        ("n_reused", ctypes.c_int32),
     ]
 
 
diff --git a/vendor/llama.cpp b/vendor/llama.cpp
index 79e0b68c1..9a9638954 160000
--- a/vendor/llama.cpp
+++ b/vendor/llama.cpp
@@ -1 +1 @@
-Subproject commit 79e0b68c178656bb0632cb8602d2940b755077f8
+Subproject commit 9a96389544a08fd829fccda28142ce2066017fde

From d12ca479885bd530abf4543cd576b7eecb1b20e9 Mon Sep 17 00:00:00 2001
From: Andrei Betlen <abetlen@gmail.com>
Date: Thu, 7 Aug 2025 06:42:58 -0700
Subject: [PATCH 07/13] misc: Update pypi downloads badge

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 088a23779..382f7cbed 100644
--- a/README.md
+++ b/README.md
@@ -9,7 +9,7 @@
 [![PyPI](https://img.shields.io/pypi/v/llama-cpp-python)](https://pypi.org/project/llama-cpp-python/)
 [![PyPI - Python Version](https://img.shields.io/pypi/pyversions/llama-cpp-python)](https://pypi.org/project/llama-cpp-python/)
 [![PyPI - License](https://img.shields.io/pypi/l/llama-cpp-python)](https://pypi.org/project/llama-cpp-python/)
-[![PyPI - Downloads](https://img.shields.io/pypi/dm/llama-cpp-python)](https://pypi.org/project/llama-cpp-python/)
+[![PyPI - Downloads](https://static.pepy.tech/badge/llama-cpp-python/month)](https://pepy.tech/projects/llama-cpp-python)
 [![Github All Releases](https://img.shields.io/github/downloads/abetlen/llama-cpp-python/total.svg?label=Github%20Downloads)]()
 
 Simple Python bindings for **@ggerganov's** [`llama.cpp`](https://github.com/ggerganov/llama.cpp) library.

From 68e89e86c8135e865995d088ca7e5f4a38370c20 Mon Sep 17 00:00:00 2001
From: Andrei Betlen <abetlen@gmail.com>
Date: Thu, 7 Aug 2025 06:43:30 -0700
Subject: [PATCH 08/13] misc: Add Python 3.13 classifier tag

---
 pyproject.toml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/pyproject.toml b/pyproject.toml
index 9983ef777..f5ae7b59c 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -25,6 +25,7 @@ classifiers = [
     "Programming Language :: Python :: 3.10",
     "Programming Language :: Python :: 3.11",
     "Programming Language :: Python :: 3.12",
+    "Programming Language :: Python :: 3.13",
 ]
 
 

From af637928db7351e030011085f818b034c6efc047 Mon Sep 17 00:00:00 2001
From: Andrei Betlen <abetlen@gmail.com>
Date: Thu, 7 Aug 2025 06:47:26 -0700
Subject: [PATCH 09/13] feat: Add gpt-oss chat format support through
 strftime_now in chat format by @iamlemec

---
 llama_cpp/llama_chat_format.py | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/llama_cpp/llama_chat_format.py b/llama_cpp/llama_chat_format.py
index 2a7d575ce..f738ab9bb 100644
--- a/llama_cpp/llama_chat_format.py
+++ b/llama_cpp/llama_chat_format.py
@@ -8,6 +8,7 @@
 import random
 import string
 
+from datetime import datetime
 from contextlib import ExitStack
 from typing import (
     Any,
@@ -214,6 +215,10 @@ def __init__(
             lstrip_blocks=True,
         ).from_string(self.template)
 
+    @staticmethod
+    def strftime_now(f: str) -> str:
+        return datetime.now().strftime(f)
+
     def __call__(
         self,
         *,
@@ -237,6 +242,7 @@ def raise_exception(message: str):
             function_call=function_call,
             tools=tools,
             tool_choice=tool_choice,
+            strftime_now=self.strftime_now,
         )
 
         stopping_criteria = None

From 30ddd56e827e7fef6d5020809c574bdc0e166196 Mon Sep 17 00:00:00 2001
From: sergey21000 <67040429+sergey21000@users.noreply.github.com>
Date: Thu, 7 Aug 2025 16:49:10 +0300
Subject: [PATCH 10/13] fix: rename op_offloat to op_offload in llama.py
 (#2046)

---
 llama_cpp/llama.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py
index 2e93670e6..71d94ebd8 100644
--- a/llama_cpp/llama.py
+++ b/llama_cpp/llama.py
@@ -92,7 +92,7 @@ def __init__(
         embedding: bool = False,
         offload_kqv: bool = True,
         flash_attn: bool = False,
-        op_offloat: Optional[bool] = None,
+        op_offload: Optional[bool] = None,
         swa_full: Optional[bool] = None,
         # Sampling Params
         no_perf: bool = False,
@@ -174,7 +174,7 @@ def __init__(
             embedding: Embedding mode only.
             offload_kqv: Offload K, Q, V to GPU.
             flash_attn: Use flash attention.
-            op_offloat: offload host tensor operations to device
+            op_offload: offload host tensor operations to device
             swa_full: use full-size SWA cache (https://github.com/ggml-org/llama.cpp/pull/13194#issuecomment-2868343055)
             no_perf: Measure performance timings.
             last_n_tokens_size: Maximum number of tokens to keep in the last_n_tokens deque.
@@ -343,8 +343,8 @@ def __init__(
         self.context_params.offload_kqv = offload_kqv
         self.context_params.flash_attn = flash_attn
 
-        if op_offloat is not None:
-            self.context_params.op_offloat = op_offloat
+        if op_offload is not None:
+            self.context_params.op_offload = op_offload
 
         if swa_full is not None:
             self.context_params.swa_full = swa_full
@@ -2097,7 +2097,7 @@ def __getstate__(self):
             embedding=self.context_params.embeddings,
             offload_kqv=self.context_params.offload_kqv,
             flash_attn=self.context_params.flash_attn,
-            op_offloat=self.context_params.op_offloat,
+            op_offload=self.context_params.op_offload,
             swa_full=self.context_params.swa_full,
             # Sampling Params
             no_perf=self.context_params.no_perf,

From dfc9bf503bb7d4be166410e525971509373bee0e Mon Sep 17 00:00:00 2001
From: Andrei Betlen <abetlen@gmail.com>
Date: Thu, 7 Aug 2025 06:53:07 -0700
Subject: [PATCH 11/13] chore: Bump version

---
 CHANGELOG.md          | 6 ++++++
 llama_cpp/__init__.py | 2 +-
 2 files changed, 7 insertions(+), 1 deletion(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index e743c4584..929363721 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -7,6 +7,12 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ## [Unreleased]
 
+## [0.3.15]
+
+- feat: Update llama.cpp to ggerganov/llama.cpp@9a96389544a08fd829fccda28142ce2066017fde
+- feat: Add gpt-oss chat format support through strftime_now in chat format by @iamlemec in af637928db7351e030011085f818b034c6efc047
+- fix: rename op_offloat to op_offload in llama.py by @sergey21000 in #2046
+
 ## [0.3.14]
 
 - feat: Update llama.cpp to ggerganov/llama.cpp@79e0b68c178656bb0632cb8602d2940b755077f8
diff --git a/llama_cpp/__init__.py b/llama_cpp/__init__.py
index 409c59514..1e256a776 100644
--- a/llama_cpp/__init__.py
+++ b/llama_cpp/__init__.py
@@ -1,4 +1,4 @@
 from .llama_cpp import *
 from .llama import *
 
-__version__ = "0.3.14"
+__version__ = "0.3.15"

From ce6fd8bbc808196dea90dd259bbcd4301c69b0b5 Mon Sep 17 00:00:00 2001
From: Andrei Betlen <abetlen@gmail.com>
Date: Thu, 14 Aug 2025 21:52:49 -0700
Subject: [PATCH 12/13] feat: Update llama.cpp

---
 vendor/llama.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vendor/llama.cpp b/vendor/llama.cpp
index 9a9638954..4227c9be4 160000
--- a/vendor/llama.cpp
+++ b/vendor/llama.cpp
@@ -1 +1 @@
-Subproject commit 9a96389544a08fd829fccda28142ce2066017fde
+Subproject commit 4227c9be4268ac844921b90f31595f81236bd317

From c37132bac860fcc333255c36313f89c4f49d4c8d Mon Sep 17 00:00:00 2001
From: Andrei Betlen <abetlen@gmail.com>
Date: Thu, 14 Aug 2025 21:55:21 -0700
Subject: [PATCH 13/13] chore: Bump version

---
 CHANGELOG.md          | 4 ++++
 llama_cpp/__init__.py | 2 +-
 2 files changed, 5 insertions(+), 1 deletion(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 929363721..16954eb88 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -7,6 +7,10 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ## [Unreleased]
 
+## [0.3.16]
+
+- feat: Update llama.cpp to ggerganov/llama.cpp@4227c9be4268ac844921b90f31595f81236bd317
+
 ## [0.3.15]
 
 - feat: Update llama.cpp to ggerganov/llama.cpp@9a96389544a08fd829fccda28142ce2066017fde
diff --git a/llama_cpp/__init__.py b/llama_cpp/__init__.py
index 1e256a776..c1dde7046 100644
--- a/llama_cpp/__init__.py
+++ b/llama_cpp/__init__.py
@@ -1,4 +1,4 @@
 from .llama_cpp import *
 from .llama import *
 
-__version__ = "0.3.15"
+__version__ = "0.3.16"