From a99fd214dd0b7b97f2cd5d22ae4c7c91236aec6c Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Tue, 15 Jul 2025 07:35:40 -0400 Subject: [PATCH 01/13] feat: Update llama.cpp --- llama_cpp/llama_cpp.py | 1 + vendor/llama.cpp | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/llama_cpp/llama_cpp.py b/llama_cpp/llama_cpp.py index d13d60458..bd8e36b79 100644 --- a/llama_cpp/llama_cpp.py +++ b/llama_cpp/llama_cpp.py @@ -200,6 +200,7 @@ """RWKV tokenizer based on greedy tokenization""" +# NOTE: Deprecated and will be removed in the future. (already gone in llama.cpp) # // pre-tokenization types # enum llama_vocab_pre_type { # LLAMA_VOCAB_PRE_TYPE_DEFAULT = 0, diff --git a/vendor/llama.cpp b/vendor/llama.cpp index a0374a67e..bdca38376 160000 --- a/vendor/llama.cpp +++ b/vendor/llama.cpp @@ -1 +1 @@ -Subproject commit a0374a67e2924f2e845cdc59dd67d9a44065a89c +Subproject commit bdca38376f7e8dd928defe01ce6a16218a64b040 From c8579d767aa53c48221f409c49df03e4019ed771 Mon Sep 17 00:00:00 2001 From: Alcoft <101468238+alcoftTAO@users.noreply.github.com> Date: Tue, 15 Jul 2025 11:37:45 +0000 Subject: [PATCH 02/13] fix: Better chat format for Qwen2.5-VL (#2040) --- llama_cpp/llama_chat_format.py | 21 ++++++++++++--------- 1 file changed, 12 insertions(+), 9 deletions(-) diff --git a/llama_cpp/llama_chat_format.py b/llama_cpp/llama_chat_format.py index a288db7b0..2a7d575ce 100644 --- a/llama_cpp/llama_chat_format.py +++ b/llama_cpp/llama_chat_format.py @@ -3459,26 +3459,29 @@ class Qwen25VLChatHandler(Llava15ChatHandler): DEFAULT_SYSTEM_MESSAGE = "You are a helpful assistant." CHAT_FORMAT = ( - "<|im_start|>system\n" - "You are a helpful assistant.<|im_end|>\n" + #"{% set image_count = namespace(value=0) %}" + #"{% set video_count = namespace(value=0) %}" "{% for message in messages %}" - "{% if message['role'] == 'user' %}" - "<|im_start|>user\n" + "{% if loop.first and message['role'] != 'system' %}" + "<|im_start|>system\n" + "{{ self.DEFAULT_SYSTEM_MESSAGE }}<|im_end|>\n" + "{% endif %}" + "<|im_start|>{{ message['role'] }}\n" "{% if message['content'] is string %}" - "{{ message['content'] }}" + "{{ message['content'] }}<|im_end|>\n" "{% else %}" "{% for content in message['content'] %}" - "{% if content['type'] == 'text' %}" - "{{ content['text'] }}" - "{% elif content['type'] == 'image_url' %}" + "{% if content['type'] == 'image_url' %}" "{% if content.image_url is string %}" "{{ content.image_url }}" "{% else %}" "{{ content.image_url.url }}" "{% endif %}" + #"{% set image_count.value = image_count.value + 1 %}" + "{% elif content['type'] == 'text' %}" + "{{ content['text'] }}" "{% endif %}" "{% endfor %}" - "{% endif %}" "<|im_end|>\n" "{% endif %}" "{% endfor %}" From d9749cb9678d3ceb93b9ee63f4d723e3f01eb5c6 Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Tue, 15 Jul 2025 07:39:46 -0400 Subject: [PATCH 03/13] chore: Bump version --- CHANGELOG.md | 5 +++++ llama_cpp/__init__.py | 2 +- 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 6017812bb..88b9a1b45 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,11 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] +## [0.3.13] + +- feat: Update llama.cpp to ggerganov/llama.cpp@ +- fix: Better chat format for Qwen2.5-VL by @alcoftTAO in #2040 + ## [0.3.12] - feat: Update llama.cpp to ggerganov/llama.cpp@a0374a67e2924f2e845cdc59dd67d9a44065a89c diff --git a/llama_cpp/__init__.py b/llama_cpp/__init__.py index b16bb7dc9..0c869dcae 100644 --- a/llama_cpp/__init__.py +++ b/llama_cpp/__init__.py @@ -1,4 +1,4 @@ from .llama_cpp import * from .llama import * -__version__ = "0.3.12" +__version__ = "0.3.13" From 95292e36c6cef7430cff0bda16578fe9f7303e01 Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Wed, 16 Jul 2025 02:47:55 -0400 Subject: [PATCH 04/13] feat: Update llama.cpp --- llama_cpp/llama_cpp.py | 17 ++++++++++------- vendor/llama.cpp | 2 +- 2 files changed, 11 insertions(+), 8 deletions(-) diff --git a/llama_cpp/llama_cpp.py b/llama_cpp/llama_cpp.py index bd8e36b79..b9e245e2f 100644 --- a/llama_cpp/llama_cpp.py +++ b/llama_cpp/llama_cpp.py @@ -179,12 +179,13 @@ # enum llama_vocab_type { -# LLAMA_VOCAB_TYPE_NONE = 0, // For models without vocab -# LLAMA_VOCAB_TYPE_SPM = 1, // LLaMA tokenizer based on byte-level BPE with byte fallback -# LLAMA_VOCAB_TYPE_BPE = 2, // GPT-2 tokenizer based on byte-level BPE -# LLAMA_VOCAB_TYPE_WPM = 3, // BERT tokenizer based on WordPiece -# LLAMA_VOCAB_TYPE_UGM = 4, // T5 tokenizer based on Unigram -# LLAMA_VOCAB_TYPE_RWKV = 5, // RWKV tokenizer based on greedy tokenization +# LLAMA_VOCAB_TYPE_NONE = 0, // For models without vocab +# LLAMA_VOCAB_TYPE_SPM = 1, // LLaMA tokenizer based on byte-level BPE with byte fallback +# LLAMA_VOCAB_TYPE_BPE = 2, // GPT-2 tokenizer based on byte-level BPE +# LLAMA_VOCAB_TYPE_WPM = 3, // BERT tokenizer based on WordPiece +# LLAMA_VOCAB_TYPE_UGM = 4, // T5 tokenizer based on Unigram +# LLAMA_VOCAB_TYPE_RWKV = 5, // RWKV tokenizer based on greedy tokenization +# LLAMA_VOCAB_TYPE_PLAMO2 = 6, // PLaMo-2 tokenizer based on Aho-Corasick with dynamic programming # }; LLAMA_VOCAB_TYPE_NONE = 0 """For models without vocab""" @@ -198,6 +199,8 @@ """T5 tokenizer based on Unigram""" LLAMA_VOCAB_TYPE_RWKV = 5 """RWKV tokenizer based on greedy tokenization""" +LLAMA_VOCAB_TYPE_PLAMO2 = 6 +"""PLaMo-2 tokenizer based on Aho-Corasick with dynamic programming""" # NOTE: Deprecated and will be removed in the future. (already gone in llama.cpp) @@ -2171,7 +2174,7 @@ def llama_kv_self_seq_add( # // - lazily on next llama_decode() # // p0 < 0 : [0, p1] # // p1 < 0 : [p0, inf) -# DEPRECATED(void llama_kv_self_seq_div( +# DEPRECATED(LLAMA_API void llama_kv_self_seq_div( # struct llama_context * ctx, # llama_seq_id seq_id, # llama_pos p0, diff --git a/vendor/llama.cpp b/vendor/llama.cpp index bdca38376..79e0b68c1 160000 --- a/vendor/llama.cpp +++ b/vendor/llama.cpp @@ -1 +1 @@ -Subproject commit bdca38376f7e8dd928defe01ce6a16218a64b040 +Subproject commit 79e0b68c178656bb0632cb8602d2940b755077f8 From e1af05f43f57d2b660edfb77935dd2d2641ec602 Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Fri, 18 Jul 2025 12:45:45 -0400 Subject: [PATCH 05/13] chore: Bump version --- CHANGELOG.md | 6 +++++- llama_cpp/__init__.py | 2 +- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 88b9a1b45..e743c4584 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,9 +7,13 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] +## [0.3.14] + +- feat: Update llama.cpp to ggerganov/llama.cpp@79e0b68c178656bb0632cb8602d2940b755077f8 + ## [0.3.13] -- feat: Update llama.cpp to ggerganov/llama.cpp@ +- feat: Update llama.cpp to ggerganov/llama.cpp@bdca38376f7e8dd928defe01ce6a16218a64b040 - fix: Better chat format for Qwen2.5-VL by @alcoftTAO in #2040 ## [0.3.12] diff --git a/llama_cpp/__init__.py b/llama_cpp/__init__.py index 0c869dcae..409c59514 100644 --- a/llama_cpp/__init__.py +++ b/llama_cpp/__init__.py @@ -1,4 +1,4 @@ from .llama_cpp import * from .llama import * -__version__ = "0.3.13" +__version__ = "0.3.14" From 4f260288ff0f34801f1c2ede975093201c8dcf4c Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Thu, 7 Aug 2025 06:40:18 -0700 Subject: [PATCH 06/13] feat: Update llama.cpp --- llama_cpp/_internals.py | 6 ++++++ llama_cpp/llama_cpp.py | 33 ++++++++++++++++++++++++++++++++- vendor/llama.cpp | 2 +- 3 files changed, 39 insertions(+), 2 deletions(-) diff --git a/llama_cpp/_internals.py b/llama_cpp/_internals.py index 18d733481..b5175a7f2 100644 --- a/llama_cpp/_internals.py +++ b/llama_cpp/_internals.py @@ -287,18 +287,24 @@ def pooling_type(self) -> int: return llama_cpp.llama_pooling_type(self.ctx) def kv_cache_clear(self): + assert self.memory is not None, "Memory is not initialized" llama_cpp.llama_memory_clear(self.memory, True) def kv_cache_seq_rm(self, seq_id: int, p0: int, p1: int): + assert self.memory is not None, "Memory is not initialized" + seq_id = seq_id if seq_id >= 0 else 0 llama_cpp.llama_memory_seq_rm(self.memory, seq_id, p0, p1) def kv_cache_seq_cp(self, seq_id_src: int, seq_id_dst: int, p0: int, p1: int): + assert self.memory is not None, "Memory is not initialized" llama_cpp.llama_memory_seq_cp(self.memory, seq_id_src, seq_id_dst, p0, p1) def kv_cache_seq_keep(self, seq_id: int): + assert self.memory is not None, "Memory is not initialized" llama_cpp.llama_memory_seq_keep(self.memory, seq_id) def kv_cache_seq_shift(self, seq_id: int, p0: int, p1: int, shift: int): + assert self.memory is not None, "Memory is not initialized" llama_cpp.llama_memory_seq_add(self.memory, seq_id, p0, p1, shift) def get_state_size(self) -> int: diff --git a/llama_cpp/llama_cpp.py b/llama_cpp/llama_cpp.py index b9e245e2f..711d42a6a 100644 --- a/llama_cpp/llama_cpp.py +++ b/llama_cpp/llama_cpp.py @@ -381,6 +381,7 @@ # //LLAMA_FTYPE_MOSTLY_Q4_0_8_8 = 35, // removed from gguf files, use Q4_0 and runtime repack # LLAMA_FTYPE_MOSTLY_TQ1_0 = 36, // except 1d tensors # LLAMA_FTYPE_MOSTLY_TQ2_0 = 37, // except 1d tensors +# LLAMA_FTYPE_MOSTLY_MXFP4_MOE = 38, // except 1d tensors # # LLAMA_FTYPE_GUESSED = 1024, // not specified in the model file # }; @@ -419,6 +420,7 @@ # LLAMA_FTYPE_MOSTLY_Q4_0_8_8 = 35 LLAMA_FTYPE_MOSTLY_TQ1_0 = 36 LLAMA_FTYPE_MOSTLY_TQ2_0 = 37 +LLAMA_FTYPE_MOSTLY_MXFP4_MOE = 38 LLAMA_FTYPE_GUESSED = 1024 # enum llama_rope_scaling_type { @@ -691,6 +693,7 @@ class llama_model_kv_override(ctypes.Structure): # bool use_mmap; // use mmap if possible # bool use_mlock; // force system to keep model in RAM # bool check_tensors; // validate model tensor data +# bool use_extra_bufts; // use extra buffer types (used for weight repacking) # }; class llama_model_params(ctypes.Structure): """Parameters for llama_model @@ -708,7 +711,8 @@ class llama_model_params(ctypes.Structure): vocab_only (bool): only load the vocabulary, no weights use_mmap (bool): use mmap if possible use_mlock (bool): force system to keep model in RAM - check_tensors (bool): validate model tensor data""" + check_tensors (bool): validate model tensor data + use_extra_bufts (bool): use extra buffer types (used for weight repacking)""" if TYPE_CHECKING: devices: CtypesArray[ctypes.c_void_p] # NOTE: unused @@ -724,6 +728,7 @@ class llama_model_params(ctypes.Structure): use_mmap: bool use_mlock: bool check_tensors: bool + use_extra_bufts: bool _fields_ = [ ("devices", ctypes.c_void_p), # NOTE: unnused @@ -739,6 +744,7 @@ class llama_model_params(ctypes.Structure): ("use_mmap", ctypes.c_bool), ("use_mlock", ctypes.c_bool), ("check_tensors", ctypes.c_bool), + ("use_extra_bufts", ctypes.c_bool), ] @@ -787,6 +793,9 @@ class llama_model_params(ctypes.Structure): # bool swa_full; // use full-size SWA cache (https://github.com/ggml-org/llama.cpp/pull/13194#issuecomment-2868343055) # // NOTE: setting to false when n_seq_max > 1 can cause bad performance in some cases # // ref: https://github.com/ggml-org/llama.cpp/pull/13845#issuecomment-2924800573 +# bool kv_unified; // use a unified buffer across the input sequences when computing the attention +# // try to disable when n_seq_max > 1 for improved performance when the sequences do not share a large prefix +# // ref: https://github.com/ggml-org/llama.cpp/pull/14363 # }; class llama_context_params(ctypes.Structure): """Parameters for llama_context @@ -821,6 +830,7 @@ class llama_context_params(ctypes.Structure): no_perf (bool): whether to measure performance timings op_offload (bool): offload host tensor operations to device swa_full (bool): use full-size SWA cache + kv_unified (bool): use a unified buffer across the input sequences when computing the attention """ if TYPE_CHECKING: @@ -853,6 +863,7 @@ class llama_context_params(ctypes.Structure): no_perf: bool op_offload: bool swa_full: bool + kv_unified: bool _fields_ = [ ("n_ctx", ctypes.c_uint32), @@ -884,6 +895,7 @@ class llama_context_params(ctypes.Structure): ("no_perf", ctypes.c_bool), ("op_offload", ctypes.c_bool), ("swa_full", ctypes.c_bool), + ("kv_unified", ctypes.c_bool), ] @@ -1651,6 +1663,14 @@ def llama_model_is_recurrent(model: llama_model_p, /) -> bool: ... +# // Returns true if the model is diffusion-based (like LLaDA, Dream, etc.) +# LLAMA_API bool llama_model_is_diffusion(const struct llama_model * model); +@ctypes_function("llama_model_is_diffusion", [llama_model_p_ctypes], ctypes.c_bool) +def llama_model_is_diffusion(model: llama_model_p, /) -> bool: + """Returns true if the model is diffusion-based (like LLaDA, Dream, etc.)""" + ... + + # // Returns 0 on success # LLAMA_API uint32_t llama_model_quantize( # const char * fname_inp, @@ -2833,6 +2853,7 @@ def llama_synchronize(ctx: llama_context_p, /): # // in the order they have appeared in the batch. # // Rows: number of tokens for which llama_batch.logits[i] != 0 # // Cols: n_vocab +# // TODO: deprecate in favor of llama_get_logits_ith() (ref: https://github.com/ggml-org/llama.cpp/pull/14853#issuecomment-3113143522) # LLAMA_API float * llama_get_logits(struct llama_context * ctx); @ctypes_function( "llama_get_logits", [llama_context_p_ctypes], ctypes.POINTER(ctypes.c_float) @@ -2873,6 +2894,7 @@ def llama_get_logits_ith( # // in the order they have appeared in the batch. # // shape: [n_outputs*n_embd] # // Otherwise, returns NULL. +# // TODO: deprecate in favor of llama_get_embeddings_ith() (ref: https://github.com/ggml-org/llama.cpp/pull/14853#issuecomment-3113143522) # LLAMA_API float * llama_get_embeddings(struct llama_context * ctx); @ctypes_function( "llama_get_embeddings", [llama_context_p_ctypes], ctypes.POINTER(ctypes.c_float) @@ -3020,6 +3042,13 @@ def llama_vocab_pad(vocab: llama_vocab_p, /) -> llama_token: ... +# LLAMA_API llama_token llama_vocab_mask(const struct llama_vocab * vocab); // mask +@ctypes_function("llama_vocab_mask", [llama_vocab_p_ctypes], llama_token) +def llama_vocab_mask(vocab: llama_vocab_p, /) -> llama_token: + """mask""" + ... + + # LLAMA_API bool llama_vocab_get_add_bos(const struct llama_vocab * vocab); @ctypes_function( "llama_vocab_get_add_bos", @@ -4176,6 +4205,7 @@ def llama_log_set( # int32_t n_p_eval; # int32_t n_eval; +# int32_t n_reused; // number of times a ggml compute graph had been reused # }; class llama_perf_context_data(ctypes.Structure): _fields_ = [ @@ -4185,6 +4215,7 @@ class llama_perf_context_data(ctypes.Structure): ("t_eval_ms", ctypes.c_double), ("n_p_eval", ctypes.c_int32), ("n_eval", ctypes.c_int32), + ("n_reused", ctypes.c_int32), ] diff --git a/vendor/llama.cpp b/vendor/llama.cpp index 79e0b68c1..9a9638954 160000 --- a/vendor/llama.cpp +++ b/vendor/llama.cpp @@ -1 +1 @@ -Subproject commit 79e0b68c178656bb0632cb8602d2940b755077f8 +Subproject commit 9a96389544a08fd829fccda28142ce2066017fde From d12ca479885bd530abf4543cd576b7eecb1b20e9 Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Thu, 7 Aug 2025 06:42:58 -0700 Subject: [PATCH 07/13] misc: Update pypi downloads badge --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 088a23779..382f7cbed 100644 --- a/README.md +++ b/README.md @@ -9,7 +9,7 @@ [![PyPI](https://img.shields.io/pypi/v/llama-cpp-python)](https://pypi.org/project/llama-cpp-python/) [![PyPI - Python Version](https://img.shields.io/pypi/pyversions/llama-cpp-python)](https://pypi.org/project/llama-cpp-python/) [![PyPI - License](https://img.shields.io/pypi/l/llama-cpp-python)](https://pypi.org/project/llama-cpp-python/) -[![PyPI - Downloads](https://img.shields.io/pypi/dm/llama-cpp-python)](https://pypi.org/project/llama-cpp-python/) +[![PyPI - Downloads](https://static.pepy.tech/badge/llama-cpp-python/month)](https://pepy.tech/projects/llama-cpp-python) [![Github All Releases](https://img.shields.io/github/downloads/abetlen/llama-cpp-python/total.svg?label=Github%20Downloads)]() Simple Python bindings for **@ggerganov's** [`llama.cpp`](https://github.com/ggerganov/llama.cpp) library. From 68e89e86c8135e865995d088ca7e5f4a38370c20 Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Thu, 7 Aug 2025 06:43:30 -0700 Subject: [PATCH 08/13] misc: Add Python 3.13 classifier tag --- pyproject.toml | 1 + 1 file changed, 1 insertion(+) diff --git a/pyproject.toml b/pyproject.toml index 9983ef777..f5ae7b59c 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -25,6 +25,7 @@ classifiers = [ "Programming Language :: Python :: 3.10", "Programming Language :: Python :: 3.11", "Programming Language :: Python :: 3.12", + "Programming Language :: Python :: 3.13", ] From af637928db7351e030011085f818b034c6efc047 Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Thu, 7 Aug 2025 06:47:26 -0700 Subject: [PATCH 09/13] feat: Add gpt-oss chat format support through strftime_now in chat format by @iamlemec --- llama_cpp/llama_chat_format.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/llama_cpp/llama_chat_format.py b/llama_cpp/llama_chat_format.py index 2a7d575ce..f738ab9bb 100644 --- a/llama_cpp/llama_chat_format.py +++ b/llama_cpp/llama_chat_format.py @@ -8,6 +8,7 @@ import random import string +from datetime import datetime from contextlib import ExitStack from typing import ( Any, @@ -214,6 +215,10 @@ def __init__( lstrip_blocks=True, ).from_string(self.template) + @staticmethod + def strftime_now(f: str) -> str: + return datetime.now().strftime(f) + def __call__( self, *, @@ -237,6 +242,7 @@ def raise_exception(message: str): function_call=function_call, tools=tools, tool_choice=tool_choice, + strftime_now=self.strftime_now, ) stopping_criteria = None From 30ddd56e827e7fef6d5020809c574bdc0e166196 Mon Sep 17 00:00:00 2001 From: sergey21000 <67040429+sergey21000@users.noreply.github.com> Date: Thu, 7 Aug 2025 16:49:10 +0300 Subject: [PATCH 10/13] fix: rename op_offloat to op_offload in llama.py (#2046) --- llama_cpp/llama.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py index 2e93670e6..71d94ebd8 100644 --- a/llama_cpp/llama.py +++ b/llama_cpp/llama.py @@ -92,7 +92,7 @@ def __init__( embedding: bool = False, offload_kqv: bool = True, flash_attn: bool = False, - op_offloat: Optional[bool] = None, + op_offload: Optional[bool] = None, swa_full: Optional[bool] = None, # Sampling Params no_perf: bool = False, @@ -174,7 +174,7 @@ def __init__( embedding: Embedding mode only. offload_kqv: Offload K, Q, V to GPU. flash_attn: Use flash attention. - op_offloat: offload host tensor operations to device + op_offload: offload host tensor operations to device swa_full: use full-size SWA cache (https://github.com/ggml-org/llama.cpp/pull/13194#issuecomment-2868343055) no_perf: Measure performance timings. last_n_tokens_size: Maximum number of tokens to keep in the last_n_tokens deque. @@ -343,8 +343,8 @@ def __init__( self.context_params.offload_kqv = offload_kqv self.context_params.flash_attn = flash_attn - if op_offloat is not None: - self.context_params.op_offloat = op_offloat + if op_offload is not None: + self.context_params.op_offload = op_offload if swa_full is not None: self.context_params.swa_full = swa_full @@ -2097,7 +2097,7 @@ def __getstate__(self): embedding=self.context_params.embeddings, offload_kqv=self.context_params.offload_kqv, flash_attn=self.context_params.flash_attn, - op_offloat=self.context_params.op_offloat, + op_offload=self.context_params.op_offload, swa_full=self.context_params.swa_full, # Sampling Params no_perf=self.context_params.no_perf, From dfc9bf503bb7d4be166410e525971509373bee0e Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Thu, 7 Aug 2025 06:53:07 -0700 Subject: [PATCH 11/13] chore: Bump version --- CHANGELOG.md | 6 ++++++ llama_cpp/__init__.py | 2 +- 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index e743c4584..929363721 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,12 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] +## [0.3.15] + +- feat: Update llama.cpp to ggerganov/llama.cpp@9a96389544a08fd829fccda28142ce2066017fde +- feat: Add gpt-oss chat format support through strftime_now in chat format by @iamlemec in af637928db7351e030011085f818b034c6efc047 +- fix: rename op_offloat to op_offload in llama.py by @sergey21000 in #2046 + ## [0.3.14] - feat: Update llama.cpp to ggerganov/llama.cpp@79e0b68c178656bb0632cb8602d2940b755077f8 diff --git a/llama_cpp/__init__.py b/llama_cpp/__init__.py index 409c59514..1e256a776 100644 --- a/llama_cpp/__init__.py +++ b/llama_cpp/__init__.py @@ -1,4 +1,4 @@ from .llama_cpp import * from .llama import * -__version__ = "0.3.14" +__version__ = "0.3.15" From ce6fd8bbc808196dea90dd259bbcd4301c69b0b5 Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Thu, 14 Aug 2025 21:52:49 -0700 Subject: [PATCH 12/13] feat: Update llama.cpp --- vendor/llama.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vendor/llama.cpp b/vendor/llama.cpp index 9a9638954..4227c9be4 160000 --- a/vendor/llama.cpp +++ b/vendor/llama.cpp @@ -1 +1 @@ -Subproject commit 9a96389544a08fd829fccda28142ce2066017fde +Subproject commit 4227c9be4268ac844921b90f31595f81236bd317 From c37132bac860fcc333255c36313f89c4f49d4c8d Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Thu, 14 Aug 2025 21:55:21 -0700 Subject: [PATCH 13/13] chore: Bump version --- CHANGELOG.md | 4 ++++ llama_cpp/__init__.py | 2 +- 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 929363721..16954eb88 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,10 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] +## [0.3.16] + +- feat: Update llama.cpp to ggerganov/llama.cpp@4227c9be4268ac844921b90f31595f81236bd317 + ## [0.3.15] - feat: Update llama.cpp to ggerganov/llama.cpp@9a96389544a08fd829fccda28142ce2066017fde diff --git a/llama_cpp/__init__.py b/llama_cpp/__init__.py index 1e256a776..c1dde7046 100644 --- a/llama_cpp/__init__.py +++ b/llama_cpp/__init__.py @@ -1,4 +1,4 @@ from .llama_cpp import * from .llama import * -__version__ = "0.3.15" +__version__ = "0.3.16"