yupbank
diff --git a/‎CHANGELOG.md
Lines changed: 25 additions & 1 deletion b/‎CHANGELOG.md
Lines changed: 25 additions & 1 deletion
diff --git a/‎CMakeLists.txt
Lines changed: 5 additions & 0 deletions b/‎CMakeLists.txt
Lines changed: 5 additions & 0 deletions
diff --git a/‎README.md
Lines changed: 2 additions & 2 deletions b/‎README.md
Lines changed: 2 additions & 2 deletions
diff --git a/‎llama_cpp/__init__.py
Lines changed: 1 addition & 1 deletion b/‎llama_cpp/__init__.py
Lines changed: 1 addition & 1 deletion
diff --git a/‎llama_cpp/_internals.py
Lines changed: 6 additions & 4 deletions b/‎llama_cpp/_internals.py
Lines changed: 6 additions & 4 deletions
diff --git a/‎llama_cpp/llama.py
Lines changed: 30 additions & 46 deletions b/‎llama_cpp/llama.py
Lines changed: 30 additions & 46 deletions
@@ -7,9 +7,33 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ## [Unreleased]
 
+## [0.2.58]
+
+- feat: Update llama.cpp to ggerganov/llama.cpp@ba0c7c70ab5b15f1f2be7fb0dfbe0366dda30d6c
+- feat: add support for KV cache quantization options by @Limour-dev in #1307
+- feat: Add logprobs support to chat completions by @windspirit95 in #1311
+- fix: set LLAMA_METAL_EMBED_LIBRARY=on on MacOS arm64 by @bretello in #1289
+- feat: Add tools/functions variables to Jinja2ChatFormatter, add function response formatting for all simple chat formats by @CISC in #1273
+- fix: Changed local API doc references to hosted by by @lawfordp2017 in #1317
+
+## [0.2.57]
+
+- feat: Update llama.cpp to ggerganov/llama.cpp@ac9ee6a4ad740bc1ee484ede43e9f92b5af244c1
+- fix: set default embedding pooling type to unspecified by @abetlen in 4084aabe867b8ec2aba1b22659e59c9318b0d1f3
+- fix: Fix and optimize functionary chat handler by @jeffrey-fong in #1282
+- fix: json mode for basic chat formats by @abetlen in 20e6815252d0efd9f015f7adbf108faaf36e3f3c
+
+## [0.2.56]
+
+- feat: Update llama.cpp to ggerganov/llama.cpp@c2101a2e909ac7c08976d414e64e96c90ee5fa9e
+- feat(server): Add endpoints for tokenize, detokenize and count tokens by @felipelo in #1136
+- feat: Switch embed to llama_get_embeddings_seq by @iamlemec in #1263
+- fix: Fixed json strings grammar by blacklisting character control set by @ExtReMLapin in d02a9cf16ff88ad011e2eb1ce29f4d9400f13cd1
+- fix: Check for existence of clip model path by @kejcao in #1264
+
 ## [0.2.55]
 
-- feat: Update llama.cpp to ggerganov/9731134296af3a6839cd682e51d9c2109a871de5
+- feat: Update llama.cpp to ggerganov/llama.cpp@9731134296af3a6839cd682e51d9c2109a871de5
 - docs: fix small typo in README: 'model know how' -> 'model knows how' by @boegel in #1244
 
 ## [0.2.54]
 
@@ -17,6 +17,11 @@ if (LLAMA_BUILD)
         set(LLAMA_FMA "Off" CACHE BOOL "llama: enable FMA" FORCE)
         set(LLAMA_F16C "Off" CACHE BOOL "llama: enable F16C" FORCE)
     endif()
+
+    if (APPLE AND CMAKE_SYSTEM_PROCESSOR MATCHES "arm64")
+        set(LLAMA_METAL_EMBED_LIBRARY "On" CACHE BOOL "llama: embed metal library" FORCE)
+    endif()
+
     add_subdirectory(vendor/llama.cpp)
         TARGETS llama 
 
@@ -322,7 +322,7 @@ For OpenAI API v1 compatibility, you use the [`create_chat_completion_openai_v1`
 
 ### JSON and JSON Schema Mode
 
-To constrain chat responses to only valid JSON or a specific JSON Schema use the `response_format` argument in [`create_chat_completion`](http://localhost:8000/api-reference/#llama_cpp.Llama.create_chat_completion).
+To constrain chat responses to only valid JSON or a specific JSON Schema use the `response_format` argument in [`create_chat_completion`](https://llama-cpp-python.readthedocs.io/en/latest/api-reference/#llama_cpp.Llama.create_chat_completion).
 
 #### JSON Mode
 
@@ -530,7 +530,7 @@ llama = Llama(
 
 ### Embeddings
 
-To generate text embeddings use [`create_embedding`](http://localhost:8000/api-reference/#llama_cpp.Llama.create_embedding).
+To generate text embeddings use [`create_embedding`](https://llama-cpp-python.readthedocs.io/en/latest/api-reference/#llama_cpp.Llama.create_embedding).
 
 ```python
 import llama_cpp
 
@@ -1,4 +1,4 @@
 from .llama_cpp import *
 from .llama import *
 
-__version__ = "0.2.55"
+__version__ = "0.2.58"
@@ -730,12 +730,14 @@ def sample(
         if len(self.prev) > 0:
             nl_token = ctx_main.model.token_nl()
             nl_logit = logits_array[nl_token]
-            if self.params.penalty_last_n > 0:
+            last_tokens = self.prev<
F438
/span>[-self.params.penalty_last_n:]
+            last_tokens_size = min(len(last_tokens), self.params.penalty_last_n)
+            if last_tokens_size > 0:
+                last_tokens_p = (llama_cpp.llama_token * len(last_tokens))(*last_tokens)
                 ctx_main.sample_repetition_penalties(
                     token_data_array,
-                    # TODO: Only create this once
-                    (llama_cpp.llama_token * len(self.prev))(*self.prev),
-                    self.params.penalty_last_n,
+                    last_tokens_p,
+                    last_tokens_size,
                     self.params.penalty_repeat,
                     self.params.penalty_freq,
                     self.params.penalty_present,
 
@@ -79,6 +79,7 @@ def __init__(
         n_threads: Optional[int] = None,
         n_threads_batch: Optional[int] = None,
         rope_scaling_type: Optional[int] = llama_cpp.LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED,
+        pooling_type: int = llama_cpp.LLAMA_POOLING_TYPE_UNSPECIFIED,
         rope_freq_base: float = 0.0,
         rope_freq_scale: float = 0.0,
         yarn_ext_factor: float = -1.0,
@@ -104,6 +105,9 @@ def __init__(
         draft_model: Optional[LlamaDraftModel] = None,
         # Tokenizer Override
         tokenizer: Optional[BaseLlamaTokenizer] = None,
+        # KV cache quantization
+        type_k: Optional[int] = None,
+        type_v: Optional[int] = None,
         # Misc
         verbose: bool = True,
         # Extra Params
@@ -151,6 +155,7 @@ def __init__(
             n_threads: Number of threads to use for generation
             n_threads_batch: Number of threads to use for batch processing
             rope_scaling_type: RoPE scaling type, from `enum llama_rope_scaling_type`. ref: https://github.com/ggerganov/llama.cpp/pull/2054
+            pooling_type: Pooling type, from `enum llama_pooling_type`.
             rope_freq_base: RoPE base frequency, 0 = from model
             rope_freq_scale: RoPE frequency scaling factor, 0 = from model
             yarn_ext_factor: YaRN extrapolation mix factor, negative = from model
@@ -170,6 +175,8 @@ def __init__(
             draft_model: Optional draft model to use for speculative decoding.
             tokenizer: Optional tokenizer to override the default tokenizer from llama.cpp.
             verbose: Print verbose output to stderr.
+            type_k: KV cache data type for K (default: f16)
+            type_v: KV cache data type for V (default: f16)
 
         Raises:
             ValueError: If the model path does not exist.
@@ -271,6 +278,7 @@ def __init__(
             if rope_scaling_type is not None
             else llama_cpp.LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED
         )
+        self.context_params.pooling_type = pooling_type
         self.context_params.rope_freq_base = (
             rope_freq_base if rope_freq_base != 0.0 else 0
         )
@@ -293,9 +301,13 @@ def __init__(
         self.context_params.logits_all = (
             logits_all if draft_model is None else True
         )  # Must be set to True for speculative decoding
-        self.context_params.embedding = embedding
+        self.context_params.embeddings = embedding # TODO: Rename to embeddings
         self.context_params.offload_kqv = offload_kqv
-
+        #  KV cache quantization
+        if type_k is not None:
+            self.context_params.type_k = type_k
+        if type_v is not None:
+            self.context_params.type_v = type_v
         # Sampling Params
         self.last_n_tokens_size = last_n_tokens_size
 
@@ -787,7 +799,7 @@ def embed(
         n_embd = self.n_embd()
         n_batch = self.n_batch
 
-        if self.context_params.embedding == False:
+        if self.context_params.embeddings == False:
             raise RuntimeError(
                 "Llama model must be created with embedding=True to call this method"
             )
@@ -814,9 +826,12 @@ def decode_batch(n_seq: int):
 
             # store embeddings
             for i in range(n_seq):
-                embedding: List[float] = llama_cpp.llama_get_embeddings_ith(
+                ptr = llama_cpp.llama_get_embeddings_seq(
                     self._ctx.ctx, i
-                )[:n_embd]
+                )
+                if not ptr:
+                    raise RuntimeError("Failed to get embeddings from sequence pooling type is not set")
+                embedding: List[float] = ptr[:n_embd]
                 if normalize:
                     norm = float(np.linalg.norm(embedding))
                     embedding = [v / norm for v in embedding]
@@ -1647,6 +1662,7 @@ def create_chat_completion(
             top_k=top_k,
             min_p=min_p,
             typical_p=typical_p,
+            logprobs=top_logprobs if logprobs else None,
             stream=stream,
             stop=stop,
             seed=seed,
@@ -1717,6 +1733,7 @@ def __getstate__(self):
             n_threads=self.context_params.n_threads,
             n_threads_batch=self.context_params.n_threads_batch,
             rope_scaling_type=self.context_params.rope_scaling_type,
+            pooling_type=self.context_params.pooling_type,
             rope_freq_base=self.context_params.rope_freq_base,
             rope_freq_scale=self.context_params.rope_freq_scale,
             yarn_ext_factor=self.context_params.yarn_ext_factor,
@@ -1725,7 +1742,8 @@ def __getstate__(self):
             yarn_beta_slow=self.context_params.yarn_beta_slow,
             yarn_orig_ctx=self.context_params.yarn_orig_ctx,
             logits_all=self.context_params.logits_all,
-            embedding=self.context_params.embedding,
+            embedding=self.context_params.embeddings,
+            offload_kqv=self.context_params.offload_kqv,
             # Sampling Params
             last_n_tokens_size=self.last_n_tokens_size,
             # LoRA Params
@@ -1737,51 +1755,17 @@ def __getstate__(self):
             # Chat Format Params
             chat_format=self.chat_format,
             chat_handler=self.chat_handler,
+            # Speculative Decidng
+            draft_model=self.draft_model,
+            # KV cache quantization
+            type_k=self.context_params.type_k,
+            type_v=self.context_params.type_v,
             # Misc
             verbose=self.verbose,
         )
 
     def __setstate__(self, state):
-        self.__init__(
-            model_path=state["model_path"],
-            # Model Params
-            n_gpu_layers=state["n_gpu_layers"],
-            split_mode=state["split_mode"],
-            main_gpu=state["main_gpu"],
-            tensor_split=state["tensor_split"],
-            vocab_only=state["vocab_only"],
-            use_mmap=state["use_mmap"],
-            use_mlock=state["use_mlock"],
-            kv_overrides=state["kv_overrides"],
-            # Context Params
-            seed=state["seed"],
-            n_ctx=state["n_ctx"],
-            n_batch=state["n_batch"],
-            n_threads=state["n_threads"],
-            n_threads_batch=state["n_threads_batch"],
-            rope_freq_base=state["rope_freq_base"],
-            rope_freq_scale=state["rope_freq_scale"],
-            rope_scaling_type=state["rope_scaling_type"],
-            yarn_ext_factor=state["yarn_ext_factor"],
-            yarn_attn_factor=state["yarn_attn_factor"],
-            yarn_beta_fast=state["yarn_beta_fast"],
-            yarn_beta_slow=state["yarn_beta_slow"],
-            yarn_orig_ctx=state["yarn_orig_ctx"],
-            logits_all=state["logits_all"],
-            embedding=state["embedding"],
-            # Sampling Params
-            last_n_tokens_size=state["last_n_tokens_size"],
-            # LoRA Params
-            lora_base=state["lora_base"],
-            lora_path=state["lora_path"],
-            # Backend Params
-            numa=state["numa"],
-            # Chat Format Params
-            chat_format=state["chat_format"],
-            chat_handler=state["chat_handler"],
-            # Misc
-            verbose=state["verbose"],
-        )
+        self.__init__(**state)
 
     def save_state(self) -> LlamaState:
         assert self._ctx.ctx is not None