qeleb
diff --git a/‎CHANGELOG.md
Lines changed: 24 additions & 0 deletions b/‎CHANGELOG.md
Lines changed: 24 additions & 0 deletions
diff --git a/‎README.md
Lines changed: 5 additions & 3 deletions b/‎README.md
Lines changed: 5 additions & 3 deletions
diff --git a/‎examples/low_level_api/low_level_api_llama_cpp.py
Lines changed: 7 additions & 7 deletions b/‎examples/low_level_api/low_level_api_llama_cpp.py
Lines changed: 7 additions & 7 deletions
diff --git a/‎llama_cpp/__init__.py
Lines changed: 1 addition & 1 deletion b/‎llama_cpp/__init__.py
Lines changed: 1 addition & 1 deletion
diff --git a/‎llama_cpp/_utils.py
Lines changed: 16 additions & 10 deletions b/‎llama_cpp/_utils.py
Lines changed: 16 additions & 10 deletions
diff --git a/‎llama_cpp/llama.py
Lines changed: 30 additions & 14 deletions b/‎llama_cpp/llama.py
Lines changed: 30 additions & 14 deletions
diff --git a/‎llama_cpp/llama_chat_format.py
Lines changed: 32 additions & 0 deletions b/‎llama_cpp/llama_chat_format.py
Lines changed: 32 additions & 0 deletions
@@ -7,6 +7,30 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ## [Unreleased]
 
+- fix: ctypes definitions of llama_kv_cache_view_update and llama_kv_cache_view_free. by @e-c-d in #1028
+
+## [0.2.24]
+
+- feat: Update llama.cpp to ggerganov/llama.cpp@0e18b2e7d0b5c0a509ea40098def234b8d4a938a
+- feat: Add offload_kqv option to llama and server by @abetlen in 095c65000642a3cf73055d7428232fb18b73c6f3
+- feat: n_ctx=0 now uses the n_ctx_train of the model by @DanieleMorotti in #1015
+- feat: logits_to_logprobs supports both 2-D and 3-D logits arrays by @kddubey in #1002
+- fix: Remove f16_kv, add offload_kqv fields in low level and llama apis by @brandonrobertz in #1019
+- perf: Don't convert logprobs arrays to lists by @kddubey in #1021
+- docs: Fix README.md functionary demo typo by @evelynmitchell in #996
+- examples: Update low_level_api_llama_cpp.py to match current API by @jsoma in #1023
+
+## [0.2.23]
+
+- Update llama.cpp to ggerganov/llama.cpp@948ff137ec37f1ec74c02905917fa0afc9b97514
+- Add qwen chat format by @yhfgyyf in #1005
+- Add support for running the server with SSL by @rgerganov in #994
+- Replace logits_to_logprobs implementation with numpy equivalent to llama.cpp by @player1537 in #991
+- Fix UnsupportedOperation: fileno in suppress_stdout_stderr by @zocainViken in #961
+- Add Pygmalion chat format by @chiensen in #986
+- README.md multimodal params fix by @zocainViken in #967
+- Fix minor typo in README by @aniketmaurya in #958
+
 ## [0.2.22]
 
 - Update llama.cpp to ggerganov/llama.cpp@8a7b2fa528f130631a5f43648481596ab320ed5a
 
@@ -207,7 +207,8 @@ The gguf-converted files for this model can be found here: [functionary-7b-v1](h
       messages = [
         {
           "role": "system",
-          "content": "A chat between a curious user and an artificial intelligence assitant. The assistant gives helpful, detailed, and polite answers to the user's questions. The assistant callse functions with appropriate input when necessary"
+          "content": "A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. The assistant calls functions with appropriate input when necessary"
+          
         },
         {
           "role": "user",
@@ -219,7 +220,7 @@ The gguf-converted files for this model can be found here: [functionary-7b-v1](h
         "function": {
           "name": "UserDetail",
           "parameters": {
-            "type": "object"
+            "type": "object",
             "title": "UserDetail",
             "properties": {
               "name": {
@@ -265,7 +266,8 @@ Then you'll need to use a custom chat handler to load the clip model and process
 >>> llm = Llama(
   model_path="./path/to/llava/llama-model.gguf",
   chat_handler=chat_handler,
-  n_ctx=2048 # n_ctx should be increased to accomodate the image embedding
+  n_ctx=2048, # n_ctx should be increased to accomodate the image embedding
+  logits_all=True,# needed to make llava work
 )
 >>> llm.create_chat_completion(
     messages = [
 
@@ -73,7 +73,7 @@
     embd = []
     if len(embd_inp) <= input_consumed:
         logits = llama_cpp.llama_get_logits(ctx)
-        n_vocab = llama_cpp.llama_n_vocab(ctx)
+        n_vocab = llama_cpp.llama_n_vocab(model)
 
         _arr = (llama_cpp.llama_token_data * n_vocab)(*[
             llama_cpp.llama_token_data(token_id, logits[token_id], 0.0)
@@ -83,12 +83,12 @@
             llama_cpp.llama_token_data_array(_arr, len(_arr), False))
 
         _arr = (llama_cpp.c_int * len(last_n_tokens_data))(*last_n_tokens_data)
-        llama_cpp.llama_sample_repetition_penalty(ctx, candidates_p,
+        llama_cpp.llama_sample_repetition_penalties(ctx, candidates_p,
             _arr,
-            last_n_repeat, repeat_penalty)
-        llama_cpp.llama_sample_frequency_and_presence_penalties(ctx, candidates_p,
-            _arr,
-            last_n_repeat, frequency_penalty, presence_penalty)
+            penalty_last_n=last_n_repeat,
+            penalty_repeat=repeat_penalty,
+            penalty_freq=frequency_penalty,
+            penalty_present=presence_penalty)
 
         llama_cpp.llama_sample_top_k(ctx, candidates_p, k=40, min_keep=1)
         llama_cpp.llama_sample_top_p(ctx, candidates_p, p=0.8, min_keep=1)
@@ -126,4 +126,4 @@
 
 llama_cpp.llama_print_timings(ctx)
 
-llama_cpp.llama_free(ctx)
+llama_cpp.llama_free(ctx)
@@ -1,4 +1,4 @@
 from .llama_cpp import *
 from .llama import *
 
-__version__ = "0.2.22"
+__version__ = "0.2.24"
@@ -17,14 +17,18 @@ def __enter__(self):
         if self.disable:
             return self
 
+        # Check if sys.stdout and sys.stderr have fileno method
+        if not hasattr(self.sys.stdout, 'fileno') or not hasattr(self.sys.stderr, 'fileno'):
+            return self  # Return the instance without making changes
+
         self.outnull_file = self.open(self.os.devnull, "w")
         self.errnull_file = self.open(self.os.devnull, "w")
 
         self.old_stdout_fileno_undup = self.sys.stdout.fileno()
         self.old_stderr_fileno_undup = self.sys.stderr.fileno()
 
-        self.old_stdout_fileno = self.os.dup(self.sys.stdout.fileno())
-        self.old_stderr_fileno = self.os.dup(self.sys.stderr.fileno())
+        self.old_stdout_fileno = self.os.dup(self.old_stdout_fileno_undup)
+        self.old_stderr_fileno = self.os.dup(self.old_stderr_fileno_undup)
 
         self.old_stdout = self.sys.stdout
         self.old_stderr = self.sys.stderr
@@ -40,14 +44,16 @@ def __exit__(self, *_):
         if self.disable:
             return
 
-        self.sys.stdout = self.old_stdout
-        self.sys.stderr = self.old_stderr
+        # Check if sys.stdout and sys.stderr have fileno method
+        if hasattr(self.sys.stdout, 'fileno') and hasattr(self.sys.stderr, 'fileno'):
+            self.sys.stdout = self.old_stdout
+            self.sys.stderr = self.old_stderr
 
-        self.os.dup2(self.old_stdout_fileno, self.old_stdout_fileno_undup)
-        self.os.dup2(self.old_stderr_fileno, self.old_stderr_fileno_undup)
+            self.os.dup2(self.old_stdout_fileno, self.old_stdout_fileno_undup)
+            self.os.dup2(self.old_stderr_fileno, self.old_stderr_fileno_undup)
 
-        self.os.close(self.old_stdout_fileno)
-        self.os.close(self.old_stderr_fileno)
+            self.os.close(self.old_stdout_fileno)
+            self.os.close(self.old_stderr_fileno)
 
-        self.outnull_file.close()
-        self.errnull_file.close()
+            self.outnull_file.close()
+            self.errnull_file.close()
@@ -4,7 +4,6 @@
 import sys
 import uuid
 import time
-import math
 import multiprocessing
 
 from typing import (
@@ -67,9 +66,9 @@ def __init__(
         yarn_beta_slow: float = 1.0,
         yarn_orig_ctx: int = 0,
         mul_mat_q: bool = True,
-        f16_kv: bool = True,
         logits_all: bool = False,
         embedding: bool = False,
+        offload_kqv: bool = False,
         # Sampling Params
         last_n_tokens_size: int = 64,
         # LoRA Params
@@ -87,7 +86,7 @@ def __init__(
         **kwargs,  # type: ignore
     ):
         """Load a llama.cpp model from `model_path`.
-    
+
         Examples:
             Basic usage
 
@@ -133,9 +132,9 @@ def __init__(
             yarn_beta_fast: YaRN low correction dim
             yarn_beta_slow: YaRN high correction dim
             yarn_orig_ctx: YaRN original context size
-            f16_kv: Use fp16 for KV cache, fp32 otherwise
             logits_all: Return logits for all tokens, not just the last token. Must be True for completion to return logprobs.
             embedding: Embedding mode only.
+            offload_kqv: Offload K, Q, V to GPU.
             last_n_tokens_size: Maximum number of tokens to keep in the last_n_tokens deque.
             lora_base: Optional path to base model, useful if using a quantized base model and you want to apply LoRA to an f16 model.
             lora_path: Path to a LoRA file to apply to the model.
@@ -220,9 +219,9 @@ def __init__(
         )
         self.context_params.yarn_orig_ctx = yarn_orig_ctx if yarn_orig_ctx != 0 else 0
         self.context_params.mul_mat_q = mul_mat_q
-        # self.context_params.f16_kv = f16_kv
         self.context_params.logits_all = logits_all
         self.context_params.embedding = embedding
+        self.context_params.offload_kqv = offload_kqv
 
         # Sampling Params
         self.last_n_tokens_size = last_n_tokens_size
@@ -239,6 +238,12 @@ def __init__(
         self._model = _LlamaModel(
             path_model=self.model_path, params=self.model_params, verbose=self.verbose
         )
+        # Set the default value for the context and correct the batch
+        if n_ctx == 0:
+            n_ctx = self._model.n_ctx_train()
+            self.n_batch = min(n_ctx, n_batch)
+            self.context_params.n_ctx = self._model.n_ctx_train()
+            self.context_params.n_batch = self.n_batch
 
         self._ctx = _LlamaContext(
             model=self._model,
@@ -940,7 +945,7 @@ def _logprobs_or_none(all_tokens: List[int], all_token_strs: List[str], all_logp
                             self.detokenize(completion_tokens[:returned_tokens])
                         )
                         token_offset = len(prompt_tokens) + returned_tokens
-                        logits = self._scores[token_offset - 1, :].tolist()
+                        logits = self._scores[token_offset - 1, :]
                         token_logprob = Llama.logits_to_logprobs(logits)
                         sorted_logprobs = list(
                             sorted(
@@ -1034,7 +1039,8 @@ def _logprobs_or_none(all_tokens: List[int], all_token_strs: List[str], all_logp
                         self.detokenize(completion_tokens[:returned_tokens])
                     )
                     token_offset = len(prompt_tokens) + returned_tokens - 1
-                    token_logprob = Llama.logits_to_logprobs(self._scores[token_offset, :].tolist())
+                    logits = self._scores[token_offset, :]
+                    token_logprob = Llama.logits_to_logprobs(logits)
                     sorted_logprobs = list(
                         sorted(
                             zip(token_logprob, range(len(token_logprob))),
@@ -1101,7 +1107,7 @@ def _logprobs_or_none(all_tokens: List[int], all_token_strs: List[str], all_logp
                 for token in all_tokens
             ]
             all_logprobs = [
-                Llama.logits_to_logprobs(row.tolist()) for row in self._scores
+                Llama.logits_to_logprobs(row).tolist() for row in self._scores
             ][token_offset:]
             logprobs_or_none = _logprobs_or_none(
                 all_tokens, all_token_strs, all_logprobs, text_offset
@@ -1426,7 +1432,6 @@ def __getstate__(self):
             yarn_beta_slow=self.context_params.yarn_beta_slow,
             yarn_orig_ctx=self.context_params.yarn_orig_ctx,
             mul_mat_q=self.context_params.mul_mat_q,
-            f16_kv=self.context_params.f16_kv,
             logits_all=self.context_params.logits_all,
             embedding=self.context_params.embedding,
             # Sampling Params
@@ -1469,7 +1474,6 @@ def __setstate__(self, state):
             yarn_beta_slow=state["yarn_beta_slow"],
             yarn_orig_ctx=state["yarn_orig_ctx"],
             mul_mat_q=state["mul_mat_q"],
-            f16_kv=state["f16_kv"],
             logits_all=state["logits_all"],
             embedding=state["embedding"],
             # Sampling Params
@@ -1557,10 +1561,22 @@ def token_nl(self) -> int:
         return self._model.token_nl()
 
     @staticmethod
-    def logits_to_logprobs(logits: List[float]) -> List[float]:
-        exps = [math.exp(float(x)) for x in logits]
-        sum_exps = sum(exps)
-        return [math.log(x / sum_exps) for x in exps]
+    def logits_to_logprobs(
+        logits: Union[npt.NDArray[np.single], List], axis: int = -1
+    ) -> npt.NDArray[np.single]:
+        # https://docs.scipy.org/doc/scipy/reference/generated/scipy.special.log_softmax.html
+        logits_maxs: np.ndarray = np.amax(logits, axis=axis, keepdims=True)
+        if logits_maxs.ndim > 0:
+            logits_maxs[~np.isfinite(logits_maxs)] = 0
+        elif not np.isfinite(logits_maxs):
+            logits_maxs = 0
+        subtract_maxs = np.subtract(logits, logits_maxs, dtype=np.single)
+        exp = np.exp(subtract_maxs)
+        # Suppress warnings about log of zero
+        with np.errstate(divide="ignore"):
+            summed = np.sum(exp, axis=axis, keepdims=True)
+            out = np.log(summed)
+        return subtract_maxs - out
 
     @staticmethod
     def longest_token_prefix(a: Sequence[int], b: Sequence[int]):
 
@@ -423,6 +423,21 @@ def format_alpaca(
     _prompt = _format_add_colon_two(system_message, _messages, _sep, _sep2)
     return ChatFormatterResponse(prompt=_prompt)
 
+@register_chat_format("qwen")
+def format_qwen(
+    messages: List[llama_types.ChatCompletionRequestMessage],
+    **kwargs: Any,
+) -> ChatFormatterResponse:
+    _roles = dict(user="<|im_start|>user", assistant="<|im_start|>assistant")
+    system_message="You are a helpful assistant."
+    system_template="<|im_start|>system\n{system_message}"
+    system_message=system_template.format(system_message=system_message)
+    _messages = _map_roles(messages, _roles)
+    _messages.append((_roles["assistant"], None))
+    _sep = "<|im_end|>"
+    _prompt = _format_chatml(system_message, _messages, _sep)
+    _sep2 = "<|endoftext|>"
+    return ChatFormatterResponse(prompt=_prompt,stop=_sep2)
 
 @register_chat_format("vicuna")
 def format(
@@ -637,6 +652,23 @@ def format_zephyr(
     _prompt = _format_chatml(system_message, _messages, _sep)
     return ChatFormatterResponse(prompt=_prompt, stop=_sep)
 
+
+@register_chat_format("pygmalion")
+def format_pygmalion(
+    messages: List[llama_types.ChatCompletionRequestMessage],
+    **kwargs: Any,
+) -> ChatFormatterResponse:
+    system_template = """<|system|>{system_message}"""
+    system_message = _get_system_message(messages)
+    system_message = system_template.format(system_message=system_message)
+    _roles = dict(user="<|user|>", assistant="<|model|>")
+    _sep = "\n"
+    _messages = _map_roles(messages, _roles)
+    _messages.append((_roles["assistant"], None))
+    _prompt = _format_chatml(system_message, _messages, _sep)
+    return ChatFormatterResponse(prompt=_prompt, stop=_sep)
+
+
 @register_chat_format("chatml")
 def format_chatml(
     messages: List[llama_types.ChatCompletionRequestMessage],