MobinX
diff --git a/‎CHANGELOG.md
Lines changed: 19 additions & 0 deletions b/‎CHANGELOG.md
Lines changed: 19 additions & 0 deletions
diff --git a/‎llama_cpp/llama.py
Lines changed: 6 additions & 10 deletions b/‎llama_cpp/llama.py
Lines changed: 6 additions & 10 deletions
diff --git a/‎llama_cpp/llama_cpp.py
Lines changed: 49 additions & 6 deletions b/‎llama_cpp/llama_cpp.py
Lines changed: 49 additions & 6 deletions
diff --git a/‎llama_cpp/server/__main__.py
Lines changed: 3 additions & 3 deletions b/‎llama_cpp/server/__main__.py
Lines changed: 3 additions & 3 deletions
diff --git a/‎llama_cpp/server/app.py
Lines changed: 49 additions & 55 deletions b/‎llama_cpp/server/app.py
Lines changed: 49 additions & 55 deletions
diff --git a/‎pyproject.toml
Lines changed: 1 addition & 1 deletion b/‎pyproject.toml
Lines changed: 1 addition & 1 deletion
diff --git a/‎setup.py
Lines changed: 2 additions & 2 deletions b/‎setup.py
Lines changed: 2 additions & 2 deletions
@@ -7,6 +7,25 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ## [Unreleased]
 
+## [0.1.71]
+
+### Added
+
+- (llama.cpp) Update llama.cpp
+
+### Fixed
+
+- (server) Fix several pydantic v2 migration bugs
+
+## [0.1.70]
+
+### Fixed
+
+- (Llama.create_completion) Revert change so that `max_tokens` is not truncated to `context_size` in `create_completion`
+- (server) Fixed changed settings field names from pydantic v2 migration
+
+## [0.1.69]
+
 ### Added
 
 - (server) Streaming requests can are now interrupted pre-maturely when a concurrent request is made. Can be controlled with the `interrupt_requests` setting.
 
@@ -833,19 +833,15 @@ def _create_completion(
         if self.verbose:
             llama_cpp.llama_reset_timings(self.ctx)
 
-        if max_tokens <= 0:
-            # Unlimited, depending on n_ctx.
-            if len(prompt_tokens) >= int(llama_cpp.llama_n_ctx(self.ctx)):
-                raise ValueError(
-                    f"Requested tokens exceed context window of {llama_cpp.llama_n_ctx(self.ctx)}"
-                )
-            else:
-                max_tokens = int(llama_cpp.llama_n_ctx(self.ctx)) - len(prompt_tokens)
-        elif len(prompt_tokens) + max_tokens > int(llama_cpp.llama_n_ctx(self.ctx)):
+        if len(prompt_tokens) >= llama_cpp.llama_n_ctx(self.ctx):
             raise ValueError(
-                f"Requested tokens ({len(prompt_tokens)}) exceed context window of {self._n_ctx}"
+                f"Requested tokens exceed context window of {llama_cpp.llama_n_ctx(self.ctx)}"
             )
 
+        if max_tokens <= 0:
+            # Unlimited, depending on n_ctx.
+            max_tokens = llama_cpp.llama_n_ctx(self.ctx) - len(prompt_tokens)
+
         # Truncate max_tokens if requested tokens would exceed the context window
         max_tokens = (
             max_tokens
 
@@ -326,13 +326,23 @@ def llama_mlock_supported() -> bool:
 # // Initialize the llama + ggml backend
 # // If numa is true, use NUMA optimizations
 # // Call once at the start of the program
-# LLAMA_API void llama_init_backend(bool numa);
-def llama_init_backend(numa: c_bool):
-    return _lib.llama_init_backend(numa)
+# LLAMA_API void llama_backend_init(bool numa);
+def llama_backend_init(numa: c_bool):
+    return _lib.llama_backend_init(numa)
 
 
-_lib.llama_init_backend.argtypes = [c_bool]
-_lib.llama_init_backend.restype = None
+_lib.llama_backend_init.argtypes = [c_bool]
+_lib.llama_backend_init.restype = None
+
+
+# // Call once at the end of the program - currently only used for MPI
+# LLAMA_API void llama_backend_free();
+def llama_backend_free():
+    return _lib.llama_backend_free()
+
+
+_lib.llama_backend_free.argtypes = []
+_lib.llama_backend_free.restype = None
 
 
 # LLAMA_API struct llama_model * llama_load_model_from_file(
@@ -819,6 +829,39 @@ def llama_sample_frequency_and_presence_penalties(
 _lib.llama_sample_frequency_and_presence_penalties.restype = None
 
 
+# /// @details Apply classifier-free guidance to the logits as described in academic paper "Stay on topic with Classifier-Free Guidance" https://arxiv.org/abs/2306.17806
+# /// @param candidates A vector of `llama_token_data` containing the candidate tokens, the logits must be directly extracted from the original generation context without being sorted.
+# /// @params guidance_ctx A separate context from the same model. Other than a negative prompt at the beginning, it should have all generated and user input tokens copied from the main context.
+# /// @params scale Guidance strength. 1.0f means no guidance. Higher values mean stronger guidance.
+# /// @params smooth_factor Smooth factor between guidance logits and original logits. 1.0f means only use guidance logits. 0.0f means only original logits.
+# LLAMA_API void llama_sample_classifier_free_guidance(
+#             struct llama_context * ctx,
+#         llama_token_data_array * candidates,
+#             struct llama_context * guidance_ctx,
+#                             float   scale,
+#                             float   smooth_factor);
+def llama_sample_classifier_free_guidance(
+    ctx: llama_context_p,
+    candidates,  # type: _Pointer[llama_token_data_array]
+    guidance_ctx: llama_context_p,
+    scale: c_float,
+    smooth_factor: c_float,
+):
+    return _lib.llama_sample_classifier_free_guidance(
+        ctx, candidates, guidance_ctx, scale, smooth_factor
+    )
+
+
+_lib.llama_sample_classifier_free_guidance.argtypes = [
+    llama_context_p,
+    llama_token_data_array_p,
+    llama_context_p,
+    c_float,
+    c_float,
+]
+_lib.llama_sample_classifier_free_guidance.restype = None
+
+
 # @details Sorts candidate tokens by their logits in descending order and calculate probabilities based on logits.
 # LLAMA_API void llama_sample_softmax(struct llama_context * ctx, llama_token_data_array * candidates);
 def llama_sample_softmax(
@@ -1063,5 +1106,5 @@ def llama_print_system_info() -> bytes:
 _llama_initialized = False
 
 if not _llama_initialized:
-    llama_init_backend(c_bool(False))
+    llama_backend_init(c_bool(False))
     _llama_initialized = True
@@ -30,14 +30,14 @@
 
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
-    for name, field in Settings.__model_fields__.items():
-        description = field.field_info.description
+    for name, field in Settings.model_fields.items():
+        description = field.description
         if field.default is not None and description is not None:
             description += f" (default: {field.default})"
         parser.add_argument(
             f"--{name}",
             dest=name,
-            type=field.type_,
+            type=field.annotation if field.annotation is not None else str,
             help=description,
         )
 
 
@@ -84,12 +84,8 @@ class Settings(BaseSettings):
     verbose: bool = Field(
         default=True, description="Whether to print debug information."
     )
-    host: str = Field(
-        default="localhost", description="Listen address"
-    )
-    port: int = Field(
-        default=8000, description="Listen port"
-    )
+    host: str = Field(default="localhost", description="Listen address")
+    port: int = Field(default=8000, description="Listen port")
     interrupt_requests: bool = Field(
         default=True,
         description="Whether to interrupt requests when a new request is received.",
@@ -183,7 +179,7 @@ def get_settings():
     yield settings
 
 
-model_field = Field(description="The model to use for generating completions.")
+model_field = Field(description="The model to use for generating completions.", default=None)
 
 max_tokens_field = Field(
     default=16, ge=1, le=2048, description="The maximum number of tokens to generate."
@@ -247,21 +243,18 @@ def get_settings():
     default=0,
     ge=0,
     le=2,
-    description="Enable Mirostat constant-perplexity algorithm of the specified version (1 or 2; 0 = disabled)"
+    description="Enable Mirostat constant-perplexity algorithm of the specified version (1 or 2; 0 = disabled)",
 )
 
 mirostat_tau_field = Field(
     default=5.0,
     ge=0.0,
     le=10.0,
-    description="Mirostat target entropy, i.e. the target perplexity - lower values produce focused and coherent text, larger values produce more diverse and less coherent text"
+    description="Mirostat target entropy, i.e. the target perplexity - lower values produce focused and coherent text, larger values produce more diverse and less coh
B41A
erent text",
 )
 
 mirostat_eta_field = Field(
-    default=0.1,
-    ge=0.001,
-    le=1.0,
-    description="Mirostat learning rate"
+    default=0.1, ge=0.001, le=1.0, description="Mirostat learning rate"
 )
 
 
@@ -299,22 +292,23 @@ class CreateCompletionRequest(BaseModel):
     model: Optional[str] = model_field
     n: Optional[int] = 1
     best_of: Optional[int] = 1
-    user: Optional[str] = Field(None)
+    user: Optional[str] = Field(default=None)
 
     # llama.cpp specific parameters
     top_k: int = top_k_field
     repeat_penalty: float = repeat_penalty_field
     logit_bias_type: Optional[Literal["input_ids", "tokens"]] = Field(None)
 
-    class Config:
-        schema_extra = {
-            "example": {
-                "prompt": "\n\n### Instructions:\nWhat is the capital of France?\n\n### Response:\n",
-                "stop": ["\n", "###"],
-            }
+    model_config = {
+        "json_schema_extra": {
+            "examples": [
+                {
+                    "prompt": "\n\n### Instructions:\nWhat is the capital of France?\n\n### Response:\n",
+                    "stop": ["\n", "###"],
+                }
+            ]
         }
-
-
+    }
 
 
 def make_logit_bias_processor(
@@ -333,7 +327,7 @@ def make_logit_bias_processor(
 
     elif logit_bias_type == "tokens":
         for token, score in logit_bias.items():
-            token = token.encode('utf-8')
+            token = token.encode("utf-8")
             for input_id in llama.tokenize(token, add_bos=False):
                 to_bias[input_id] = score
 
@@ -357,7 +351,7 @@ async def create_completion(
     request: Request,
     body: CreateCompletionRequest,
     llama: llama_cpp.Llama = Depends(get_llama),
-):
+) -> llama_cpp.Completion:
     if isinstance(body.prompt, list):
         assert len(body.prompt) <= 1
         body.prompt = body.prompt[0] if len(body.prompt) > 0 else ""
@@ -369,7 +363,7 @@ async def create_completion(
         "logit_bias_type",
         "user",
     }
-    kwargs = body.dict(exclude=exclude)
+    kwargs = body.model_dump(exclude=exclude)
 
     if body.logit_bias is not None:
         kwargs['logits_processor'] = llama_cpp.LogitsProcessorList([
@@ -401,7 +395,7 @@ async def event_publisher(inner_send_chan: MemoryObjectSendStream):
 
         return EventSourceResponse(
             recv_chan, data_sender_callable=partial(event_publisher, send_chan)
-        )
+        ) # type: ignore
     else:
         completion: llama_cpp.Completion = await run_in_threadpool(llama, **kwargs)  # type: ignore
         return completion
@@ -410,16 +404,17 @@ async def event_publisher(inner_send_chan: MemoryObjectSendStream):
 class CreateEmbeddingRequest(BaseModel):
     model: Optional[str] = model_field
     input: Union[str, List[str]] = Field(description="The input to embed.")
-    user: Optional[str]
-
-    class Config:
-        schema_extra = {
-            "example": {
-                "input": "The food was delicious and the waiter...",
-            }
+    user: Optional[str] = Field(default=None)
+
+    model_config = {
+        "json_schema_extra": {
+            "examples": [
+                {
+                    "input": "The food was delicious and the waiter...",
+                }
+            ]
         }
-
-
+    }
 
 
 @router.post(
@@ -429,7 +424,7 @@ async def create_embedding(
     request: CreateEmbeddingRequest, llama: llama_cpp.Llama = Depends(get_llama)
 ):
     return await run_in_threadpool(
-        llama.create_embedding, **request.dict(exclude={"user"})
+        llama.create_embedding, **request.model_dump(exclude={"user"})
     )
 
 
@@ -466,21 +461,22 @@ class CreateChatCompletionRequest(BaseModel):
     repeat_penalty: float = repeat_penalty_field
     logit_bias_type: Optional[Literal["input_ids", "tokens"]] = Field(None)
 
-    class Config:
-        schema_extra = {
-            "example": {
-                "messages": [
-                    ChatCompletionRequestMessage(
-                        role="system", content="You are a helpful assistant."
-                    ),
-                    ChatCompletionRequestMessage(
-                        role="user", content="What is the capital of France?"
-                    ),
-                ]
-            }
+    model_config = {
+        "json_schema_extra": {
+            "examples": [
+                {
+                    "messages": [
+                        ChatCompletionRequestMessage(
+                            role="system", content="You are a helpful assistant."
+                        ).model_dump(),
+                        ChatCompletionRequestMessage(
+                            role="user", content="What is the capital of France?"
+                        ).model_dump(),
+                    ]
+                }
+            ]
         }
-
-
+    }
 
 
 @router.post(
@@ -491,14 +487,14 @@ async def create_chat_completion(
     body: CreateChatCompletionRequest,
     llama: llama_cpp.Llama = Depends(get_llama),
     settings: Settings = Depends(get_settings),
-) -> Union[llama_cpp.ChatCompletion]: # type: ignore
+) -> llama_cpp.ChatCompletion:
     exclude = {
         "n",
         "logit_bias",
         "logit_bias_type",
         "user",
     }
-    kwargs = body.dict(exclude=exclude)
+    kwargs = body.model_dump(exclude=exclude)
 
     if body.logit_bias is not None:
         kwargs['logits_processor'] = llama_cpp.LogitsProcessorList([
@@ -531,7 +527,7 @@ async def event_publisher(inner_send_chan: MemoryObjectSendStream):
         return EventSourceResponse(
             recv_chan,
             data_sender_callable=partial(event_publisher, send_chan),
-        )
+        ) # type: ignore
     else:
         completion: llama_cpp.ChatCompletion = await run_in_threadpool(
             llama.create_chat_completion, **kwargs  # type: ignore
@@ -551,8 +547,6 @@ class ModelList(TypedDict):
     data: List[ModelData]
 
 
-
-
 @router.get("/v1/models")
 async def get_models(
     settings: Settings = Depends(get_settings),
 
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "llama_cpp_python"
-version = "0.1.68"
+version = "0.1.71"
 description = "Python bindings for the llama.cpp library"
 authors = ["Andrei Betlen <abetlen@gmail.com>"]
 license = "MIT"
 
@@ -10,15 +10,15 @@
     description="A Python wrapper for llama.cpp",
     long_description=long_description,
     long_description_content_type="text/markdown",
-    version="0.1.68",
+    version="0.1.71",
     author="Andrei Betlen",
     author_email="abetlen@gmail.com",
     license="MIT",
     package_dir={"llama_cpp": "llama_cpp", "llama_cpp.server": "llama_cpp/server"},
     packages=["llama_cpp", "llama_cpp.server"],
     install_requires=["typing-extensions>=4.5.0", "numpy>=1.20.0", "diskcache>=5.6.1"],
     extras_require={
-        "server": ["uvicorn>=0.22.1", "fastapi>=0.100.0", "pydantic-settings>=2.0.1", "sse-starlette>=1.6.1"],
+        "server": ["uvicorn>=0.22.0", "fastapi>=0.100.0", "pydantic-settings>=2.0.1", "sse-starlette>=1.6.1"],
     },
     python_requires=">=3.7",
     classifiers=[