8000 Resolve merge conflicts · MobinX/llama-cpp-python@579f526 · GitHub
[go: up one dir, main page]

Skip to content

Commit 579f526

Browse files
author
Shouyi Wang
committed
Resolve merge conflicts
2 parents 9f21f54 + 6705f9b commit 579f526

File tree

8 files changed

+130
-78
lines changed

8 files changed

+130
-78
lines changed

CHANGELOG.md

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,25 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
77

88
## [Unreleased]
99

10+
## [0.1.71]
11+
12+
### Added
13+
14+
- (llama.cpp) Update llama.cpp
15+
16+
### Fixed
17+
18+
- (server) Fix several pydantic v2 migration bugs
19+
20+
## [0.1.70]
21+
22+
### Fixed
23+
24+
- (Llama.create_completion) Revert change so that `max_tokens` is not truncated to `context_size` in `create_completion`
25+
- (server) Fixed changed settings field names from pydantic v2 migration
26+
27+
## [0.1.69]
28+
1029
### Added
1130

1231
- (server) Streaming requests can are now interrupted pre-maturely when a concurrent request is made. Can be controlled with the `interrupt_requests` setting.

llama_cpp/llama.py

Lines changed: 6 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -833,19 +833,15 @@ def _create_completion(
833833
if self.verbose:
834834
llama_cpp.llama_reset_timings(self.ctx)
835835

836-
if max_tokens <= 0:
837-
# Unlimited, depending on n_ctx.
838-
if len(prompt_tokens) >= int(llama_cpp.llama_n_ctx(self.ctx)):
839-
raise ValueError(
840-
f"Requested tokens exceed context window of {llama_cpp.llama_n_ctx(self.ctx)}"
841-
)
842-
else:
843-
max_tokens = int(llama_cpp.llama_n_ctx(self.ctx)) - len(prompt_tokens)
844-
elif len(prompt_tokens) + max_tokens > int(llama_cpp.llama_n_ctx(self.ctx)):
836+
if len(prompt_tokens) >= llama_cpp.llama_n_ctx(self.ctx):
845837
raise ValueError(
846-
f"Requested tokens ({len(prompt_tokens)}) exceed context window of {self._n_ctx}"
838+
f"Requested tokens exceed context window of {llama_cpp.llama_n_ctx(self.ctx)}"
847839
)
848840

841+
if max_tokens <= 0:
842+
# Unlimited, depending on n_ctx.
843+
max_tokens = llama_cpp.llama_n_ctx(self.ctx) - len(prompt_tokens)
844+
849845
# Truncate max_tokens if requested tokens would exceed the context window
850846
max_tokens = (
851847
max_tokens

llama_cpp/llama_cpp.py

Lines changed: 49 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -326,13 +326,23 @@ def llama_mlock_supported() -> bool:
326326
# // Initialize the llama + ggml backend
327327
# // If numa is true, use NUMA optimizations
328328
# // Call once at the start of the program
329-
# LLAMA_API void llama_init_backend(bool numa);
330-
def llama_init_backend(numa: c_bool):
331-
return _lib.llama_init_backend(numa)
329+
# LLAMA_API void llama_backend_init(bool numa);
330+
def llama_backend_init(numa: c_bool):
331+
return _lib.llama_backend_init(numa)
332332

333333

334-
_lib.llama_init_backend.argtypes = [c_bool]
335-
_lib.llama_init_backend.restype = None
334+
_lib.llama_backend_init.argtypes = [c_bool]
335+
_lib.llama_backend_init.restype = None
336+
337+
338+
# // Call once at the end of the program - currently only used for MPI
339+
# LLAMA_API void llama_backend_free();
340+
def llama_backend_free():
341+
return _lib.llama_backend_free()
342+
343+
344+
_lib.llama_backend_free.argtypes = []
345+
_lib.llama_backend_free.restype = None
336346

337347

338348
# LLAMA_API struct llama_model * llama_load_model_from_file(
@@ -819,6 +829,39 @@ def llama_sample_frequency_and_presence_penalties(
819829
_lib.llama_sample_frequency_and_presence_penalties.restype = None
820830

821831

832+
# /// @details Apply classifier-free guidance to the logits as described in academic paper "Stay on topic with Classifier-Free Guidance" https://arxiv.org/abs/2306.17806
833+
# /// @param candidates A vector of `llama_token_data` containing the candidate tokens, the logits must be directly extracted from the original generation context without being sorted.
834+
# /// @params guidance_ctx A separate context from the same model. Other than a negative prompt at the beginning, it should have all generated and user input tokens copied from the main context.
835+
# /// @params scale Guidance strength. 1.0f means no guidance. Higher values mean stronger guidance.
836+
# /// @params smooth_factor Smooth factor between guidance logits and original logits. 1.0f means only use guidance logits. 0.0f means only original logits.
837+
# LLAMA_API void llama_sample_classifier_free_guidance(
838+
# struct llama_context * ctx,
839+
# llama_token_data_array * candidates,
840+
# struct llama_context * guidance_ctx,
841+
# float scale,
842+
# float smooth_factor);
843+
def llama_sample_classifier_free_guidance(
844+
ctx: llama_context_p,
845+
candidates, # type: _Pointer[llama_token_data_array]
846+
guidance_ctx: llama_context_p,
847+
scale: c_float,
848+
smooth_factor: c_float,
849+
):
850+
return _lib.llama_sample_classifier_free_guidance(
851+
ctx, candidates, guidance_ctx, scale, smooth_factor
852+
)
853+
854+
855+
_lib.llama_sample_classifier_free_guidance.argtypes = [
856+
llama_context_p,
857+
llama_token_data_array_p,
858+
llama_context_p,
859+
c_float,
860+
c_float,
861+
]
862+
_lib.llama_sample_classifier_free_guidance.restype = None
863+
864+
822865
# @details Sorts candidate tokens by their logits in descending order and calculate probabilities based on logits.
823866
# LLAMA_API void llama_sample_softmax(struct llama_context * ctx, llama_token_data_array * candidates);
824867
def llama_sample_softmax(
@@ -1063,5 +1106,5 @@ def llama_print_system_info() -> bytes:
10631106
_llama_initialized = False
10641107

10651108
if not _llama_initialized:
1066-
llama_init_backend(c_bool(False))
1109+
llama_backend_init(c_bool(False))
10671110
_llama_initialized = True

llama_cpp/server/__main__.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -30,14 +30,14 @@
3030

3131
if __name__ == "__main__":
3232
parser = argparse.ArgumentParser()
33-
for name, field in Settings.__model_fields__.items():
34-
description = field.field_info.description
33+
for name, field in Settings.model_fields.items():
34+
description = field.description
3535
if field.default is not None and description is not None:
3636
description += f" (default: {field.default})"
3737
parser.add_argument(
3838
f"--{name}",
3939
dest=name,
40-
type=field.type_,
40+
type=field.annotation if field.annotation is not None else str,
4141
help=description,
4242
)
4343

llama_cpp/server/app.py

Lines changed: 49 additions & 55 deletions
Original file line numberDiff line numberDiff line change
@@ -84,12 +84,8 @@ class Settings(BaseSettings):
8484
verbose: bool = Field(
8585
default=True, description="Whether to print debug information."
8686
)
87-
host: str = Field(
88-
default="localhost", description="Listen address"
89-
)
90-
port: int = Field(
91-
default=8000, description="Listen port"
92-
)
87+
host: str = Field(default="localhost", description="Listen address")
88+
port: int = Field(default=8000, description="Listen port")
9389
interrupt_requests: bool = Field(
9490
default=True,
9591
description="Whether to interrupt requests when a new request is received.",
@@ -183,7 +179,7 @@ def get_settings():
183179
yield settings
184180

185181

186-
model_field = Field(description="The model to use for generating completions.")
182+
model_field = Field(description="The model to use for generating completions.", default=None)
187183

188184
max_tokens_field = Field(
189185
default=16, ge=1, le=2048, description="The maximum number of tokens to generate."
@@ -247,21 +243,18 @@ def get_settings():
247243
default=0,
248244
ge=0,
249245
le=2,
250-
description="Enable Mirostat constant-perplexity algorithm of the specified version (1 or 2; 0 = disabled)"
246+
description="Enable Mirostat constant-perplexity algorithm of the specified version (1 or 2; 0 = disabled)",
251247
)
252248

253249
mirostat_tau_field = Field(
254250
default=5.0,
255251
ge=0.0,
256252
le=10.0,
257-
description="Mirostat target entropy, i.e. the target perplexity - lower values produce focused and coherent text, larger values produce more diverse and less coherent text"
253+
description="Mirostat target entropy, i.e. the target perplexity - lower values produce focused and coherent text, larger values produce more diverse and less coh B41A erent text",
258254
)
259255

260256
mirostat_eta_field = Field(
261-
default=0.1,
262-
ge=0.001,
263-
le=1.0,
264-
description="Mirostat learning rate"
257+
default=0.1, ge=0.001, le=1.0, description="Mirostat learning rate"
265258
)
266259

267260

@@ -299,22 +292,23 @@ class CreateCompletionRequest(BaseModel):
299292
model: Optional[str] = model_field
300293
n: Optional[int] = 1
301294
best_of: Optional[int] = 1
302-
user: Optional[str] = Field(None)
295+
user: Optional[str] = Field(default=None)
303296

304297
# llama.cpp specific parameters
305298
top_k: int = top_k_field
306299
repeat_penalty: float = repeat_penalty_field
307300
logit_bias_type: Optional[Literal["input_ids", "tokens"]] = Field(None)
308301

309-
class Config:
310-
schema_extra = {
311-
"example": {
312-
"prompt": "\n\n### Instructions:\nWhat is the capital of France?\n\n### Response:\n",
313-
"stop": ["\n", "###"],
314-
}
302+
model_config = {
303+
"json_schema_extra": {
304+
"examples": [
305+
{
306+
"prompt": "\n\n### Instructions:\nWhat is the capital of France?\n\n### Response:\n",
307+
"stop": ["\n", "###"],
308+
}
309+
]
315310
}
316-
317-
311+
}
318312

319313

320314
def make_logit_bias_processor(
@@ -333,7 +327,7 @@ def make_logit_bias_processor(
333327

334328
elif logit_bias_type == "tokens":
335329
for token, score in logit_bias.items():
336-
token = token.encode('utf-8')
330+
token = token.encode("utf-8")
337331
for input_id in llama.tokenize(token, add_bos=False):
338332
to_bias[input_id] = score
339333

@@ -357,7 +351,7 @@ async def create_completion(
357351
request: Request,
358352
body: CreateCompletionRequest,
359353
llama: llama_cpp.Llama = Depends(get_llama),
360-
):
354+
) -> llama_cpp.Completion:
361355
if isinstance(body.prompt, list):
362356
assert len(body.prompt) <= 1
363357
body.prompt = body.prompt[0] if len(body.prompt) > 0 else ""
@@ -369,7 +363,7 @@ async def create_completion(
369363
"logit_bias_type",
370364
"user",
371365
}
372-
kwargs = body.dict(exclude=exclude)
366+
kwargs = body.model_dump(exclude=exclude)
373367

374368
if body.logit_bias is not None:
375369
kwargs['logits_processor'] = llama_cpp.LogitsProcessorList([
@@ -401,7 +395,7 @@ async def event_publisher(inner_send_chan: MemoryObjectSendStream):
401395

402396
return EventSourceResponse(
403397
recv_chan, data_sender_callable=partial(event_publisher, send_chan)
404-
)
398+
) # type: ignore
405399
else:
406400
completion: llama_cpp.Completion = await run_in_threadpool(llama, **kwargs) # type: ignore
407401
return completion
@@ -410,16 +404,17 @@ async def event_publisher(inner_send_chan: MemoryObjectSendStream):
410404
class CreateEmbeddingRequest(BaseModel):
411405
model: Optional[str] = model_field
412406
input: Union[str, List[str]] = Field(description="The input to embed.")
413-
user: Optional[str]
414-
415-
class Config:
416-
schema_extra = {
417-
"example": {
418-
"input": "The food was delicious and the waiter...",
419-
}
407+
user: Optional[str] = Field(default=None)
408+
409+
model_config = {
410+
"json_schema_extra": {
411+
"examples": [
412+
{
413+
"input": "The food was delicious and the waiter...",
414+
}
415+
]
420416
}
421-
422-
417+
}
423418

424419

425420
@router.post(
@@ -429,7 +424,7 @@ async def create_embedding(
429424
request: CreateEmbeddingRequest, llama: llama_cpp.Llama = Depends(get_llama)
430425
):
431426
return await run_in_threadpool(
432-
llama.create_embedding, **request.dict(exclude={"user"})
427+
llama.create_embedding, **request.model_dump(exclude={"user"})
433428
)
434429

435430

@@ -466,21 +461,22 @@ class CreateChatCompletionRequest(BaseModel):
466461
repeat_penalty: float = repeat_penalty_field
467462
logit_bias_type: Optional[Literal["input_ids", "tokens"]] = Field(None)
468463

469-
class Config:
470-
schema_extra = {
471-
"example": {
472-
"messages": [
473-
ChatCompletionRequestMessage(
474-
role="system", content="You are a helpful assistant."
475-
),
476-
ChatCompletionRequestMessage(
477-
role="user", content="What is the capital of France?"
478-
),
479-
]
480-
}
464+
model_config = {
465+
"json_schema_extra": {
466+
"examples": [
467+
{
468+
"messages": [
469+
ChatCompletionRequestMessage(
470+
role="system", content="You are a helpful assistant."
471+
).model_dump(),
472+
ChatCompletionRequestMessage(
473+
role="user", content="What is the capital of France?"
474+
).model_dump(),
475+
]
476+
}
477+
]
481478
}
482-
483-
479+
}
484480

485481

486482
@router.post(
@@ -491,14 +487,14 @@ async def create_chat_completion(
491487
body: CreateChatCompletionRequest,
492488
llama: llama_cpp.Llama = Depends(get_llama),
493489
settings: Settings = Depends(get_settings),
494-
) -> Union[llama_cpp.ChatCompletion]: # type: ignore
490+
) -> llama_cpp.ChatCompletion:
495491
exclude = {
496492
"n",
497493
"logit_bias",
498494
"logit_bias_type",
499495
"user",
500496
}
501-
kwargs = body.dict(exclude=exclude)
497+
kwargs = body.model_dump(exclude=exclude)
502498

503499
if body.logit_bias is not None:
504500
kwargs['logits_processor'] = llama_cpp.LogitsProcessorList([
@@ -531,7 +527,7 @@ async def event_publisher(inner_send_chan: MemoryObjectSendStream):
531527
return EventSourceResponse(
532528
recv_chan,
533529
data_sender_callable=partial(event_publisher, send_chan),
534-
)
530+
) # type: ignore
535531
else:
536532
completion: llama_cpp.ChatCompletion = await run_in_threadpool(
537533
llama.create_chat_completion, **kwargs # type: ignore
@@ -551,8 +547,6 @@ class ModelList(TypedDict):
551547
data: List[ModelData]
552548

553549

554-
555-
556550
@router.get("/v1/models")
557551
async def get_models(
558552
settings: Settings = Depends(get_settings),

pyproject.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
[tool.poetry]
22
name = "llama_cpp_python"
3-
version = "0.1.68"
3+
version = "0.1.71"
44
description = "Python bindings for the llama.cpp library"
55
authors = ["Andrei Betlen <abetlen@gmail.com>"]
66
license = "MIT"

setup.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -10,15 +10,15 @@
1010
description="A Python wrapper for llama.cpp",
1111
long_description=long_description,
1212
long_description_content_type="text/markdown",
13-
version="0.1.68",
13+
version="0.1.71",
1414
author="Andrei Betlen",
1515
author_email="abetlen@gmail.com",
1616
license="MIT",
1717
package_dir={"llama_cpp": "llama_cpp", "llama_cpp.server": "llama_cpp/server"},
1818
packages=["llama_cpp", "llama_cpp.server"],
1919
install_requires=["typing-extensions>=4.5.0", "numpy>=1.20.0", "diskcache>=5.6.1"],
2020
extras_require={
21-
"server": ["uvicorn>=0.22.1", "fastapi>=0.100.0", "pydantic-settings>=2.0.1", "sse-starlette>=1.6.1"],
21+
"server": ["uvicorn>=0.22.0", "fastapi>=0.100.0", "pydantic-settings>=2.0.1", "sse-starlette>=1.6.1"],
2222
},
2323
python_requires=">=3.7",
2424
classifiers=[

0 commit comments

Comments
 (0)
2A8B
0