8000 Merge branch 'main' into batch-processing · qeleb/llama-cpp-python@a625412 · GitHub
[go: up one dir, main page]

Skip to content

Commit a625412

Browse files
committed
Merge branch 'main' into batch-processing
2 parents 4335a9d + abda047 commit a625412

File tree

13 files changed

+171
-49
lines changed

13 files changed

+171
-49
lines changed

CHANGELOG.md

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,30 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
77

88
## [Unreleased]
99

10+
- fix: ctypes definitions of llama_kv_cache_view_update and llama_kv_cache_view_free. by @e-c-d in #1028
11+
12+
## [0.2.24]
13+
14+
- feat: Update llama.cpp to ggerganov/llama.cpp@0e18b2e7d0b5c0a509ea40098def234b8d4a938a
15+
- feat: Add offload_kqv option to llama and server by @abetlen in 095c65000642a3cf73055d7428232fb18b73c6f3
16+
- feat: n_ctx=0 now uses the n_ctx_train of the model by @DanieleMorotti in #1015
17+
- feat: logits_to_logprobs supports both 2-D and 3-D logits arrays by @kddubey in #1002
18+
- fix: Remove f16_kv, add offload_kqv fields in low level and llama apis by @brandonrobertz in #1019
19+
- perf: Don't convert logprobs arrays to lists by @kddubey in #1021
20+
- docs: Fix README.md functionary demo typo by @evelynmitchell in #996
21+
- examples: Update low_level_api_llama_cpp.py to match current API by @jsoma in #1023
22+
23+
## [0.2.23]
24+
25+
- Update llama.cpp to ggerganov/llama.cpp@948ff137ec37f1ec74c02905917fa0afc9b97514
26+
- Add qwen chat format by @yhfgyyf in #1005
27+
- Add support for running the server with SSL by @rgerganov in #994
28+
- Replace logits_to_logprobs implementation with numpy equivalent to llama.cpp by @player1537 in #991
29+
- Fix UnsupportedOperation: fileno in suppress_stdout_stderr by @zocainViken in #961
30+
- Add Pygmalion chat format by @chiensen in #986
31+
- README.md multimodal params fix by @zocainViken in #967
32+
- Fix minor typo in README by @aniketmaurya in #958
33+
1034
## [0.2.22]
1135

1236
- Update llama.cpp to ggerganov/llama.cpp@8a7b2fa528f130631a5f43648481596ab320ed5a

README.md

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -207,7 +207,8 @@ The gguf-converted files for this model can be found here: [functionary-7b-v1](h
207207
messages = [
208208
{
209209
"role": "system",
210- B41A
"content": "A chat between a curious user and an artificial intelligence assitant. The assistant gives helpful, detailed, and polite answers to the user's questions. The assistant callse functions with appropriate input when necessary"
210+
"content": "A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. The assistant calls functions with appropriate input when necessary"
211+
211212
},
212213
{
213214
"role": "user",
@@ -219,7 +220,7 @@ The gguf-converted files for this model can be found here: [functionary-7b-v1](h
219220
"function": {
220221
"name": "UserDetail",
221222
"parameters": {
222-
"type": "object"
223+
"type": "object",
223224
"title": "UserDetail",
224225
"properties": {
225226
"name": {
@@ -265,7 +266,8 @@ Then you'll need to use a custom chat handler to load the clip model and process
265266
>>> llm = Llama(
266267
model_path="./path/to/llava/llama-model.gguf",
267268
chat_handler=chat_handler,
268-
n_ctx=2048 # n_ctx should be increased to accomodate the image embedding
269+
n_ctx=2048, # n_ctx should be increased to accomodate the image embedding
270+
logits_all=True,# needed to make llava work
269271
)
270272
>>> llm.create_chat_completion(
271273
messages = [

examples/low_level_api/low_level_api_llama_cpp.py

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -73,7 +73,7 @@
7373< F438 /code>
embd = []
7474
if len(embd_inp) <= input_consumed:
7575
logits = llama_cpp.llama_get_logits(ctx)
76-
n_vocab = llama_cpp.llama_n_vocab(ctx)
76+
n_vocab = llama_cpp.llama_n_vocab(model)
7777

7878
_arr = (llama_cpp.llama_token_data * n_vocab)(*[
7979
llama_cpp.llama_token_data(token_id, logits[token_id], 0.0)
@@ -83,12 +83,12 @@
8383
llama_cpp.llama_token_data_array(_arr, len(_arr), False))
8484

8585
_arr = (llama_cpp.c_int * len(last_n_tokens_data))(*last_n_tokens_data)
86-
llama_cpp.llama_sample_repetition_penalty(ctx, candidates_p,
86+
llama_cpp.llama_sample_repetition_penalties(ctx, candidates_p,
8787
_arr,
88-
last_n_repeat, repeat_penalty)
89-
llama_cpp.llama_sample_frequency_and_presence_penalties(ctx, candidates_p,
90-
_arr,
91-
last_n_repeat, frequency_penalty, presence_penalty)
88+
penalty_last_n=last_n_repeat,
89+
penalty_repeat=repeat_penalty,
90+
penalty_freq=frequency_penalty,
91+
penalty_present=presence_penalty)
9292

9393
llama_cpp.llama_sample_top_k(ctx, candidates_p, k=40, min_keep=1)
9494
llama_cpp.llama_sample_top_p(ctx, candidates_p, p=0.8, min_keep=1)
@@ -126,4 +126,4 @@
126126

127127
llama_cpp.llama_print_timings(ctx)
128128

129-
llama_cpp.llama_free(ctx)
129+
llama_cpp.llama_free(ctx)

llama_cpp/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
11
from .llama_cpp import *
22
from .llama import *
33

4-
__version__ = "0.2.22"
4+
__version__ = "0.2.24"

llama_cpp/_utils.py

Lines changed: 16 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -17,14 +17,18 @@ def __enter__(self):
1717
if self.disable:
1818
return self
1919

20+
# Check if sys.stdout and sys.stderr have fileno method
21+
if not hasattr(self.sys.stdout, 'fileno') or not hasattr(self.sys.stderr, 'fileno'):
22+
return self # Return the instance without making changes
23+
2024
self.outnull_file = self.open(self.os.devnull, "w")
2125
self.errnull_file = self.open(self.os.devnull, "w")
2226

2327
self.old_stdout_fileno_undup = self.sys.stdout.fileno()
2428
self.old_stderr_fileno_undup = self.sys.stderr.fileno()
2529

26-
self.old_stdout_fileno = self.os.dup(self.sys.stdout.fileno())
27-
self.old_stderr_fileno = self.os.dup(self.sys.stderr.fileno())
30+
self.old_stdout_fileno = self.os.dup(self.old_stdout_fileno_undup)
31+
self.old_stderr_fileno = self.os.dup(self.old_stderr_fileno_undup)
2832

2933
self.old_stdout = self.sys.stdout
3034
self.old_stderr = self.sys.stderr
@@ -40,14 +44,16 @@ def __exit__(self, *_):
4044
if self.disable:
4145
return
4246

43-
self.sys.stdout = self.old_stdout
44-
self.sys.stderr = self.old_stderr
47+
# Check if sys.stdout and sys.stderr have fileno method
48+
if hasattr(self.sys.stdout, 'fileno') and hasattr(self.sys.stderr, 'fileno'):
49+
self.sys.stdout = self.old_stdout
50+
self.sys.stderr = self.old_stderr
4551

46-
self.os.dup2(self.old_stdout_fileno, self.old_stdout_fileno_undup)
47-
self.os.dup2(self.old_stderr_fileno, self.old_stderr_fileno_undup)
52+
self.os.dup2(self.old_stdout_fileno, self.old_stdout_fileno_undup)
53+
self.os.dup2(self.old_stderr_fileno, self.old_stderr_fileno_undup)
4854

49-
self.os.close(self.old_stdout_fileno)
50-
self.os.close(self.old_stderr_fileno)
55+
self.os.close(self.old_stdout_fileno)
56+
self.os.close(self.old_stderr_fileno)
5157

52-
self.outnull_file.close()
53-
self.errnull_file.close()
58+
self.outnull_file.close()
59+
self.errnull_file.close()

llama_cpp/llama.py

Lines changed: 30 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,6 @@
44
import sys
55
import uuid
66
import time
7-
import math
87
import multiprocessing
98

109
from typing import (
@@ -67,9 +66,9 @@ def __init__(
6766
yarn_beta_slow: float = 1.0,
6867
yarn_orig_ctx: int = 0,
6968
mul_mat_q: bool = True,
70-
f16_kv: bool = True,
7169
logits_all: bool = False,
7270
embedding: bool = False,
71+
offload_kqv: bool = False,
7372
# Sampling Params
7473
last_n_tokens_size: int = 64,
7574
# LoRA Params
@@ -87,7 +86,7 @@ def __init__(
8786
**kwargs, # type: ignore
8887
):
8988
"""Load a llama.cpp model from `model_path`.
90-
89+
9190
Examples:
9291
Basic usage
9392
@@ -133,9 +132,9 @@ def __init__(
133132
yarn_beta_fast: YaRN low correction dim
134133
yarn_beta_slow: YaRN high correction dim
135134
yarn_orig_ctx: YaRN original context size
136-
f16_kv: Use fp16 for KV cache, fp32 otherwise
137135
logits_all: Return logits for all tokens, not just the last token. Must be True for completion to return logprobs.
138136
embedding: Embedding mode only.
137+
offload_kqv: Offload K, Q, V to GPU.
139138
last_n_tokens_size: Maximum number of tokens to keep in the last_n_tokens deque.
140139
lora_base: Optional path to base model, useful if using a quantized base model and you want to apply LoRA to an f16 model.
141140
lora_path: Path to a LoRA file to apply to the model.
@@ -220,9 +219,9 @@ def __init__(
220219
)
221220
self.context_params.yarn_orig_ctx = yarn_orig_ctx if yarn_orig_ctx != 0 else 0
222221
self.context_params.mul_mat_q = mul_mat_q
223-
# self.context_params.f16_kv = f16_kv
224222
self.context_params.logits_all = logits_all
225223
self.context_params.embedding = embedding
224+
self.context_params.offload_kqv = offload_kqv
226225

227226
# Sampling Params
228227
self.last_n_tokens_size = last_n_tokens_size
@@ -239,6 +238,12 @@ def __init__(
239238
self._model = _LlamaModel(
240239
path_model=self.model_path, params=self.model_params, verbose=self.verbose
241240
)
241+
# Set the default value for the context and correct the batch
242+
if n_ctx == 0:
243+
n_ctx = self._model.n_ctx_train()
244+
self.n_batch = min(n_ctx, n_batch)
245+
self.context_params.n_ctx = self._model.n_ctx_train()
246+
self.context_params.n_batch = self.n_batch
242247

243248
self._ctx = _LlamaContext(
244249
model=self._model,
@@ -940,7 +945,7 @@ def _logprobs_or_none(all_tokens: List[int], all_token_strs: List[str], all_logp
940945
self.detokenize(completion_tokens[:returned_tokens])
941946
)
942947
token_offset = len(prompt_tokens) + returned_tokens
943-
logits = self._scores[token_offset - 1, :].tolist()
948+
logits = self._scores[token_offset - 1, :]
944949
token_logprob = Llama.logits_to_logprobs(logits)
945950
sorted_logprobs = list(
946951
sorted(
@@ -1034,7 +1039,8 @@ def _logprobs_or_none(all_tokens: List[int], all_token_strs: List[str], all_logp
10341039
self.detokenize(completion_tokens[:returned_tokens])
10351040
)
10361041
token_offset = len(prompt_tokens) + returned_tokens - 1
1037-
token_logprob = Llama.logits_to_logprobs(self._scores[token_offset, :].tolist())
1042+
logits = self._scores[token_offset, :]
1043+
token_logprob = Llama.logits_to_logprobs(logits)
10381044
sorted_logprobs = list(
10391045
sorted(
10401046
zip(token_logprob, range(len(token_logprob))),
@@ -1101,7 +1107,7 @@ def _logprobs_or_none(all_tokens: List[int], all_token_strs: List[str], all_logp
11011107
for token in all_tokens
11021108
]
11031109
all_logprobs = [
1104-
Llama.logits_to_logprobs(row.tolist()) for row in self._scores
1110+
Llama.logits_to_logprobs(row).tolist() for row in self._scores
11051111
][token_offset:]
11061112
logprobs_or_none = _logprobs_or_none(
11071113
all_tokens, all_token_strs, all_logprobs, text_offset
@@ -1426,7 +1432,6 @@ def __getstate__(self):
14261432
yarn_beta_slow=self.context_params.yarn_beta_slow,
14271433
yarn_orig_ctx=self.context_params.yarn_orig_ctx,
14281434
mul_mat_q=self.context_params.mul_mat_q,
1429-
f16_kv=self.context_params.f16_kv,
14301435
logits_all=self.context_params.logits_all,
14311436
embedding=self.context_params.embedding,
14321437
# Sampling Params
@@ -1469,7 +1474,6 @@ def __setstate__(self, state):
14691474
yarn_beta_slow=state["yarn_beta_slow"],
14701475
yarn_orig_ctx=state["yarn_orig_ctx"],
14711476
mul_mat_q=state["mul_mat_q"],
1472-
f16_kv=state["f16_kv"],
14731477
logits_all=state["logits_all"],
14741478
embedding=state["embedding"],
14751479
# Sampling Params
@@ -1557,10 +1561,22 @@ def token_nl(self) -> int:
15571561
return self._model.token_nl()
15581562

15591563
@staticmethod
1560-
def logits_to_logprobs(logits: List[float]) -> List[float]:
1561-
exps = [math.exp(float(x)) for x in logits]
1562-
sum_exps = sum(exps)
1563-
return [math.log(x / sum_exps) for x in exps]
1564+
def logits_to_logprobs(
1565+
logits: Union[npt.NDArray[np.single], List], axis: int = -1
1566+
) -> npt.NDArray[np.single]:
1567+
# https://docs.scipy.org/doc/scipy/reference/generated/scipy.special.log_softmax.html
1568+
logits_maxs: np.ndarray = np.amax(logits, axis=axis, keepdims=True)
1569+
if logits_maxs.ndim > 0:
1570+
logits_maxs[~np.isfinite(logits_maxs)] = 0
1571+
elif not np.isfinite(logits_maxs):
1572+
logits_maxs = 0
1573+
subtract_maxs = np.subtract(logits, logits_maxs, dtype=np.single)
1574+
exp = np.exp(subtract_maxs)
1575+
# Suppress warnings about log of zero
1576+
with np.errstate(divide="ignore"):
1577+
summed = np.sum(exp, axis=axis, keepdims=True)
1578+
out = np.log(summed)
1579+
return subtract_maxs - out
15641580

15651581
@staticmethod
15661582
def longest_token_prefix(a: Sequence[int], b: Sequence[int]):

llama_cpp/llama_chat_format.py

Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -423,6 +423,21 @@ def format_alpaca(
423423
_prompt = _format_add_colon_two(system_message, _messages, _sep, _sep2)
424424
return ChatFormatterResponse(prompt=_prompt)
425425

426+
@register_chat_format("qwen")
427+
def format_qwen(
428+
messages: List[llama_types.ChatCompletionRequestMessage],
429+
**kwargs: Any,
430+
) -> ChatFormatterResponse:
431+
_roles = dict(user="<|im_start|>user", assistant="<|im_start|>assistant")
432+
system_message="You are a helpful assistant."
433+
system_template="<|im_start|>system\n{system_message}"
434+
system_message=system_template.format(system_message=system_message)
435+
_messages = _map_roles(messages, _roles)
436+
_messages.append((_roles["assistant"], None))
437+
_sep = "<|im_end|>"
438+
_prompt = _format_chatml(system_message, _messages, _sep)
439+
_sep2 = "<|endoftext|>"
440+
return ChatFormatterResponse(prompt=_prompt,stop=_sep2)
426441

427442
@register_chat_format("vicuna")
428443
def format(
@@ -637,6 +652,23 @@ def format_zephyr(
637652
_prompt = _format_chatml(system_message, _messages, _sep)
638653
return ChatFormatterResponse(prompt=_prompt, stop=_sep)
639654

655+
656+
@register_chat_format("pygmalion")
657+
def format_pygmalion(
658+
messages: List[llama_types.ChatCompletionRequestMessage],
659+
**kwargs: Any,
660+
) -> ChatFormatterResponse:
661+
system_template = """<|system|>{system_message}"""
662+
system_message = _get_system_message(messages)
663+
system_message = system_template.format(system_message=system_message)
664+
_roles = dict(user="<|user|>", assistant="<|model|>")
665+
_sep = "\n"
666+
_messages = _map_roles(messages, _roles)
667+
_messages.append((_roles["assistant"], None))
668+
_prompt = _format_chatml(system_message, _messages, _sep)
669+
return ChatFormatterResponse(prompt=_prompt, stop=_sep)
670+
671+
640672
@register_chat_format("chatml")
641673
def format_chatml(
642674
messages: List[llama_types.ChatCompletionRequestMessage],

0 commit comments

Comments
 (0)
0