diff --git a/CHANGELOG.md b/CHANGELOG.md index 6f6355893..b4f49c904 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,11 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] +## [0.3.6] + +- feat: Update llama.cpp to ggerganov/llama.cpp@f7cd13301c2a88f97073fd119072b4cc92c08df1 +- fix(server): streaming resource lock by @gjpower in #1879 + ## [0.3.5] - feat: Update llama.cpp to ggerganov/llama.cpp@26a8406ba9198eb6fdd8329fa717555b4f77f05f diff --git a/llama_cpp/__init__.py b/llama_cpp/__init__.py index d7d9be881..8c6118fb4 100644 --- a/llama_cpp/__init__.py +++ b/llama_cpp/__init__.py @@ -1,4 +1,4 @@ from .llama_cpp import * from .llama import * -__version__ = "0.3.5" +__version__ = "0.3.6" diff --git a/llama_cpp/llama_cpp.py b/llama_cpp/llama_cpp.py index 0481cdbcd..16c6b9d73 100644 --- a/llama_cpp/llama_cpp.py +++ b/llama_cpp/llama_cpp.py @@ -222,6 +222,7 @@ # LLAMA_VOCAB_PRE_TYPE_EXAONE = 25, # LLAMA_VOCAB_PRE_TYPE_CHAMELEON = 26, # LLAMA_VOCAB_PRE_TYPE_MINERVA = 27, +# LLAMA_VOCAB_PRE_TYPE_DEEPSEEK3_LLM = 28, # }; LLAMA_VOCAB_PRE_TYPE_DEFAULT = 0 LLAMA_VOCAB_PRE_TYPE_LLAMA3 = 1 @@ -251,6 +252,7 @@ LLAMA_VOCAB_PRE_TYPE_EXAONE = 25 LLAMA_VOCAB_PRE_TYPE_CHAMELEON = 26 LLAMA_VOCAB_PRE_TYPE_MINERVA = 27 +LLAMA_VOCAB_PRE_TYPE_DEEPSEEK3_LLM = 28 # // note: these values should be synchronized with ggml_rope @@ -1090,9 +1092,10 @@ def llama_backend_free(): ... -# LLAMA_API struct llama_model * llama_load_model_from_file( +# DEPRECATED(LLAMA_API struct llama_model * llama_load_model_from_file( # const char * path_model, -# struct llama_model_params params); +# struct llama_model_params params), +# "use llama_model_load_from_file instead"); @ctypes_function( "llama_load_model_from_file", [ctypes.c_char_p, llama_model_params], @@ -1104,6 +1107,20 @@ def llama_load_model_from_file( ... +# LLAMA_API struct llama_model * llama_model_load_from_file( +# const char * path_model, +# struct llama_model_params params); +@ctypes_function( + "llama_model_load_from_file", + [ctypes.c_char_p, llama_model_params], + llama_model_p_ctypes, +) +def llama_model_load_from_file( + path_model: bytes, params: llama_model_params, / +) -> Optional[llama_model_p]: + ... + + # LLAMA_API void llama_free_model(struct llama_model * model); @ctypes_function( "llama_free_model", @@ -1114,6 +1131,16 @@ def llama_free_model(model: llama_model_p, /): ... +# LLAMA_API void llama_model_free(struct llama_model * model); +@ctypes_function( + "llama_model_free", + [llama_model_p_ctypes], + None, +) +def llama_model_free(model: llama_model_p, /): + ... + + # LLAMA_API struct llama_context * llama_new_context_with_model( # struct llama_model * model, # struct llama_context_params params); diff --git a/llama_cpp/server/app.py b/llama_cpp/server/app.py index b6db453b8..5120f2416 100644 --- a/llama_cpp/server/app.py +++ b/llama_cpp/server/app.py @@ -7,7 +7,7 @@ from anyio import Lock from functools import partial -from typing import Iterator, List, Optional, Union, Dict +from typing import List, Optional, Union, Dict import llama_cpp @@ -155,34 +155,71 @@ def create_app( return app +def prepare_request_resources( + body: CreateCompletionRequest | CreateChatCompletionRequest, + llama_proxy: LlamaProxy, + body_model: str | None, + kwargs, +) -> llama_cpp.Llama: + if llama_proxy is None: + raise HTTPException( + status_code=status.HTTP_503_SERVICE_UNAVAILABLE, + detail="Service is not available", + ) + llama = llama_proxy(body_model) + if body.logit_bias is not None: + kwargs["logit_bias"] = ( + _logit_bias_tokens_to_input_ids(llama, body.logit_bias) + if body.logit_bias_type == "tokens" + else body.logit_bias + ) + + if body.grammar is not None: + kwargs["grammar"] = llama_cpp.LlamaGrammar.from_string(body.grammar) + + if body.min_tokens > 0: + _min_tokens_logits_processor = llama_cpp.LogitsProcessorList( + [llama_cpp.MinTokensLogitsProcessor(body.min_tokens, llama.token_eos())] + ) + if "logits_processor" not in kwargs: + kwargs["logits_processor"] = _min_tokens_logits_processor + else: + kwargs["logits_processor"].extend(_min_tokens_logits_processor) + return llama + + async def get_event_publisher( request: Request, inner_send_chan: MemoryObjectSendStream[typing.Any], - iterator: Iterator[typing.Any], - on_complete: typing.Optional[typing.Callable[[], typing.Awaitable[None]]] = None, + body: CreateCompletionRequest | CreateChatCompletionRequest, + body_model: str | None, + llama_call, + kwargs, ): server_settings = next(get_server_settings()) interrupt_requests = ( server_settings.interrupt_requests if server_settings else False ) - async with inner_send_chan: - try: - async for chunk in iterate_in_threadpool(iterator): - await inner_send_chan.send(dict(data=json.dumps(chunk))) - if await request.is_disconnected(): - raise anyio.get_cancelled_exc_class()() - if interrupt_requests and llama_outer_lock.locked(): - await inner_send_chan.send(dict(data="[DONE]")) - raise anyio.get_cancelled_exc_class()() - await inner_send_chan.send(dict(data="[DONE]")) - except anyio.get_cancelled_exc_class() as e: - print("disconnected") - with anyio.move_on_after(1, shield=True): - print(f"Disconnected from client (via refresh/close) {request.client}") - raise e - finally: - if on_complete: - await on_complete() + async with contextlib.asynccontextmanager(get_llama_proxy)() as llama_proxy: + llama = prepare_request_resources(body, llama_proxy, body_model, kwargs) + async with inner_send_chan: + try: + iterator = await run_in_threadpool(llama_call, llama, **kwargs) + async for chunk in iterate_in_threadpool(iterator): + await inner_send_chan.send(dict(data=json.dumps(chunk))) + if await request.is_disconnected(): + raise anyio.get_cancelled_exc_class()() + if interrupt_requests and llama_outer_lock.locked(): + await inner_send_chan.send(dict(data="[DONE]")) + raise anyio.get_cancelled_exc_class()() + await inner_send_chan.send(dict(data="[DONE]")) + except anyio.get_cancelled_exc_class() as e: + print("disconnected") + with anyio.move_on_after(1, shield=True): + print( + f"Disconnected from client (via refresh/close) {request.client}" + ) + raise e def _logit_bias_tokens_to_input_ids( @@ -267,18 +304,11 @@ async def create_completion( request: Request, body: CreateCompletionRequest, ) -> llama_cpp.Completion: - exit_stack = contextlib.AsyncExitStack() - llama_proxy = await exit_stack.enter_async_context(contextlib.asynccontextmanager(get_llama_proxy)()) - if llama_proxy is None: - raise HTTPException( - status_code=status.HTTP_503_SERVICE_UNAVAILABLE, - detail="Service is not available", - ) if isinstance(body.prompt, list): assert len(body.prompt) <= 1 body.prompt = body.prompt[0] if len(body.prompt) > 0 else "" - llama = llama_proxy( + body_model = ( body.model if request.url.path != "/v1/engines/copilot-codex/completions" else "copilot-codex" @@ -293,44 +323,8 @@ async def create_completion( } kwargs = body.model_dump(exclude=exclude) - if body.logit_bias is not None: - kwargs["logit_bias"] = ( - _logit_bias_tokens_to_input_ids(llama, body.logit_bias) - if body.logit_bias_type == "tokens" - else body.logit_bias - ) - - if body.grammar is not None: - kwargs["grammar"] = llama_cpp.LlamaGrammar.from_string(body.grammar) - - if body.min_tokens > 0: - _min_tokens_logits_processor = llama_cpp.LogitsProcessorList( - [llama_cpp.MinTokensLogitsProcessor(body.min_tokens, llama.token_eos())] - ) - if "logits_processor" not in kwargs: - kwargs["logits_processor"] = _min_tokens_logits_processor - else: - kwargs["logits_processor"].extend(_min_tokens_logits_processor) - - try: - iterator_or_completion: Union[ - llama_cpp.CreateCompletionResponse, - Iterator[llama_cpp.CreateCompletionStreamResponse], - ] = await run_in_threadpool(llama, **kwargs) - except Exception as err: - await exit_stack.aclose() - raise err - - if isinstance(iterator_or_completion, Iterator): - # EAFP: It's easier to ask for forgiveness than permission - first_response = await run_in_threadpool(next, iterator_or_completion) - - # If no exception was raised from first_response, we can assume that - # the iterator is valid and we can use it to stream the response. - def iterator() -> Iterator[llama_cpp.CreateCompletionStreamResponse]: - yield first_response - yield from iterator_or_completion - + # handle streaming request + if kwargs.get("stream", False): send_chan, recv_chan = anyio.create_memory_object_stream(10) return EventSourceResponse( recv_chan, @@ -338,15 +332,29 @@ def iterator() -> Iterator[llama_cpp.CreateCompletionStreamResponse]: get_event_publisher, request=request, inner_send_chan=send_chan, - iterator=iterator(), - on_complete=exit_stack.aclose, + body=body, + body_model=body_model, + llama_call=llama_cpp.Llama.__call__, + kwargs=kwargs, ), sep="\n", ping_message_factory=_ping_message_factory, ) - else: - await exit_stack.aclose() - return iterator_or_completion + + # handle regular request + async with contextlib.asynccontextmanager(get_llama_proxy)() as llama_proxy: + llama = prepare_request_resources(body, llama_proxy, body_model, kwargs) + + if await request.is_disconnected(): + print( + f"Disconnected from client (via refresh/close) before llm invoked {request.client}" + ) + raise HTTPException( + status_code=status.HTTP_400_BAD_REQUEST, + detail="Client closed request", + ) + + return await run_in_threadpool(llama, **kwargs) @router.post( @@ -474,13 +482,8 @@ async def create_chat_completion( # where the dependency is cleaned up before a StreamingResponse # is complete. # https://github.com/tiangolo/fastapi/issues/11143 - exit_stack = contextlib.AsyncExitStack() - llama_proxy = await exit_stack.enter_async_context(contextlib.asynccontextmanager(get_llama_proxy)()) - if llama_proxy is None: - raise HTTPException( - status_code=status.HTTP_503_SERVICE_UNAVAILABLE, - detail="Service is not available", - ) + + body_model = body.model exclude = { "n", "logit_bias_type", @@ -488,44 +491,9 @@ async def create_chat_completion( "min_tokens", } kwargs = body.model_dump(exclude=exclude) - llama = llama_proxy(body.model) - if body.logit_bias is not None: - kwargs["logit_bias"] = ( - _logit_bias_tokens_to_input_ids(llama, body.logit_bias) - if body.logit_bias_type == "tokens" - else body.logit_bias - ) - - if body.grammar is not None: - kwargs["grammar"] = llama_cpp.LlamaGrammar.from_string(body.grammar) - - if body.min_tokens > 0: - _min_tokens_logits_processor = llama_cpp.LogitsProcessorList( - [llama_cpp.MinTokensLogitsProcessor(body.min_tokens, llama.token_eos())] - ) - if "logits_processor" not in kwargs: - kwargs["logits_processor"] = _min_tokens_logits_processor - else: - kwargs["logits_processor"].extend(_min_tokens_logits_processor) - - try: - iterator_or_completion: Union[ - llama_cpp.ChatCompletion, Iterator[llama_cpp.ChatCompletionChunk] - ] = await run_in_threadpool(llama.create_chat_completion, **kwargs) - except Exception as err: - await exit_stack.aclose() - raise err - - if isinstance(iterator_or_completion, Iterator): - # EAFP: It's easier to ask for forgiveness than permission - first_response = await run_in_threadpool(next, iterator_or_completion) - - # If no exception was raised from first_response, we can assume that - # the iterator is valid and we can use it to stream the response. - def iterator() -> Iterator[llama_cpp.ChatCompletionChunk]: - yield first_response - yield from iterator_or_completion + # handle streaming request + if kwargs.get("stream", False): send_chan, recv_chan = anyio.create_memory_object_stream(10) return EventSourceResponse( recv_chan, @@ -533,15 +501,29 @@ def iterator() -> Iterator[llama_cpp.ChatCompletionChunk]: get_event_publisher, request=request, inner_send_chan=send_chan, - iterator=iterator(), - on_complete=exit_stack.aclose, + body=body, + body_model=body_model, + llama_call=llama_cpp.Llama.create_chat_completion, + kwargs=kwargs, ), sep="\n", ping_message_factory=_ping_message_factory, ) - else: - await exit_stack.aclose() - return iterator_or_completion + + # handle regular request + async with contextlib.asynccontextmanager(get_llama_proxy)() as llama_proxy: + llama = prepare_request_resources(body, llama_proxy, body_model, kwargs) + + if await request.is_disconnected(): + print( + f"Disconnected from client (via refresh/close) before llm invoked {request.client}" + ) + raise HTTPException( + status_code=status.HTTP_400_BAD_REQUEST, + detail="Client closed request", + ) + + return await run_in_threadpool(llama.create_chat_completion, **kwargs) @router.get( diff --git a/vendor/llama.cpp b/vendor/llama.cpp index 716bd6dec..f7cd13301 160000 --- a/vendor/llama.cpp +++ b/vendor/llama.cpp @@ -1 +1 @@ -Subproject commit 716bd6dec3e044e5c325386b5b0483392b24cefe +Subproject commit f7cd13301c2a88f97073fd119072b4cc92c08df1