From 3c402c579fd9ef19cba51ae8ad32b41829fbcc0b Mon Sep 17 00:00:00 2001 From: Daniel Thuerck Date: Fri, 3 May 2024 07:37:43 +0200 Subject: [PATCH] Propagate flash attn to model load. --- llama_cpp/server/model.py | 1 + 1 file changed, 1 insertion(+) diff --git a/llama_cpp/server/model.py b/llama_cpp/server/model.py index e102fadbd..f00292410 100644 --- a/llama_cpp/server/model.py +++ b/llama_cpp/server/model.py @@ -242,6 +242,7 @@ def load_llama_from_model_settings(settings: ModelSettings) -> llama_cpp.Llama: logits_all=settings.logits_all, embedding=settings.embedding, offload_kqv=settings.offload_kqv, + flash_attn=settings.flash_attn, # Sampling Params last_n_tokens_size=settings.last_n_tokens_size, # LoRA Params