From b06a954bbc5ee62e5274e5badb63bbf38788741d Mon Sep 17 00:00:00 2001 From: Jared Van Bortel Date: Mon, 19 May 2025 13:38:36 -0400 Subject: [PATCH] llama_encode : only force non-causal attention for enc-dec models --- src/llama-context.cpp | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/src/llama-context.cpp b/src/llama-context.cpp index a3b84a6a82e74..1653776a8e40b 100644 --- a/src/llama-context.cpp +++ b/src/llama-context.cpp @@ -732,10 +732,12 @@ int llama_context::encode(llama_batch & inp_batch) { const auto causal_attn_org = cparams.causal_attn; - // always use non-causal attention for encoder graphs - // TODO: this is a tmp solution until we have a proper way to support enc-dec models - // ref: https://github.com/ggml-org/llama.cpp/pull/12181#issuecomment-2730451223 - cparams.causal_attn = false; + if (model.arch == LLM_ARCH_T5) { + // always use non-causal attention for encoder graphs + // TODO: this is a tmp solution until we have a proper way to support enc-dec models + // ref: https://github.com/ggml-org/llama.cpp/pull/12181#issuecomment-2730451223 + cparams.causal_attn = false; + } auto * gf = graph_init(); auto res = graph_build(ctx_compute.get(), gf, ubatch, LLM_GRAPH_TYPE_ENCODER);