llama : do not allocate KV cache for "vocab_only == true" (ggml-org#682)

sw · web-flow · commit 81040f10aae3 · 2023-04-02T10:18:53.000+03:00
Fixes sanitizer CI
diff --git a/llama.cpp b/llama.cpp
@@ -1608,7 +1608,7 @@ struct llama_context * llama_init_from_file(
     }
 
     // reserve memory for context buffers
-    {
+    if (!params.vocab_only) {
         if (!kv_cache_init(ctx->model.hparams, ctx->model.kv_self, memory_type, ctx->model.hparams.n_ctx)) {
             fprintf(stderr, "%s: kv_cache_init() failed for self-attention cache\n", __func__);
             llama_free(ctx);

Original file line number	Diff line number	Diff line change
`@@ -1608,7 +1608,7 @@ struct llama_context * llama_init_from_file(`
`1608`	`1608`	`}`
`1609`	`1609`
`1610`	`1610`	`// reserve memory for context buffers`
`1611`		`- {`
	`1611`	`+ if (!params.vocab_only) {`
`1612`	`1612`	`if (!kv_cache_init(ctx->model.hparams, ctx->model.kv_self, memory_type, ctx->model.hparams.n_ctx)) {`
`1613`	`1613`	`fprintf(stderr, "%s: kv_cache_init() failed for self-attention cache\n", __func__);`
`1614`	`1614`	`llama_free(ctx);`