8000 Merge remote-tracking branch 'origin/master' into tool-diffs · ochafik/llama.cpp@40951c8 · GitHub
[go: up one dir, main page]

8000 Skip to content

Commit 40951c8

Browse files
author
ochafik
committed
Merge remote-tracking branch 'origin/master' into tool-diffs
2 parents f0d5df2 + b775345 commit 40951c8

File tree

16 files changed

+377
-204
lines changed

16 files changed

+377
-204
lines changed

.github/workflows/winget.yml

Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,42 @@
1+
name: Update Winget Package
2+
3+
on:
4+
workflow_dispatch: # allows manual triggering
5+
schedule:
6+
- cron: '28 5 * * *' # Update every day at 5:28 UTC
7+
8+
jobs:
9+
update:
10+
name: Update Winget Package
11+
runs-on: ubuntu-latest
12+
13+
steps:
14+
- name: Install cargo binstall
15+
uses: cargo-bins/cargo-binstall@268643a6b5ea099f5718ee5cd3ff7dc89a5eb49b
16+
17+
- name: Install komac
18+
run: |
19+
cargo binstall komac@2.11.2 -y
20+
21+
- name: Find latest release
22+
id: find_latest_release
23+
uses: actions/github-script@v6
24+
with:
25+
script: |
26+
const { data: releases } = await github.rest.repos.listReleases({
27+
owner: context.repo.owner,
28+
repo: context.repo.repo,
29+
});
30+
console.log("Latest release:", releases[0].tag_name);
31+
return releases[0].tag_name;
32+
33+
- name: Update manifest
34+
env:
35+
VERSION: ${{ steps.find_latest_release.outputs.result }}
36+
run: |
37+
echo "Updating manifest..."
38+
komac update --version ${{ env.VERSION }} \
39+
--urls "https://github.com/ggml-org/llama.cpp/releases/download/${{ env.VERSION }}/llama-${{ env.VERSION }}-bin-win-vulkan-x64.zip" \
40+
--token ${{ secrets.WINGET_GITHUB_TOKEN }} \
41+
--submit \
42+
ggml.llamacpp

src/llama-hparams.cpp

Lines changed: 17 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,22 @@
22

33
#include "ggml.h"
44

5+
void llama_hparams::set_swa_pattern(uint32_t n_pattern) {
6+
for (uint32_t il = 0; il < n_layer; ++il) {
7+
swa_layers[il] = n_pattern == 0 || (il % n_pattern < (n_pattern - 1));
8+
}
9+
}
10+
11+
bool llama_hparams::is_swa_any() const {
12+
for (uint32_t il = 0; il < n_layer; ++il) {
13+
if (swa_layers[il]) {
14+
return true;
15+
}
16+
}
17+
18+
return false;
19+
}
20+
521
uint32_t llama_hparams::n_head(uint32_t il) const {
622
if (il < n_layer) {
723
return n_head_arr[il];
@@ -72,7 +88,7 @@ uint32_t llama_hparams::n_embd_v_s() const {
7288

7389
bool llama_hparams::is_swa(uint32_t il) const {
7490
if (il < n_layer) {
75-
return n_swa_pattern == 0 || (il % n_swa_pattern < (n_swa_pattern - 1));
91+
return swa_layers[il];
7692
}
7793

7894
GGML_ABORT("fatal error");

src/llama-hparams.h

Lines changed: 23 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -102,20 +102,12 @@ struct llama_hparams {
102102

103103
// Sliding Window Attention (SWA)
104104
llama_swa_type swa_type = LLAMA_SWA_TYPE_NONE;
105-
106-
uint32_t n_swa = 0; // the size of the sliding window (0 - no SWA)
107-
uint32_t n_swa_pattern = 1; // this value n means that every nth layer is dense (i.e. non-SWA)
108-
// by default n == 1, all layers are dense
109-
// note that if n_swa_pattern == 0, all layers are SWA
110-
// example: n_swa_pattern = 3
111-
// il == 0: swa
112-
// il == 1: swa
113-
// il == 2: dense
114-
// il == 3: swa
115-
// il == 4: swa
116-
// il == 5: dense
117-
// il == 6: swa
118-
// etc ...
105+
// the size of the sliding window (0 - no SWA)
106+
uint32_t n_swa = 0;
107+
// if swa_layers[il] == true, then layer il is SWA
108+
// if swa_layers[il] == false, then layer il is dense (i.e. non-SWA)
109+
// by default, all layers are dense
110+ 10000
std::array<bool, LLAMA_MAX_LAYERS> swa_layers;
119111

120112
// for State Space Models
121113
uint32_t ssm_d_conv = 0;
@@ -153,6 +145,23 @@ struct llama_hparams {
153145
enum llama_rope_type rope_type = LLAMA_ROPE_TYPE_NONE;
154146
enum llama_rope_scaling_type rope_scaling_type_train = LLAMA_ROPE_SCALING_TYPE_NONE;
155147

148+
// this value n_pattern means that every nth layer is dense (i.e. non-SWA)
149+
// note that if n_pattern == 0, all layers are SWA
150+
// if n_pattern == 1, all layers are dense
151+
// example: n_pattern = 3
152+
// il == 0: swa
153+
// il == 1: swa
154+
// il == 2: dense
155+
// il == 3: swa
156+
// il == 4: swa
157+
// il == 5: dense
158+
// il == 6: swa
159+
// etc ...
160+
void set_swa_pattern(uint32_t n_pattern);
161+
162+
// return true if one of the layers is SWA
163+
bool is_swa_any() const;
164+
156165
uint32_t n_head(uint32_t il = 0) const;
157166

158167
uint32_t n_head_kv(uint32_t il = 0) const;

src/llama-model.cpp

Lines changed: 12 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -463,11 +463,14 @@ void llama_model::load_hparams(llama_model_loader & ml) {
463463
GGML_ASSERT(hparams.n_expert_used == 0);
464464
}
465465

466-
// zero-out the array hparams
467466
std::fill(hparams.n_head_arr.begin(), hparams.n_head_arr.end(), 0);
468467
std::fill(hparams.n_head_kv_arr.begin(), hparams.n_head_kv_arr.end(), 0);
469468
std::fill(hparams.n_ff_arr.begin(), hparams.n_ff_arr.end(), 0);
470469

470+
std::fill(hparams.rope_sections.begin(), hparams.rope_sections.end(), 0);
471+
472+
std::fill(hparams.swa_layers.begin(), hparams.swa_layers.end(), 0);
473+
471474
ml.get_key_or_arr(LLM_KV_FEED_FORWARD_LENGTH, hparams.n_ff_arr, hparams.n_layer, false);
472475
ml.get_key_or_arr(LLM_KV_ATTENTION_HEAD_COUNT, hparams.n_head_arr, hparams.n_layer, false);
473476

@@ -574,7 +577,7 @@ void llama_model::load_hparams(llama_model_loader & ml) {
574577

575578
hparams.swa_type = LLAMA_SWA_TYPE_CHUNKED;
576579
hparams.n_swa = 8192; // should this be a gguf kv? currently it's the same for Scout and Maverick
577-
hparams.n_swa_pattern = 4; // pattern: 3 chunked - 1 full
580+
hparams.set_swa_pattern(4); // pattern: 3 chunked - 1 full
578581

579582
switch (hparams.n_expert) {
580583
case 16: type = LLM_TYPE_17B_16E; break;
@@ -863,7 +866,7 @@ void llama_model::load_hparams(llama_model_loader & ml) {
863866
hparams.swa_type = LLAMA_SWA_TYPE_NONE;
864867

865868
hparams.n_swa = 0;
866-
hparams.n_swa_pattern = 1;
869+
hparams.set_swa_pattern(1);
867870
}
868871
} break;
869872
case LLM_ARCH_PHIMOE:
@@ -935,7 +938,7 @@ void llama_model::load_hparams(llama_model_loader & ml) {
935938
{
936939
hparams.swa_type = LLAMA_SWA_TYPE_STANDARD;
937940
hparams.n_swa = 4096; // default value of gemma 2
938-
hparams.n_swa_pattern = 2;
941+
hparams.set_swa_pattern(2);
939942
hparams.attn_soft_cap = true;
940943

941944
ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa, false);
@@ -953,7 +956,7 @@ void llama_model::load_hparams(llama_model_loader & ml) {
953956
case LLM_ARCH_GEMMA3:
954957
{
955958
hparams.swa_type = LLAMA_SWA_TYPE_STANDARD;
956-
hparams.n_swa_pattern = 6;
959+
hparams.set_swa_pattern(6);
957960

958961
hparams.rope_freq_base_train_swa = 10000.0f;
959962
hparams.rope_freq_scale_train_swa = 1.0f;
@@ -1038,7 +1041,7 @@ void llama_model::load_hparams(llama_model_loader & ml) {
10381041
case LLM_ARCH_COHERE2:
10391042
{
10401043
hparams.swa_type = LLAMA_SWA_TYPE_STANDARD;
1041-
hparams.n_swa_pattern = 4;
1044+
hparams.set_swa_pattern(4);
10421045

10431046
ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa);
10441047
ml.get_key(LLM_KV_LOGIT_SCALE, hparams.f_logit_scale);
@@ -4320,7 +4323,7 @@ void llama_model::print_info() const {
43204323
LLAMA_LOG_INFO("%s: n_head_kv = %s\n", __func__, print_f([&](uint32_t il) { return hparams.n_head_kv(il); }, hparams.n_layer).c_str());
43214324
LLAMA_LOG_INFO("%s: n_rot = %u\n", __func__, hparams.n_rot);
43224325
LLAMA_LOG_INFO("%s: n_swa = %u\n", __func__, hparams.n_swa);
4323-
LLAMA_LOG_INFO("%s: n_swa_pattern = %u\n", __func__, hparams.n_swa_pattern);
4326+
LLAMA_LOG_INFO("%s: is_swa_any = %u\n", __func__, hparams.is_swa_any());
43244327
LLAMA_LOG_INFO("%s: n_embd_head_k = %u\n", __func__, hparams.n_embd_head_k);
43254328
LLAMA_LOG_INFO("%s: n_embd_head_v = %u\n", __func__, hparams.n_embd_head_v);
43264329
LLAMA_LOG_INFO("%s: n_gqa = %s\n", __func__, print_f([&](uint32_t il) { return hparams.n_gqa(il); }, hparams.n_layer).c_str());
@@ -13216,7 +13219,7 @@ llama_memory_i * llama_model::create_memory(const llama_memory_params & params,
1321613219
LLAMA_LOG_DEBUG("%s: n_ctx = %u (padded)\n", __func__, cparams.n_ctx);
1321713220

1321813221
if (hparams.swa_type != LLAMA_SWA_TYPE_NONE) {
13219-
GGML_ASSERT(hparams.n_swa_pattern != 1);
13222+
GGML_ASSERT(hparams.is_swa_any());
1322013223

1322113224
res = new llama_kv_cache_unified_iswa(
1322213225
*this,
@@ -13230,7 +13233,7 @@ llama_memory_i * llama_model::create_memory(const llama_memory_params & params,
1323013233
cparams.n_batch,
1323113234
padding);
1323213235
} else {
13233-
GGML_ASSERT(hparams.n_swa_pattern == 1);
13236+
GGML_ASSERT(!hparams.is_swa_any());
1323413237

1323513238
res = new llama_kv_cache_unified(
1323613239
*this,

tools/mtmd/mtmd-helper.cpp

Lines changed: 2 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -12,17 +12,7 @@ size_t mtmd_helper_get_n_tokens(const mtmd_input_chunks * chunks) {
1212
size_t n_tokens = 0;
1313
for (size_t i = 0; i < mtmd_input_chunks_size(chunks); i++) {
1414
auto chunk = mtmd_input_chunks_get(chunks, i);
15-
auto chunk_type = mtmd_input_chunk_get_type(chunk);
16-
if (chunk_type == MTMD_INPUT_CHUNK_TYPE_TEXT) {
17-
size_t n_tokens_text;
18-
mtmd_input_chunk_get_tokens_text(chunk, &n_tokens_text);
19-
n_tokens += n_tokens_text;
20-
} else if (chunk_type == MTMD_INPUT_CHUNK_TYPE_IMAGE) {
21-
auto tokens_image = mtmd_input_chunk_get_tokens_image(chunk);
22-
n_tokens += mtmd_image_tokens_get_n_tokens(tokens_image);
23-
} else {
24-
GGML_ASSERT(false && "chunk type not supported");
25-
}
15+
n_tokens += mtmd_input_chunk_get_n_tokens(chunk);
2616
}
2717
return n_tokens;
2818
}
@@ -31,17 +21,7 @@ llama_pos mtmd_helper_get_n_pos(const mtmd_input_chunks * chunks) {
3121
llama_pos n_pos = 0;
3222
for (size_t i = 0; i < mtmd_input_chunks_size(chunks); i++) {
3323
auto chunk = mtmd_input_chunks_get(chunks, i);
34-
auto chunk_type = mtmd_input_chunk_get_type(chunk);
35-
if (chunk_type == MTMD_INPUT_CHUNK_TYPE_TEXT) {
36-
size_t n_tokens_text;
37-
mtmd_input_chunk_get_tokens_text(chunk, &n_tokens_text);
38-
n_pos += n_tokens_text;
39-
} else if (chunk_type == MTMD_INPUT_CHUNK_TYPE_IMAGE) {
40-
auto tokens_image = mtmd_input_chunk_get_tokens_image(chunk);
41-
n_pos += mtmd_image_tokens_get_n_pos(tokens_image);
42-
} else {
43-
GGML_ASSERT(false && "chunk type not supported");
44-
}
24+
n_pos += mtmd_input_chunk_get_n_pos(chunk);
4525
}
4626
return n_pos;
4727
}

tools/mtmd/mtmd.cpp

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -751,6 +751,10 @@ const unsigned char * mtmd_bitmap_get_data(const mtmd_bitmap * bitmap) {
751751
return bitmap->data.data();
752752
}
753753

754+
size_t mtmd_bitmap_get_n_bytes(const mtmd_bitmap * bitmap) {
755+
return bitmap->data.size();
756+
}
757+
754758
bool mtmd_bitmap_is_audio(const mtmd_bitmap * bitmap) {
755759
return bitmap->is_audio;
756760
}

tools/mtmd/mtmd.h

Lines changed: 7 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -119,11 +119,12 @@ MTMD_API bool mtmd_support_audio(mtmd_context * ctx);
119119
// the data is in float format (PCM F32)
120120
MTMD_API mtmd_bitmap * mtmd_bitmap_init (uint32_t nx, uint32_t ny, const unsigned char * data);
121121
MTMD_API mtmd_bitmap * mtmd_bitmap_init_from_audio(size_t n_samples, const float * data);
122-
MTMD_API uint32_t mtmd_bitmap_get_nx (const mtmd_bitmap * bitmap);
123-
MTMD_API uint32_t mtmd_bitmap_get_ny (const mtmd_bitmap * bitmap);
124-
MTMD_API const unsigned char * mtmd_bitmap_get_data(const mtmd_bitmap * bitmap);
125-
MTMD_API bool mtmd_bitmap_is_audio(const mtmd_bitmap * bitmap);
126-
MTMD_API void mtmd_bitmap_free (mtmd_bitmap * bitmap);
122+
MTMD_API uint32_t mtmd_bitmap_get_nx (const mtmd_bitmap * bitmap);
123+
MTMD_API uint32_t mtmd_bitmap_get_ny (const mtmd_bitmap * bitmap);
124+
MTMD_API const unsigned char * mtmd_bitmap_get_data (const mtmd_bitmap * bitmap);
125+
MTMD_API size_t mtmd_bitmap_get_n_bytes(const mtmd_bitmap * bitmap);
126+
MTMD_API bool mtmd_bitmap_is_audio (const mtmd_bitmap * bitmap);
127+
MTMD_API void mtmd_bitmap_free (mtmd_bitmap * bitmap);
127128
// bitmap ID is optional, but useful for KV cache tracking
128129
// these getters/setters are dedicated functions, so you can for example calculate the hash of the image based on mtmd_bitmap_get_data()
129130
MTMD_API const char * mtmd_bitmap_get_id(const mtmd_bitmap * bitmap);
@@ -322,6 +323,7 @@ struct bitmap {
322323
uint32_t nx() { return mtmd_bitmap_get_nx(ptr.get()); }
323324
uint32_t ny() { return mtmd_bitmap_get_ny(ptr.get()); }
324325
const unsigned char * data() { return mtmd_bitmap_get_data(ptr.get()); }
326+
size_t n_bytes() { return mtmd_bitmap_get_n_bytes(ptr.get()); }
325327
std::string id() { return mtmd_bitmap_get_id(ptr.get()); }
326328
void set_id(const char * id) { mtmd_bitmap_set_id(ptr.get(), id); }
327329
};

tools/server/public/index.html.gz

528 Bytes
Binary file not shown.

tools/server/server.cpp

Lines changed: 22 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -1885,6 +1885,7 @@ struct server_context {
18851885
float slot_prompt_similarity = 0.0f;
18861886

18871887
common_chat_templates_ptr chat_templates;
1888+
oaicompat_parser_options oai_parser_opt;
18881889

18891890
~server_context() {
18901891
mtmd_free(mctx);
@@ -2080,6 +2081,15 @@ struct server_context {
20802081
}
20812082

20822083
metrics.init();
2084+
2085+
oai_parser_opt = {
2086+
/* use_jinja */ params_base.use_jinja,
2087+
/* prefill_assistant */ params_base.prefill_assistant,
2088+
/* reasoning_format */ params_base.reasoning_format,
2089+
/* common_chat_templates */ chat_templates.get(),
2090+
/* allow_image */ mctx ? mtmd_support_vision(mctx) : false,
2091+
/* allow_audio */ mctx ? mtmd_support_audio (mctx) : false,
2092+
};
20832093
}
20842094

20852095
server_slot * get_slot_by_id(int id) {
@@ -4089,7 +4099,10 @@ int main(int argc, char ** argv) {
40894099
{ "default_generation_settings", ctx_server.default_generation_settings_for_props },
40904100
{ "total_slots", ctx_server.params_base.n_parallel },
40914101
{ "model_path", ctx_server.params_base.model.path },
4092-
{ "modalities", json{{"vision", ctx_server.mctx != nullptr}} }, // TODO: add more in the future
4102+
{ "modalities", json{
4103+
{"vision", ctx_server.oai_parser_opt.allow_image},
4104+
{"audio", ctx_server.oai_parser_opt.allow_audio},
4105+
} },
40934106
{ "chat_template", common_chat_templates_source(ctx_server.chat_templates.get()) },
40944107
{ "bos_token", common_token_to_piece(ctx_server.ctx, llama_vocab_bos(ctx_server.vocab), /* special= */ true)},
40954108
{ "eos_token", common_token_to_piece(ctx_server.ctx, llama_vocab_eos(ctx_server.vocab), /* special= */ true)},
@@ -4180,10 +4193,10 @@ int main(int argc, char ** argv) {
41804193
for (auto & file : files) {
41814194
mtmd::bitmap bmp(mtmd_helper_bitmap_init_from_buf(file.data(), file.size()));
41824195
if (!bmp.ptr) {
4183-
throw std::runtime_error("Failed to load image");
4196+
throw std::runtime_error("Failed to load image or audio file");
41844197
}
41854198
// calculate bitmap hash (for KV caching)
4186-
std::string hash = fnv_hash(bmp.data(), bmp.nx()*bmp.ny()*3);
4199+
std::string hash = fnv_hash(bmp.data(), bmp.n_bytes());
41874200
bmp.set_id(hash.c_str());
41884201
bitmaps.entries.push_back(std::move(bmp));
41894202
}
@@ -4415,7 +4428,7 @@ int main(int argc, char ** argv) {
44154428
OAICOMPAT_TYPE_NONE); // infill is not OAI compatible
44164429
};
44174430

4418-
const auto handle_chat_completions = [&ctx_server, &params, &res_error, &handle_completions_impl](const httplib::Request & req, httplib::Response & res) {
4431+
const auto handle_chat_completions = [&ctx_server, &res_error, &handle_completions_impl](const httplib::Request & req, httplib::Response & res) {
44194432
LOG_DBG("request: %s\n", req.body.c_str());
44204433
if (ctx_server.params_base.embedding) {
44214434
res_error(res, format_error_response("This server does not support completions. Start it without `--embeddings`", ERROR_TYPE_NOT_SUPPORTED));
@@ -4424,13 +4437,9 @@ int main(int argc, char ** argv) {
44244437

44254438
auto body = json::parse(req.body);
44264439
std::vector<raw_buffer> files;
4427-
json data = oaicompat_completion_params_parse(
4440+
json data = oaicompat_chat_params_parse(
44284441
body,
4429-
params.use_jinja,
4430-
params.prefill_assistant,
4431-
params.reasoning_format,
4432-
ctx_server.chat_templates.get(),
4433-
ctx_server.mctx,
4442+
ctx_server.oai_parser_opt,
44344443
files);
44354444

44364445
handle_completions_impl(
@@ -4443,16 +4452,12 @@ int main(int argc, char ** argv) {
44434452
};
44444453

44454454
// same with handle_chat_completions, but without inference part
4446-
const auto handle_apply_template = [&ctx_server, &params, &res_ok](const httplib::Request & req, httplib::Response & res) {
4455+
const auto handle_apply_template = [&ctx_server, &res_ok](const httplib::Request & req, httplib::Response & res) {
44474456
auto body = json::parse(req.body);
44484457
std::vector<raw_buffer> files; // dummy, unused
4449-
json data = oaicompat_completion_params_parse(
4458+
json data = oaicompat_chat_params_parse(
44504459
body,
4451-
params.use_jinja,
4452-
params.prefill_assistant,
4453-
params.reasoning_format,
4454-
ctx_server.chat_templates.get(),
4455-
ctx_server.mctx,
4460+
ctx_server.oai_parser_opt,
44564461
files);
44574462
res_ok(res, {{ "prompt", std::move(data.at("prompt")) }});
44584463
};

tools/server/tests/unit/test_vision_api.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,7 @@ def create_server():
3030
("What is this:\n", "malformed", False, None),
3131
("What is this:\n", "https://google.com/404", False, None), # non-existent image
3232
("What is this:\n", "https://ggml.ai", False, None), # non-image data
33+
# TODO @ngxson : test with multiple images, no images and with audio
3334
]
3435
)
3536
def test_vision_chat_completion(prompt, image_url, success, re_content):

0 commit comments

Comments
 (0)
0