@@ -86,6 +86,7 @@ mtmd_context_params mtmd_context_params_default() {
86
86
params.n_threads = 4 ;
87
87
params.verbosity = GGML_LOG_LEVEL_INFO;
88
88
params.image_marker = MTMD_DEFAULT_IMAGE_MARKER;
89
+ params.media_marker = MTMD_DEFAULT_MEDIA_MARKER;
89
90
return params;
90
91
}
91
92
@@ -96,7 +97,7 @@ struct mtmd_context {
96
97
97
98
bool print_timings;
98
99
int n_threads;
99
- std::string image_marker ;
100
+ std::string media_marker ;
100
101
bool has_vision;
101
102
bool has_audio;
102
103
@@ -127,8 +128,12 @@ struct mtmd_context {
127
128
text_model (text_model),
128
129
print_timings (ctx_params.print_timings),
129
130
n_threads (ctx_params.n_threads),
130
- image_marker (ctx_params.image_marker )
131
+ media_marker (ctx_params.media_marker )
131
132
{
133
+ if (std::string (ctx_params.image_marker ) != MTMD_DEFAULT_IMAGE_MARKER) {
134
+ throw std::runtime_error (" custom image_marker is not supported anymore, use media_marker instead" );
135
+ }
136
+
132
137
clip_context_params ctx_clip_params;
133
138
ctx_clip_params.use_gpu = ctx_params.use_gpu ;
134
139
ctx_clip_params.verbosity = ctx_params.verbosity ;
@@ -269,48 +274,51 @@ int32_t mtmd_tokenize(mtmd_context * ctx,
269
<
EDBE
/td>274
auto vocab = llama_model_get_vocab (ctx->text_model );
270
275
271
276
std::string prompt_modified (text->text );
272
- std::string marker_modified (ctx->image_marker );
277
+ std::string marker_modified (ctx->media_marker );
273
278
projector_type proj_type = clip_get_projector_type (ctx->ctx_clip );
274
279
280
+ // for compatibility, we convert image marker to media marker
281
+ string_replace_all (prompt_modified, MTMD_DEFAULT_IMAGE_MARKER, ctx->media_marker );
282
+
275
283
// a bit hacky here, but works for now
276
284
// for some models, we need to add prefix and suffix to the image embeddings
277
285
if (clip_is_gemma3 (ctx->ctx_clip )) {
278
286
// gemma 3
279
287
// <start_of_image> ... (image embeddings) ... <end_of_image>
280
- marker_modified = " <start_of_image>" + ctx->image_marker + " <end_of_image>" ;
281
- string_replace_all (prompt_modified, ctx->image_marker , marker_modified);
288
+ marker_modified = " <start_of_image>" + ctx->media_marker + " <end_of_image>" ;
289
+ string_replace_all (prompt_modified, ctx->media_marker , marker_modified);
282
290
283
291
} else if (proj_type == PROJECTOR_TYPE_IDEFICS3) {
284
292
// https://github.com/huggingface/transformers/blob/a42ba80fa520c784c8f11a973ca9034e5f859b79/src/transformers/models/idefics3/processing_idefics3.py#L192-L215
285
- marker_modified = " <fake_token_around_image><global-img>" + ctx->image_marker + " <fake_token_around_image>" ;
286
- string_replace_all (prompt_modified, ctx->image_marker , marker_modified);
293
+ marker_modified = " <fake_token_around_image><global-img>" + ctx->media_marker + " <fake_token_around_image>" ;
294
+ string_replace_all (prompt_modified, ctx->media_marker , marker_modified);
287
295
288
296
} else if (proj_type == PROJECTOR_TYPE_PIXTRAL) {
289
297
// https://github.com/huggingface/transformers/blob/1cd110c6cb6a6237614130c470e9a902dbc1a4bd/docs/source/en/model_doc/pixtral.md
290
- marker_modified = ctx->image_marker + " [IMG_END]" ;
291
- string_replace_all (prompt_modified, ctx->image_marker , marker_modified);
298
+ marker_modified = ctx->media_marker + " [IMG_END]" ;
299
+ string_replace_all (prompt_modified, ctx->media_marker , marker_modified);
292
300
293
301
} else if (proj_type == PROJECTOR_TYPE_QWEN2VL || proj_type == PROJECTOR_TYPE_QWEN25VL) {
294
302
// <|vision_start|> ... (image embeddings) ... <|vision_end|>
295
- marker_modified = " <|vision_start|>" + ctx->image_marker + " <|vision_end|>" ;
296
- string_replace_all (prompt_modified, ctx->image_marker , marker_modified);
303
+ marker_modified = " <|vision_start|>" + ctx->media_marker + " <|vision_end|>" ;
304
+ string_replace_all (prompt_modified, ctx->media_marker , marker_modified);
297
305
298
306
} else if (proj_type == PROJECTOR_TYPE_LLAMA4) {<
1E0A
/div>
299
307
// (more details in mtmd_context constructor)
300
- marker_modified = " <|image_start|>" + ctx->image_marker + " <|image_end|>" ;
301
- string_replace_all (prompt_modified, ctx->image_marker , marker_modified);
308
+ marker_modified = " <|image_start|>" + ctx->media_marker + " <|image_end|>" ;
309
+
F438
string_replace_all (prompt_modified, ctx->media_marker , marker_modified);
302
310
303
311
} else if (proj_type == PROJECTOR_TYPE_INTERNVL) {
304
312
// <img> ... (image embeddings) ... </img>
305
- marker_modified = " <img>" + ctx->image_marker + " </img>" ;
306
- string_replace_all (prompt_modified, ctx->image_marker , marker_modified);
313
+ marker_modified = " <img>" + ctx->media_marker + " </img>" ;
314
+ string_replace_all (prompt_modified, ctx->media_marker , marker_modified);
307
315
308
316
}
309
317
310
318
// llava-1.5, llava-1.6, Yi-VL, Yi-34B, granite: don't need to add prefix and suffix
311
319
// for glm-edge, BOI and EOI token's embeddings are not present in the text model
312
320
313
- std::vector<std::string> parts = string_split_str (prompt_modified, ctx->image_marker );
321
+ std::vector<std::string> parts = string_split_str (prompt_modified, ctx->media_marker );
314
322
output->entries .clear ();
315
323
output->entries .reserve (parts.size ());
316
324
@@ -820,6 +828,15 @@ llama_pos mtmd_input_chunk_get_n_pos(const mtmd_input_chunk * chunk) {
820
828
}
821
829
}
822
830
831
+ const char * mtmd_input_chunk_get_id (const mtmd_input_chunk * chunk) {
832
+ if (chunk->type == MTMD_INPUT_CHUNK_TYPE_IMAGE) {
833
+ return chunk->tokens_image ->id .c_str ();
834
+ } else if (chunk->type == MTMD_INPUT_CHUNK_TYPE_AUDIO) {
835
+ return chunk->tokens_audio ->id .c_str ();
836
+ }
837
+ return nullptr ;
838
+ }
839
+
823
840
mtmd_input_chunk * mtmd_input_chunk_copy (const mtmd_input_chunk * chunk) {
824
841
mtmd_input_chunk * copy = new mtmd_input_chunk{
825
842
chunk->type ,
0 commit comments