8000 mtmd : add ultravox audio input by ngxson · Pull Request #13623 · ggml-org/llama.cpp · GitHub
[go: up one dir, main page]

Skip to content

mtmd : add ultravox audio input #13623

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 33 commits into from
May 22, 2025
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
Show all changes
33 commits
Select commit Hold shift + click to select a range
4fa0c27
convert ok, load ok
ngxson May 4, 2025
8b73116
warmup ok
ngxson May 4, 2025
4ac7940
test
ngxson May 4, 2025
4282465
still does not work?
ngxson May 4, 2025
45cdb7f
fix padding
ngxson May 4, 2025
f3605b9
temporary give up
ngxson May 4, 2025
1804fa2
Merge branch 'master' into xsn/mtmd_ultravox
ngxson May 18, 2025
bc708b4
fix merge conflict
ngxson May 18, 2025
de20afd
build_ultravox()
ngxson May 18, 2025
bbe4940
rm test
ngxson May 19, 2025
4d44460
Merge branch 'master' into xsn/mtmd_ultravox
ngxson May 19, 2025
8d7d75a
fix merge conflict
ngxson May 19, 2025
dce799d
add necessary mtmd APIs
ngxson May 19, 2025
f151854
first working version (only 4s of audio)
ngxson May 19, 2025
9a0dcb6
will this monster compile?
ngxson May 19, 2025
1a90395
fix compile
ngxson May 19, 2025
4a8c092
please compile
ngxson May 19, 2025
6f23ad1
fPIC
ngxson May 19, 2025
cf38b47
fix windows
ngxson May 19, 2025
cf4f5d2
various fixes
ngxson May 19, 2025
3bbb26b
clean up audio_helpers
ngxson May 20, 2025
3ce96d7
fix conversion
ngxson May 20, 2025
cf9613f
add some debug stuff
ngxson May 20, 2025
23d0d7f
long audio input ok
ngxson May 21, 2025
7033aa1
adapt the api
ngxson May 21, 2025
e7c8a2e
Merge branch 'master' into xsn/mtmd_ultravox
ngxson May 21, 2025
111c820
add --audio arg
ngxson May 21, 2025
e6416b0
final touch UX
ngxson May 21, 2025
36a1abb
add miniaudio to readme
ngxson May 21, 2025
544f4f1
fix typo
ngxson May 21, 2025
7602ee4
Merge branch 'master' into xsn/mtmd_ultravox
ngxson May 22, 2025
9afb3af
refactor kv metadata
ngxson May 22, 2025
107790a
mtmd_default_marker()
ngxson May 22, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
adapt the api
  • Loading branch information
ngxson committed May 21, 2025
commit 7033aa16a78c894b061efaf9a7b6bccb5e55f26d
6 changes: 3 additions & 3 deletions tools/mtmd/mtmd-cli.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -283,8 +283,8 @@ int main(int argc, char ** argv) {

if (is_single_turn) {
g_is_generating = true;
if (params.prompt.find("<__image__>") == std::string::npos) {
params.prompt += " <__image__>";
if (params.prompt.find(MTMD_DEFAULT_MEDIA_MARKER) == std::string::npos) {
params.prompt += MTMD_DEFAULT_MEDIA_MARKER;
}
common_chat_msg msg;
msg.role = "user";
Expand Down Expand Up @@ -341,7 +341,7 @@ int main(int argc, char ** argv) {
std::string image = line.substr(7);
if (ctx.load_image(image)) {
LOG("Image %s loaded\n", image.c_str());
content += "<__image__>";
content += MTMD_DEFAULT_MEDIA_MARKER;
}
// else, error is already printed by libmtmd
continue;
Expand Down
49 changes: 33 additions & 16 deletions tools/mtmd/mtmd.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -86,6 +86,7 @@ mtmd_context_params mtmd_context_params_default() {
params.n_threads = 4;
params.verbosity = GGML_LOG_LEVEL_INFO;
params.image_marker = MTMD_DEFAULT_IMAGE_MARKER;
params.media_marker = MTMD_DEFAULT_MEDIA_MARKER;
return params;
}

Expand All @@ -96,7 +97,7 @@ struct mtmd_context {

bool print_timings;
int n_threads;
std::string image_marker;
std::string media_marker;
bool has_vision;
bool has_audio;

Expand Down Expand Up @@ -127,8 +128,12 @@ struct mtmd_context {
text_model (text_model),
print_timings(ctx_params.print_timings),
n_threads (ctx_params.n_threads),
image_marker (ctx_params.image_marker)
media_marker (ctx_params.media_marker)
{
if (std::string(ctx_params.image_marker) != MTMD_DEFAULT_IMAGE_MARKER) {
throw std::runtime_error("custom image_marker is not supported anymore, use media_marker instead");
}

clip_context_params ctx_clip_params;
ctx_clip_params.use_gpu = ctx_params.use_gpu;
ctx_clip_params.verbosity = ctx_params.verbosity;
Expand Down Expand Up @@ -269,48 +274,51 @@ int32_t mtmd_tokenize(mtmd_context * ctx,
auto vocab = llama_model_get_vocab(ctx->text_model);

std::string prompt_modified(text->text);
std::string marker_modified(ctx->image_marker);
std::string marker_modified(ctx->media_marker);
projector_type proj_type = clip_get_projector_type(ctx->ctx_clip);

// for compatibility, we convert image marker to media marker
string_replace_all(prompt_modified, MTMD_DEFAULT_IMAGE_MARKER, ctx->media_marker);

// a bit hacky here, but works for now
// for some models, we need to add prefix and suffix to the image embeddings
if (clip_is_gemma3(ctx->ctx_clip)) {
// gemma 3
// <start_of_image> ... (image embeddings) ... <end_of_image>
marker_modified = "<start_of_image>" + ctx->image_marker + "<end_of_image>";
string_replace_all(prompt_modified, ctx->image_marker, marker_modified);
marker_modified = "<start_of_image>" + ctx->media_marker + "<end_of_image>";
string_replace_all(prompt_modified, ctx->media_marker, marker_modified);

} else if (proj_type == PROJECTOR_TYPE_IDEFICS3) {
// https://github.com/huggingface/transformers/blob/a42ba80fa520c784c8f11a973ca9034e5f859b79/src/transformers/models/idefics3/processing_idefics3.py#L192-L215
marker_modified = "<fake_token_around_image><global-img>" + ctx->image_marker + "<fake_token_around_image>";
string_replace_all(prompt_modified, ctx->image_marker, marker_modified);
marker_modified = "<fake_token_around_image><global-img>" + ctx->media_marker + "<fake_token_around_image>";
string_replace_all(prompt_modified, ctx->media_marker, marker_modified);

} else if (proj_type == PROJECTOR_TYPE_PIXTRAL) {
// https://github.com/huggingface/transformers/blob/1cd110c6cb6a6237614130c470e9a902dbc1a4bd/docs/source/en/model_doc/pixtral.md
marker_modified = ctx->image_marker + "[IMG_END]";
string_replace_all(prompt_modified, ctx->image_marker, marker_modified);
marker_modified = ctx->media_marker + "[IMG_END]";
string_replace_all(prompt_modified, ctx->media_marker, marker_modified);

} else if (proj_type == PROJECTOR_TYPE_QWEN2VL || proj_type == PROJECTOR_TYPE_QWEN25VL) {
// <|vision_start|> ... (image embeddings) ... <|vision_end|>
marker_modified = "<|vision_start|>" + ctx->image_marker + "<|vision_end|>";
string_replace_all(prompt_modified, ctx->image_marker, marker_modified);
marker_modified = "<|vision_start|>" + ctx->media_marker + "<|vision_end|>";
string_replace_all(prompt_modified, ctx->media_marker, marker_modified);

} else if (proj_type == PROJECTOR_TYPE_LLAMA4) {
// (more details in mtmd_context constructor)
marker_modified = "<|image_start|>" + ctx->image_marker + "<|image_end|>";
string_replace_all(prompt_modified, ctx->image_marker, marker_modified);
marker_modified = "<|image_start|>" + ctx->media_marker + "<|image_end|>";
string_replace_all(prompt_modified, ctx->media_marker, marker_modified);

} else if (proj_type == PROJECTOR_TYPE_INTERNVL) {
// <img> ... (image embeddings) ... </img>
marker_modified = "<img>" + ctx->image_marker + "</img>";
string_replace_all(prompt_modified, ctx->image_marker, marker_modified);
marker_modified = "<img>" + ctx->media_marker + "</img>";
string_replace_all(prompt_modified, ctx->media_marker, marker_modified);

}

// llava-1.5, llava-1.6, Yi-VL, Yi-34B, granite: don't need to add prefix and suffix
// for glm-edge, BOI and EOI token's embeddings are not present in the text model

std::vector<std::string> parts = string_split_str(prompt_modified, ctx->image_marker);
std::vector<std::string> parts = string_split_str(prompt_modified, ctx->media_marker);
output->entries.clear();
output->entries.reserve(parts.size());

Expand Down Expand Up @@ -820,6 +828,15 @@ llama_pos mtmd_input_chunk_get_n_pos(const mtmd_input_chunk * chunk) {
}
}

const char * mtmd_input_chunk_get_id(const mtmd_input_chunk * chunk) {
if (chunk->type == MTMD_INPUT_CHUNK_TYPE_IMAGE) {
return chunk->tokens_image->id.c_str();
} else if (chunk->type == MTMD_INPUT_CHUNK_TYPE_AUDIO) {
return chunk->tokens_audio->id.c_str();
}
return nullptr;
}

mtmd_input_chunk * mtmd_input_chunk_copy(const mtmd_input_chunk * chunk) {
mtmd_input_chunk * copy = new mtmd_input_chunk{
chunk->type,
Expand Down
27 changes: 18 additions & 9 deletions tools/mtmd/mtmd.h
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,9 @@
# define MTMD_API
#endif

#define MTMD_DEFAULT_MEDIA_MARKER "<__media__>"

// deprecated marker, use MTMD_DEFAULT_MEDIA_MARKER instead
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We have such constants in llama.h and ggml.h, but we eventually have to start moving those behind API calls. It's more future-proof.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Good idea! I added it in 107790a

#define MTMD_DEFAULT_IMAGE_MARKER "<__image__>"

#ifdef __cplusplus
Expand Down Expand Up @@ -80,7 +83,8 @@ struct mtmd_context_params {
bool print_timings;
int n_threads;
enum ggml_log_level verbosity;
const char * image_marker;
const char * image_marker; // deprecated, use media_marker instead
const char * media_marker;
};

MTMD_API struct mtmd_context_params mtmd_context_params_default(void);
Expand Down Expand Up @@ -112,6 +116,7 @@ MTMD_API bool mtmd_support_audio(mtmd_context * ctx);
// the data is in RGBRGBRGB... format
// if bitmap is audio:
// length of data must be n_samples * sizeof(float)
// the data is in float format (PCM F32)
MTMD_API mtmd_bitmap * mtmd_bitmap_init (uint32_t nx, uint32_t ny, const unsigned char * data);
MTMD_API mtmd_bitmap * mtmd_bitmap_init_from_audio(size_t n_samples, const float * data);
MTMD_API uint32_t mtmd_bitmap_get_nx (const mtmd_bitmap * bitmap);
Expand Down Expand Up @@ -141,8 +146,11 @@ MTMD_API void mtmd_input_chunks_free(mtmd_input_chunks * chu
MTMD_API enum mtmd_input_chunk_type mtmd_input_chunk_get_type (const mtmd_input_chunk * chunk);
MTMD_API const llama_token * mtmd_input_chunk_get_tokens_text (const mtmd_input_chunk * chunk, size_t * n_tokens_output);
MTMD_API const mtmd_image_tokens * mtmd_input_chunk_get_tokens_image(const mtmd_input_chunk * chunk);
MTMD_API llama_pos mtmd_input_chunk_get_n_pos (const mtmd_input_chunk * chunk);
MTMD_API size_t mtmd_input_chunk_get_n_tokens (const mtmd_input_chunk * chunk);
// returns nullptr for ID on text chunk
MTMD_API const char * mtmd_input_chunk_get_id (const mtmd_input_chunk * chunk);
// number of temporal positions (always 1 for M-RoPE, n_tokens otherwise)
MTMD_API llama_pos mtmd_input_chunk_get_n_pos (const mtmd_input_chunk * chunk);

// in case you want to use custom logic to handle the chunk (i.e. KV cache management)
// you can move the chunk ownership to your own code by copying it
Expand All @@ -162,20 +170,21 @@ MTMD_API const char * mtmd_image_tokens_get_id (const mtmd_image_tokens * i
// number of temporal positions (always 1 for M-RoPE, n_tokens otherwise)
MTMD_API llama_pos mtmd_image_tokens_get_n_pos (const mtmd_image_tokens * image_tokens); // TODO: deprecate

// tokenize an input text prompt and an image
// the prompt must have the input image marker (default: "<__image__>") in it
// the marker will be replaced with the image tokens
// tokenize an input text prompt and a list of bitmaps (images/audio)
// the prompt must have the input image marker (default: "<__media__>") in it
// the default marker is defined by MTMD_DEFAULT_MEDIA_MARKER
// the marker will be replaced with the image/audio chunk
// for example:
// "here is an image: <__image__>\ndescribe it in detail."
// "here is an image: <__media__>\ndescribe it in detail."
// this will gives 3 chunks:
// 1. "here is an image: <start_of_image>"
// 2. (image tokens)
// 2. (image/audio tokens)
// 3. "<end_of_image>\ndescribe it in detail."
// number of bitmaps must be equal to the number of image markers in the prompt
// number of bitmaps must be equal to the number of markers in the prompt
// this function is thread-safe (shared ctx)
// return values:
// 0 on success
// 1 on number of images not matching the number of markers
// 1 on number of bitmaps not matching the number of markers
// 2 on image preprocessing error
MTMD_API int32_t mtmd_tokenize(mtmd_context * ctx,
mtmd_input_chunks * output,
Expand Down
2 changes: 1 addition & 1 deletion tools/server/utils.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -710,7 +710,7 @@ static json oaicompat_completion_params_parse(

// replace this chunk with a marker
p["type"] = "text";
p["text"] = MTMD_DEFAULT_IMAGE_MARKER;
p["text"] = MTMD_DEFAULT_MEDIA_MARKER;
p.erase("image_url");
}
}
Expand Down
0