8000 Expose Llava as a shared library for downstream projects by damian0815 · Pull Request #3613 · ggml-org/llama.cpp · GitHub
[go: up one dir, main page]

Skip to content

Expose Llava as a shared library for downstream projects #3613

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 34 commits into from
Nov 6, 2023
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
Show all changes
34 commits
Select commit Hold shift + click to select a range
0209d39
wip llava python bindings compatibility
damian0815 Oct 13, 2023
3c10d9f
add external llava API
damian0815 Oct 13, 2023
770dc9d
add base64 in-prompt image support
damian0815 Oct 13, 2023
8224ca5
wip refactor image loading
damian0815 Oct 14, 2023
c693208
refactor image load out of llava init
damian0815 Oct 14, 2023
0889117
cleanup
damian0815 Oct 14, 2023
f83c060
further cleanup; move llava-cli into its own file and rename
damian0815 Oct 14, 2023
e2cd07c 8000
move base64.hpp into common/
damian0815 Oct 14, 2023
f8eddcf
collapse clip and llava libraries
damian0815 Oct 14, 2023
b9f533b
move llava into its own subdir
damian0815 Oct 14, 2023
f21af51
wip
damian0815 Oct 14, 2023
708928c
fix bug where base64 string was not removed from the prompt
damian0815 Oct 14, 2023
09edb7e
get libllava to output in the right place
damian0815 Oct 14, 2023
2847ecf
expose llava methods in libllama.dylib
damian0815 Oct 14, 2023
e3261ff
cleanup memory usage around clip_image_*
damian0815 Oct 14, 2023
d64891b
cleanup and refactor *again*
damian0815 Oct 15, 2023
5a91551
update headerdoc
damian0815 Oct 15, 2023
e84003b
Move llava back to examples
monatis Nov 2, 2023
8037034
build with cmake, not tested (WIP)
monatis Nov 2, 2023
52143f7
Editorconfig
monatis Nov 5, 2023
c6b8844
Merge branch 'master' into llava-lib
monatis Nov 5, 2023
32bf7bf
Editorconfig
monatis Nov 5, 2023
53dca51
Build with make
monatis Nov 5, 2023
b927772
Build with make
monatis Nov 5, 2023
01f06e2
Fix cyclical depts on Windows
monatis Nov 5, 2023
ad97e0e
attempt to fix build on Windows
monatis Nov 5, 2023
71ea278
Merge branch 'master' into llava-lib
monatis Nov 5, 2023
1f8c866
attempt to fix build on Windows
monatis Nov 6, 2023
d6be69f
Upd TODOs
monatis Nov 6, 2023
5b8b9ef
attempt to fix build on Windows+CUDA
monatis Nov 6, 2023
b9bacc7
Revert changes in cmake
monatis Nov 6, 2023
9f03ac7
Fix according to review comments
monatis Nov 6, 2023
22f43fc
Support building as a shared library
monatis Nov 6, 2023
3548029
address review comments
cebtenzzre Nov 6, 2023
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
refactor image load out of llava init
  • Loading branch information
damian0815 committed Oct 14, 2023
commit c6932085febdd3f4794bf058e39afbe5dee6d952
2 changes: 2 additions & 0 deletions examples/llava/llava-utils.h
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,8 @@
#include "common.h"
#include "llama.h"

#include "base64.hpp"

#include <cstdio>
#include <cstdlib>
#include <vector>
Expand Down
83 changes: 40 additions & 43 deletions examples/llava/llava.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,8 @@ static void show_additional_info(int /*argc*/, char ** argv) {
printf(" note: a lower temperature value like 0.1 is recommended for better quality.\n");
}

static bool encode_image_with_clip(clip_ctx * ctx_clip, int n_threads, const clip_image_u8 * img, float * image_embd, int * n_img_embd, int * n_img_pos, float * t_img_enc_ms) {
static bool encode_image_with_clip(llava_context * ctx_llava, int n_threads, const clip_image_u8 * img, float * image_embd, int * n_img_embd, int * n_img_pos) {
auto ctx_clip = ctx_llava->ctx_clip;
clip_image_f32 img_res;
if (!clip_image_preprocess(ctx_clip, img, &img_res, /*pad2square =*/ true)) {
fprintf(stderr, "%s: unable to preprocess image\n", __func__);
Expand All @@ -26,44 +27,56 @@ static bool encode_image_with_clip(clip_ctx * ctx_clip, int n_threads, const cli
*n_img_pos = clip_n_patches(ctx_clip);
*n_img_embd = clip_n_mmproj_embd(ctx_clip);

// make sure that the correct mmproj was used, i.e., compare apples to apples
int n_llama_embd = llama_n_embd(llama_get_model(ctx_llava->ctx_llama));
if (*n_img_embd != n_llama_embd) {
printf("%s: embedding dim of the multimodal projector (%d) is not equal to that of LLaMA (%d). Make sure that you use the correct mmproj file.\n", __func__, *n_img_embd, n_llama_embd);

return false;
}

const int64_t t_img_enc_start_us = ggml_time_us();
if (!clip_image_encode(ctx_clip, n_threads, &img_res, image_embd)) {
fprintf(stderr, "Unable to encode image\n");

return false;
}
const int64_t t_img_enc_end_us = ggml_time_us();
*t_img_enc_ms = (t_img_enc_end_us - t_img_enc_start_us) / 1000.0;
float t_img_enc_ms = (t_img_enc_end_us - t_img_enc_start_us) / 1000.0;

{
printf("\n%s: image encoded in %8.2f ms by CLIP (%8.2f ms per image patch)\n", __func__, t_img_enc_ms, t_img_enc_ms / *n_img_pos);
}

return true;
}

bool llava_build_img_embed(struct llava_context * ctx_llava, const clip_image_u8 * img) {
static bool llava_build_img_embed(struct llava_context * ctx_llava, int n_threads, const clip_image_u8 * img, float ** image_embd_out, int * n_image_pos_out) {

auto ctx_clip = ctx_llava->ctx_clip;
float * image_embd = (float *)malloc(clip_embd_nbytes(ctx_clip));
if (!image_embd) {
fprintf(stderr, "Unable to allocate memory for image embeddings\n");
free(image_embd);
return false;
}

int n_image_pos;
int n_img_embd;
int n_img_pos;
float t_img_enc_ms;
if (!encode_image_with_clip(ctx_clip, params->n_threads, &img, image_embd, &n_img_embd, &n_img_pos, &t_img_enc_ms)) {
if (!encode_image_with_clip(ctx_llava, n_threads, img, image_embd, &n_img_embd, &n_image_pos)) {
fprintf(stderr, "%s: cannot encode image, aborting\n", __func__);
free(image_embd);
return false;
}

ctx_llava->image_embd = image_embd;
retur true;
*image_embd_out = image_embd;
*n_image_pos_out = n_image_pos;
return true;
}


struct llava_context * llava_init(gpt_params * params) {
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Image loading and inference parts should be stripped off this function.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

done


const char * clip_path = params->mmproj.c_str();
const char * img_path = params->image.c_str();

auto prompt = params->prompt;
if (prompt.empty()) {
Expand Down Expand Up @@ -94,55 +107,36 @@ struct llava_context * llava_init(gpt_params * params) {
return NULL;
}

// make sure that the correct mmproj was used, i.e., compare apples to apples
int n_llama_embd = llama_n_embd(llama_get_model(ctx_llama));
if (n_img_embd != n_llama_embd) {
printf("%s: embedding dim of the multimodal projector (%d) is not equal to that of LLaMA (%d). Make sure that you use the correct mmproj file.\n", __func__, n_img_embd, n_llama_embd);

llama_free(ctx_llama);
llama_free_model(model);
llama_backend_free();
free(image_embd);

return NULL;
}

{
printf("\n%s: image encoded in %8.2f ms by CLIP (%8.2f ms per image patch)\n", __func__, t_img_enc_ms, t_img_enc_ms / n_img_pos);
}


auto ctx_llava = (struct llava_context *)malloc(sizeof(llava_context));

ctx_llava->ctx_llama = ctx_llama;
ctx_llava->ctx_clip = ctx_clip;
ctx_llava->model = model;
ctx_llava->image_embd = image_embd;
ctx_llava->n_img_pos = n_img_pos;
return ctx_llava;

}

void llava_free(struct llava_context * ctx_llava) {
if (ctx_llava->ctx_clip) {
clip_free(ctx_clip);
clip_free(ctx_llava->ctx_clip);
ctx_llava->ctx_clip = NULL;
}

llama_free(ctx_llava->ctx_llama);
llama_free_model(ctx_llava->model);
llama_backend_free();
free(ctx_llava->image_embd);
}

void llava_process_prompt(struct llava_context * ctx_llava, gpt_params * params, const char * prompt) {
static void llava_process_prompt(struct llava_context * ctx_llava, float * image_embd, int n_img_pos, gpt_params * params, const char * prompt) {
int n_past = 0;

const int max_tgt_len = params->n_predict < 0 ? 256 : params->n_predict;

// llava chat format is "<system_prompt>USER: <image_embeddings>\n<textual_prompt>\nASSISTANT:"
// GG: are we sure that the should be a trailing whitespace at the end of this string?
eval_string(ctx_llava->ctx_llama, "A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions.\nUSER: ", params->n_batch, &n_past);
eval_image_embd(ctx_llava->ctx_llama, ctx_llava->image_embd, ctx_llava->n_img_pos, params->n_batch, &n_past);
eval_image_embd(ctx_llava->ctx_llama, image_embd, n_img_pos, params->n_batch, &n_past);
eval_string(ctx_llava->ctx_llama, prompt, params->n_batch, &n_past);
eval_string(ctx_llava->ctx_llama, "\nASSISTANT:", params->n_batch, &n_past);

Expand Down Expand Up @@ -186,31 +180,34 @@ int main(int argc, char ** argv) {

// load and preprocess the image
clip_image_u8 img;
auto prompt = params.prompt;
if (prompt_contains_image(prompt)) {
if (img_path) {
if (!params.image.empty()) {
printf("using base64 encoded image instead of command line image path\n");
}
if (!get_image_from_prompt(prompt, &img)) {
fprintf(stderr, "%s: can't load image from prompt\n", __func__);
clip_free(ctx_clip);
return NULL;
llava_free(ctx_llava);
return 1;
}
prompt = remove_image_from_prompt(prompt);
} else {
if (!clip_image_load_from_file(img_path, &img)) {
fprintf(stderr, "%s: is %s really an image file?\n", __func__, img_path);
clip_free(ctx_clip);
return NULL;
if (!clip_image_load_from_file(params.image.c_str(), &img)) {
fprintf(stderr, "%s: is %s really an image file?\n", __func__, params.image.c_str());
llava_free(ctx_llava);
return 1;
}
}
llava_build_img_embed(ctx_llava, &img);
float * image_embd;
int n_image_pos;
llava_build_img_embed(ctx_llava, params.n_threads, &img, &image_embd, &n_image_pos);

// process the prompt
// llava chat format is "<system_prompt>USER: <image_embeddings>\n<textual_prompt>\nASSISTANT:"
llava_process_prompt(ctx_llava, &params, params.prompt.c_str());
llava_process_prompt(ctx_llava, image_embd, n_image_pos, &params, params.prompt.c_str());

llama_print_timings(ctx_llava->ctx_llama);

free(image_embd);
llava_free(ctx_llava);
return 0;
}
6 changes: 3 additions & 3 deletions examples/llava/llava.h
Original file lin 72AA e number Diff line number Diff line change
Expand Up @@ -14,14 +14,14 @@ struct llava_context {
struct llama_context * ctx_llama = NULL;
struct llama_model * model = NULL;

int n_img_pos = 0;
float * image_embd = NULL;
// int n_img_pos = 0;
// float * image_embd = NULL;
};

struct llava_context * llava_init(gpt_params * params);
void llava_free(struct llava_context * ctx_llava);

void llava_process_prompt(struct llava_context * ctx_llava, gpt_params * params, const char * prompt);
//void llava_process_prompt(struct llava_context * ctx_llava, gpt_params * params, const char * prompt);


#ifdef __cplusplus
Expand Down
0