8000 Expose Llava as a shared library for downstream projects by damian0815 · Pull Request #3613 · ggml-org/llama.cpp · GitHub
[go: up one dir, main page]

Skip to content

Expose Llava as a shared library for downstream projects #3613

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 34 commits into from
Nov 6, 2023
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
Show all changes
34 commits
Select commit Hold shift + click to select a range
0209d39
wip llava python bindings compatibility
damian0815 Oct 13, 2023
3c10d9f
add external llava API
damian0815 Oct 13, 2023
770dc9d
add base64 in-prompt image support
damian0815 Oct 13, 2023
8224ca5
wip refactor image loading
damian0815 Oct 14, 2023
c693208
refactor image load out of llava init
damian0815 Oct 14, 2023
0889117
cleanup
damian0815 Oct 14, 2023
f83c060
further cleanup; move llava-cli into its own file and rename
damian0815 Oct 14, 2023
e2cd07c
move base64.hpp into common/
damian0815 Oct 14, 2023
f8eddcf
collapse clip and llava libraries
damian0815 Oct 14, 2023
b9f533b
move llava into its own subdir
damian0815 Oct 14, 2023
f21af51
wip
damian0815 Oct 14, 2023
708928c
fix bug where base64 string was not removed from the prompt
damian0815 Oct 14, 2023
09edb7e
get libllava to output in the right place
damian0815 Oct 14, 2023
2847ecf
expose llava methods in libllama.dylib
damian0815 Oct 14, 2023
e3261ff
cleanup memory usage around clip_image_*
damian0815 Oct 14, 2023
d64891b
cleanup and refactor *again*
damian0815 Oct 15, 2023
5a91551
update headerdoc
damian0815 Oct 15, 2023
e84003b
Move llava back to examples
monatis Nov 2, 2023
8037034
build with cmake, not tested (WIP)
monatis Nov 2, 2023
52143f7
Editorconfig
monatis Nov 5, 2023
c6b8844
Merge branch 'master' into llava-lib
monatis Nov 5, 2023
32bf7bf
Editorconfig
monatis Nov 5, 2023
53dca51
Build with make
monatis Nov 5, 2023
b927772
Build with make
monatis Nov 5, 2023
01f06e2
Fix cyclical depts on Windows
monatis Nov 5, 2023
ad97e0e
attempt to fix build on Windows
monatis Nov 5, 2023
71ea278
Merge branch 'master' into llava-lib
monatis Nov 5, 2023
1f8c866
attempt to fix build on Windows
monatis Nov 6, 2023
d6be69f
Upd TODOs
monatis Nov 6, 2023
5b8b9ef
attempt to fix build on Windows+CUDA
monatis Nov 6, 2023
b9bacc7
Revert changes in cmake
monatis Nov 6, 2023
9f03ac7
Fix according to review comments
monatis Nov 6, 2023
22f43fc
Support building as a shared library
monatis Nov 6, 2023
3548029
address review comments
cebtenzzre Nov 6, 2023
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
add external llava API
  • Loading branch information
damian0815 committed Oct 13, 2023
commit 3c10d9f3de3e79bdba18f3745e3cc56b0aa45e67
121 changes: 75 additions & 46 deletions examples/llava/llava.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
#include "llava-utils.h"
#include "common.h"
#include "llama.h"
#include "llava.h"

#include <cstdio>
#include <cstdlib>
Expand Down Expand Up @@ -34,27 +35,13 @@ static bool encode_image_with_clip(clip_ctx * ctx_clip, int n_threads, const cli
return true;
}

int main(int argc, char ** argv) {
ggml_time_init();

gpt_params params;

if (!gpt_params_parse(argc, argv, params)) {
show_additional_info(argc, argv);
return 1;
}

if (params.mmproj.empty() || params.image.empty()) {
gpt_print_usage(argc, argv, params);
show_additional_info(argc, argv);
return 1;
}
struct llava_context * llava_init(gpt_params * params) {
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Image loading and inference parts should be stripped off this function.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

done


const char * clip_path = params.mmproj.c_str();
const char * img_path = params.image.c_str();
const char * clip_path = params->mmproj.c_str();
const char * img_path = params->image.c_str();

if (params.prompt.empty()) {
params.prompt = "describe the image in detail.";
if (params->prompt.empty()) {
params->prompt = "describe the image in detail.";
}

auto ctx_clip = clip_model_load(clip_path, /*verbosity=*/ 1);
Expand All @@ -65,47 +52,48 @@ int main(int argc, char ** argv) {
if (!clip_image_load_from_file(img_path, &img)) {
fprintf(stderr, "%s: is %s really an image file?\n", __func__, img_path);
clip_free(ctx_clip);
return 1;
return NULL;
}

float * image_embd = (float *)malloc(clip_embd_nbytes(ctx_clip));
if (!image_embd) {
fprintf(stderr, "Unable to allocate memory for image embeddings\n");
return 1;
return NULL;
}

int n_img_embd;
int n_img_pos;
float t_img_enc_ms;
if (!encode_image_with_clip(ctx_clip, params.n_threads, &img, image_embd, &n_img_embd, &n_img_pos, &t_img_enc_ms)) {
if (!encode_image_with_clip(ctx_clip, params->n_threads, &img, image_embd, &n_img_embd, &n_img_pos, &t_img_enc_ms)) {
fprintf(stderr, "%s: cannot encode image, aborting\n", __func__);
clip_free(ctx_clip);
return 1;
return NULL;
}

// we get the embeddings, free up the memory required for CLIP
clip_free(ctx_clip);
ctx_clip = NULL;

llama_backend_init(params.numa);
llama_backend_init(params->numa);

llama_model_params model_params = llama_model_default_params();
llama_model * model = llama_load_model_from_file(params.model.c_str(), model_params);
llama_model * model = llama_load_model_from_file(params->model.c_str(), model_params);
if (model == NULL) {
fprintf(stderr , "%s: error: unable to load model\n" , __func__);
return 1;
return NULL;
}

llama_context_params ctx_params = llama_context_default_params();

ctx_params.n_ctx = params.n_ctx < 2048 ? 2048 : params.n_ctx; // we need a longer context size to process image embeddings
ctx_params.n_threads = params.n_threads;
ctx_params.n_threads_batch = params.n_threads_batch == -1 ? params.n_threads : params.n_threads_batch;
ctx_params.n_ctx = params->n_ctx < 2048 ? 2048 : params->n_ctx; // we need a longer context size to process image embeddings
ctx_params.n_threads = params->n_threads;
ctx_params.n_threads_batch = params->n_threads_batch == -1 ? params->n_threads : params->n_threads_batch;

llama_context * ctx_llama = llama_new_context_with_model(model, ctx_params);

if (ctx_llama == NULL) {
fprintf(stderr , "%s: error: failed to create the llama_context\n" , __func__);
return 1;
return NULL;
}

// make sure that the correct mmproj was used, i.e., compare apples to apples
Expand All @@ -118,28 +106,49 @@ int main(int argc, char ** argv) {
llama_backend_free();
free(image_embd);

return 1;
return NULL;
}

// process the prompt
// llava chat format is "<system_prompt>USER: <image_embeddings>\n<textual_prompt>\nASSISTANT:"
{
printf("\n%s: image encoded in %8.2f ms by CLIP (%8.2f ms per image patch)\n", __func__, t_img_enc_ms, t_img_enc_ms / n_img_pos);
}


auto ctx_llava = (struct llava_context *)malloc(sizeof(llava_context));

ctx_llava->ctx_llama = ctx_llama;
ctx_llava->ctx_clip = ctx_clip;
ctx_llava->model = model;
ctx_llava->image_embd = image_embd;
ctx_llava->n_img_pos = n_img_pos;
return ctx_llava;

}

void llava_free(struct llava_context * ctx_llava) {
llama_free(ctx_llava->ctx_llama);
llama_free_model(ctx_llava->model);
llama_backend_free();
free(ctx_llava->image_embd);
}

void llava_process_prompt(struct llava_context * ctx_llava, gpt_params * params, const char * prompt) {
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Let's avoid such god-like functions for now --it kills hackability in early stages of development --it brings no benefit over functions in llava-utils.h. Better to have single-responsibility functions as much as possible to enhance the development speed and flexibility.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

👍 sure

int n_past = 0;

const int max_tgt_len = params.n_predict < 0 ? 256 : params.n_predict;
const int max_tgt_len = params->n_predict < 0 ? 256 : params->n_predict;

// GG: are we sure that the should be a trailing whitespace at the end of this string?
eval_string(ctx_llama, "A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions.\nUSER: ", params.n_batch, &n_past);
eval_image_embd(ctx_llama, image_embd, n_img_pos, params.n_batch, &n_past);
eval_string(ctx_llama, params.prompt.c_str(), params.n_batch, &n_past);
eval_string(ctx_llama, "\nASSISTANT:", params.n_batch, &n_past);
eval_string(ctx_llava->ctx_llama, "A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions.\nUSER: ", params->n_batch, &n_past);
eval_image_embd(ctx_llava->ctx_llama, ctx_llava->image_embd, ctx_llava->n_img_pos, params->n_batch, &n_past);
eval_string(ctx_llava->ctx_llama, prompt, params->n_batch, &n_past);
eval_string(ctx_llava->ctx_llama, "\nASSISTANT:", params->n_batch, &n_past);

// generate the response

printf("\n");

for (int i = 0; i < max_tgt_len; i++) {
const char * tmp = sample(ctx_llama, params, &n_past);
const char * tmp = sample(ctx_llava->ctx_llama, *params, &n_past);
if (strcmp(tmp, "</s>") == 0) break;

printf("%s", tmp);
Expand All @@ -148,16 +157,36 @@ int main(int argc, char ** argv) {

printf("\n");

{
printf("\n%s: image encoded in %8.2f ms by CLIP (%8.2f ms per image patch)\n", __func__, t_img_enc_ms, t_img_enc_ms / n_img_pos);
}


int main(int argc, char ** argv) {
ggml_time_init();

gpt_params params;

if (!gpt_params_parse(argc, argv, params)) {
show_additional_info(argc, argv);
return 1;
}
if (params.mmproj.empty() || params.image.empty()) {
gpt_print_usage(argc, argv, params);
show_additional_info(argc, argv);
return 1;
}

llama_print_timings(ctx_llama);
auto ctx_llava = llava_init(&params);
if (ctx_llava == NULL) {
fprintf(stderr, "%s: error: failed to init llava\n", __func__);
return 1;
}

llama_free(ctx_llama);
llama_free_model(model);
llama_backend_free();
free(image_embd);
// process the prompt
// llava chat format is "<system_prompt>USER: <image_embeddings>\n<textual_prompt>\nASSISTANT:"
llava_process_prompt(ctx_llava, &params, params.prompt.c_str());

llama_print_timings(ctx_llava->ctx_llama);

llava_free(ctx_llava);
return 0;
}
31 changes: 31 additions & 0 deletions examples/llava/llava.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
#ifndef LLAVA_H
#define LLAVA_H

#include "ggml.h"

struct clip_ctx;

#ifdef __cplusplus
extern "C" {
#endif

struct llava_context {
struct clip_ctx * ctx_clip = NULL;
struct llama_context * ctx_llama = NULL;
struct llama_model * model = NULL;

int n_img_pos = 0;
float * image_embd = NULL;
};

struct llava_context * llava_init(gpt_params * params);
void llava_free(struct llava_context * ctx_llava);

void llava_process_prompt(struct llava_context * ctx_llava, gpt_params * params, const char * prompt);


#ifdef __cplusplus
}
#endif

#endif
0