8000 llama : ggml-backend integration by slaren · Pull Request #4766 · ggml-org/llama.cpp · GitHub
[go: up one dir, main page]

Skip to content

llama : ggml-backend integration #4766

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 39 commits into from
Jan 12, 2024
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
Show all changes
39 commits
Select commit Hold shift + click to select a range
33f0761
llama : ggml-backend integration
slaren Dec 28, 2023
6483328
ggml-backend : add names to buffers
slaren Jan 5, 2024
a1ab35c
fix unmap after loading
slaren Jan 5, 2024
1fa7ee2
batched-bench : add tensor_split param
ggerganov Jan 5, 2024
863ef45
llama : check for null tensor_split
slaren Jan 5, 2024
d107459
ggml-backend : increase GGML_MAX_BACKENDS
slaren Jan 5, 2024
ece0b0d
improve graph splitting, partial fix for --no-kv-offload
slaren Jan 5, 2024
2f2c367
cuda : add ggml-backend split buffer support
slaren Jan 6, 2024
72b74f3
cuda : do not create buffer types for devices that don't exist (fixes…
slaren Jan 6, 2024
f77c72f
ggml : fix null backend dereference (#4807)
ggerganov Jan 7, 2024
7c16cf1
test-backend-ops : check buffer allocation failures
slaren Jan 7, 2024
87c8207
Merge remote-tracking branch 'origin/master' into sl/backend-sched
slaren Jan 7, 2024
5e879c9
llama : add cparam (split_mode) and command line argument (--split-mo…
slaren Jan 7, 2024
ac145fd
ggml : fix mul_mat_id work size
slaren Jan 8, 2024
444b975
llama : rewrite session kv load/set without graphs
slaren Jan 8, 2024
d41cef9
minor
slaren Jan 8, 2024
5a62db3
llama : only initialize used backends, free backends on context free
slaren Jan 8, 2024
4813e17
llama : abort ctx if cuda backend init fails
slaren Jan 8, 2024
11583c1
llama : rewrite lora with ggml-backend and compute on CPU
slaren Jan 8, 2024
4ed5f62
llama : only map to a backend buffer the region of the file mapping c…
slaren Jan 8, 2024
fa76201
opencl : add ggml-backend buffer type
slaren Jan 9, 2024
2e7814a
Merge remote-tracking branch 'origin/master' into sl/backend-sched
slaren Jan 9, 2024
5d2dffc
cuda : only use batched_cublas with batched mat muls (fixes fp16 tg p…
slaren Jan 10, 2024
3cb1c1f
Merge remote-tracking branch 'origin/master' into sl/backend-sched
slaren Jan 10, 2024
07a1b05
llama : on Metal, by default offload the full model
ggerganov Jan 10, 2024
3cd0cbb
metal : page align the data ptr (#4854)
ggerganov Jan 10, 2024
74066f8
Apply suggestions from code review
slaren Jan 10, 2024
c522c11
cuda : fix split buffer free
slaren Jan 10, 2024
9d4ba6e
address review comments
slaren Jan 11, 2024
d83c084
llama-bench : add split-mode parameter
slaren Jan 11, 2024
6dcc42b
fix whitespace
slaren Jan 11, 2024
42aa835
opencl : fix double initialization
slaren Jan 11, 2024
c3681af
Merge remote-tracking branch 'origin/master' into sl/backend-sched
slaren Jan 11, 2024
c486719
server : add --split-mode parameter
slaren Jan 11, 2024
23c14ef
use async copy and compute to improve multi-gpu performance
slaren Jan 11, 2024
e73009e
use async memcpys to copy the graph outputs to the CPU
slaren Jan 12, 2024
1e7694e
fix opencl
slaren Jan 12, 2024
458674c
Merge remote-tracking branch 'origin/master' into sl/backend-sched
slaren Jan 12, 2024
53ae0dd
use a host buffer for the cpu compute buffer for faster copies to the…
slaren Jan 12, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
llama : rewrite lora with ggml-backend and compute on CPU
ggml-ci
  • Loading branch information
slaren committed Jan 8, 2024
commit 11583c14624b8fe4f7d47eb7cab0d4b84d8ae1f3
45 changes: 25 additions & 20 deletions ggml-backend.c
Original file line number Diff line number Diff line change
Expand Up @@ -948,11 +948,7 @@ static struct ggml_tensor * ggml_dup_tensor_layout(struct ggml_context * ctx, co
// assigns backends to ops and splits the graph into subgraphs that can be computed on the same backend
// TODO: merge passes
static void sched_split_graph(ggml_backend_sched_t sched, struct ggml_cgraph * graph) {
// reset state
size_t hash_size = sched->hash_set.size;
memset(sched->hash_set.keys, 0, sizeof(sched->hash_set.keys[0]) * hash_size);
memset(sched->node_talloc, 0, sizeof(sched->node_talloc[0]) * hash_size);
memset(sched->node_copies, 0, sizeof(sched->node_copies[0]) * hash_size);
// reset splits
sched->n_splits = 0;

struct ggml_init_params params = {
Expand All @@ -961,11 +957,13 @@ static void sched_split_graph(ggml_backend_sched_t sched, struct ggml_cgraph * g
/* .no_alloc = */ true
};

if (sched->ctx != NULL) {
ggml_free(sched->ctx);
}
ggml_free(sched->ctx);

sched->ctx = ggml_init(params);
if (sched->ctx == NULL) {
fprintf(stderr, "%s: failed to initialize context\n", __func__);
GGML_ASSERT(false);
}

// pass 1: assign backends to ops with allocated inputs
for (int i = 0; i < graph->n_leafs; i++) {
Expand Down Expand Up @@ -1309,13 +1307,23 @@ static void sched_reset(ggml_backend_sched_t sched) {
for (int i = 0; i < sched->n_backends; i++) {
ggml_tallocr_reset(sched->tallocs[i]);
}
// reset state for the next run
size_t hash_size = sched->hash_set.size;
memset(sched->hash_set.keys, 0, sizeof(sched->hash_set.keys[0]) * hash_size);
memset(sched->node_talloc, 0, sizeof(sched->node_talloc[0]) * hash_size);
memset(sched->node_copies, 0, sizeof(sched->node_copies[0]) * hash_size);
}

ggml_backend_sched_t ggml_backend_sched_new(ggml_backend_t * backends, int n_backends) {
ggml_backend_sched_t ggml_backend_sched_new(ggml_backend_t * backends, int n_backends, size_t graph_size) {
GGML_ASSERT(n_backends > 0);
GGML_ASSERT(n_backends < 8000 ;= GGML_MAX_BACKENDS);

struct ggml_backend_sched * sched = malloc(sizeof(struct ggml_backend_sched));
memset(sched, 0, sizeof(struct ggml_backend_sched));
struct ggml_backend_sched * sched = calloc(sizeof(struct ggml_backend_sched), 1);

// initialize hash table
sched->hash_set = ggml_hash_set_new(graph_size + GGML_MAX_SPLITS*GGML_MAX_SPLIT_INPUTS);
sched->node_talloc = calloc(sizeof(sched->node_talloc[0]) * sched->hash_set.size, 1);
sched->node_copies = calloc(sizeof(sched->node_copies[0]) * sched->hash_set.size, 1);

sched->n_backends = n_backends;
for (int i = 0; i < n_backends; i++) {
Expand All @@ -1340,19 +1348,15 @@ void ggml_backend_sched_free(ggml_backend_sched_t sched) {
ggml_tallocr_free(sched->tallocs[i]);
}
ggml_gallocr_free(sched->galloc);
ggml_free(sched->ctx);
free(sched->hash_set.keys);
free(sched->node_talloc);
free(sched->node_copies);
free(sched);
}

void ggml_backend_sched_init_measure(ggml_backend_sched_t sched, struct ggml_cgraph * measure_graph) {
// initialize hash tables
size_t hash_size = measure_graph->visited_hash_table.size + GGML_MAX_SPLITS*GGML_MAX_SPLIT_INPUTS;
sched->hash_set.size = hash_size;
sched->hash_set.keys = malloc(sizeof(sched->hash_set.keys[0]) * hash_size);
sched->node_talloc = malloc(sizeof(sched->node_talloc[0]) * hash_size);
sched->node_copies = malloc(sizeof(sched->node_copies[0]) * hash_size);
GGML_ASSERT(ggml_tallocr_is_measure(sched->tallocs[0])); // can only be initialized once

sched_split_graph(sched, measure_graph);
sched_alloc_splits(sched);
Expand All @@ -1368,7 +1372,8 @@ void ggml_backend_sched_init_measure(ggml_backend_sched_t sched, struct ggml_cgr
}

void ggml_backend_sched_graph_split(ggml_backend_sched_t sched, struct ggml_cgraph * graph) {
GGML_ASSERT(sched->hash_set.size >= graph->visited_hash_table.size + GGML_MAX_SPLITS*GGML_MAX_SPLIT_INPUTS);
GGML_ASSERT((int)sched->hash_set.size >= graph->n_nodes + GGML_MAX_SPLITS*GGML_MAX_SPLIT_INPUTS);

sched_split_graph(sched, graph);
}

Expand All @@ -1385,17 +1390,17 @@ int ggml_backend_sched_get_n_splits(ggml_backend_sched_t sched) {

ggml_tallocr_t ggml_backend_sched_get_tallocr(ggml_backend_sched_t sched, ggml_backend_t backend) {
int backend_index = sched_backend_prio(sched, backend);
GGML_ASSERT(backend_index >= 0 && backend_index < sched->n_backends);
return sched->tallocs[backend_index];
}

ggml_backend_buffer_t ggml_backend_sched_get_buffer(ggml_backend_sched_t sched, ggml_backend_t backend) {
int backend_index = sched_backend_prio(sched, backend);
GGML_ASSERT(backend_index >= 0 && backend_index < sched->n_backends);
return ggml_tallocr_get_buffer(sched->tallocs[backend_index]);
}

void ggml_backend_sched_set_node_backend(ggml_backend_sched_t sched, struct ggml_tensor * node, ggml_backend_t backend) {
// FIXME: node_allocr is cleared when splitting the graph, so all user assignments are lost
// to avoid this, we need to clear node_allocr after compute rather than before split
int backend_index = sched_backend_prio(sched, backend);
GGML_ASSERT(backend_index >= 0 && backend_index < sched->n_backends);
node_allocr(node) = sched->tallocs[backend_index];
Expand Down
2 changes: 1 addition & 1 deletion ggml-backend.h
Original file line number Diff line number Diff line change
Expand Up @@ -149,7 +149,7 @@ extern "C" {
typedef struct ggml_backend_sched * ggml_backend_sched_t;

// Initialize a backend scheduler
GGML_API ggml_backend_sched_t ggm 8000 l_backend_sched_new(ggml_backend_t * backends, int n_backends);
GGML_API ggml_backend_sched_t ggml_backend_sched_new(ggml_backend_t * backends, int n_backends, size_t graph_size);
GGML_API void ggml_backend_sched_free(ggml_backend_sched_t sched);
// Initialize backend buffers from a measure graph
GGML_API void ggml_backend_sched_init_measure(ggml_backend_sched_t sched, struct ggml_cgraph * measure_graph);
Expand Down
44 changes: 42 additions & 2 deletions ggml-cuda.cu
Original file line number Diff line number Diff line change
Expand Up @@ -9720,16 +9720,56 @@ static void ggml_backend_cuda_split_buffer_set_tensor(ggml_backend_buffer_t buff
}
}

static void ggml_backend_cuda_split_buffer_get_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * tensor, void * data, size_t offset, size_t size) {
// split tensors must always be set in their entirety at once
GGML_ASSERT(offset == 0);
GGML_ASSERT(size == ggml_nbytes(tensor));

ggml_backend_cuda_split_buffer_type_context * buft_ctx = (ggml_backend_cuda_split_buffer_type_context *)buffer->buft->context;

const int64_t ne0 = tensor->ne[0];
const size_t nb1 = tensor->nb[1];
ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *)tensor->extra;

for (int id = 0; id < g_device_count; ++id) {
int64_t row_low, row_high;
get_row_split(&row_low, &row_high, tensor, buft_ctx->tensor_split, id);

int64_t nrows_split = row_high - row_low;
if (nrows_split == 0) {
continue;
}

const size_t offset_split = row_low*nb1;
size_t size = ggml_nbytes_split(tensor, nrows_split);
const size_t original_size = size;

// pad last row to a multiple of 512 elements to avoid out-of-bounds memory accesses
if (ne0 % MATRIX_ROW_PADDING != 0) {
size += ggml_row_size(tensor->type, MATRIX_ROW_PADDING - ne0 % MATRIX_ROW_PADDING);
}

char * buf_host = (char *)data + offset_split;
//CUDA_CHECK(cudaMemcpy(extra->data_device[id], buf_host, original_size, cudaMemcpyHostToDevice));
CUDA_CHECK(cudaMemcpy(buf_host, extra->data_device[id], original_size, cudaMemcpyDeviceToHost));
}
}

static void ggml_backend_cuda_split_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
UNUSED(buffer);
UNUSED(value);
}

static struct ggml_backend_buffer_i ggml_cuda_backend_split_buffer_interface = {
/* .get_name = */ ggml_backend_cuda_split_buffer_get_name,
/* .free_buffer = */ ggml_backend_cuda_split_buffer_free_buffer,
/* .get_base = */ ggml_backend_cuda_split_buffer_get_base,
/* .init_tensor = */ ggml_backend_cuda_split_buffer_init_tensor,
/* .set_tensor = */ ggml_backend_cuda_split_buffer_set_tensor,
/* .get_tensor = */ NULL,
/* .get_tensor = */ ggml_backend_cuda_split_buffer_get_tensor,
/* .cpy_tensor_from = */ NULL,
/* .cpy_tensor_to = */ NULL,
/* .clear = */ NULL,
/* .clear = */ ggml_backend_cuda_split_buffer_clear,
};

// cuda split buffer type
Expand Down
2 changes: 2 additions & 0 deletions ggml-impl.h
Original file line number Diff line number Diff line change
Expand Up @@ -228,6 +228,8 @@ inline static float ggml_lookup_fp16_to_fp32(ggml_fp16_t f) {
#define GGML_HASHTABLE_FULL ((size_t)-1)
#define GGML_HASHTABLE_ALREADY_EXISTS ((size_t)-2)

struct ggml_hash_set ggml_hash_set_new(size_t size);

bool ggml_hash_contains (const struct ggml_hash_set hash_set, struct ggml_tensor * key);

// returns GGML_HASHTABLE_FULL if table is full, otherwise the current index of the key or where it should be inserted
Expand Down
19 changes: 18 additions & 1 deletion ggml.c
Original file line number Diff line number Diff line change
Expand Up @@ -4343,6 +4343,23 @@ struct ggml_tensor * ggml_cpy_inplace(
return ggml_cpy_impl(ctx, a, b, true);
}

struct ggml_tensor * ggml_cast(
struct ggml_context * ctx,
struct ggml_tensor * a,
enum ggml_type type) {
bool is_node = false;

struct ggml_tensor * result = ggml_new_tensor(ctx, type, GGML_MAX_DIMS, a->ne);
ggml_format_name(result, "%s (copy)", a->name);

result->op = GGML_OP_CPY;
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
result->src[0] = a;
result->src[1] = result;

return result;
}

// ggml_cont

static struct ggml_tensor * ggml_cont_impl(
Expand Down Expand Up @@ -14835,7 +14852,7 @@ size_t ggml_hash_find_or_insert(struct ggml_hash_set hash_set, struct ggml_tenso
return i;
}

static struct ggml_hash_set ggml_hash_set_new(size_t size) {
struct ggml_hash_set ggml_hash_set_new(size_t size) {
size = ggml_hash_size(size);
struct ggml_hash_set result;
result.size = size;
Expand Down
5 changes: 5 additions & 0 deletions ggml.h
Original file line number Diff line number Diff line change
Expand Up @@ -1165,6 +1165,11 @@ extern "C" {
struct ggml_tensor * a,
struct ggml_tensor * b);

GGML_API struct ggml_tensor * ggml_cast(
struct ggml_context * ctx,
struct ggml_tensor * a,
enum ggml_type type);

// make contiguous
GGML_API struct ggml_tensor * ggml_cont(
struct ggml_context * ctx,
Expand Down
Loading
0