From c254b21307bb4d30f0c0f6dc82111f6607f5c61c Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Wed, 9 Apr 2025 14:50:41 +0300 Subject: [PATCH 01/15] metal : add memory pool for temp allocs (wip) [no ci] --- ggml/src/ggml-metal/ggml-metal.m | 83 ++++++++++++++++++++++++++------ 1 file changed, 69 insertions(+), 14 deletions(-) diff --git a/ggml/src/ggml-metal/ggml-metal.m b/ggml/src/ggml-metal/ggml-metal.m index 9f1c6c6ccc09f..ba981333faa80 100644 --- a/ggml/src/ggml-metal/ggml-metal.m +++ b/ggml/src/ggml-metal/ggml-metal.m @@ -44,8 +44,8 @@ // note: assumes single GPU device - the default one // TODO: support multiple GPU devices static struct ggml_backend_metal_device_context { - id mtl_device; - int mtl_device_ref_count; + id mtl_device; + int mtl_device_ref_count; id mtl_library; bool has_simdgroup_reduction; @@ -470,6 +470,7 @@ static void ggml_backend_metal_device_rel(struct ggml_backend_metal_device_conte struct ggml_backend_metal_context { id queue; + id heap; dispatch_queue_t d_queue; @@ -693,6 +694,19 @@ @implementation GGMLMetalClass ctx->d_queue = dispatch_queue_create("ggml-metal", DISPATCH_QUEUE_CONCURRENT); + // allocate tmp heap with fixed size for testing + // TODO: figure out how to dynamically resize it + { + MTLHeapDescriptor *heapDescriptor = [[MTLHeapDescriptor alloc] init]; + heapDescriptor.storageMode = MTLStorageModePrivate; + heapDescriptor.cpuCacheMode = MTLCPUCacheModeDefaultCache; + heapDescriptor.size = 32*1024*1024; + + ctx->heap = [device newHeapWithDescriptor:heapDescriptor]; + + [heapDescriptor release]; + } + // load library if (ctx_dev->mtl_library == nil) { ctx_dev->mtl_library = ggml_metal_load_library(device, ctx_dev->use_bfloat); @@ -1136,6 +1150,7 @@ static void ggml_metal_free(struct ggml_backend_metal_context * ctx) { Block_release(ctx->encode_async); [ctx->queue release]; + [ctx->heap release]; dispatch_release(ctx->d_queue); @@ -1439,7 +1454,8 @@ static bool ggml_metal_supports_op(const struct ggml_backend_metal_device_contex static void ggml_metal_encode_node( ggml_backend_t backend, int idx, - id encoder) { + id encoder, + id heap) { struct ggml_backend_metal_context * ctx = backend->context; struct ggml_backend_metal_device_context * ctx_dev = backend->device->context; @@ -2111,26 +2127,65 @@ static void ggml_metal_encode_node( const float m0 = powf(2.0f, -(max_bias ) / n_head_log2); const float m1 = powf(2.0f, -(max_bias / 2.0f) / n_head_log2); - ggml_metal_kargs_soft_max args = { + // cpy to tmp buffer in MTLHeap + + ggml_metal_kargs_cpy args_cpy = { /*.ne00 =*/ ne00, /*.ne01 =*/ ne01, /*.ne02 =*/ ne02, - /*.scale =*/ scale, - /*.max_bias =*/ max_bias, - /*.m0 =*/ m0, - /*.m1 =*/ m1, + /*.ne03 =*/ ne03, + /*.nb00 =*/ nb00, + /*.nb01 =*/ nb01, + /*.nb02 =*/ nb02, + /*.nb03 =*/ nb03, + /*.ne0 =*/ ne00, + /*.ne1 =*/ ne01, + /*.ne2 =*/ ne02, + /*.ne3 =*/ ne03, + /*.nb0 =*/ nb00, + /*.nb1 =*/ nb01, + /*.nb2 =*/ nb02, + /*.nb3 =*/ nb03, + }; + + id id_src0h = [heap newBufferWithLength:ggml_nbytes(src0) options:MTLResourceStorageModePrivate]; + + if (src0->type == GGML_TYPE_F16) { + [encoder setComputePipelineState:ctx->kernels[GGML_METAL_KERNEL_TYPE_CPY_F16_F16].pipeline]; + } else { + [encoder setComputePipelineState:ctx->kernels[GGML_METAL_KERNEL_TYPE_CPY_F32_F32].pipeline]; + } + [encoder setBytes:&args_cpy length:sizeof(args_cpy) atIndex:0]; + [encoder setBuffer:id_src0 offset:offs_src0 atIndex:1]; + [encoder setBuffer:id_src0h offset:0 atIndex:2]; + + GGML_ASSERT(ne00 % ggml_blck_size(src0->type) == 0); + int nth_cpy = MIN(1024, ne00 / ggml_blck_size(src0->type)); + + [encoder dispatchThreadgroups:MTLSizeMake(ne01, ne02, ne03) threadsPerThreadgroup:MTLSizeMake(nth_cpy, 1, 1)]; + + // softmax + + ggml_metal_kargs_soft_max args = { + /*.ne00 =*/ ne00, + /*.ne01 =*/ ne01, + /*.ne02 =*/ ne02, + /*.scale =*/ scale, + /*.max_bias =*/ max_bias, + /*.m0 =*/ m0, + /*.m1 =*/ m1, /*.n_head_log2 =*/ n_head_log2, }; [encoder setComputePipelineState:pipeline]; - [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0]; + [encoder setBuffer:id_src0h offset:0 atIndex:0]; if (id_src1) { - [encoder setBuffer:id_src1 offset:offs_src1 atIndex:1]; + [encoder setBuffer:id_src1 offset:offs_src1 atIndex:1]; } else { - [encoder setBuffer:id_src0 offset:offs_src0 atIndex:1]; + [encoder setBuffer:id_src0h offset:0 atIndex:1]; } - [encoder setBuffer:id_dst offset:offs_dst atIndex:2]; - [encoder setBytes:&args length:sizeof(args) atIndex:3]; + [encoder setBuffer:id_dst offset:offs_dst atIndex:2]; + [encoder setBytes:&args length:sizeof(args) atIndex:3]; [encoder setThreadgroupMemoryLength:32*sizeof(float) atIndex:0]; @@ -4992,7 +5047,7 @@ static void ggml_backend_metal_set_n_cb(ggml_backend_t backend, int n_cb) { [encoder pushDebugGroup:[NSString stringWithCString:ggml_op_desc(ggml_graph_node(ctx->gf, idx)) encoding:NSUTF8StringEncoding]]; } - ggml_metal_encode_node(backend, idx, encoder); + ggml_metal_encode_node(backend, idx, encoder, ctx->heap); if (should_capture) { [encoder popDebugGroup]; From 2341e7c688882ffc480a47dabb87db9cec620fc5 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Wed, 9 Apr 2025 16:02:59 +0300 Subject: [PATCH 02/15] cont : free buffers from the heap --- ggml/src/ggml-metal/ggml-metal.m | 53 +++++++++++++++++++++++++++----- 1 file changed, 46 insertions(+), 7 deletions(-) diff --git a/ggml/src/ggml-metal/ggml-metal.m b/ggml/src/ggml-metal/ggml-metal.m index ba981333faa80..46b7f16934cbf 100644 --- a/ggml/src/ggml-metal/ggml-metal.m +++ b/ggml/src/ggml-metal/ggml-metal.m @@ -19,6 +19,9 @@ // max number of MTLCommandBuffer used to submit a graph for processing #define GGML_METAL_MAX_COMMAND_BUFFERS 8 +// max number of buffers that can be allocated on the heap per command buffer +#define GGML_METAL_MAX_HEAP_BUFFERS 64 + #ifndef TARGET_OS_VISION #define TARGET_OS_VISION 0 #endif @@ -468,9 +471,18 @@ static void ggml_backend_metal_device_rel(struct ggml_backend_metal_device_conte GGML_METAL_KERNEL_TYPE_COUNT }; +struct ggml_backend_metal_heap { + int n; + + id obj; + id bufs[GGML_METAL_MAX_HEAP_BUFFERS]; +}; + struct ggml_backend_metal_context { id queue; - id heap; + + // TODO: create heap per command buffer + struct ggml_backend_metal_heap heap; dispatch_queue_t d_queue; @@ -702,7 +714,12 @@ @implementation GGMLMetalClass heapDescriptor.cpuCacheMode = MTLCPUCacheModeDefaultCache; heapDescriptor.size = 32*1024*1024; - ctx->heap = [device newHeapWithDescriptor:heapDescriptor]; + ctx->heap.n = 0; + + ctx->heap.obj = [device newHeapWithDescriptor:heapDescriptor]; + for (int i = 0; i < GGML_METAL_MAX_HEAP_BUFFERS; ++i) { + ctx->heap.bufs[i] = nil; + } [heapDescriptor release]; } @@ -1149,8 +1166,8 @@ static void ggml_metal_free(struct ggml_backend_metal_context * ctx) { Block_release(ctx->encode_async); - [ctx->queue release]; - [ctx->heap release]; + [ctx->queue release]; + [ctx->heap.obj release]; dispatch_release(ctx->d_queue); @@ -1455,7 +1472,7 @@ static void ggml_metal_encode_node( ggml_backend_t backend, int idx, id encoder, - id heap) { + struct ggml_backend_metal_heap * heap) { struct ggml_backend_metal_context * ctx = backend->context; struct ggml_backend_metal_device_context * ctx_dev = backend->device->context; @@ -2148,7 +2165,11 @@ static void ggml_metal_encode_node( /*.nb3 =*/ nb03, }; - id id_src0h = [heap newBufferWithLength:ggml_nbytes(src0) options:MTLResourceStorageModePrivate]; + id id_src0h = [heap->obj newBufferWithLength:ggml_nbytes(src0) options:MTLResourceStorageModePrivate]; + + // save a reference to the heap-allocated buffer + // TODO: simplify and check for available resources + heap->bufs[heap->n++] = id_src0h; if (src0->type == GGML_TYPE_F16) { [encoder setComputePipelineState:ctx->kernels[GGML_METAL_KERNEL_TYPE_CPY_F16_F16].pipeline]; @@ -4621,6 +4642,8 @@ static enum ggml_status ggml_metal_graph_compute( id command_buffer = ctx->command_buffers[n_cb]; [command_buffer waitUntilCompleted]; + // TODO: free main cb heap + MTLCommandBufferStatus status = [command_buffer status]; if (status != MTLCommandBufferStatusCompleted) { GGML_LOG_INFO("%s: command buffer %d failed with status %lu\n", __func__, n_cb, status); @@ -4636,6 +4659,22 @@ static enum ggml_status ggml_metal_graph_compute( id command_buffer = ctx->command_buffers[i]; [command_buffer waitUntilCompleted]; + // free buffers from the heap + { + size_t size_allocated = [ctx->heap.obj currentAllocatedSize]; + size_t size_used = [ctx->heap.obj usedSize]; + GGML_LOG_INFO("%s: command buffer %d, allocated = %zu, used = %zu, n = %d\n", __func__, i, size_allocated, size_used, ctx->heap.n); + + for (int j = 0; j < ctx->heap.n; ++j) { + id buf = ctx->heap.bufs[j]; + [buf release]; + + ctx->heap.bufs[j] = nil; + } + + ctx->heap.n = 0; + } + MTLCommandBufferStatus status = [command_buffer status]; if (status != MTLCommandBufferStatusCompleted) { GGML_LOG_INFO("%s: command buffer %d failed with status %lu\n", __func__, i, status); @@ -5047,7 +5086,7 @@ static void ggml_backend_metal_set_n_cb(ggml_backend_t backend, int n_cb) { [encoder pushDebugGroup:[NSString stringWithCString:ggml_op_desc(ggml_graph_node(ctx->gf, idx)) encoding:NSUTF8StringEncoding]]; } - ggml_metal_encode_node(backend, idx, encoder, ctx->heap); + ggml_metal_encode_node(backend, idx, encoder, &ctx->heap); if (should_capture) { [encoder popDebugGroup]; From 37450314b552e7c7a46954d4527059444cf9cf18 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Wed, 9 Apr 2025 16:55:01 +0300 Subject: [PATCH 03/15] cont : resize heap [no ci] --- ggml/src/ggml-metal/ggml-metal.m | 164 ++++++++++++++++++++++--------- 1 file changed, 118 insertions(+), 46 deletions(-) diff --git a/ggml/src/ggml-metal/ggml-metal.m b/ggml/src/ggml-metal/ggml-metal.m index 46b7f16934cbf..f48c64605f2e0 100644 --- a/ggml/src/ggml-metal/ggml-metal.m +++ b/ggml/src/ggml-metal/ggml-metal.m @@ -471,18 +471,67 @@ static void ggml_backend_metal_device_rel(struct ggml_backend_metal_device_conte GGML_METAL_KERNEL_TYPE_COUNT }; -struct ggml_backend_metal_heap { +// TODO: use MTLHeapTypePlacement and reset offset after every node +struct ggml_metal_heap { int n; + int fail; + + size_t need; id obj; id bufs[GGML_METAL_MAX_HEAP_BUFFERS]; }; +static void ggml_metal_heap_reset(struct ggml_metal_heap * heap) { + heap->n = 0; + heap->fail = 0; + heap->need = 0; + + for (int i = 0; i < GGML_METAL_MAX_HEAP_BUFFERS; i++) { + if (heap->bufs[i]) { + [heap->bufs[i] release]; + heap->bufs[i] = nil; + continue; + } + + break; + } +} + +static id ggml_metal_heap_alloc(struct ggml_metal_heap * heap, size_t size, size_t alignment) { + const size_t size_aligned = GGML_PAD(size, alignment); + + heap->need += size_aligned; + + if (!heap->fail && heap->need > [heap->obj maxAvailableSizeWithAlignment:alignment]) { + heap->fail = 1; + } + + if (!heap->fail && heap->n >= GGML_METAL_MAX_HEAP_BUFFERS) { + heap->fail = 2; + } + + if (heap->fail) { + return nil; + } + + id buf = [heap->obj newBufferWithLength:size_aligned options:MTLResourceStorageModePrivate]; + if (!buf) { + heap->fail = 3; + return nil; + } + + heap->bufs[heap->n++] = buf; + + return buf; +} + struct ggml_backend_metal_context { + id device; id queue; // TODO: create heap per command buffer - struct ggml_backend_metal_heap heap; + struct ggml_metal_heap heap; dispatch_queue_t d_queue; @@ -696,9 +745,11 @@ @implementation GGMLMetalClass struct ggml_backend_metal_device_context * ctx_dev = dev->context; id device = ggml_backend_metal_device_acq(ctx_dev); + GGML_LOG_INFO("%s: picking default device: %s\n", __func__, [[device name] UTF8String]); - ctx->queue = [device newCommandQueue]; + ctx->device = device; + ctx->queue = [device newCommandQueue]; if (ctx->queue == nil) { GGML_LOG_ERROR("%s: error: failed to create command queue\n", __func__); return NULL; @@ -707,21 +758,22 @@ @implementation GGMLMetalClass ctx->d_queue = dispatch_queue_create("ggml-metal", DISPATCH_QUEUE_CONCURRENT); // allocate tmp heap with fixed size for testing - // TODO: figure out how to dynamically resize it + // TODO: factor into a function { - MTLHeapDescriptor *heapDescriptor = [[MTLHeapDescriptor alloc] init]; - heapDescriptor.storageMode = MTLStorageModePrivate; - heapDescriptor.cpuCacheMode = MTLCPUCacheModeDefaultCache; - heapDescriptor.size = 32*1024*1024; + MTLHeapDescriptor * desc = [[MTLHeapDescriptor alloc] init]; + desc.storageMode = MTLStorageModePrivate; + desc.cpuCacheMode = MTLCPUCacheModeDefaultCache; + desc.type = MTLHeapTypeAutomatic; // TODO: use MTLHeapTypePlacement + desc.size = 1024*1024; ctx->heap.n = 0; - ctx->heap.obj = [device newHeapWithDescriptor:heapDescriptor]; + ctx->heap.obj = [device newHeapWithDescriptor:desc]; for (int i = 0; i < GGML_METAL_MAX_HEAP_BUFFERS; ++i) { ctx->heap.bufs[i] = nil; } - [heapDescriptor release]; + [desc release]; } // load library @@ -1472,7 +1524,7 @@ static void ggml_metal_encode_node( ggml_backend_t backend, int idx, id encoder, - struct ggml_backend_metal_heap * heap) { + struct ggml_metal_heap * heap) { struct ggml_backend_metal_context * ctx = backend->context; struct ggml_backend_metal_device_context * ctx_dev = backend->device->context; @@ -2165,11 +2217,16 @@ static void ggml_metal_encode_node( /*.nb3 =*/ nb03, }; - id id_src0h = [heap->obj newBufferWithLength:ggml_nbytes(src0) options:MTLResourceStorageModePrivate]; + //id id_src0h = [heap->obj newBufferWithLength:ggml_nbytes(src0) options:MTLResourceStorageModePrivate]; - // save a reference to the heap-allocated buffer - // TODO: simplify and check for available resources - heap->bufs[heap->n++] = id_src0h; + //// save a reference to the heap-allocated buffer + //// TODO: simplify and check for available resources + //heap->bufs[heap->n++] = id_src0h; + + id id_src0h = ggml_metal_heap_alloc(heap, ggml_nbytes(src0), 32); + if (!id_src0h) { + break; + } if (src0->type == GGML_TYPE_F16) { [encoder setComputePipelineState:ctx->kernels[GGML_METAL_KERNEL_TYPE_CPY_F16_F16].pipeline]; @@ -4659,21 +4716,8 @@ static enum ggml_status ggml_metal_graph_compute( id command_buffer = ctx->command_buffers[i]; [command_buffer waitUntilCompleted]; - // free buffers from the heap - { - size_t size_allocated = [ctx->heap.obj currentAllocatedSize]; - size_t size_used = [ctx->heap.obj usedSize]; - GGML_LOG_INFO("%s: command buffer %d, allocated = %zu, used = %zu, n = %d\n", __func__, i, size_allocated, size_used, ctx->heap.n); - - for (int j = 0; j < ctx->heap.n; ++j) { - id buf = ctx->heap.bufs[j]; - [buf release]; - - ctx->heap.bufs[j] = nil; - } - - ctx->heap.n = 0; - } + // TODO: per command buffer heap + ggml_metal_heap_reset(&ctx->heap); MTLCommandBufferStatus status = [command_buffer status]; if (status != MTLCommandBufferStatusCompleted) { @@ -5069,31 +5113,59 @@ static void ggml_backend_metal_set_n_cb(ggml_backend_t backend, int n_cb) { const int n_nodes_per_cb = ctx->n_nodes_per_cb; id command_buffer = ctx->command_buffers[cb_idx]; - id encoder = [command_buffer computeCommandEncoder]; - int node_start = 0; - int node_end = n_nodes_0; + int n_try = 3; - if (cb_idx < n_cb_l) { - node_start = n_nodes_0 + ( (cb_idx + 0) * n_nodes_per_cb); - node_end = n_nodes_0 + (MIN((cb_idx == n_cb_l - 1) ? n_nodes_1 : (cb_idx + 1) * n_nodes_per_cb, n_nodes_1)); - } + while (n_try-- > 0) { + id encoder = [command_buffer computeCommandEncoder]; - const bool should_capture = ctx->capture_next_compute; + int node_start = 0; + int node_end = n_nodes_0; - for (int idx = node_start; idx < node_end; ++idx) { - if (should_capture) { - [encoder pushDebugGroup:[NSString stringWithCString:ggml_op_desc(ggml_graph_node(ctx->gf, idx)) encoding:NSUTF8StringEncoding]]; + if (cb_idx < n_cb_l) { + node_start = n_nodes_0 + ( (cb_idx + 0) * n_nodes_per_cb); + node_end = n_nodes_0 + (MIN((cb_idx == n_cb_l - 1) ? n_nodes_1 : (cb_idx + 1) * n_nodes_per_cb, n_nodes_1)); } - ggml_metal_encode_node(backend, idx, encoder, &ctx->heap); + const bool should_capture = ctx->capture_next_compute; + + for (int idx = node_start; idx < node_end; ++idx) { + if (should_capture) { + [encoder pushDebugGroup:[NSString stringWithCString:ggml_op_desc(ggml_graph_node(ctx->gf, idx)) encoding:NSUTF8StringEncoding]]; + } + + ggml_metal_encode_node(backend, idx, encoder, &ctx->heap); - if (should_capture) { - [encoder popDebugGroup]; + if (should_capture) { + [encoder popDebugGroup]; + } } - } - [encoder endEncoding]; + [encoder endEncoding]; + + if (ctx->heap.fail == 0) { + break; + } + + // increase heap size + [ctx->heap.obj release]; + + { + MTLHeapDescriptor * desc = [[MTLHeapDescriptor alloc] init]; + desc.storageMode = MTLStorageModePrivate; + desc.cpuCacheMode = MTLCPUCacheModeDefaultCache; + desc.type = MTLHeapTypeAutomatic; // TODO: use MTLHeapTypePlacement + desc.size = ctx->heap.need; + + GGML_LOG_INFO("%s: increasing heap size to %zu\n", __func__, ctx->heap.need); + + ctx->heap.obj = [ctx->device newHeapWithDescriptor:desc]; + + [desc release]; + } + + ggml_metal_heap_reset(&ctx->heap); + } if (cb_idx < 2 || ctx->abort_callback == NULL) { [command_buffer commit]; From 9433c504c09b9e887886e82a7b237a92d420634a Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Thu, 10 Apr 2025 14:49:49 +0300 Subject: [PATCH 04/15] cont : refactor heap [no ci] --- ggml/src/ggml-metal/ggml-metal.m | 125 +++++++++++++++++++------------ 1 file changed, 79 insertions(+), 46 deletions(-) diff --git a/ggml/src/ggml-metal/ggml-metal.m b/ggml/src/ggml-metal/ggml-metal.m index f48c64605f2e0..238e5d86ba609 100644 --- a/ggml/src/ggml-metal/ggml-metal.m +++ b/ggml/src/ggml-metal/ggml-metal.m @@ -471,17 +471,55 @@ static void ggml_backend_metal_device_rel(struct ggml_backend_metal_device_conte GGML_METAL_KERNEL_TYPE_COUNT }; -// TODO: use MTLHeapTypePlacement and reset offset after every node struct ggml_metal_heap { int n; int fail; size_t need; + id device; id obj; id bufs[GGML_METAL_MAX_HEAP_BUFFERS]; }; +static struct ggml_metal_heap * ggml_metal_heap_init(id device, size_t size) { + struct ggml_metal_heap * heap = calloc(1, sizeof(struct ggml_metal_heap)); + + MTLHeapDescriptor * desc = [[MTLHeapDescriptor alloc] init]; + desc.storageMode = MTLStorageModePrivate; + desc.cpuCacheMode = MTLCPUCacheModeDefaultCache; + desc.type = MTLHeapTypeAutomatic; // TODO: use MTLHeapTypePlacement + desc.size = size; + + heap->device = device; + heap->obj = [device newHeapWithDescriptor:desc]; + if (!heap->obj) { + GGML_LOG_ERROR("%s: error: failed to create MTLHeap with size %zu\n", __func__, size); + + free(heap); + + return false; + } + + for (int i = 0; i < GGML_METAL_MAX_HEAP_BUFFERS; ++i) { + heap->bufs[i] = nil; + } + + [desc release]; + + return heap; +} + +static void ggml_metal_heap_free(struct ggml_metal_heap * heap) { + if (heap == nil) { + return; + } + + [heap->obj release]; + + free(heap); +} + static void ggml_metal_heap_reset(struct ggml_metal_heap * heap) { heap->n = 0; heap->fail = 0; @@ -498,6 +536,33 @@ static void ggml_metal_heap_reset(struct ggml_metal_heap * heap) { } } +static bool ggml_metal_heap_resize(struct ggml_metal_heap * heap, size_t size) { + if (heap == nil) { + return false; + } + + [heap->obj release]; + + MTLHeapDescriptor * desc = [[MTLHeapDescriptor alloc] init]; + desc.storageMode = MTLStorageModePrivate; + desc.cpuCacheMode = MTLCPUCacheModeDefaultCache; + desc.type = MTLHeapTypeAutomatic; // TODO: use MTLHeapTypePlacement + desc.size = size; + + heap->obj = [heap->device newHeapWithDescriptor:desc]; + if (!heap->obj) { + GGML_LOG_ERROR("%s: error: failed to create MTLHeap with size %zu\n", __func__, size); + + return false; + } + + [desc release]; + + ggml_metal_heap_reset(heap); + + return true; +} + static id ggml_metal_heap_alloc(struct ggml_metal_heap * heap, size_t size, size_t alignment) { const size_t size_aligned = GGML_PAD(size, alignment); @@ -531,7 +596,7 @@ static void ggml_metal_heap_reset(struct ggml_metal_heap * heap) { id queue; // TODO: create heap per command buffer - struct ggml_metal_heap heap; + struct ggml_metal_heap * heap; dispatch_queue_t d_queue; @@ -757,24 +822,7 @@ @implementation GGMLMetalClass ctx->d_queue = dispatch_queue_create("ggml-metal", DISPATCH_QUEUE_CONCURRENT); - // allocate tmp heap with fixed size for testing - // TODO: factor into a function - { - MTLHeapDescriptor * desc = [[MTLHeapDescriptor alloc] init]; - desc.storageMode = MTLStorageModePrivate; - desc.cpuCacheMode = MTLCPUCacheModeDefaultCache; - desc.type = MTLHeapTypeAutomatic; // TODO: use MTLHeapTypePlacement - desc.size = 1024*1024; - - ctx->heap.n = 0; - - ctx->heap.obj = [device newHeapWithDescriptor:desc]; - for (int i = 0; i < GGML_METAL_MAX_HEAP_BUFFERS; ++i) { - ctx->heap.bufs[i] = nil; - } - - [desc release]; - } + ctx->heap = ggml_metal_heap_init(device, 1024*1024); // load library if (ctx_dev->mtl_library == nil) { @@ -1218,8 +1266,9 @@ static void ggml_metal_free(struct ggml_backend_metal_context * ctx) { Block_release(ctx->encode_async); - [ctx->queue release]; - [ctx->heap.obj release]; + [ctx->queue release]; + + ggml_metal_heap_free(ctx->heap); dispatch_release(ctx->d_queue); @@ -2217,12 +2266,6 @@ static void ggml_metal_encode_node( /*.nb3 =*/ nb03, }; - //id id_src0h = [heap->obj newBufferWithLength:ggml_nbytes(src0) options:MTLResourceStorageModePrivate]; - - //// save a reference to the heap-allocated buffer - //// TODO: simplify and check for available resources - //heap->bufs[heap->n++] = id_src0h; - id id_src0h = ggml_metal_heap_alloc(heap, ggml_nbytes(src0), 32); if (!id_src0h) { break; @@ -4717,7 +4760,7 @@ static enum ggml_status ggml_metal_graph_compute( [command_buffer waitUntilCompleted]; // TODO: per command buffer heap - ggml_metal_heap_reset(&ctx->heap); + ggml_metal_heap_reset(ctx->heap); MTLCommandBufferStatus status = [command_buffer status]; if (status != MTLCommandBufferStatusCompleted) { @@ -5134,7 +5177,7 @@ static void ggml_backend_metal_set_n_cb(ggml_backend_t backend, int n_cb) { [encoder pushDebugGroup:[NSString stringWithCString:ggml_op_desc(ggml_graph_node(ctx->gf, idx)) encoding:NSUTF8StringEncoding]]; } - ggml_metal_encode_node(backend, idx, encoder, &ctx->heap); + ggml_metal_encode_node(backend, idx, encoder, ctx->heap); if (should_capture) { [encoder popDebugGroup]; @@ -5143,28 +5186,18 @@ static void ggml_backend_metal_set_n_cb(ggml_backend_t backend, int n_cb) { [encoder endEncoding]; - if (ctx->heap.fail == 0) { + if (ctx->heap->fail == 0) { break; } - // increase heap size - [ctx->heap.obj release]; + const size_t need = ctx->heap->need; - { - MTLHeapDescriptor * desc = [[MTLHeapDescriptor alloc] init]; - desc.storageMode = MTLStorageModePrivate; - desc.cpuCacheMode = MTLCPUCacheModeDefaultCache; - desc.type = MTLHeapTypeAutomatic; // TODO: use MTLHeapTypePlacement - desc.size = ctx->heap.need; - - GGML_LOG_INFO("%s: increasing heap size to %zu\n", __func__, ctx->heap.need); - - ctx->heap.obj = [ctx->device newHeapWithDescriptor:desc]; + GGML_LOG_INFO("%s: increasing heap size to %zu\n", __func__, need); - [desc release]; + if (!ggml_metal_heap_resize(ctx->heap, need)) { + GGML_LOG_ERROR("%s: failed to increase heap size to %zu\n", __func__, need); + break; } - - ggml_metal_heap_reset(&ctx->heap); } if (cb_idx < 2 || ctx->abort_callback == NULL) { From 2804db781201055947f812f59fe53a0a1b22c824 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Thu, 10 Apr 2025 14:56:47 +0300 Subject: [PATCH 05/15] cont : heap for each cmd buffer [no ci] --- ggml/src/ggml-metal/ggml-metal.m | 80 ++++++++++++++++++-------------- 1 file changed, 45 insertions(+), 35 deletions(-) diff --git a/ggml/src/ggml-metal/ggml-metal.m b/ggml/src/ggml-metal/ggml-metal.m index 238e5d86ba609..cb9523506c1e5 100644 --- a/ggml/src/ggml-metal/ggml-metal.m +++ b/ggml/src/ggml-metal/ggml-metal.m @@ -591,13 +591,16 @@ static bool ggml_metal_heap_resize(struct ggml_metal_heap * heap, size_t size) { return buf; } +struct ggml_metal_command_buffer { + id obj; + + struct ggml_metal_heap * heap; +}; + struct ggml_backend_metal_context { id device; id queue; - // TODO: create heap per command buffer - struct ggml_metal_heap * heap; - dispatch_queue_t d_queue; struct ggml_metal_kernel kernels[GGML_METAL_KERNEL_TYPE_COUNT]; @@ -620,7 +623,8 @@ static bool ggml_metal_heap_resize(struct ggml_metal_heap * heap, size_t size) { void (^encode_async)(size_t ith); // n_cb command buffers + 1 used by the main thread - id command_buffers[GGML_METAL_MAX_COMMAND_BUFFERS + 1]; + //id command_buffers[GGML_METAL_MAX_COMMAND_BUFFERS + 1]; + struct ggml_metal_command_buffer cmd_bufs[GGML_METAL_MAX_COMMAND_BUFFERS + 1]; // abort ggml_metal_graph_compute if callback returns true ggml_abort_callback abort_callback; @@ -822,8 +826,6 @@ @implementation GGMLMetalClass ctx->d_queue = dispatch_queue_create("ggml-metal", DISPATCH_QUEUE_CONCURRENT); - ctx->heap = ggml_metal_heap_init(device, 1024*1024); - // load library if (ctx_dev->mtl_library == nil) { ctx_dev->mtl_library = ggml_metal_load_library(device, ctx_dev->use_bfloat); @@ -877,7 +879,11 @@ @implementation GGMLMetalClass ctx->gf = nil; ctx->encode_async = nil; for (int i = 0; i < GGML_METAL_MAX_COMMAND_BUFFERS; ++i) { - ctx->command_buffers[i] = nil; + ctx->cmd_bufs[i].obj = nil; + + // create 1MB heaps per command buffer + // these can be resized during compute when necessary + ctx->cmd_bufs[i].heap = ggml_metal_heap_init(device, 1024*1024); } #if TARGET_OS_OSX || (TARGET_OS_IOS && __clang_major__ >= 15) @@ -1268,7 +1274,11 @@ static void ggml_metal_free(struct ggml_backend_metal_context * ctx) { [ctx->queue release]; - ggml_metal_heap_free(ctx->heap); + //ggml_metal_heap_free(ctx->heap); + for (int i = 0; i < GGML_METAL_MAX_COMMAND_BUFFERS; ++i) { + [ctx->cmd_bufs[i].obj release]; + ggml_metal_heap_free(ctx->cmd_bufs[i].heap); + } dispatch_release(ctx->d_queue); @@ -4712,25 +4722,25 @@ static enum ggml_status ggml_metal_graph_compute( } // the main thread commits the first few commands immediately - // command_buffer[n_cb] + // cmd_buf[n_cb] { - id command_buffer = [ctx->queue commandBufferWithUnretainedReferences]; - ctx->command_buffers[n_cb] = command_buffer; + id cmd_buf = [ctx->queue commandBufferWithUnretainedReferences]; + ctx->cmd_bufs[n_cb].obj = cmd_buf; - [command_buffer enqueue]; + [cmd_buf enqueue]; ctx->encode_async(n_cb); } // prepare the rest of the command buffers asynchronously - // command_buffer[0.. n_cb) + // cmd_buf[0.. n_cb) for (int cb_idx = 0; cb_idx < n_cb; ++cb_idx) { - id command_buffer = [ctx->queue commandBufferWithUnretainedReferences]; - ctx->command_buffers[cb_idx] = command_buffer; + id cmd_buf = [ctx->queue commandBufferWithUnretainedReferences]; + ctx->cmd_bufs[cb_idx].obj = cmd_buf; // always enqueue the first two command buffers // enqueue all of the command buffers if we don't need to abort if (cb_idx < 2 || ctx->abort_callback == NULL) { - [command_buffer enqueue]; + [cmd_buf enqueue]; } } @@ -4739,16 +4749,16 @@ static enum ggml_status ggml_metal_graph_compute( // wait for completion and check status of each command buffer // needed to detect if the device ran out-of-memory for example (#1881) { - id command_buffer = ctx->command_buffers[n_cb]; - [command_buffer waitUntilCompleted]; + id cmd_buf = ctx->cmd_bufs[n_cb].obj; + [cmd_buf waitUntilCompleted]; - // TODO: free main cb heap + ggml_metal_heap_reset(ctx->cmd_bufs[n_cb].heap); - MTLCommandBufferStatus status = [command_buffer status]; + MTLCommandBufferStatus status = [cmd_buf status]; if (status != MTLCommandBufferStatusCompleted) { GGML_LOG_INFO("%s: command buffer %d failed with status %lu\n", __func__, n_cb, status); if (status == MTLCommandBufferStatusError) { - GGML_LOG_INFO("error: %s\n", [[command_buffer error].localizedDescription UTF8String]); + GGML_LOG_INFO("error: %s\n", [[cmd_buf error].localizedDescription UTF8String]); } return GGML_STATUS_FAILED; @@ -4756,23 +4766,22 @@ static enum ggml_status ggml_metal_graph_compute( } for (int i = 0; i < n_cb; ++i) { - id command_buffer = ctx->command_buffers[i]; - [command_buffer waitUntilCompleted]; + id cmd_buf = ctx->cmd_bufs[i].obj; + [cmd_buf waitUntilCompleted]; - // TODO: per command buffer heap - ggml_metal_heap_reset(ctx->heap); + ggml_metal_heap_reset(ctx->cmd_bufs[i].heap); - MTLCommandBufferStatus status = [command_buffer status]; + MTLCommandBufferStatus status = [cmd_buf status]; if (status != MTLCommandBufferStatusCompleted) { GGML_LOG_INFO("%s: command buffer %d failed with status %lu\n", __func__, i, status); if (status == MTLCommandBufferStatusError) { - GGML_LOG_INFO("error: %s\n", [[command_buffer error].localizedDescription UTF8String]); + GGML_LOG_INFO("error: %s\n", [[cmd_buf error].localizedDescription UTF8String]); } return GGML_STATUS_FAILED; } - id next_buffer = (i + 1 < n_cb ? ctx->command_buffers[i + 1] : nil); + id next_buffer = (i + 1 < n_cb ? ctx->cmd_bufs[i + 1].obj : nil); if (!next_buffer) { continue; } @@ -5155,12 +5164,13 @@ static void ggml_backend_metal_set_n_cb(ggml_backend_t backend, int n_cb) { const int n_nodes_per_cb = ctx->n_nodes_per_cb; - id command_buffer = ctx->command_buffers[cb_idx]; + id cmd_buf = ctx->cmd_bufs[cb_idx].obj; + struct ggml_metal_heap * heap = ctx->cmd_bufs[cb_idx].heap; int n_try = 3; while (n_try-- > 0) { - id encoder = [command_buffer computeCommandEncoder]; + id encoder = [cmd_buf computeCommandEncoder]; int node_start = 0; int node_end = n_nodes_0; @@ -5177,7 +5187,7 @@ static void ggml_backend_metal_set_n_cb(ggml_backend_t backend, int n_cb) { [encoder pushDebugGroup:[NSString stringWithCString:ggml_op_desc(ggml_graph_node(ctx->gf, idx)) encoding:NSUTF8StringEncoding]]; } - ggml_metal_encode_node(backend, idx, encoder, ctx->heap); + ggml_metal_encode_node(backend, idx, encoder, heap); if (should_capture) { [encoder popDebugGroup]; @@ -5186,22 +5196,22 @@ static void ggml_backend_metal_set_n_cb(ggml_backend_t backend, int n_cb) { [encoder endEncoding]; - if (ctx->heap->fail == 0) { + if (heap->fail == 0) { break; } - const size_t need = ctx->heap->need; + const size_t need = heap->need; GGML_LOG_INFO("%s: increasing heap size to %zu\n", __func__, need); - if (!ggml_metal_heap_resize(ctx->heap, need)) { + if (!ggml_metal_heap_resize(heap, need)) { GGML_LOG_ERROR("%s: failed to increase heap size to %zu\n", __func__, need); break; } } if (cb_idx < 2 || ctx->abort_callback == NULL) { - [command_buffer commit]; + [cmd_buf commit]; } }); } From e1dc4df76ff345265f1aa6ad5977bd14e2940973 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Thu, 10 Apr 2025 15:27:39 +0300 Subject: [PATCH 06/15] cont : fix free --- ggml/src/ggml-metal/ggml-metal.m | 23 +++++++++++++++++++---- 1 file changed, 19 insertions(+), 4 deletions(-) diff --git a/ggml/src/ggml-metal/ggml-metal.m b/ggml/src/ggml-metal/ggml-metal.m index cb9523506c1e5..f0b40e84efb7b 100644 --- a/ggml/src/ggml-metal/ggml-metal.m +++ b/ggml/src/ggml-metal/ggml-metal.m @@ -883,7 +883,7 @@ @implementation GGMLMetalClass // create 1MB heaps per command buffer // these can be resized during compute when necessary - ctx->cmd_bufs[i].heap = ggml_metal_heap_init(device, 1024*1024); + ctx->cmd_bufs[i].heap = ggml_metal_heap_init(device, 32); } #if TARGET_OS_OSX || (TARGET_OS_IOS && __clang_major__ >= 15) @@ -1274,9 +1274,9 @@ static void ggml_metal_free(struct ggml_backend_metal_context * ctx) { [ctx->queue release]; - //ggml_metal_heap_free(ctx->heap); for (int i = 0; i < GGML_METAL_MAX_COMMAND_BUFFERS; ++i) { - [ctx->cmd_bufs[i].obj release]; + // ctx->cmd_bufs[i].obj is auto released + ggml_metal_heap_free(ctx->cmd_bufs[i].heap); } @@ -5167,7 +5167,7 @@ static void ggml_backend_metal_set_n_cb(ggml_backend_t backend, int n_cb) { id cmd_buf = ctx->cmd_bufs[cb_idx].obj; struct ggml_metal_heap * heap = ctx->cmd_bufs[cb_idx].heap; - int n_try = 3; + int n_try = 2; while (n_try-- > 0) { id encoder = [cmd_buf computeCommandEncoder]; @@ -5200,6 +5200,21 @@ static void ggml_backend_metal_set_n_cb(ggml_backend_t backend, int n_cb) { break; } + if (heap->fail == 2) { + GGML_LOG_ERROR("%s: MTLHeap ran out of buffers, max = %d\n", __func__, heap->n); + break; + } + + if (heap->fail == 3) { + GGML_LOG_ERROR("%s: MTLHeap failed to allocate buffer\n", __func__); + break; + } + + if (n_try == 0) { + GGML_LOG_ERROR("%s: failed to allocate heap memory\n", __func__); + break; + } + const size_t need = heap->need; GGML_LOG_INFO("%s: increasing heap size to %zu\n", __func__, need); From c77ccf0bf5f95184a3e39c2142364d9c7352c00e Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Thu, 10 Apr 2025 16:27:12 +0300 Subject: [PATCH 07/15] wip --- ggml/src/ggml-metal/ggml-metal.m | 131 ++++++++++++++++++------------- 1 file changed, 78 insertions(+), 53 deletions(-) diff --git a/ggml/src/ggml-metal/ggml-metal.m b/ggml/src/ggml-metal/ggml-metal.m index f0b40e84efb7b..3faa137810e2d 100644 --- a/ggml/src/ggml-metal/ggml-metal.m +++ b/ggml/src/ggml-metal/ggml-metal.m @@ -1579,7 +1579,7 @@ static bool ggml_metal_supports_op(const struct ggml_backend_metal_device_contex } } -static void ggml_metal_encode_node( +static bool ggml_metal_encode_node( ggml_backend_t backend, int idx, id encoder, @@ -1599,7 +1599,7 @@ static void ggml_metal_encode_node( struct ggml_tensor * dst = node; if (ggml_is_empty(dst)) { - return; + return true; } switch (dst->op) { @@ -1610,7 +1610,7 @@ static void ggml_metal_encode_node( case GGML_OP_PERMUTE: { // noop -> next node - } return; + } return true; default: { } break; @@ -2214,6 +2214,8 @@ static void ggml_metal_encode_node( { GGML_ASSERT(!src1 || src1->type == GGML_TYPE_F16 || src1->type == GGML_TYPE_F32); + GGML_ASSERT(ggml_is_contiguous(src0)); + int nth = 32; // SIMD width id pipeline = nil; @@ -2278,7 +2280,9 @@ static void ggml_metal_encode_node( id id_src0h = ggml_metal_heap_alloc(heap, ggml_nbytes(src0), 32); if (!id_src0h) { - break; + //GGML_LOG_ERROR("%s: failed to allocate buffer for cpy, size = %zu, need = %zu, max available = %zu\n", + // __func__, ggml_nbytes(src0), heap->need, [heap->obj maxAvailableSizeWithAlignment:32]); + return false; } if (src0->type == GGML_TYPE_F16) { @@ -4669,6 +4673,8 @@ static void ggml_metal_encode_node( GGML_ABORT("fatal error"); } } + + return true; } static enum ggml_status ggml_metal_graph_compute( @@ -4683,6 +4689,8 @@ static enum ggml_status ggml_metal_graph_compute( // number of threads in addition to the main thread const int n_cb = ctx->n_cb; + int n_try = 64; + // submit the ggml compute graph to the GPU by creating command buffers and encoding the ops in them // the first n_nodes_0 are encoded and submitted for processing directly by the calling thread // while these nodes are processing, we start n_cb threads to enqueue the rest of the nodes @@ -4690,6 +4698,7 @@ static enum ggml_status ggml_metal_graph_compute( // // tests on M1 Pro and M2 Ultra using LLaMA models, show that optimal values for n_cb are 1 or 2 + while (n_try-- > 0) { @autoreleasepool { ctx->gf = gf; @@ -4752,8 +4761,6 @@ static enum ggml_status ggml_metal_graph_compute( id cmd_buf = ctx->cmd_bufs[n_cb].obj; [cmd_buf waitUntilCompleted]; - ggml_metal_heap_reset(ctx->cmd_bufs[n_cb].heap); - MTLCommandBufferStatus status = [cmd_buf status]; if (status != MTLCommandBufferStatusCompleted) { GGML_LOG_INFO("%s: command buffer %d failed with status %lu\n", __func__, n_cb, status); @@ -4769,8 +4776,6 @@ static enum ggml_status ggml_metal_graph_compute( id cmd_buf = ctx->cmd_bufs[i].obj; [cmd_buf waitUntilCompleted]; - ggml_metal_heap_reset(ctx->cmd_bufs[i].heap); - MTLCommandBufferStatus status = [cmd_buf status]; if (status != MTLCommandBufferStatusCompleted) { GGML_LOG_INFO("%s: command buffer %d failed with status %lu\n", __func__, i, status); @@ -4805,6 +4810,54 @@ static enum ggml_status ggml_metal_graph_compute( } } + bool retry = false; + + // check heap statuses + for (int i = 0; i <= n_cb; ++i) { + struct ggml_metal_heap * heap = ctx->cmd_bufs[i].heap; + + const size_t need = 4*heap->need; + + //printf("\nXXXXXXXXXXXXXXXXX cb %d, need = %zu, fail = %d, size = %zu\n", i, need, heap->fail, [heap->obj currentAllocatedSize]); + + if (heap->fail == 0) { + ggml_metal_heap_reset(ctx->cmd_bufs[i].heap); + + continue; + } + + if (heap->fail == 2) { + GGML_LOG_ERROR("%s: command buffer %d, MTLHeap ran out of buffers, max = %d\n", __func__, i, heap->n); + return GGML_STATUS_ALLOC_FAILED; + } + + if (heap->fail == 3) { + GGML_LOG_ERROR("%s: command buffer %d, MTLHeap failed to allocate buffer, max = %d\n", __func__, i, heap->n); + return GGML_STATUS_ALLOC_FAILED; + } + + //GGML_LOG_INFO("%s: command buffer %d, MTLHeap need = %zu\n", __func__, i, need); + + if (!ggml_metal_heap_resize(heap, need)) { + GGML_LOG_ERROR("%s: failed to increase heap size to %zu\n", __func__, need); + return GGML_STATUS_ALLOC_FAILED; + } + + retry = true; + } + + if (!retry) { + break; + } + + //printf("XXXXXXXXXXXXXXXXXXXXXXX retry\n"); + + if (n_try == 0) { + GGML_LOG_ERROR("%s: failed to allocate heap memory\n", __func__); + return GGML_STATUS_ALLOC_FAILED; + } + } + return GGML_STATUS_SUCCESS; } @@ -5167,64 +5220,36 @@ static void ggml_backend_metal_set_n_cb(ggml_backend_t backend, int n_cb) { id cmd_buf = ctx->cmd_bufs[cb_idx].obj; struct ggml_metal_heap * heap = ctx->cmd_bufs[cb_idx].heap; - int n_try = 2; - - while (n_try-- > 0) { - id encoder = [cmd_buf computeCommandEncoder]; - - int node_start = 0; - int node_end = n_nodes_0; - - if (cb_idx < n_cb_l) { - node_start = n_nodes_0 + ( (cb_idx + 0) * n_nodes_per_cb); - node_end = n_nodes_0 + (MIN((cb_idx == n_cb_l - 1) ? n_nodes_1 : (cb_idx + 1) * n_nodes_per_cb, n_nodes_1)); - } - - const bool should_capture = ctx->capture_next_compute; - - for (int idx = node_start; idx < node_end; ++idx) { - if (should_capture) { - [encoder pushDebugGroup:[NSString stringWithCString:ggml_op_desc(ggml_graph_node(ctx->gf, idx)) encoding:NSUTF8StringEncoding]]; - } - - ggml_metal_encode_node(backend, idx, encoder, heap); + id encoder = [cmd_buf computeCommandEncoder]; - if (should_capture) { - [encoder popDebugGroup]; - } - } + int node_start = 0; + int node_end = n_nodes_0; - [encoder endEncoding]; + if (cb_idx < n_cb_l) { + node_start = n_nodes_0 + ( (cb_idx + 0) * n_nodes_per_cb); + node_end = n_nodes_0 + (MIN((cb_idx == n_cb_l - 1) ? n_nodes_1 : (cb_idx + 1) * n_nodes_per_cb, n_nodes_1)); + } - if (heap->fail == 0) { - break; - } + const bool should_capture = ctx->capture_next_compute; - if (heap->fail == 2) { - GGML_LOG_ERROR("%s: MTLHeap ran out of buffers, max = %d\n", __func__, heap->n); - break; + for (int idx = node_start; idx < node_end; ++idx) { + if (should_capture) { + [encoder pushDebugGroup:[NSString stringWithCString:ggml_op_desc(ggml_graph_node(ctx->gf, idx)) encoding:NSUTF8StringEncoding]]; } - if (heap->fail == 3) { - GGML_LOG_ERROR("%s: MTLHeap failed to allocate buffer\n", __func__); - break; - } + const bool res = ggml_metal_encode_node(backend, idx, encoder, heap); - if (n_try == 0) { - GGML_LOG_ERROR("%s: failed to allocate heap memory\n", __func__); - break; + if (should_capture) { + [encoder popDebugGroup]; } - const size_t need = heap->need; - - GGML_LOG_INFO("%s: increasing heap size to %zu\n", __func__, need); - - if (!ggml_metal_heap_resize(heap, need)) { - GGML_LOG_ERROR("%s: failed to increase heap size to %zu\n", __func__, need); + if (!res) { break; } } + [encoder endEncoding]; + if (cb_idx < 2 || ctx->abort_callback == NULL) { [cmd_buf commit]; } From c2c0f0f7d875a871d51365e034563ea29466cdb0 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Thu, 10 Apr 2025 16:55:05 +0300 Subject: [PATCH 08/15] cont : fix alignment [no ci] --- ggml/src/ggml-metal/ggml-metal.m | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/ggml/src/ggml-metal/ggml-metal.m b/ggml/src/ggml-metal/ggml-metal.m index 3faa137810e2d..e9fc033eb85e5 100644 --- a/ggml/src/ggml-metal/ggml-metal.m +++ b/ggml/src/ggml-metal/ggml-metal.m @@ -568,7 +568,7 @@ static bool ggml_metal_heap_resize(struct ggml_metal_heap * heap, size_t size) { heap->need += size_aligned; - if (!heap->fail && heap->need > [heap->obj maxAvailableSizeWithAlignment:alignment]) { + if (!heap->fail && size_aligned > [heap->obj maxAvailableSizeWithAlignment:alignment]) { heap->fail = 1; } @@ -2278,11 +2278,13 @@ static bool ggml_metal_encode_node( /*.nb3 =*/ nb03, }; - id id_src0h = ggml_metal_heap_alloc(heap, ggml_nbytes(src0), 32); + id id_src0h = ggml_metal_heap_alloc(heap, ggml_nbytes(src0), 64*1024); if (!id_src0h) { - //GGML_LOG_ERROR("%s: failed to allocate buffer for cpy, size = %zu, need = %zu, max available = %zu\n", - // __func__, ggml_nbytes(src0), heap->need, [heap->obj maxAvailableSizeWithAlignment:32]); - return false; + //GGML_LOG_ERROR("%s: failed to allocate buffer, idx = %4d, size = %8zu, need = %8zu, max available = %9zu, heap size = %9zu, heap used = %zu\n", + // __func__, idx, ggml_nbytes(src0), heap->need, [heap->obj maxAvailableSizeWithAlignment:0], [heap->obj size], [heap->obj usedSize]); + return true; + } else { + //GGML_LOG_ERROR("%s: allocated %zu\n", __func__, ggml_nbytes(src0)); } if (src0->type == GGML_TYPE_F16) { @@ -4689,7 +4691,7 @@ static enum ggml_status ggml_metal_graph_compute( // number of threads in addition to the main thread const int n_cb = ctx->n_cb; - int n_try = 64; + int n_try = 2; // submit the ggml compute graph to the GPU by creating command buffers and encoding the ops in them // the first n_nodes_0 are encoded and submitted for processing directly by the calling thread @@ -4816,7 +4818,7 @@ static enum ggml_status ggml_metal_graph_compute( for (int i = 0; i <= n_cb; ++i) { struct ggml_metal_heap * heap = ctx->cmd_bufs[i].heap; - const size_t need = 4*heap->need; + const size_t need = heap->need; //printf("\nXXXXXXXXXXXXXXXXX cb %d, need = %zu, fail = %d, size = %zu\n", i, need, heap->fail, [heap->obj currentAllocatedSize]); From cbb617edc6d981f3a8495f91a1b81739f808757e Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Thu, 10 Apr 2025 18:42:36 +0300 Subject: [PATCH 09/15] cont : not working .. [no ci] --- ggml/src/ggml-metal/ggml-metal.m | 57 +++++++++++++++++++++++--------- 1 file changed, 41 insertions(+), 16 deletions(-) diff --git a/ggml/src/ggml-metal/ggml-metal.m b/ggml/src/ggml-metal/ggml-metal.m index e9fc033eb85e5..9aca9d953a8a5 100644 --- a/ggml/src/ggml-metal/ggml-metal.m +++ b/ggml/src/ggml-metal/ggml-metal.m @@ -563,7 +563,9 @@ static bool ggml_metal_heap_resize(struct ggml_metal_heap * heap, size_t size) { return true; } -static id ggml_metal_heap_alloc(struct ggml_metal_heap * heap, size_t size, size_t alignment) { +static id ggml_metal_heap_alloc(struct ggml_metal_heap * heap, size_t size) { + const size_t alignment = 1024*1024; + const size_t size_aligned = GGML_PAD(size, alignment); heap->need += size_aligned; @@ -1583,7 +1585,8 @@ static bool ggml_metal_encode_node( ggml_backend_t backend, int idx, id encoder, - struct ggml_metal_heap * heap) { + struct ggml_metal_heap * heap, + bool no_compute) { struct ggml_backend_metal_context * ctx = backend->context; struct ggml_backend_metal_device_context * ctx_dev = backend->device->context; @@ -1621,6 +1624,28 @@ static bool ggml_metal_encode_node( GGML_ABORT("unsupported op"); } + id h_src0 = nil; + switch (dst->op) { + case GGML_OP_SOFT_MAX: + { + h_src0 = ggml_metal_heap_alloc(heap, ggml_nbytes(src0)); + if (!h_src0) { + //GGML_LOG_ERROR("%s: failed to allocate buffer, idx = %4d, size = %8zu, need = %8zu, max available = %9zu, heap size = %9zu, heap used = %zu\n", + // __func__, idx, ggml_nbytes(src0), heap->need, [heap->obj maxAvailableSizeWithAlignment:0], [heap->obj size], [heap->obj usedSize]); + return false; + } else { + //GGML_LOG_ERROR("%s: allocated %zu\n", __func__, ggml_nbytes(src0)); + } + } break; + default: + { + } break; + } + + if (no_compute) { + return true; + } + const int64_t ne00 = src0 ? src0->ne[0] : 0; const int64_t ne01 = src0 ? src0->ne[1] : 0; const int64_t ne02 = src0 ? src0->ne[2] : 0; @@ -2278,15 +2303,6 @@ static bool ggml_metal_encode_node( /*.nb3 =*/ nb03, }; - id id_src0h = ggml_metal_heap_alloc(heap, ggml_nbytes(src0), 64*1024); - if (!id_src0h) { - //GGML_LOG_ERROR("%s: failed to allocate buffer, idx = %4d, size = %8zu, need = %8zu, max available = %9zu, heap size = %9zu, heap used = %zu\n", - // __func__, idx, ggml_nbytes(src0), heap->need, [heap->obj maxAvailableSizeWithAlignment:0], [heap->obj size], [heap->obj usedSize]); - return true; - } else { - //GGML_LOG_ERROR("%s: allocated %zu\n", __func__, ggml_nbytes(src0)); - } - if (src0->type == GGML_TYPE_F16) { [encoder setComputePipelineState:ctx->kernels[GGML_METAL_KERNEL_TYPE_CPY_F16_F16].pipeline]; } else { @@ -2294,7 +2310,7 @@ static bool ggml_metal_encode_node( } [encoder setBytes:&args_cpy length:sizeof(args_cpy) atIndex:0]; [encoder setBuffer:id_src0 offset:offs_src0 atIndex:1]; - [encoder setBuffer:id_src0h offset:0 atIndex:2]; + [encoder setBuffer:h_src0 offset:0 atIndex:2]; GGML_ASSERT(ne00 % ggml_blck_size(src0->type) == 0); int nth_cpy = MIN(1024, ne00 / ggml_blck_size(src0->type)); @@ -2315,11 +2331,11 @@ static bool ggml_metal_encode_node( }; [encoder setComputePipelineState:pipeline]; - [encoder setBuffer:id_src0h offset:0 atIndex:0]; + [encoder setBuffer:h_src0 offset:0 atIndex:0]; if (id_src1) { [encoder setBuffer:id_src1 offset:offs_src1 atIndex:1]; } else { - [encoder setBuffer:id_src0h offset:0 atIndex:1]; + [encoder setBuffer:h_src0 offset:0 atIndex:1]; } [encoder setBuffer:id_dst offset:offs_dst atIndex:2]; [encoder setBytes:&args length:sizeof(args) atIndex:3]; @@ -4732,6 +4748,12 @@ static enum ggml_status ggml_metal_graph_compute( } } + for (int i = 0; i <= n_cb; ++i) { + struct ggml_metal_heap * heap = ctx->cmd_bufs[i].heap; + + [heap->obj setPurgeableState:MTLPurgeableStateNonVolatile]; + } + // the main thread commits the first few commands immediately // cmd_buf[n_cb] { @@ -4824,6 +4846,7 @@ static enum ggml_status ggml_metal_graph_compute( if (heap->fail == 0) { ggml_metal_heap_reset(ctx->cmd_bufs[i].heap); + [heap->obj setPurgeableState:MTLPurgeableStateEmpty]; continue; } @@ -5234,19 +5257,21 @@ static void ggml_backend_metal_set_n_cb(ggml_backend_t backend, int n_cb) { const bool should_capture = ctx->capture_next_compute; + bool no_compute = false; + for (int idx = node_start; idx < node_end; ++idx) { if (should_capture) { [encoder pushDebugGroup:[NSString stringWithCString:ggml_op_desc(ggml_graph_node(ctx->gf, idx)) encoding:NSUTF8StringEncoding]]; } - const bool res = ggml_metal_encode_node(backend, idx, encoder, heap); + const bool res = ggml_metal_encode_node(backend, idx, encoder, heap, no_compute); if (should_capture) { [encoder popDebugGroup]; } if (!res) { - break; + no_compute = true; } } From 91d5dc5a2be410478f77fcbcf04ec4e1a3196ffc Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Fri, 11 Apr 2025 12:27:15 +0300 Subject: [PATCH 10/15] cont : heap allocation now works [no ci] --- ggml/src/ggml-metal/ggml-metal.m | 116 ++++++++++++------------------- 1 file changed, 46 insertions(+), 70 deletions(-) diff --git a/ggml/src/ggml-metal/ggml-metal.m b/ggml/src/ggml-metal/ggml-metal.m index 9aca9d953a8a5..149999cb30723 100644 --- a/ggml/src/ggml-metal/ggml-metal.m +++ b/ggml/src/ggml-metal/ggml-metal.m @@ -558,18 +558,27 @@ static bool ggml_metal_heap_resize(struct ggml_metal_heap * heap, size_t size) { [desc release]; + //GGML_LOG_INFO("%s: resized heap to %zu\n", __func__, [heap->obj size]); + ggml_metal_heap_reset(heap); return true; } -static id ggml_metal_heap_alloc(struct ggml_metal_heap * heap, size_t size) { - const size_t alignment = 1024*1024; +static id ggml_metal_heap_alloc(struct ggml_metal_heap * heap, size_t size, bool no_alloc) { + // note: this is probably more than needed, but just in case + const size_t alignment = 1024; const size_t size_aligned = GGML_PAD(size, alignment); + //GGML_LOG_INFO("%s: size = %zu, size_aligned = %zu, need = %zu, fail = %d\n", __func__, size, size_aligned, heap->need, heap->fail); + heap->need += size_aligned; + if (no_alloc) { + return nil; + } + if (!heap->fail && size_aligned > [heap->obj maxAvailableSizeWithAlignment:alignment]) { heap->fail = 1; } @@ -883,7 +892,7 @@ @implementation GGMLMetalClass for (int i = 0; i < GGML_METAL_MAX_COMMAND_BUFFERS; ++i) { ctx->cmd_bufs[i].obj = nil; - // create 1MB heaps per command buffer + // create initial small heaps per command buffer // these can be resized during compute when necessary ctx->cmd_bufs[i].heap = ggml_metal_heap_init(device, 32); } @@ -1624,17 +1633,19 @@ static bool ggml_metal_encode_node( GGML_ABORT("unsupported op"); } + const bool no_alloc = no_compute; + + // heap buffers for temporary data id h_src0 = nil; + switch (dst->op) { case GGML_OP_SOFT_MAX: { - h_src0 = ggml_metal_heap_alloc(heap, ggml_nbytes(src0)); - if (!h_src0) { - //GGML_LOG_ERROR("%s: failed to allocate buffer, idx = %4d, size = %8zu, need = %8zu, max available = %9zu, heap size = %9zu, heap used = %zu\n", - // __func__, idx, ggml_nbytes(src0), heap->need, [heap->obj maxAvailableSizeWithAlignment:0], [heap->obj size], [heap->obj usedSize]); + h_src0 = ggml_metal_heap_alloc(heap, ggml_nbytes(src0), no_alloc); + if (!no_alloc && !h_src0) { + GGML_LOG_ERROR("%s: failed to allocate buffer, idx = %4d, size = %8zu, need = %8zu, max available = %9zu, heap size = %9zu, heap used = %zu, fail = %d\n", + __func__, idx, ggml_nbytes(src0), heap->need, [heap->obj maxAvailableSizeWithAlignment:0], [heap->obj size], [heap->obj usedSize], heap->fail); return false; - } else { - //GGML_LOG_ERROR("%s: allocated %zu\n", __func__, ggml_nbytes(src0)); } } break; default: @@ -4707,8 +4718,6 @@ static enum ggml_status ggml_metal_graph_compute( // number of threads in addition to the main thread const int n_cb = ctx->n_cb; - int n_try = 2; - // submit the ggml compute graph to the GPU by creating command buffers and encoding the ops in them // the first n_nodes_0 are encoded and submitted for processing directly by the calling thread // while these nodes are processing, we start n_cb threads to enqueue the rest of the nodes @@ -4716,7 +4725,6 @@ static enum ggml_status ggml_metal_graph_compute( // // tests on M1 Pro and M2 Ultra using LLaMA models, show that optimal values for n_cb are 1 or 2 - while (n_try-- > 0) { @autoreleasepool { ctx->gf = gf; @@ -4834,55 +4842,6 @@ static enum ggml_status ggml_metal_graph_compute( } } - bool retry = false; - - // check heap statuses - for (int i = 0; i <= n_cb; ++i) { - struct ggml_metal_heap * heap = ctx->cmd_bufs[i].heap; - - const size_t need = heap->need; - - //printf("\nXXXXXXXXXXXXXXXXX cb %d, need = %zu, fail = %d, size = %zu\n", i, need, heap->fail, [heap->obj currentAllocatedSize]); - - if (heap->fail == 0) { - ggml_metal_heap_reset(ctx->cmd_bufs[i].heap); - [heap->obj setPurgeableState:MTLPurgeableStateEmpty]; - - continue; - } - - if (heap->fail == 2) { - GGML_LOG_ERROR("%s: command buffer %d, MTLHeap ran out of buffers, max = %d\n", __func__, i, heap->n); - return GGML_STATUS_ALLOC_FAILED; - } - - if (heap->fail == 3) { - GGML_LOG_ERROR("%s: command buffer %d, MTLHeap failed to allocate buffer, max = %d\n", __func__, i, heap->n); - return GGML_STATUS_ALLOC_FAILED; - } - - //GGML_LOG_INFO("%s: command buffer %d, MTLHeap need = %zu\n", __func__, i, need); - - if (!ggml_metal_heap_resize(heap, need)) { - GGML_LOG_ERROR("%s: failed to increase heap size to %zu\n", __func__, need); - return GGML_STATUS_ALLOC_FAILED; - } - - retry = true; - } - - if (!retry) { - break; - } - - //printf("XXXXXXXXXXXXXXXXXXXXXXX retry\n"); - - if (n_try == 0) { - GGML_LOG_ERROR("%s: failed to allocate heap memory\n", __func__); - return GGML_STATUS_ALLOC_FAILED; - } - } - return GGML_STATUS_SUCCESS; } @@ -5257,21 +5216,38 @@ static void ggml_backend_metal_set_n_cb(ggml_backend_t backend, int n_cb) { const bool should_capture = ctx->capture_next_compute; - bool no_compute = false; + ggml_metal_heap_reset(heap); for (int idx = node_start; idx < node_end; ++idx) { - if (should_capture) { - [encoder pushDebugGroup:[NSString stringWithCString:ggml_op_desc(ggml_graph_node(ctx->gf, idx)) encoding:NSUTF8StringEncoding]]; - } + ggml_metal_encode_node(backend, idx, encoder, heap, true); + } + + bool can_compute = true; - const bool res = ggml_metal_encode_node(backend, idx, encoder, heap, no_compute); + if (heap->need > [heap->obj size]) { + const size_t need = heap->need; - if (should_capture) { - [encoder popDebugGroup]; + if (!ggml_metal_heap_resize(heap, need)) { + GGML_LOG_ERROR("%s: failed to resize MTLHeap, need = %zu\n", __func__, need); + can_compute = false; } + } + + if (can_compute) { + for (int idx = node_start; idx < node_end; ++idx) { + if (should_capture) { + [encoder pushDebugGroup:[NSString stringWithCString:ggml_op_desc(ggml_graph_node(ctx->gf, idx)) encoding:NSUTF8StringEncoding]]; + } + + const bool res = ggml_metal_encode_node(backend, idx, encoder, heap, false); - if (!res) { - no_compute = true; + if (should_capture) { + [encoder popDebugGroup]; + } + + if (!res) { + break; + } } } From 455691c52fd793141c8d2466660c417ae9d4dec7 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Fri, 11 Apr 2025 13:32:36 +0300 Subject: [PATCH 11/15] cont : use MTLHeapTypePlacement ggml-ci --- ggml/src/ggml-metal/ggml-metal.m | 85 ++++++++++++++++---------------- 1 file changed, 43 insertions(+), 42 deletions(-) diff --git a/ggml/src/ggml-metal/ggml-metal.m b/ggml/src/ggml-metal/ggml-metal.m index 149999cb30723..42519b7e6768b 100644 --- a/ggml/src/ggml-metal/ggml-metal.m +++ b/ggml/src/ggml-metal/ggml-metal.m @@ -19,9 +19,6 @@ // max number of MTLCommandBuffer used to submit a graph for processing #define GGML_METAL_MAX_COMMAND_BUFFERS 8 -// max number of buffers that can be allocated on the heap per command buffer -#define GGML_METAL_MAX_HEAP_BUFFERS 64 - #ifndef TARGET_OS_VISION #define TARGET_OS_VISION 0 #endif @@ -472,14 +469,15 @@ static void ggml_backend_metal_device_rel(struct ggml_backend_metal_device_conte }; struct ggml_metal_heap { - int n; int fail; + size_t offs; size_t need; id device; id obj; - id bufs[GGML_METAL_MAX_HEAP_BUFFERS]; + + NSMutableArray * bufs; }; static struct ggml_metal_heap * ggml_metal_heap_init(id device, size_t size) { @@ -488,7 +486,7 @@ static void ggml_backend_metal_device_rel(struct ggml_backend_metal_device_conte MTLHeapDescriptor * desc = [[MTLHeapDescriptor alloc] init]; desc.storageMode = MTLStorageModePrivate; desc.cpuCacheMode = MTLCPUCacheModeDefaultCache; - desc.type = MTLHeapTypeAutomatic; // TODO: use MTLHeapTypePlacement + desc.type = MTLHeapTypePlacement; desc.size = size; heap->device = device; @@ -501,39 +499,35 @@ static void ggml_backend_metal_device_rel(struct ggml_backend_metal_device_conte return false; } - for (int i = 0; i < GGML_METAL_MAX_HEAP_BUFFERS; ++i) { - heap->bufs[i] = nil; - } - [desc release]; + heap->bufs = [[NSMutableArray alloc] init]; + return heap; } +static void ggml_metal_heap_reset(struct ggml_metal_heap * heap) { + heap->fail = 0; + heap->offs = 0; + heap->need = 0; + + for (id buf in heap->bufs) { + [buf release]; + } + [heap->bufs removeAllObjects]; +} + static void ggml_metal_heap_free(struct ggml_metal_heap * heap) { if (heap == nil) { return; } - [heap->obj release]; - - free(heap); -} - -static void ggml_metal_heap_reset(struct ggml_metal_heap * heap) { - heap->n = 0; - heap->fail = 0; - heap->need = 0; + ggml_metal_heap_reset(heap); - for (int i = 0; i < GGML_METAL_MAX_HEAP_BUFFERS; i++) { - if (heap->bufs[i]) { - [heap->bufs[i] release]; - heap->bufs[i] = nil; - continue; - } + [heap->obj release]; + [heap->bufs release]; - break; - } + free(heap); } static bool ggml_metal_heap_resize(struct ggml_metal_heap * heap, size_t size) { @@ -546,7 +540,7 @@ static bool ggml_metal_heap_resize(struct ggml_metal_heap * heap, size_t size) { MTLHeapDescriptor * desc = [[MTLHeapDescriptor alloc] init]; desc.storageMode = MTLStorageModePrivate; desc.cpuCacheMode = MTLCPUCacheModeDefaultCache; - desc.type = MTLHeapTypeAutomatic; // TODO: use MTLHeapTypePlacement + desc.type = MTLHeapTypePlacement; desc.size = size; heap->obj = [heap->device newHeapWithDescriptor:desc]; @@ -571,33 +565,32 @@ static bool ggml_metal_heap_resize(struct ggml_metal_heap * heap, size_t size) { const size_t size_aligned = GGML_PAD(size, alignment); - //GGML_LOG_INFO("%s: size = %zu, size_aligned = %zu, need = %zu, fail = %d\n", __func__, size, size_aligned, heap->need, heap->fail); + heap->offs += size_aligned; + heap->need = MAX(heap->need, heap->offs + size_aligned); - heap->need += size_aligned; + //GGML_LOG_INFO("%s: size = %zu, size_aligned = %zu, offs = %zu, need = %zu\n", __func__, size, size_aligned, offs, heap->offs, heap->need); if (no_alloc) { return nil; } - if (!heap->fail && size_aligned > [heap->obj maxAvailableSizeWithAlignment:alignment]) { + if (!heap->fail && heap->offs + size_aligned > [heap->obj size]) { heap->fail = 1; } - if (!heap->fail && heap->n >= GGML_METAL_MAX_HEAP_BUFFERS) { - heap->fail = 2; - } - if (heap->fail) { return nil; } - id buf = [heap->obj newBufferWithLength:size_aligned options:MTLResourceStorageModePrivate]; + id buf = [heap->obj newBufferWithLength:size_aligned options:MTLResourceStorageModePrivate offset:heap->offs]; if (!buf) { heap->fail = 3; return nil; } - heap->bufs[heap->n++] = buf; + [heap->bufs addObject:buf]; + + //GGML_LOG_INFO("%s: allocated buffer, size = %zu, offs = %zu, heap size = %zu, heap used = %zu\n", __func__, size_aligned, offs, [heap->obj size], [heap->obj usedSize]); return buf; } @@ -634,7 +627,6 @@ static bool ggml_metal_heap_resize(struct ggml_metal_heap * heap, size_t size) { void (^encode_async)(size_t ith); // n_cb command buffers + 1 used by the main thread - //id command_buffers[GGML_METAL_MAX_COMMAND_BUFFERS + 1]; struct ggml_metal_command_buffer cmd_bufs[GGML_METAL_MAX_COMMAND_BUFFERS + 1]; // abort ggml_metal_graph_compute if callback returns true @@ -1638,13 +1630,16 @@ static bool ggml_metal_encode_node( // heap buffers for temporary data id h_src0 = nil; + // always allocate buffers from the start of the heap for the current node + heap->offs = 0; + switch (dst->op) { case GGML_OP_SOFT_MAX: { h_src0 = ggml_metal_heap_alloc(heap, ggml_nbytes(src0), no_alloc); if (!no_alloc && !h_src0) { - GGML_LOG_ERROR("%s: failed to allocate buffer, idx = %4d, size = %8zu, need = %8zu, max available = %9zu, heap size = %9zu, heap used = %zu, fail = %d\n", - __func__, idx, ggml_nbytes(src0), heap->need, [heap->obj maxAvailableSizeWithAlignment:0], [heap->obj size], [heap->obj usedSize], heap->fail); + GGML_LOG_ERROR("%s: failed to allocate buffer, idx = %4d, size = %8zu, offs = %8zu, max available = %9zu, heap size = %9zu, heap used = %zu, fail = %d\n", + __func__, idx, ggml_nbytes(src0), heap->offs, [heap->obj maxAvailableSizeWithAlignment:0], [heap->obj size], [heap->obj usedSize], heap->fail); return false; } } break; @@ -2250,8 +2245,6 @@ static bool ggml_metal_encode_node( { GGML_ASSERT(!src1 || src1->type == GGML_TYPE_F16 || src1->type == GGML_TYPE_F32); - GGML_ASSERT(ggml_is_contiguous(src0)); - int nth = 32; // SIMD width id pipeline = nil; @@ -4836,6 +4829,12 @@ static enum ggml_status ggml_metal_graph_compute( [next_buffer commit]; } + for (int i = 0; i <= n_cb; ++i) { + struct ggml_metal_heap * heap = ctx->cmd_bufs[i].heap; + + [heap->obj setPurgeableState:MTLPurgeableStateEmpty]; + } + if (!should_capture && ctx->capture_started) { [ctx->capture_scope endScope]; [[MTLCaptureManager sharedCaptureManager] stopCapture]; @@ -5233,6 +5232,8 @@ static void ggml_backend_metal_set_n_cb(ggml_backend_t backend, int n_cb) { } } + //GGML_LOG_INFO("XXXXXXXXXXXXXXXXXXXXXXXXX\n"); + if (can_compute) { for (int idx = node_start; idx < node_end; ++idx) { if (should_capture) { From 6f413276faa48260f006dada7deb9237e01778b5 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Mon, 14 Apr 2025 13:48:10 +0300 Subject: [PATCH 12/15] metal : use dynamic MTLHeap allocations ggml-ci --- ggml/src/ggml-metal/ggml-metal.m | 331 +++++++++++++++++-------------- 1 file changed, 185 insertions(+), 146 deletions(-) diff --git a/ggml/src/ggml-metal/ggml-metal.m b/ggml/src/ggml-metal/ggml-metal.m index 42519b7e6768b..a95ad00d10780 100644 --- a/ggml/src/ggml-metal/ggml-metal.m +++ b/ggml/src/ggml-metal/ggml-metal.m @@ -469,13 +469,13 @@ static void ggml_backend_metal_device_rel(struct ggml_backend_metal_device_conte }; struct ggml_metal_heap { - int fail; + int n_unused; // number of times the heap was unused + + int64_t n_alloc; size_t offs; - size_t need; - id device; - id obj; + id obj; NSMutableArray * bufs; }; @@ -489,7 +489,9 @@ static void ggml_backend_metal_device_rel(struct ggml_backend_metal_device_conte desc.type = MTLHeapTypePlacement; desc.size = size; - heap->device = device; + heap->n_unused = 0; + heap->n_alloc = 0; + heap->obj = [device newHeapWithDescriptor:desc]; if (!heap->obj) { GGML_LOG_ERROR("%s: error: failed to create MTLHeap with size %zu\n", __func__, size); @@ -507,14 +509,20 @@ static void ggml_backend_metal_device_rel(struct ggml_backend_metal_device_conte } static void ggml_metal_heap_reset(struct ggml_metal_heap * heap) { - heap->fail = 0; heap->offs = 0; - heap->need = 0; + + if ([heap->bufs count] > 0) { + heap->n_unused = 0; + } else { + heap->n_unused++; + } for (id buf in heap->bufs) { [buf release]; } [heap->bufs removeAllObjects]; + + [heap->obj setPurgeableState:MTLPurgeableStateVolatile]; } static void ggml_metal_heap_free(struct ggml_metal_heap * heap) { @@ -530,75 +538,10 @@ static void ggml_metal_heap_free(struct ggml_metal_heap * heap) { free(heap); } -static bool ggml_metal_heap_resize(struct ggml_metal_heap * heap, size_t size) { - if (heap == nil) { - return false; - } - - [heap->obj release]; - - MTLHeapDescriptor * desc = [[MTLHeapDescriptor alloc] init]; - desc.storageMode = MTLStorageModePrivate; - desc.cpuCacheMode = MTLCPUCacheModeDefaultCache; - desc.type = MTLHeapTypePlacement; - desc.size = size; - - heap->obj = [heap->device newHeapWithDescriptor:desc]; - if (!heap->obj) { - GGML_LOG_ERROR("%s: error: failed to create MTLHeap with size %zu\n", __func__, size); - - return false; - } - - [desc release]; - - //GGML_LOG_INFO("%s: resized heap to %zu\n", __func__, [heap->obj size]); - - ggml_metal_heap_reset(heap); - - return true; -} - -static id ggml_metal_heap_alloc(struct ggml_metal_heap * heap, size_t size, bool no_alloc) { - // note: this is probably more than needed, but just in case - const size_t alignment = 1024; - - const size_t size_aligned = GGML_PAD(size, alignment); - - heap->offs += size_aligned; - heap->need = MAX(heap->need, heap->offs + size_aligned); - - //GGML_LOG_INFO("%s: size = %zu, size_aligned = %zu, offs = %zu, need = %zu\n", __func__, size, size_aligned, offs, heap->offs, heap->need); - - if (no_alloc) { - return nil; - } - - if (!heap->fail && heap->offs + size_aligned > [heap->obj size]) { - heap->fail = 1; - } - - if (heap->fail) { - return nil; - } - - id buf = [heap->obj newBufferWithLength:size_aligned options:MTLResourceStorageModePrivate offset:heap->offs]; - if (!buf) { - heap->fail = 3; - return nil; - } - - [heap->bufs addObject:buf]; - - //GGML_LOG_INFO("%s: allocated buffer, size = %zu, offs = %zu, heap size = %zu, heap used = %zu\n", __func__, size_aligned, offs, [heap->obj size], [heap->obj usedSize]); - - return buf; -} - struct ggml_metal_command_buffer { id obj; - struct ggml_metal_heap * heap; + struct ggml_metal_mem_pool * mem_pool; }; struct ggml_backend_metal_context { @@ -634,6 +577,154 @@ static bool ggml_metal_heap_resize(struct ggml_metal_heap * heap, size_t size) { void * abort_callback_data; }; +@interface ggml_metal_heap_ptr : NSObject + +@property (nonatomic, assign) struct ggml_metal_heap * data; + +@end + +@implementation ggml_metal_heap_ptr + +@end + +struct ggml_metal_mem_pool { + id device; + + NSMutableArray * heaps; + NSMutableArray * heaps_to_remove; +}; + +static struct ggml_metal_mem_pool * ggml_metal_mem_pool_init(void) { + struct ggml_metal_mem_pool * mem_pool = calloc(1, sizeof(struct ggml_metal_mem_pool)); + + mem_pool->heaps = [[NSMutableArray alloc] init]; + mem_pool->heaps_to_remove = [[NSMutableArray alloc] init]; + + return mem_pool; +} + +static void ggml_metal_mem_pool_free(struct ggml_metal_mem_pool * mem_pool) { + GGML_LOG_DEBUG("%s: freeing memory pool, num heaps = %zu\n", __func__, [mem_pool->heaps count]); + + size_t size_all = 0; + size_t size_cur = 0; + + for (ggml_metal_heap_ptr * ptr in mem_pool->heaps) { + GGML_LOG_DEBUG("%s: heap: %p\n", __func__, (void *) ptr.data); + GGML_LOG_DEBUG("%s: n_alloc: %" PRId64 "\n", __func__, ptr.data->n_alloc); + GGML_LOG_DEBUG("%s: n_unused: %d\n", __func__, ptr.data->n_unused); + GGML_LOG_DEBUG("%s: size: %.2f MiB\n", __func__, [ptr.data->obj size] / 1024.0 / 1024.0); + GGML_LOG_DEBUG("%s: bufs: %zu\n", __func__, [ptr.data->bufs count]); + + if ([ptr.data->bufs count] > 0) { + size_cur += [ptr.data->obj size]; + } + size_all += [ptr.data->obj size]; + + ggml_metal_heap_free(ptr.data); + [ptr release]; + } + [mem_pool->heaps release]; + [mem_pool->heaps_to_remove release]; + + if (size_all > 0) { + GGML_LOG_DEBUG("%s: size_all: %.2f MiB\n", __func__, size_all / 1024.0 / 1024.0); + GGML_LOG_DEBUG("%s: size_cur: %.2f MiB\n", __func__, size_cur / 1024.0 / 1024.0); + } + + free(mem_pool); +} + +static void ggml_metal_mem_pool_reset(struct ggml_metal_mem_pool * mem_pool) { + for (NSUInteger i = 0; i < [mem_pool->heaps count]; i++) { + ggml_metal_heap_ptr * ptr = [mem_pool->heaps objectAtIndex:i]; + + struct ggml_metal_heap * heap = ptr.data; + ggml_metal_heap_reset(heap); + + // if the heap hasn't been used for a while, remove it + if (heap->n_unused >= 128) { + [mem_pool->heaps_to_remove addObject:@(i)]; + } + } + + if (mem_pool->heaps_to_remove.count > 0) { + for (NSUInteger i = 0; i < [mem_pool->heaps_to_remove count]; i++) { + NSUInteger index = [[mem_pool->heaps_to_remove objectAtIndex:i] intValue]; + ggml_metal_heap_ptr * ptr = [mem_pool->heaps objectAtIndex:index]; + + struct ggml_metal_heap * heap = ptr.data; + ggml_metal_heap_free(heap); + + [mem_pool->heaps removeObjectAtIndex:index]; + [ptr release]; + } + + [mem_pool->heaps_to_remove removeAllObjects]; + } +} + +static void ggml_metal_mem_pool_clear(struct ggml_metal_mem_pool * mem_pool) { + for (ggml_metal_heap_ptr * ptr in mem_pool->heaps) { + ptr.data->offs = 0; + } +} + +static id ggml_metal_mem_pool_alloc(struct ggml_metal_mem_pool * mem_pool, size_t size) { + const size_t alignment = 32; + + const size_t size_aligned = GGML_PAD(size, alignment); + + // try one of the existing heaps + for (ggml_metal_heap_ptr * ptr in mem_pool->heaps) { + struct ggml_metal_heap * heap = ptr.data; + if (heap->offs + size_aligned <= [heap->obj size]) { + if ([heap->bufs count] == 0) { + [heap->obj setPurgeableState:MTLPurgeableStateNonVolatile]; + } + + id buf = [heap->obj newBufferWithLength:size_aligned options:MTLResourceStorageModePrivate offset:heap->offs]; + if (buf == nil) { + GGML_LOG_ERROR("%s: error: failed to create MTLBuffer with size %zu\n", __func__, size_aligned); + return nil; + } + + heap->n_alloc++; + heap->offs += size_aligned; + + [heap->bufs addObject:buf]; + + return buf; + } + } + + // create a new heap that can fit this buffer + ggml_metal_heap_ptr * heap_ptr = [ggml_metal_heap_ptr new]; + + struct ggml_metal_heap * heap = ggml_metal_heap_init(mem_pool->device, size_aligned); + heap_ptr.data = heap; + + GGML_LOG_DEBUG("%s: creating new heap of size %zu, got %zu\n", __func__, size_aligned, [heap->obj size]); + + ggml_metal_heap_reset(heap_ptr.data); + + [heap->obj setPurgeableState:MTLPurgeableStateNonVolatile]; + id buf = [heap->obj newBufferWithLength:size_aligned options:MTLResourceStorageModePrivate offset:heap->offs]; + if (buf == nil) { + GGML_LOG_ERROR("%s: error: failed to create MTLBuffer with size %zu\n", __func__, size_aligned); + return NULL; + } + + heap->n_alloc++; + heap->offs += size_aligned; + + [heap->bufs addObject:buf]; + + [mem_pool->heaps addObject:heap_ptr]; + + return buf; +} + // MSL code // TODO: move the contents here when ready // for now it is easier to work in a separate file @@ -884,9 +975,8 @@ @implementation GGMLMetalClass for (int i = 0; i < GGML_METAL_MAX_COMMAND_BUFFERS; ++i) { ctx->cmd_bufs[i].obj = nil; - // create initial small heaps per command buffer - // these can be resized during compute when necessary - ctx->cmd_bufs[i].heap = ggml_metal_heap_init(device, 32); + ctx->cmd_bufs[i].mem_pool = ggml_metal_mem_pool_init(); + ctx->cmd_bufs[i].mem_pool->device = device; } #if TARGET_OS_OSX || (TARGET_OS_IOS && __clang_major__ >= 15) @@ -1280,7 +1370,7 @@ static void ggml_metal_free(struct ggml_backend_metal_context * ctx) { for (int i = 0; i < GGML_METAL_MAX_COMMAND_BUFFERS; ++i) { // ctx->cmd_bufs[i].obj is auto released - ggml_metal_heap_free(ctx->cmd_bufs[i].heap); + ggml_metal_mem_pool_free(ctx->cmd_bufs[i].mem_pool); } dispatch_release(ctx->d_queue); @@ -1586,8 +1676,7 @@ static bool ggml_metal_encode_node( ggml_backend_t backend, int idx, id encoder, - struct ggml_metal_heap * heap, - bool no_compute) { + struct ggml_metal_mem_pool * mem_pool) { struct ggml_backend_metal_context * ctx = backend->context; struct ggml_backend_metal_device_context * ctx_dev = backend->device->context; @@ -1625,32 +1714,7 @@ static bool ggml_metal_encode_node( GGML_ABORT("unsupported op"); } - const bool no_alloc = no_compute; - - // heap buffers for temporary data - id h_src0 = nil; - - // always allocate buffers from the start of the heap for the current node - heap->offs = 0; - - switch (dst->op) { - case GGML_OP_SOFT_MAX: - { - h_src0 = ggml_metal_heap_alloc(heap, ggml_nbytes(src0), no_alloc); - if (!no_alloc && !h_src0) { - GGML_LOG_ERROR("%s: failed to allocate buffer, idx = %4d, size = %8zu, offs = %8zu, max available = %9zu, heap size = %9zu, heap used = %zu, fail = %d\n", - __func__, idx, ggml_nbytes(src0), heap->offs, [heap->obj maxAvailableSizeWithAlignment:0], [heap->obj size], [heap->obj usedSize], heap->fail); - return false; - } - } break; - default: - { - } break; - } - - if (no_compute) { - return true; - } + ggml_metal_mem_pool_clear(mem_pool); const int64_t ne00 = src0 ? src0->ne[0] : 0; const int64_t ne01 = src0 ? src0->ne[1] : 0; @@ -2288,6 +2352,12 @@ static bool ggml_metal_encode_node( // cpy to tmp buffer in MTLHeap + id h_src0 = h_src0 = ggml_metal_mem_pool_alloc(mem_pool, ggml_nbytes(src0)); + if (!h_src0) { + GGML_LOG_ERROR("%s: failed to allocate buffer from memory pool, size = %zu\n", __func__, ggml_nbytes(src0)); + return false; + } + ggml_metal_kargs_cpy args_cpy = { /*.ne00 =*/ ne00, /*.ne01 =*/ ne01, @@ -4749,12 +4819,6 @@ static enum ggml_status ggml_metal_graph_compute( } } - for (int i = 0; i <= n_cb; ++i) { - struct ggml_metal_heap * heap = ctx->cmd_bufs[i].heap; - - [heap->obj setPurgeableState:MTLPurgeableStateNonVolatile]; - } - // the main thread commits the first few commands immediately // cmd_buf[n_cb] { @@ -4829,12 +4893,6 @@ static enum ggml_status ggml_metal_graph_compute( [next_buffer commit]; } - for (int i = 0; i <= n_cb; ++i) { - struct ggml_metal_heap * heap = ctx->cmd_bufs[i].heap; - - [heap->obj setPurgeableState:MTLPurgeableStateEmpty]; - } - if (!should_capture && ctx->capture_started) { [ctx->capture_scope endScope]; [[MTLCaptureManager sharedCaptureManager] stopCapture]; @@ -5201,7 +5259,6 @@ static void ggml_backend_metal_set_n_cb(ggml_backend_t backend, int n_cb) { const int n_nodes_per_cb = ctx->n_nodes_per_cb; id cmd_buf = ctx->cmd_bufs[cb_idx].obj; - struct ggml_metal_heap * heap = ctx->cmd_bufs[cb_idx].heap; id encoder = [cmd_buf computeCommandEncoder]; @@ -5215,40 +5272,22 @@ static void ggml_backend_metal_set_n_cb(ggml_backend_t backend, int n_cb) { const bool should_capture = ctx->capture_next_compute; - ggml_metal_heap_reset(heap); + struct ggml_metal_mem_pool * mem_pool = ctx->cmd_bufs[cb_idx].mem_pool; + ggml_metal_mem_pool_reset(mem_pool); for (int idx = node_start; idx < node_end; ++idx) { - ggml_metal_encode_node(backend, idx, encoder, heap, true); - } - - bool can_compute = true; - - if (heap->need > [heap->obj size]) { - const size_t need = heap->need; - - if (!ggml_metal_heap_resize(heap, need)) { - GGML_LOG_ERROR("%s: failed to resize MTLHeap, need = %zu\n", __func__, need); - can_compute = false; + if (should_capture) { + [encoder pushDebugGroup:[NSString stringWithCString:ggml_op_desc(ggml_graph_node(ctx->gf, idx)) encoding:NSUTF8StringEncoding]]; } - } - - //GGML_LOG_INFO("XXXXXXXXXXXXXXXXXXXXXXXXX\n"); - - if (can_compute) { - for (int idx = node_start; idx < node_end; ++idx) { - if (should_capture) { - [encoder pushDebugGroup:[NSString stringWithCString:ggml_op_desc(ggml_graph_node(ctx->gf, idx)) encoding:NSUTF8StringEncoding]]; - } - const bool res = ggml_metal_encode_node(backend, idx, encoder, heap, false); + const bool res = ggml_metal_encode_node(backend, idx, encoder, mem_pool); - if (should_capture) { - [encoder popDebugGroup]; - } + if (should_capture) { + [encoder popDebugGroup]; + } - if (!res) { - break; - } + if (!res) { + break; } } From 69f7b0920ddcfe16c468b471dfb0c3c3939f99ae Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Tue, 15 Apr 2025 14:59:16 +0300 Subject: [PATCH 13/15] metal : add comments --- ggml/src/ggml-metal/ggml-metal.m | 116 +++++++++++++++++++------------ 1 file changed, 70 insertions(+), 46 deletions(-) diff --git a/ggml/src/ggml-metal/ggml-metal.m b/ggml/src/ggml-metal/ggml-metal.m index a95ad00d10780..b2e810f10c094 100644 --- a/ggml/src/ggml-metal/ggml-metal.m +++ b/ggml/src/ggml-metal/ggml-metal.m @@ -468,16 +468,21 @@ static void ggml_backend_metal_device_rel(struct ggml_backend_metal_device_conte GGML_METAL_KERNEL_TYPE_COUNT }; +// +// ggml_metal_heap +// + struct ggml_metal_heap { int n_unused; // number of times the heap was unused - int64_t n_alloc; + int64_t n_alloc; // total number of buffer allocations in this heap across all computes + // current offset in the heap - we reset this after each node in order to reuse the memory size_t offs; id obj; - NSMutableArray * bufs; + NSMutableArray * bufs; // the currently allocated MTLBuffer objects in this heap }; static struct ggml_metal_heap * ggml_metal_heap_init(id device, size_t size) { @@ -511,6 +516,7 @@ static void ggml_backend_metal_device_rel(struct ggml_backend_metal_device_conte static void ggml_metal_heap_reset(struct ggml_metal_heap * heap) { heap->offs = 0; + // count how many graph computes the heap ended up being unused if ([heap->bufs count] > 0) { heap->n_unused = 0; } else { @@ -522,6 +528,8 @@ static void ggml_metal_heap_reset(struct ggml_metal_heap * heap) { } [heap->bufs removeAllObjects]; + // tell the OS that it can reuse this memory if needed + // ref: https://developer.apple.com/documentation/metal/mtlpurgeablestate?language=objc [heap->obj setPurgeableState:MTLPurgeableStateVolatile]; } @@ -538,45 +546,6 @@ static void ggml_metal_heap_free(struct ggml_metal_heap * heap) { free(heap); } -struct ggml_metal_command_buffer { - id obj; - - struct ggml_metal_mem_pool * mem_pool; -}; - -struct ggml_backend_metal_context { - id device; - id queue; - - dispatch_queue_t d_queue; - - struct ggml_metal_kernel kernels[GGML_METAL_KERNEL_TYPE_COUNT]; - - // capture state - bool capture_next_compute; - bool capture_started; - - id capture_scope; - - // command buffer state - int n_cb; // number of extra threads used to submit the command buffers - int n_nodes_0; // number of nodes submitted by the main thread - int n_nodes_1; // remaining number of nodes submitted by the n_cb threads - int n_nodes_per_cb; - - struct ggml_cgraph * gf; - - // the callback given to the thread pool - void (^encode_async)(size_t ith); - - // n_cb command buffers + 1 used by the main thread - struct ggml_metal_command_buffer cmd_bufs[GGML_METAL_MAX_COMMAND_BUFFERS + 1]; - - // abort ggml_metal_graph_compute if callback returns true - ggml_abort_callback abort_callback; - void * abort_callback_data; -}; - @interface ggml_metal_heap_ptr : NSObject @property (nonatomic, assign) struct ggml_metal_heap * data; @@ -584,12 +553,17 @@ @interface ggml_metal_heap_ptr : NSObject @end @implementation ggml_metal_heap_ptr - @end +// +// ggml_metal_mem_pool +// + struct ggml_metal_mem_pool { id device; + int n_heaps; // total number of heaps ever created (including those that were removed) + NSMutableArray * heaps; NSMutableArray * heaps_to_remove; }; @@ -597,6 +571,8 @@ @implementation ggml_metal_heap_ptr static struct ggml_metal_mem_pool * ggml_metal_mem_pool_init(void) { struct ggml_metal_mem_pool * mem_pool = calloc(1, sizeof(struct ggml_metal_mem_pool)); + mem_pool->n_heaps = 0; + mem_pool->heaps = [[NSMutableArray alloc] init]; mem_pool->heaps_to_remove = [[NSMutableArray alloc] init]; @@ -604,7 +580,7 @@ @implementation ggml_metal_heap_ptr } static void ggml_metal_mem_pool_free(struct ggml_metal_mem_pool * mem_pool) { - GGML_LOG_DEBUG("%s: freeing memory pool, num heaps = %zu\n", __func__, [mem_pool->heaps count]); + GGML_LOG_DEBUG("%s: freeing memory pool, num heaps = %zu (total = %d)\n", __func__, [mem_pool->heaps count], mem_pool->n_heaps); size_t size_all = 0; size_t size_cur = 0; @@ -679,6 +655,9 @@ static void ggml_metal_mem_pool_clear(struct ggml_metal_mem_pool * mem_pool) { for (ggml_metal_heap_ptr * ptr in mem_pool->heaps) { struct ggml_metal_heap * heap = ptr.data; if (heap->offs + size_aligned <= [heap->obj size]) { + // if this is the first buffer in the heap for the current command buffer, tell the OS that + // it cannot free the memory used by the heap + // ref: https://developer.apple.com/documentation/metal/mtlpurgeablestate?language=objc if ([heap->bufs count] == 0) { [heap->obj setPurgeableState:MTLPurgeableStateNonVolatile]; } @@ -702,11 +681,15 @@ static void ggml_metal_mem_pool_clear(struct ggml_metal_mem_pool * mem_pool) { ggml_metal_heap_ptr * heap_ptr = [ggml_metal_heap_ptr new]; struct ggml_metal_heap * heap = ggml_metal_heap_init(mem_pool->device, size_aligned); - heap_ptr.data = heap; + if (heap == NULL) { + GGML_LOG_ERROR("%s: error: failed to create heap of size %zu\n", __func__, size_aligned); + return NULL; + } - GGML_LOG_DEBUG("%s: creating new heap of size %zu, got %zu\n", __func__, size_aligned, [heap->obj size]); + //GGML_LOG_DEBUG("%s: creating new heap of size %zu, got %zu\n", __func__, size_aligned, [heap->obj size]); - ggml_metal_heap_reset(heap_ptr.data); + heap_ptr.data = heap; + ggml_metal_heap_reset(heap); [heap->obj setPurgeableState:MTLPurgeableStateNonVolatile]; id buf = [heap->obj newBufferWithLength:size_aligned options:MTLResourceStorageModePrivate offset:heap->offs]; @@ -721,10 +704,51 @@ static void ggml_metal_mem_pool_clear(struct ggml_metal_mem_pool * mem_pool) { [heap->bufs addObject:buf]; [mem_pool->heaps addObject:heap_ptr]; + mem_pool->n_heaps++; return buf; } +struct ggml_metal_command_buffer { + id obj; + + // each command buffer has a memory pool from which it can allocate temporary buffers during the compute + struct ggml_metal_mem_pool * mem_pool; +}; + +struct ggml_backend_metal_context { + id device; + id queue; + + dispatch_queue_t d_queue; + + struct ggml_metal_kernel kernels[GGML_METAL_KERNEL_TYPE_COUNT]; + + // capture state + bool capture_next_compute; + bool capture_started; + + id capture_scope; + + // command buffer state + int n_cb; // number of extra threads used to submit the command buffers + int n_nodes_0; // number of nodes submitted by the main thread + int n_nodes_1; // remaining number of nodes submitted by the n_cb threads + int n_nodes_per_cb; + + struct ggml_cgraph * gf; + + // the callback given to the thread pool + void (^encode_async)(size_t ith); + + // n_cb command buffers + 1 used by the main thread + struct ggml_metal_command_buffer cmd_bufs[GGML_METAL_MAX_COMMAND_BUFFERS + 1]; + + // abort ggml_metal_graph_compute if callback returns true + ggml_abort_callback abort_callback; + void * abort_callback_data; +}; + // MSL code // TODO: move the contents here when ready // for now it is easier to work in a separate file From e5b7f7e2b54a2c879854842b7cf17e8ac3468d1e Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Tue, 15 Apr 2025 15:01:13 +0300 Subject: [PATCH 14/15] metal : disable softmax use of mem_pool ggml-ci --- ggml/src/ggml-metal/ggml-metal.m | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/ggml/src/ggml-metal/ggml-metal.m b/ggml/src/ggml-metal/ggml-metal.m index b2e810f10c094..027044f1ccb6f 100644 --- a/ggml/src/ggml-metal/ggml-metal.m +++ b/ggml/src/ggml-metal/ggml-metal.m @@ -2374,6 +2374,8 @@ static bool ggml_metal_encode_node( const float m0 = powf(2.0f, -(max_bias ) / n_head_log2); const float m1 = powf(2.0f, -(max_bias / 2.0f) / n_head_log2); +// use this branch to test the ggml_metal_mem_pool functionality +#if 0 // cpy to tmp buffer in MTLHeap id h_src0 = h_src0 = ggml_metal_mem_pool_alloc(mem_pool, ggml_nbytes(src0)); @@ -2382,6 +2384,8 @@ static bool ggml_metal_encode_node( return false; } + offs_src0 = 0; + ggml_metal_kargs_cpy args_cpy = { /*.ne00 =*/ ne00, /*.ne01 =*/ ne01, @@ -2415,6 +2419,9 @@ static bool ggml_metal_encode_node( [encoder dispatchThreadgroups:MTLSizeMake(ne01, ne02, ne03) threadsPerThreadgroup:MTLSizeMake(nth_cpy, 1, 1)]; +#else + id h_src0 = id_src0; +#endif // softmax ggml_metal_kargs_soft_max args = { @@ -2429,11 +2436,11 @@ static bool ggml_metal_encode_node( }; [encoder setComputePipelineState:pipeline]; - [encoder setBuffer:h_src0 offset:0 atIndex:0]; + [encoder setBuffer:h_src0 offset:offs_src0 atIndex:0]; if (id_src1) { [encoder setBuffer:id_src1 offset:offs_src1 atIndex:1]; } else { - [encoder setBuffer:h_src0 offset:0 atIndex:1]; + [encoder setBuffer:h_src0 offset:offs_src0 atIndex:1]; } [encoder setBuffer:id_dst offset:offs_dst atIndex:2]; [encoder setBytes:&args length:sizeof(args) atIndex:3]; From dc4c04816ee95181ce3c8426f91e10ca4c29fe7c Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Tue, 22 Apr 2025 16:15:19 +0300 Subject: [PATCH 15/15] metal : final touches --- ggml/src/ggml-metal/ggml-metal.m | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/ggml/src/ggml-metal/ggml-metal.m b/ggml/src/ggml-metal/ggml-metal.m index 027044f1ccb6f..da5857542752c 100644 --- a/ggml/src/ggml-metal/ggml-metal.m +++ b/ggml/src/ggml-metal/ggml-metal.m @@ -473,16 +473,19 @@ static void ggml_backend_metal_device_rel(struct ggml_backend_metal_device_conte // struct ggml_metal_heap { - int n_unused; // number of times the heap was unused + // number of times the heap was unused + int n_unused; - int64_t n_alloc; // total number of buffer allocations in this heap across all computes + // total number of buffer allocations in this heap across all computes + int64_t n_alloc; // current offset in the heap - we reset this after each node in order to reuse the memory size_t offs; + // the currently allocated MTLBuffer objects in this heap id obj; - NSMutableArray * bufs; // the currently allocated MTLBuffer objects in this heap + NSMutableArray * bufs; }; static struct ggml_metal_heap * ggml_metal_heap_init(id device, size_t size) { @@ -586,7 +589,7 @@ static void ggml_metal_mem_pool_free(struct ggml_metal_mem_pool * mem_pool) { size_t size_cur = 0; for (ggml_metal_heap_ptr * ptr in mem_pool->heaps) { - GGML_LOG_DEBUG("%s: heap: %p\n", __func__, (void *) ptr.data); + GGML_LOG_DEBUG("%s: heap: %p\n", __func__, (void *) ptr.data); GGML_LOG_DEBUG("%s: n_alloc: %" PRId64 "\n", __func__, ptr.data->n_alloc); GGML_LOG_DEBUG("%s: n_unused: %d\n", __func__, ptr.data->n_unused); GGML_LOG_DEBUG("%s: size: %.2f MiB\n", __func__, [ptr.data->obj size] / 1024.0 / 1024.0);