From c254b21307bb4d30f0c0f6dc82111f6607f5c61c Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Wed, 9 Apr 2025 14:50:41 +0300
Subject: [PATCH 01/15] metal : add memory pool for temp allocs (wip) [no ci]

---
 ggml/src/ggml-metal/ggml-metal.m | 83 ++++++++++++++++++++++++++------
 1 file changed, 69 insertions(+), 14 deletions(-)
diff --git a/ggml/src/ggml-metal/ggml-metal.m b/ggml/src/ggml-metal/ggml-metal.m
index 9f1c6c6ccc09f..ba981333faa80 100644
--- a/ggml/src/ggml-metal/ggml-metal.m
+++ b/ggml/src/ggml-metal/ggml-metal.m
@@ -44,8 +44,8 @@
 // note: assumes single GPU device - the default one
 // TODO: support multiple GPU devices
 static struct ggml_backend_metal_device_context {
-    id<MTLDevice> mtl_device;
-    int           mtl_device_ref_count;
+    id<MTLDevice>  mtl_device;
+    int            mtl_device_ref_count;
     id<MTLLibrary> mtl_library;
 
     bool has_simdgroup_reduction;
@@ -470,6 +470,7 @@ static void ggml_backend_metal_device_rel(struct ggml_backend_metal_device_conte
 
 struct ggml_backend_metal_context {
     id<MTLCommandQueue> queue;
+    id<MTLHeap>         heap;
 
     dispatch_queue_t d_queue;
 
@@ -693,6 +694,19 @@ @implementation GGMLMetalClass
 
     ctx->d_queue = dispatch_queue_create("ggml-metal", DISPATCH_QUEUE_CONCURRENT);
 
+    // allocate tmp heap with fixed size for testing
+    // TODO: figure out how to dynamically resize it
+    {
+        MTLHeapDescriptor *heapDescriptor = [[MTLHeapDescriptor alloc] init];
+        heapDescriptor.storageMode  = MTLStorageModePrivate;
+        heapDescriptor.cpuCacheMode = MTLCPUCacheModeDefaultCache;
+        heapDescriptor.size         = 32*1024*1024;
+
+        ctx->heap = [device newHeapWithDescriptor:heapDescriptor];
+
+        [heapDescriptor release];
+    }
+
     // load library
     if (ctx_dev->mtl_library == nil) {
         ctx_dev->mtl_library = ggml_metal_load_library(device, ctx_dev->use_bfloat);
@@ -1136,6 +1150,7 @@ static void ggml_metal_free(struct ggml_backend_metal_context * ctx) {
     Block_release(ctx->encode_async);
 
     [ctx->queue release];
+    [ctx->heap  release];
 
     dispatch_release(ctx->d_queue);
 
@@ -1439,7 +1454,8 @@ static bool ggml_metal_supports_op(const struct ggml_backend_metal_device_contex
 static void ggml_metal_encode_node(
                         ggml_backend_t   backend,
                                    int   idx,
-          id<MTLComputeCommandEncoder>   encoder) {
+          id<MTLComputeCommandEncoder>   encoder,
+          id<MTLHeap>                    heap) {
     struct ggml_backend_metal_context        * ctx     = backend->context;
     struct ggml_backend_metal_device_context * ctx_dev = backend->device->context;
 
@@ -2111,26 +2127,65 @@ static void ggml_metal_encode_node(
                 const float m0 = powf(2.0f, -(max_bias       ) / n_head_log2);
                 const float m1 = powf(2.0f, -(max_bias / 2.0f) / n_head_log2);
 
-                ggml_metal_kargs_soft_max args = {
+                // cpy to tmp buffer in MTLHeap
+
+                ggml_metal_kargs_cpy args_cpy = {
                     /*.ne00 =*/ ne00,
                     /*.ne01 =*/ ne01,
                     /*.ne02 =*/ ne02,
-                    /*.scale =*/ scale,
-                    /*.max_bias =*/ max_bias,
-                    /*.m0 =*/ m0,
-                    /*.m1 =*/ m1,
+                    /*.ne03 =*/ ne03,
+                    /*.nb00 =*/ nb00,
+                    /*.nb01 =*/ nb01,
+                    /*.nb02 =*/ nb02,
+                    /*.nb03 =*/ nb03,
+                    /*.ne0  =*/ ne00,
+                    /*.ne1  =*/ ne01,
+                    /*.ne2  =*/ ne02,
+                    /*.ne3  =*/ ne03,
+                    /*.nb0  =*/ nb00,
+                    /*.nb1  =*/ nb01,
+                    /*.nb2  =*/ nb02,
+                    /*.nb3  =*/ nb03,
+                };
+
+                id<MTLBuffer> id_src0h = [heap newBufferWithLength:ggml_nbytes(src0) options:MTLResourceStorageModePrivate];
+
+                if (src0->type == GGML_TYPE_F16) {
+                    [encoder setComputePipelineState:ctx->kernels[GGML_METAL_KERNEL_TYPE_CPY_F16_F16].pipeline];
+                } else {
+                    [encoder setComputePipelineState:ctx->kernels[GGML_METAL_KERNEL_TYPE_CPY_F32_F32].pipeline];
+                }
+                [encoder setBytes:&args_cpy length:sizeof(args_cpy) atIndex:0];
+                [encoder setBuffer:id_src0  offset:offs_src0        atIndex:1];
+                [encoder setBuffer:id_src0h offset:0                atIndex:2];
+
+                GGML_ASSERT(ne00 % ggml_blck_size(src0->type) == 0);
+                int nth_cpy = MIN(1024, ne00 / ggml_blck_size(src0->type));
+
+                [encoder dispatchThreadgroups:MTLSizeMake(ne01, ne02, ne03) threadsPerThreadgroup:MTLSizeMake(nth_cpy, 1, 1)];
+
+                // softmax
+
+                ggml_metal_kargs_soft_max args = {
+                    /*.ne00        =*/ ne00,
+                    /*.ne01        =*/ ne01,
+                    /*.ne02        =*/ ne02,
+                    /*.scale       =*/ scale,
+                    /*.max_bias    =*/ max_bias,
+                    /*.m0          =*/ m0,
+                    /*.m1          =*/ m1,
                     /*.n_head_log2 =*/ n_head_log2,
                 };
 
                 [encoder setComputePipelineState:pipeline];
-                [encoder setBuffer:id_src0 offset:offs_src0   atIndex:0];
+                [encoder setBuffer:id_src0h offset:0            atIndex:0];
                 if (id_src1) {
-                    [encoder setBuffer:id_src1 offset:offs_src1   atIndex:1];
+                    [encoder setBuffer:id_src1 offset:offs_src1 atIndex:1];
                 } else {
-                    [encoder setBuffer:id_src0 offset:offs_src0   atIndex:1];
+                    [encoder setBuffer:id_src0h offset:0        atIndex:1];
                 }
-                [encoder setBuffer:id_dst      offset:offs_dst            atIndex:2];
-                [encoder setBytes:&args        length:sizeof(args)        atIndex:3];
+                [encoder setBuffer:id_dst offset:offs_dst       atIndex:2];
+                [encoder setBytes:&args   length:sizeof(args)   atIndex:3];
 
                 [encoder setThreadgroupMemoryLength:32*sizeof(float) atIndex:0];
 
@@ -4992,7 +5047,7 @@ static void ggml_backend_metal_set_n_cb(ggml_backend_t backend, int n_cb) {
                 [encoder pushDebugGroup:[NSString stringWithCString:ggml_op_desc(ggml_graph_node(ctx->gf, idx)) encoding:NSUTF8StringEncoding]];
             }
 
-            ggml_metal_encode_node(backend, idx, encoder);
+            ggml_metal_encode_node(backend, idx, encoder, ctx->heap);
 
             if (should_capture) {
                 [encoder popDebugGroup];

From 2341e7c688882ffc480a47dabb87db9cec620fc5 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Wed, 9 Apr 2025 16:02:59 +0300
Subject: [PATCH 02/15] cont : free buffers from the heap

---
 ggml/src/ggml-metal/ggml-metal.m | 53 +++++++++++++++++++++++++++-----
 1 file changed, 46 insertions(+), 7 deletions(-)

diff --git a/ggml/src/ggml-metal/ggml-metal.m b/ggml/src/ggml-metal/ggml-metal.m
index ba981333faa80..46b7f16934cbf 100644
--- a/ggml/src/ggml-metal/ggml-metal.m
+++ b/ggml/src/ggml-metal/ggml-metal.m
@@ -19,6 +19,9 @@
 // max number of MTLCommandBuffer used to submit a graph for processing
 #define GGML_METAL_MAX_COMMAND_BUFFERS 8
 
+// max number of buffers that can be allocated on the heap per command buffer
+#define GGML_METAL_MAX_HEAP_BUFFERS 64
+
 #ifndef TARGET_OS_VISION
 #define TARGET_OS_VISION 0
 #endif
@@ -468,9 +471,18 @@ static void ggml_backend_metal_device_rel(struct ggml_backend_metal_device_conte
     GGML_METAL_KERNEL_TYPE_COUNT
 };
 
+struct ggml_backend_metal_heap {
+    int n;
+
+    id<MTLHeap>   obj;
+    id<MTLBuffer> bufs[GGML_METAL_MAX_HEAP_BUFFERS];
+};
+
 struct ggml_backend_metal_context {
     id<MTLCommandQueue> queue;
-    id<MTLHeap>         heap;
+
+    // TODO: create heap per command buffer
+    struct ggml_backend_metal_heap heap;
 
     dispatch_queue_t d_queue;
 
@@ -702,7 +714,12 @@ @implementation GGMLMetalClass
         heapDescriptor.cpuCacheMode = MTLCPUCacheModeDefaultCache;
         heapDescriptor.size         = 32*1024*1024;
 
-        ctx->heap = [device newHeapWithDescriptor:heapDescriptor];
+        ctx->heap.n = 0;
+
+        ctx->heap.obj = [device newHeapWithDescriptor:heapDescriptor];
+        for (int i = 0; i < GGML_METAL_MAX_HEAP_BUFFERS; ++i) {
+            ctx->heap.bufs[i] = nil;
+        }
 
         [heapDescriptor release];
     }
@@ -1149,8 +1166,8 @@ static void ggml_metal_free(struct ggml_backend_metal_context * ctx) {
 
     Block_release(ctx->encode_async);
 
-    [ctx->queue release];
-    [ctx->heap  release];
+    [ctx->queue    release];
+    [ctx->heap.obj release];
 
     dispatch_release(ctx->d_queue);
 
@@ -1455,7 +1472,7 @@ static void ggml_metal_encode_node(
                         ggml_backend_t   backend,
                                    int   idx,
           id<MTLComputeCommandEncoder>   encoder,
-          id<MTLHeap>                    heap) {
+        struct ggml_backend_metal_heap * heap) {
     struct ggml_backend_metal_context        * ctx     = backend->context;
     struct ggml_backend_metal_device_context * ctx_dev = backend->device->context;
 
@@ -2148,7 +2165,11 @@ static void ggml_metal_encode_node(
                     /*.nb3  =*/ nb03,
                 };
 
-                id<MTLBuffer> id_src0h = [heap newBufferWithLength:ggml_nbytes(src0) options:MTLResourceStorageModePrivate];
+                id<MTLBuffer> id_src0h = [heap->obj newBufferWithLength:ggml_nbytes(src0) options:MTLResourceStorageModePrivate];
+
+                // save a reference to the heap-allocated buffer
+                // TODO: simplify and check for available resources
+                heap->bufs[heap->n++] = id_src0h;
 
                 if (src0->type == GGML_TYPE_F16) {
                     [encoder setComputePipelineState:ctx->kernels[GGML_METAL_KERNEL_TYPE_CPY_F16_F16].pipeline];
@@ -4621,6 +4642,8 @@ static enum ggml_status ggml_metal_graph_compute(
             id<MTLCommandBuffer> command_buffer = ctx->command_buffers[n_cb];
             [command_buffer waitUntilCompleted];
 
+            // TODO: free main cb heap
+
             MTLCommandBufferStatus status = [command_buffer status];
             if (status != MTLCommandBufferStatusCompleted) {
                 GGML_LOG_INFO("%s: command buffer %d failed with status %lu\n", __func__, n_cb, status);
@@ -4636,6 +4659,22 @@ static enum ggml_status ggml_metal_graph_compute(
             id<MTLCommandBuffer> command_buffer = ctx->command_buffers[i];
             [command_buffer waitUntilCompleted];
 
+            // free buffers from the heap
+            {
+                size_t size_allocated = [ctx->heap.obj currentAllocatedSize];
+                size_t size_used      = [ctx->heap.obj usedSize];
+                GGML_LOG_INFO("%s: command buffer %d, allocated = %zu, used = %zu, n = %d\n", __func__, i, size_allocated, size_used, ctx->heap.n);
+
+                for (int j = 0; j < ctx->heap.n; ++j) {
+                    id<MTLBuffer> buf = ctx->heap.bufs[j];
+                    [buf release];
+
+                    ctx->heap.bufs[j] = nil;
+                }
+
+                ctx->heap.n = 0;
+            }
+
             MTLCommandBufferStatus status = [command_buffer status];
             if (status != MTLCommandBufferStatusCompleted) {
                 GGML_LOG_INFO("%s: command buffer %d failed with status %lu\n", __func__, i, status);
@@ -5047,7 +5086,7 @@ static void ggml_backend_metal_set_n_cb(ggml_backend_t backend, int n_cb) {
                 [encoder pushDebugGroup:[NSString stringWithCString:ggml_op_desc(ggml_graph_node(ctx->gf, idx)) encoding:NSUTF8StringEncoding]];
             }
 
-            ggml_metal_encode_node(backend, idx, encoder, ctx->heap);
+            ggml_metal_encode_node(backend, idx, encoder, &ctx->heap);
 
             if (should_capture) {
                 [encoder popDebugGroup];

From 37450314b552e7c7a46954d4527059444cf9cf18 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Wed, 9 Apr 2025 16:55:01 +0300
Subject: [PATCH 03/15] cont : resize heap [no ci]

---
 ggml/src/ggml-metal/ggml-metal.m | 164 ++++++++++++++++++++++---------
 1 file changed, 118 insertions(+), 46 deletions(-)

diff --git a/ggml/src/ggml-metal/ggml-metal.m b/ggml/src/ggml-metal/ggml-metal.m
index 46b7f16934cbf..f48c64605f2e0 100644
--- a/ggml/src/ggml-metal/ggml-metal.m
+++ b/ggml/src/ggml-metal/ggml-metal.m
@@ -471,18 +471,67 @@ static void ggml_backend_metal_device_rel(struct ggml_backend_metal_device_conte
     GGML_METAL_KERNEL_TYPE_COUNT
 };
 
-struct ggml_backend_metal_heap {
+// TODO: use MTLHeapTypePlacement and reset offset after every node
+struct ggml_metal_heap {
     int n;
+    int fail;
+
+    size_t need;
 
     id<MTLHeap>   obj;
     id<MTLBuffer> bufs[GGML_METAL_MAX_HEAP_BUFFERS];
 };
 
+static void ggml_metal_heap_reset(struct ggml_metal_heap * heap) {
+    heap->n = 0;
+    heap->fail = 0;
+    heap->need = 0;
+
+    for (int i = 0; i < GGML_METAL_MAX_HEAP_BUFFERS; i++) {
+        if (heap->bufs[i]) {
+            [heap->bufs[i] release];
+            heap->bufs[i] = nil;
+            continue;
+        }
+
+        break;
+    }
+}
+
+static id<MTLBuffer> ggml_metal_heap_alloc(struct ggml_metal_heap * heap, size_t size, size_t alignment) {
+    const size_t size_aligned = GGML_PAD(size, alignment);
+
+    heap->need += size_aligned;
+
+    if (!heap->fail && heap->need > [heap->obj maxAvailableSizeWithAlignment:alignment]) {
+        heap->fail = 1;
+    }
+
+    if (!heap->fail && heap->n >= GGML_METAL_MAX_HEAP_BUFFERS) {
+        heap->fail = 2;
+    }
+
+    if (heap->fail) {
+        return nil;
+    }
+
+    id<MTLBuffer> buf = [heap->obj newBufferWithLength:size_aligned options:MTLResourceStorageModePrivate];
+    if (!buf) {
+        heap->fail = 3;
+        return nil;
+    }
+
+    heap->bufs[heap->n++] = buf;
+
+    return buf;
+}
+
 struct ggml_backend_metal_context {
+    id<MTLDevice>       device;
     id<MTLCommandQueue> queue;
 
     // TODO: create heap per command buffer
-    struct ggml_backend_metal_heap heap;
+    struct ggml_metal_heap heap;
 
     dispatch_queue_t d_queue;
 
@@ -696,9 +745,11 @@ @implementation GGMLMetalClass
     struct ggml_backend_metal_device_context * ctx_dev = dev->context;
 
     id<MTLDevice> device = ggml_backend_metal_device_acq(ctx_dev);
+
     GGML_LOG_INFO("%s: picking default device: %s\n", __func__, [[device name] UTF8String]);
 
-    ctx->queue  = [device newCommandQueue];
+    ctx->device = device;
+    ctx->queue = [device newCommandQueue];
     if (ctx->queue == nil) {
         GGML_LOG_ERROR("%s: error: failed to create command queue\n", __func__);
         return NULL;
@@ -707,21 +758,22 @@ @implementation GGMLMetalClass
     ctx->d_queue = dispatch_queue_create("ggml-metal", DISPATCH_QUEUE_CONCURRENT);
 
     // allocate tmp heap with fixed size for testing
-    // TODO: figure out how to dynamically resize it
+    // TODO: factor into a function
     {
-        MTLHeapDescriptor *heapDescriptor = [[MTLHeapDescriptor alloc] init];
-        heapDescriptor.storageMode  = MTLStorageModePrivate;
-        heapDescriptor.cpuCacheMode = MTLCPUCacheModeDefaultCache;
-        heapDescriptor.size         = 32*1024*1024;
+        MTLHeapDescriptor * desc = [[MTLHeapDescriptor alloc] init];
+        desc.storageMode  = MTLStorageModePrivate;
+        desc.cpuCacheMode = MTLCPUCacheModeDefaultCache;
+        desc.type         = MTLHeapTypeAutomatic; // TODO: use MTLHeapTypePlacement
+        desc.size         = 1024*1024;
 
         ctx->heap.n = 0;
 
-        ctx->heap.obj = [device newHeapWithDescriptor:heapDescriptor];
+        ctx->heap.obj = [device newHeapWithDescriptor:desc];
         for (int i = 0; i < GGML_METAL_MAX_HEAP_BUFFERS; ++i) {
             ctx->heap.bufs[i] = nil;
         }
 
-        [heapDescriptor release];
+        [desc release];
     }
 
     // load library
@@ -1472,7 +1524,7 @@ static void ggml_metal_encode_node(
                         ggml_backend_t   backend,
                                    int   idx,
           id<MTLComputeCommandEncoder>   encoder,
-        struct ggml_backend_metal_heap * heap) {
+                struct ggml_metal_heap * heap) {
     struct ggml_backend_metal_context        * ctx     = backend->context;
     struct ggml_backend_metal_device_context * ctx_dev = backend->device->context;
 
@@ -2165,11 +2217,16 @@ static void ggml_metal_encode_node(
                     /*.nb3  =*/ nb03,
                 };
 
-                id<MTLBuffer> id_src0h = [heap->obj newBufferWithLength:ggml_nbytes(src0) options:MTLResourceStorageModePrivate];
+                //id<MTLBuffer> id_src0h = [heap->obj newBufferWithLength:ggml_nbytes(src0) options:MTLResourceStorageModePrivate];
 
-                // save a reference to the heap-allocated buffer
-                // TODO: simplify and check for available resources
-                heap->bufs[heap->n++] = id_src0h;
+                //// save a reference to the heap-allocated buffer
+                //// TODO: simplify and check for available resources
+                //heap->bufs[heap->n++] = id_src0h;
+
+                id<MTLBuffer> id_src0h = ggml_metal_heap_alloc(heap, ggml_nbytes(src0), 32);
+                if (!id_src0h) {
+                    break;
+                }
 
                 if (src0->type == GGML_TYPE_F16) {
                     [encoder setComputePipelineState:ctx->kernels[GGML_METAL_KERNEL_TYPE_CPY_F16_F16].pipeline];
@@ -4659,21 +4716,8 @@ static enum ggml_status ggml_metal_graph_compute(
             id<MTLCommandBuffer> command_buffer = ctx->command_buffers[i];
             [command_buffer waitUntilCompleted];
 
-            // free buffers from the heap
-            {
-                size_t size_allocated = [ctx->heap.obj currentAllocatedSize];
-                size_t size_used      = [ctx->heap.obj usedSize];
-                GGML_LOG_INFO("%s: command buffer %d, allocated = %zu, used = %zu, n = %d\n", __func__, i, size_allocated, size_used, ctx->heap.n);
-
-                for (int j = 0; j < ctx->heap.n; ++j) {
-                    id<MTLBuffer> buf = ctx->heap.bufs[j];
-                    [buf release];
-
-                    ctx->heap.bufs[j] = nil;
-                }
-
-                ctx->heap.n = 0;
-            }
+            // TODO: per command buffer heap
+            ggml_metal_heap_reset(&ctx->heap);
 
             MTLCommandBufferStatus status = [command_buffer status];
             if (status != MTLCommandBufferStatusCompleted) {
@@ -5069,31 +5113,59 @@ static void ggml_backend_metal_set_n_cb(ggml_backend_t backend, int n_cb) {
         const int n_nodes_per_cb = ctx->n_nodes_per_cb;
 
         id<MTLCommandBuffer> command_buffer  = ctx->command_buffers[cb_idx];
-        id<MTLComputeCommandEncoder> encoder = [command_buffer computeCommandEncoder];
 
-        int node_start = 0;
-        int node_end   = n_nodes_0;
+        int n_try = 3;
 
-        if (cb_idx < n_cb_l) {
-            node_start = n_nodes_0 + (                                         (cb_idx + 0) * n_nodes_per_cb);
-            node_end   = n_nodes_0 + (MIN((cb_idx == n_cb_l - 1) ? n_nodes_1 : (cb_idx + 1) * n_nodes_per_cb, n_nodes_1));
-        }
+        while (n_try-- > 0) {
+            id<MTLComputeCommandEncoder> encoder = [command_buffer computeCommandEncoder];
 
-        const bool should_capture = ctx->capture_next_compute;
+            int node_start = 0;
+            int node_end   = n_nodes_0;
 
-        for (int idx = node_start; idx < node_end; ++idx) {
-            if (should_capture) {
-                [encoder pushDebugGroup:[NSString stringWithCString:ggml_op_desc(ggml_graph_node(ctx->gf, idx)) encoding:NSUTF8StringEncoding]];
+            if (cb_idx < n_cb_l) {
+                node_start = n_nodes_0 + (                                         (cb_idx + 0) * n_nodes_per_cb);
+                node_end   = n_nodes_0 + (MIN((cb_idx == n_cb_l - 1) ? n_nodes_1 : (cb_idx + 1) * n_nodes_per_cb, n_nodes_1));
             }
 
-            ggml_metal_encode_node(backend, idx, encoder, &ctx->heap);
+            const bool should_capture = ctx->capture_next_compute;
+
+            for (int idx = node_start; idx < node_end; ++idx) {
+                if (should_capture) {
+                    [encoder pushDebugGroup:[NSString stringWithCString:ggml_op_desc(ggml_graph_node(ctx->gf, idx)) encoding:NSUTF8StringEncoding]];
+                }
+
+                ggml_metal_encode_node(backend, idx, encoder, &ctx->heap);
 
-            if (should_capture) {
-                [encoder popDebugGroup];
+                if (should_capture) {
+                    [encoder popDebugGroup];
+                }
             }
-        }
 
-        [encoder endEncoding];
+            [encoder endEncoding];
+
+            if (ctx->heap.fail == 0) {
+                break;
+            }
+
+            // increase heap size
+            [ctx->heap.obj release];
+
+            {
+                MTLHeapDescriptor * desc = [[MTLHeapDescriptor alloc] init];
+                desc.storageMode  = MTLStorageModePrivate;
+                desc.cpuCacheMode = MTLCPUCacheModeDefaultCache;
+                desc.type         = MTLHeapTypeAutomatic; // TODO: use MTLHeapTypePlacement
+                desc.size         = ctx->heap.need;
+
+                GGML_LOG_INFO("%s: increasing heap size to %zu\n", __func__, ctx->heap.need);
+
+                ctx->heap.obj = [ctx->device newHeapWithDescriptor:desc];
+
+                [desc release];
+            }
+
+            ggml_metal_heap_reset(&ctx->heap);
+        }
 
         if (cb_idx < 2 || ctx->abort_callback == NULL) {
             [command_buffer commit];

From 9433c504c09b9e887886e82a7b237a92d420634a Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Thu, 10 Apr 2025 14:49:49 +0300
Subject: [PATCH 04/15] cont : refactor heap [no ci]

---
 ggml/src/ggml-metal/ggml-metal.m | 125 +++++++++++++++++++------------
 1 file changed, 79 insertions(+), 46 deletions(-)

diff --git a/ggml/src/ggml-metal/ggml-metal.m b/ggml/src/ggml-metal/ggml-metal.m
index f48c64605f2e0..238e5d86ba609 100644
--- a/ggml/src/ggml-metal/ggml-metal.m
+++ b/ggml/src/ggml-metal/ggml-metal.m
@@ -471,17 +471,55 @@ static void ggml_backend_metal_device_rel(struct ggml_backend_metal_device_conte
     GGML_METAL_KERNEL_TYPE_COUNT
 };
 
-// TODO: use MTLHeapTypePlacement and reset offset after every node
 struct ggml_metal_heap {
     int n;
     int fail;
 
     size_t need;
 
+    id<MTLDevice> device;
     id<MTLHeap>   obj;
     id<MTLBuffer> bufs[GGML_METAL_MAX_HEAP_BUFFERS];
 };
 
+static struct ggml_metal_heap * ggml_metal_heap_init(id<MTLDevice> device, size_t size) {
+    struct ggml_metal_heap * heap = calloc(1, sizeof(struct ggml_metal_heap));
+
+    MTLHeapDescriptor * desc = [[MTLHeapDescriptor alloc] init];
+    desc.storageMode  = MTLStorageModePrivate;
+    desc.cpuCacheMode = MTLCPUCacheModeDefaultCache;
+    desc.type         = MTLHeapTypeAutomatic; // TODO: use MTLHeapTypePlacement
+    desc.size         = size;
+
+    heap->device = device;
+    heap->obj = [device newHeapWithDescriptor:desc];
+    if (!heap->obj) {
+        GGML_LOG_ERROR("%s: error: failed to create MTLHeap with size %zu\n", __func__, size);
+
+        free(heap);
+
+        return false;
+    }
+
+    for (int i = 0; i < GGML_METAL_MAX_HEAP_BUFFERS; ++i) {
+        heap->bufs[i] = nil;
+    }
+
+    [desc release];
+
+    return heap;
+}
+
+static void ggml_metal_heap_free(struct ggml_metal_heap * heap) {
+    if (heap == nil) {
+        return;
+    }
+
+    [heap->obj release];
+
+    free(heap);
+}
+
 static void ggml_metal_heap_reset(struct ggml_metal_heap * heap) {
     heap->n = 0;
     heap->fail = 0;
@@ -498,6 +536,33 @@ static void ggml_metal_heap_reset(struct ggml_metal_heap * heap) {
     }
 }
 
+static bool ggml_metal_heap_resize(struct ggml_metal_heap * heap, size_t size) {
+    if (heap == nil) {
+        return false;
+    }
+
+    [heap->obj release];
+
+    MTLHeapDescriptor * desc = [[MTLHeapDescriptor alloc] init];
+    desc.storageMode  = MTLStorageModePrivate;
+    desc.cpuCacheMode = MTLCPUCacheModeDefaultCache;
+    desc.type         = MTLHeapTypeAutomatic; // TODO: use MTLHeapTypePlacement
+    desc.size         = size;
+
+    heap->obj = [heap->device newHeapWithDescriptor:desc];
+    if (!heap->obj) {
+        GGML_LOG_ERROR("%s: error: failed to create MTLHeap with size %zu\n", __func__, size);
+
+        return false;
+    }
+
+    [desc release];
+
+    ggml_metal_heap_reset(heap);
+
+    return true;
+}
+
 static id<MTLBuffer> ggml_metal_heap_alloc(struct ggml_metal_heap * heap, size_t size, size_t alignment) {
     const size_t size_aligned = GGML_PAD(size, alignment);
 
@@ -531,7 +596,7 @@ static void ggml_metal_heap_reset(struct ggml_metal_heap * heap) {
     id<MTLCommandQueue> queue;
 
     // TODO: create heap per command buffer
-    struct ggml_metal_heap heap;
+    struct ggml_metal_heap * heap;
 
     dispatch_queue_t d_queue;
 
@@ -757,24 +822,7 @@ @implementation GGMLMetalClass
 
     ctx->d_queue = dispatch_queue_create("ggml-metal", DISPATCH_QUEUE_CONCURRENT);
 
-    // allocate tmp heap with fixed size for testing
-    // TODO: factor into a function
-    {
-        MTLHeapDescriptor * desc = [[MTLHeapDescriptor alloc] init];
-        desc.storageMode  = MTLStorageModePrivate;
-        desc.cpuCacheMode = MTLCPUCacheModeDefaultCache;
-        desc.type         = MTLHeapTypeAutomatic; // TODO: use MTLHeapTypePlacement
-        desc.size         = 1024*1024;
-
-        ctx->heap.n = 0;
-
-        ctx->heap.obj = [device newHeapWithDescriptor:desc];
-        for (int i = 0; i < GGML_METAL_MAX_HEAP_BUFFERS; ++i) {
-            ctx->heap.bufs[i] = nil;
-        }
-
-        [desc release];
-    }
+    ctx->heap = ggml_metal_heap_init(device, 1024*1024);
 
     // load library
     if (ctx_dev->mtl_library == nil) {
@@ -1218,8 +1266,9 @@ static void ggml_metal_free(struct ggml_backend_metal_context * ctx) {
 
     Block_release(ctx->encode_async);
 
-    [ctx->queue    release];
-    [ctx->heap.obj release];
+    [ctx->queue release];
+
+    ggml_metal_heap_free(ctx->heap);
 
     dispatch_release(ctx->d_queue);
 
@@ -2217,12 +2266,6 @@ static void ggml_metal_encode_node(
                     /*.nb3  =*/ nb03,
                 };
 
-                //id<MTLBuffer> id_src0h = [heap->obj newBufferWithLength:ggml_nbytes(src0) options:MTLResourceStorageModePrivate];
-
-                //// save a reference to the heap-allocated buffer
-                //// TODO: simplify and check for available resources
-                //heap->bufs[heap->n++] = id_src0h;
-
                 id<MTLBuffer> id_src0h = ggml_metal_heap_alloc(heap, ggml_nbytes(src0), 32);
                 if (!id_src0h) {
                     break;
@@ -4717,7 +4760,7 @@ static enum ggml_status ggml_metal_graph_compute(
             [command_buffer waitUntilCompleted];
 
             // TODO: per command buffer heap
-            ggml_metal_heap_reset(&ctx->heap);
+            ggml_metal_heap_reset(ctx->heap);
 
             MTLCommandBufferStatus status = [command_buffer status];
             if (status != MTLCommandBufferStatusCompleted) {
@@ -5134,7 +5177,7 @@ static void ggml_backend_metal_set_n_cb(ggml_backend_t backend, int n_cb) {
                     [encoder pushDebugGroup:[NSString stringWithCString:ggml_op_desc(ggml_graph_node(ctx->gf, idx)) encoding:NSUTF8StringEncoding]];
                 }
 
-                ggml_metal_encode_node(backend, idx, encoder, &ctx->heap);
+                ggml_metal_encode_node(backend, idx, encoder, ctx->heap);
 
                 if (should_capture) {
                     [encoder popDebugGroup];
@@ -5143,28 +5186,18 @@ static void ggml_backend_metal_set_n_cb(ggml_backend_t backend, int n_cb) {
 
             [encoder endEncoding];
 
-            if (ctx->heap.fail == 0) {
+            if (ctx->heap->fail == 0) {
                 break;
             }
 
-            // increase heap size
-            [ctx->heap.obj release];
+            const size_t need = ctx->heap->need;
 
-            {
-                MTLHeapDescriptor * desc = [[MTLHeapDescriptor alloc] init];
-                desc.storageMode  = MTLStorageModePrivate;
-                desc.cpuCacheMode = MTLCPUCacheModeDefaultCache;
-                desc.type         = MTLHeapTypeAutomatic; // TODO: use MTLHeapTypePlacement
-                desc.size         = ctx->heap.need;
-
-                GGML_LOG_INFO("%s: increasing heap size to %zu\n", __func__, ctx->heap.need);
-
-                ctx->heap.obj = [ctx->device newHeapWithDescriptor:desc];
+            GGML_LOG_INFO("%s: increasing heap size to %zu\n", __func__, need);
 
-                [desc release];
+            if (!ggml_metal_heap_resize(ctx->heap, need)) {
+                GGML_LOG_ERROR("%s: failed to increase heap size to %zu\n", __func__, need);
+                break;
             }
-
-            ggml_metal_heap_reset(&ctx->heap);
         }
 
         if (cb_idx < 2 || ctx->abort_callback == NULL) {

From 2804db781201055947f812f59fe53a0a1b22c824 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Thu, 10 Apr 2025 14:56:47 +0300
Subject: [PATCH 05/15] cont : heap for each cmd buffer [no ci]

---
 ggml/src/ggml-metal/ggml-metal.m | 80 ++++++++++++++++++--------------
 1 file changed, 45 insertions(+), 35 deletions(-)

diff --git a/ggml/src/ggml-metal/ggml-metal.m b/ggml/src/ggml-metal/ggml-metal.m
index 238e5d86ba609..cb9523506c1e5 100644
--- a/ggml/src/ggml-metal/ggml-metal.m
+++ b/ggml/src/ggml-metal/ggml-metal.m
@@ -591,13 +591,16 @@ static bool ggml_metal_heap_resize(struct ggml_metal_heap * heap, size_t size) {
     return buf;
 }
 
+struct ggml_metal_command_buffer {
+    id<MTLCommandBuffer> obj;
+
+    struct ggml_metal_heap * heap;
+};
+
 struct ggml_backend_metal_context {
     id<MTLDevice>       device;
     id<MTLCommandQueue> queue;
 
-    // TODO: create heap per command buffer
-    struct ggml_metal_heap * heap;
-
     dispatch_queue_t d_queue;
 
     struct ggml_metal_kernel kernels[GGML_METAL_KERNEL_TYPE_COUNT];
@@ -620,7 +623,8 @@ static bool ggml_metal_heap_resize(struct ggml_metal_heap * heap, size_t size) {
     void (^encode_async)(size_t ith);
 
     // n_cb command buffers + 1 used by the main thread
-    id<MTLCommandBuffer> command_buffers[GGML_METAL_MAX_COMMAND_BUFFERS + 1];
+    //id<MTLCommandBuffer> command_buffers[GGML_METAL_MAX_COMMAND_BUFFERS + 1];
+    struct ggml_metal_command_buffer cmd_bufs[GGML_METAL_MAX_COMMAND_BUFFERS + 1];
 
     // abort ggml_metal_graph_compute if callback returns true
     ggml_abort_callback abort_callback;
@@ -822,8 +826,6 @@ @implementation GGMLMetalClass
 
     ctx->d_queue = dispatch_queue_create("ggml-metal", DISPATCH_QUEUE_CONCURRENT);
 
-    ctx->heap = ggml_metal_heap_init(device, 1024*1024);
-
     // load library
     if (ctx_dev->mtl_library == nil) {
         ctx_dev->mtl_library = ggml_metal_load_library(device, ctx_dev->use_bfloat);
@@ -877,7 +879,11 @@ @implementation GGMLMetalClass
     ctx->gf = nil;
     ctx->encode_async = nil;
     for (int i = 0; i < GGML_METAL_MAX_COMMAND_BUFFERS; ++i) {
-        ctx->command_buffers[i] = nil;
+        ctx->cmd_bufs[i].obj = nil;
+
+        // create 1MB heaps per command buffer
+        // these can be resized during compute when necessary
+        ctx->cmd_bufs[i].heap = ggml_metal_heap_init(device, 1024*1024);
     }
 
 #if TARGET_OS_OSX || (TARGET_OS_IOS && __clang_major__ >= 15)
@@ -1268,7 +1274,11 @@ static void ggml_metal_free(struct ggml_backend_metal_context * ctx) {
 
     [ctx->queue release];
 
-    ggml_metal_heap_free(ctx->heap);
+    //ggml_metal_heap_free(ctx->heap);
+    for (int i = 0; i < GGML_METAL_MAX_COMMAND_BUFFERS; ++i) {
+        [ctx->cmd_bufs[i].obj release];
+        ggml_metal_heap_free(ctx->cmd_bufs[i].heap);
+    }
 
     dispatch_release(ctx->d_queue);
 
@@ -4712,25 +4722,25 @@ static enum ggml_status ggml_metal_graph_compute(
         }
 
         // the main thread commits the first few commands immediately
-        // command_buffer[n_cb]
+        // cmd_buf[n_cb]
         {
-            id<MTLCommandBuffer> command_buffer = [ctx->queue commandBufferWithUnretainedReferences];
-            ctx->command_buffers[n_cb] = command_buffer;
+            id<MTLCommandBuffer> cmd_buf = [ctx->queue commandBufferWithUnretainedReferences];
+            ctx->cmd_bufs[n_cb].obj = cmd_buf;
 
-            [command_buffer enqueue];
+            [cmd_buf enqueue];
             ctx->encode_async(n_cb);
         }
 
         // prepare the rest of the command buffers asynchronously
-        // command_buffer[0.. n_cb)
+        // cmd_buf[0.. n_cb)
         for (int cb_idx = 0; cb_idx < n_cb; ++cb_idx) {
-            id<MTLCommandBuffer> command_buffer = [ctx->queue commandBufferWithUnretainedReferences];
-            ctx->command_buffers[cb_idx] = command_buffer;
+            id<MTLCommandBuffer> cmd_buf = [ctx->queue commandBufferWithUnretainedReferences];
+            ctx->cmd_bufs[cb_idx].obj = cmd_buf;
 
             // always enqueue the first two command buffers
             // enqueue all of the command buffers if we don't need to abort
             if (cb_idx < 2 || ctx->abort_callback == NULL) {
-                [command_buffer enqueue];
+                [cmd_buf enqueue];
             }
         }
 
@@ -4739,16 +4749,16 @@ static enum ggml_status ggml_metal_graph_compute(
         // wait for completion and check status of each command buffer
         // needed to detect if the device ran out-of-memory for example (#1881)
         {
-            id<MTLCommandBuffer> command_buffer = ctx->command_buffers[n_cb];
-            [command_buffer waitUntilCompleted];
+            id<MTLCommandBuffer> cmd_buf = ctx->cmd_bufs[n_cb].obj;
+            [cmd_buf waitUntilCompleted];
 
-            // TODO: free main cb heap
+            ggml_metal_heap_reset(ctx->cmd_bufs[n_cb].heap);
 
-            MTLCommandBufferStatus status = [command_buffer status];
+            MTLCommandBufferStatus status = [cmd_buf status];
             if (status != MTLCommandBufferStatusCompleted) {
                 GGML_LOG_INFO("%s: command buffer %d failed with status %lu\n", __func__, n_cb, status);
                 if (status == MTLCommandBufferStatusError) {
-                    GGML_LOG_INFO("error: %s\n", [[command_buffer error].localizedDescription UTF8String]);
+                    GGML_LOG_INFO("error: %s\n", [[cmd_buf error].localizedDescription UTF8String]);
                 }
 
                 return GGML_STATUS_FAILED;
@@ -4756,23 +4766,22 @@ static enum ggml_status ggml_metal_graph_compute(
         }
 
         for (int i = 0; i < n_cb; ++i) {
-            id<MTLCommandBuffer> command_buffer = ctx->command_buffers[i];
-            [command_buffer waitUntilCompleted];
+            id<MTLCommandBuffer> cmd_buf = ctx->cmd_bufs[i].obj;
+            [cmd_buf waitUntilCompleted];
 
-            // TODO: per command buffer heap
-            ggml_metal_heap_reset(ctx->heap);
+            ggml_metal_heap_reset(ctx->cmd_bufs[i].heap);
 
-            MTLCommandBufferStatus status = [command_buffer status];
+            MTLCommandBufferStatus status = [cmd_buf status];
             if (status != MTLCommandBufferStatusCompleted) {
                 GGML_LOG_INFO("%s: command buffer %d failed with status %lu\n", __func__, i, status);
                 if (status == MTLCommandBufferStatusError) {
-                    GGML_LOG_INFO("error: %s\n", [[command_buffer error].localizedDescription UTF8String]);
+                    GGML_LOG_INFO("error: %s\n", [[cmd_buf error].localizedDescription UTF8String]);
                 }
 
                 return GGML_STATUS_FAILED;
             }
 
-            id<MTLCommandBuffer> next_buffer = (i + 1 < n_cb ? ctx->command_buffers[i + 1] : nil);
+            id<MTLCommandBuffer> next_buffer = (i + 1 < n_cb ? ctx->cmd_bufs[i + 1].obj : nil);
             if (!next_buffer) {
                 continue;
             }
@@ -5155,12 +5164,13 @@ static void ggml_backend_metal_set_n_cb(ggml_backend_t backend, int n_cb) {
 
         const int n_nodes_per_cb = ctx->n_nodes_per_cb;
 
-        id<MTLCommandBuffer> command_buffer  = ctx->command_buffers[cb_idx];
+        id<MTLCommandBuffer> cmd_buf = ctx->cmd_bufs[cb_idx].obj;
+        struct ggml_metal_heap * heap = ctx->cmd_bufs[cb_idx].heap;
 
         int n_try = 3;
 
         while (n_try-- > 0) {
-            id<MTLComputeCommandEncoder> encoder = [command_buffer computeCommandEncoder];
+            id<MTLComputeCommandEncoder> encoder = [cmd_buf computeCommandEncoder];
 
             int node_start = 0;
             int node_end   = n_nodes_0;
@@ -5177,7 +5187,7 @@ static void ggml_backend_metal_set_n_cb(ggml_backend_t backend, int n_cb) {
                     [encoder pushDebugGroup:[NSString stringWithCString:ggml_op_desc(ggml_graph_node(ctx->gf, idx)) encoding:NSUTF8StringEncoding]];
                 }
 
-                ggml_metal_encode_node(backend, idx, encoder, ctx->heap);
+                ggml_metal_encode_node(backend, idx, encoder, heap);
 
                 if (should_capture) {
                     [encoder popDebugGroup];
@@ -5186,22 +5196,22 @@ static void ggml_backend_metal_set_n_cb(ggml_backend_t backend, int n_cb) {
 
             [encoder endEncoding];
 
-            if (ctx->heap->fail == 0) {
+            if (heap->fail == 0) {
                 break;
             }
 
-            const size_t need = ctx->heap->need;
+            const size_t need = heap->need;
 
             GGML_LOG_INFO("%s: increasing heap size to %zu\n", __func__, need);
 
-            if (!ggml_metal_heap_resize(ctx->heap, need)) {
+            if (!ggml_metal_heap_resize(heap, need)) {
                 GGML_LOG_ERROR("%s: failed to increase heap size to %zu\n", __func__, need);
                 break;
             }
         }
 
         if (cb_idx < 2 || ctx->abort_callback == NULL) {
-            [command_buffer commit];
+            [cmd_buf commit];
         }
     });
 }

From e1dc4df76ff345265f1aa6ad5977bd14e2940973 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Thu, 10 Apr 2025 15:27:39 +0300
Subject: [PATCH 06/15] cont : fix free

---
 ggml/src/ggml-metal/ggml-metal.m | 23 +++++++++++++++++++----
 1 file changed, 19 insertions(+), 4 deletions(-)

diff --git a/ggml/src/ggml-metal/ggml-metal.m b/ggml/src/ggml-metal/ggml-metal.m
index cb9523506c1e5..f0b40e84efb7b 100644
--- a/ggml/src/ggml-metal/ggml-metal.m
+++ b/ggml/src/ggml-metal/ggml-metal.m
@@ -883,7 +883,7 @@ @implementation GGMLMetalClass
 
         // create 1MB heaps per command buffer
         // these can be resized during compute when necessary
-        ctx->cmd_bufs[i].heap = ggml_metal_heap_init(device, 1024*1024);
+        ctx->cmd_bufs[i].heap = ggml_metal_heap_init(device, 32);
     }
 
 #if TARGET_OS_OSX || (TARGET_OS_IOS && __clang_major__ >= 15)
@@ -1274,9 +1274,9 @@ static void ggml_metal_free(struct ggml_backend_metal_context * ctx) {
 
     [ctx->queue release];
 
-    //ggml_metal_heap_free(ctx->heap);
     for (int i = 0; i < GGML_METAL_MAX_COMMAND_BUFFERS; ++i) {
-        [ctx->cmd_bufs[i].obj release];
+        // ctx->cmd_bufs[i].obj is auto released
+
         ggml_metal_heap_free(ctx->cmd_bufs[i].heap);
     }
 
@@ -5167,7 +5167,7 @@ static void ggml_backend_metal_set_n_cb(ggml_backend_t backend, int n_cb) {
         id<MTLCommandBuffer> cmd_buf = ctx->cmd_bufs[cb_idx].obj;
         struct ggml_metal_heap * heap = ctx->cmd_bufs[cb_idx].heap;
 
-        int n_try = 3;
+        int n_try = 2;
 
         while (n_try-- > 0) {
             id<MTLComputeCommandEncoder> encoder = [cmd_buf computeCommandEncoder];
@@ -5200,6 +5200,21 @@ static void ggml_backend_metal_set_n_cb(ggml_backend_t backend, int n_cb) {
                 break;
             }
 
+            if (heap->fail == 2) {
+                GGML_LOG_ERROR("%s: MTLHeap ran out of buffers, max = %d\n", __func__, heap->n);
+                break;
+            }
+
+            if (heap->fail == 3) {
+                GGML_LOG_ERROR("%s: MTLHeap failed to allocate buffer\n", __func__);
+                break;
+            }
+
+            if (n_try == 0) {
+                GGML_LOG_ERROR("%s: failed to allocate heap memory\n", __func__);
+                break;
+            }
+
             const size_t need = heap->need;
 
             GGML_LOG_INFO("%s: increasing heap size to %zu\n", __func__, need);

From c77ccf0bf5f95184a3e39c2142364d9c7352c00e Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Thu, 10 Apr 2025 16:27:12 +0300
Subject: [PATCH 07/15] wip

---
 ggml/src/ggml-metal/ggml-metal.m | 131 ++++++++++++++++++-------------
 1 file changed, 78 insertions(+), 53 deletions(-)

diff --git a/ggml/src/ggml-metal/ggml-metal.m b/ggml/src/ggml-metal/ggml-metal.m
index f0b40e84efb7b..3faa137810e2d 100644
--- a/ggml/src/ggml-metal/ggml-metal.m
+++ b/ggml/src/ggml-metal/ggml-metal.m
@@ -1579,7 +1579,7 @@ static bool ggml_metal_supports_op(const struct ggml_backend_metal_device_contex
     }
 }
 
-static void ggml_metal_encode_node(
+static bool ggml_metal_encode_node(
                         ggml_backend_t   backend,
                                    int   idx,
           id<MTLComputeCommandEncoder>   encoder,
@@ -1599,7 +1599,7 @@ static void ggml_metal_encode_node(
     struct ggml_tensor * dst  = node;
 
     if (ggml_is_empty(dst)) {
-        return;
+        return true;
     }
 
     switch (dst->op) {
@@ -1610,7 +1610,7 @@ static void ggml_metal_encode_node(
         case GGML_OP_PERMUTE:
             {
                 // noop -> next node
-            } return;
+            } return true;
         default:
             {
             } break;
@@ -2214,6 +2214,8 @@ static void ggml_metal_encode_node(
             {
                 GGML_ASSERT(!src1 || src1->type == GGML_TYPE_F16 || src1->type == GGML_TYPE_F32);
 
+                GGML_ASSERT(ggml_is_contiguous(src0));
+
                 int nth = 32; // SIMD width
 
                 id<MTLComputePipelineState> pipeline = nil;
@@ -2278,7 +2280,9 @@ static void ggml_metal_encode_node(
 
                 id<MTLBuffer> id_src0h = ggml_metal_heap_alloc(heap, ggml_nbytes(src0), 32);
                 if (!id_src0h) {
-                    break;
+                    //GGML_LOG_ERROR("%s: failed to allocate buffer for cpy, size = %zu, need = %zu, max available = %zu\n",
+                    //        __func__, ggml_nbytes(src0), heap->need, [heap->obj maxAvailableSizeWithAlignment:32]);
+                    return false;
                 }
 
                 if (src0->type == GGML_TYPE_F16) {
@@ -4669,6 +4673,8 @@ static void ggml_metal_encode_node(
                 GGML_ABORT("fatal error");
             }
     }
+
+    return true;
 }
 
 static enum ggml_status ggml_metal_graph_compute(
@@ -4683,6 +4689,8 @@ static enum ggml_status ggml_metal_graph_compute(
     // number of threads in addition to the main thread
     const int n_cb = ctx->n_cb;
 
+    int n_try = 64;
+
     // submit the ggml compute graph to the GPU by creating command buffers and encoding the ops in them
     // the first n_nodes_0 are encoded and submitted for processing directly by the calling thread
     // while these nodes are processing, we start n_cb threads to enqueue the rest of the nodes
@@ -4690,6 +4698,7 @@ static enum ggml_status ggml_metal_graph_compute(
     //
     // tests on M1 Pro and M2 Ultra using LLaMA models, show that optimal values for n_cb are 1 or 2
 
+    while (n_try-- > 0) {
     @autoreleasepool {
         ctx->gf = gf;
 
@@ -4752,8 +4761,6 @@ static enum ggml_status ggml_metal_graph_compute(
             id<MTLCommandBuffer> cmd_buf = ctx->cmd_bufs[n_cb].obj;
             [cmd_buf waitUntilCompleted];
 
-            ggml_metal_heap_reset(ctx->cmd_bufs[n_cb].heap);
-
             MTLCommandBufferStatus status = [cmd_buf status];
             if (status != MTLCommandBufferStatusCompleted) {
                 GGML_LOG_INFO("%s: command buffer %d failed with status %lu\n", __func__, n_cb, status);
@@ -4769,8 +4776,6 @@ static enum ggml_status ggml_metal_graph_compute(
             id<MTLCommandBuffer> cmd_buf = ctx->cmd_bufs[i].obj;
             [cmd_buf waitUntilCompleted];
 
-            ggml_metal_heap_reset(ctx->cmd_bufs[i].heap);
-
             MTLCommandBufferStatus status = [cmd_buf status];
             if (status != MTLCommandBufferStatusCompleted) {
                 GGML_LOG_INFO("%s: command buffer %d failed with status %lu\n", __func__, i, status);
@@ -4805,6 +4810,54 @@ static enum ggml_status ggml_metal_graph_compute(
         }
     }
 
+    bool retry = false;
+
+    // check heap statuses
+    for (int i = 0; i <= n_cb; ++i) {
+        struct ggml_metal_heap * heap = ctx->cmd_bufs[i].heap;
+
+        const size_t need = 4*heap->need;
+
+        //printf("\nXXXXXXXXXXXXXXXXX cb %d, need = %zu, fail = %d, size = %zu\n", i, need, heap->fail, [heap->obj currentAllocatedSize]);
+
+        if (heap->fail == 0) {
+            ggml_metal_heap_reset(ctx->cmd_bufs[i].heap);
+
+            continue;
+        }
+
+        if (heap->fail == 2) {
+            GGML_LOG_ERROR("%s: command buffer %d, MTLHeap ran out of buffers, max = %d\n", __func__, i, heap->n);
+            return GGML_STATUS_ALLOC_FAILED;
+        }
+
+        if (heap->fail == 3) {
+            GGML_LOG_ERROR("%s: command buffer %d, MTLHeap failed to allocate buffer, max = %d\n", __func__, i, heap->n);
+            return GGML_STATUS_ALLOC_FAILED;
+        }
+
+        //GGML_LOG_INFO("%s: command buffer %d, MTLHeap need = %zu\n", __func__, i, need);
+
+        if (!ggml_metal_heap_resize(heap, need)) {
+            GGML_LOG_ERROR("%s: failed to increase heap size to %zu\n", __func__, need);
+            return GGML_STATUS_ALLOC_FAILED;
+        }
+
+        retry = true;
+    }
+
+    if (!retry) {
+        break;
+    }
+
+    //printf("XXXXXXXXXXXXXXXXXXXXXXX retry\n");
+
+    if (n_try == 0) {
+        GGML_LOG_ERROR("%s: failed to allocate heap memory\n", __func__);
+        return GGML_STATUS_ALLOC_FAILED;
+    }
+    }
+
     return GGML_STATUS_SUCCESS;
 }
 
@@ -5167,64 +5220,36 @@ static void ggml_backend_metal_set_n_cb(ggml_backend_t backend, int n_cb) {
         id<MTLCommandBuffer> cmd_buf = ctx->cmd_bufs[cb_idx].obj;
         struct ggml_metal_heap * heap = ctx->cmd_bufs[cb_idx].heap;
 
-        int n_try = 2;
-
-        while (n_try-- > 0) {
-            id<MTLComputeCommandEncoder> encoder = [cmd_buf computeCommandEncoder];
-
-            int node_start = 0;
-            int node_end   = n_nodes_0;
-
-            if (cb_idx < n_cb_l) {
-                node_start = n_nodes_0 + (                                         (cb_idx + 0) * n_nodes_per_cb);
-                node_end   = n_nodes_0 + (MIN((cb_idx == n_cb_l - 1) ? n_nodes_1 : (cb_idx + 1) * n_nodes_per_cb, n_nodes_1));
-            }
-
-            const bool should_capture = ctx->capture_next_compute;
-
-            for (int idx = node_start; idx < node_end; ++idx) {
-                if (should_capture) {
-                    [encoder pushDebugGroup:[NSString stringWithCString:ggml_op_desc(ggml_graph_node(ctx->gf, idx)) encoding:NSUTF8StringEncoding]];
-                }
-
-                ggml_metal_encode_node(backend, idx, encoder, heap);
+        id<MTLComputeCommandEncoder> encoder = [cmd_buf computeCommandEncoder];
 
-                if (should_capture) {
-                    [encoder popDebugGroup];
-                }
-            }
+        int node_start = 0;
+        int node_end   = n_nodes_0;
 
-            [encoder endEncoding];
+        if (cb_idx < n_cb_l) {
+            node_start = n_nodes_0 + (                                         (cb_idx + 0) * n_nodes_per_cb);
+            node_end   = n_nodes_0 + (MIN((cb_idx == n_cb_l - 1) ? n_nodes_1 : (cb_idx + 1) * n_nodes_per_cb, n_nodes_1));
+        }
 
-            if (heap->fail == 0) {
-                break;
-            }
+        const bool should_capture = ctx->capture_next_compute;
 
-            if (heap->fail == 2) {
-                GGML_LOG_ERROR("%s: MTLHeap ran out of buffers, max = %d\n", __func__, heap->n);
-                break;
+        for (int idx = node_start; idx < node_end; ++idx) {
+            if (should_capture) {
+                [encoder pushDebugGroup:[NSString stringWithCString:ggml_op_desc(ggml_graph_node(ctx->gf, idx)) encoding:NSUTF8StringEncoding]];
             }
 
-            if (heap->fail == 3) {
-                GGML_LOG_ERROR("%s: MTLHeap failed to allocate buffer\n", __func__);
-                break;
-            }
+            const bool res = ggml_metal_encode_node(backend, idx, encoder, heap);
 
-            if (n_try == 0) {
-                GGML_LOG_ERROR("%s: failed to allocate heap memory\n", __func__);
-                break;
+            if (should_capture) {
+                [encoder popDebugGroup];
             }
 
-            const size_t need = heap->need;
-
-            GGML_LOG_INFO("%s: increasing heap size to %zu\n", __func__, need);
-
-            if (!ggml_metal_heap_resize(heap, need)) {
-                GGML_LOG_ERROR("%s: failed to increase heap size to %zu\n", __func__, need);
+            if (!res) {
                 break;
             }
         }
 
+        [encoder endEncoding];
+
         if (cb_idx < 2 || ctx->abort_callback == NULL) {
             [cmd_buf commit];
         }

From c2c0f0f7d875a871d51365e034563ea29466cdb0 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Thu, 10 Apr 2025 16:55:05 +0300
Subject: [PATCH 08/15] cont : fix alignment [no ci]

---
 ggml/src/ggml-metal/ggml-metal.m | 16 +++++++++-------
 1 file changed, 9 insertions(+), 7 deletions(-)

diff --git a/ggml/src/ggml-metal/ggml-metal.m b/ggml/src/ggml-metal/ggml-metal.m
index 3faa137810e2d..e9fc033eb85e5 100644
--- a/ggml/src/ggml-metal/ggml-metal.m
+++ b/ggml/src/ggml-metal/ggml-metal.m
@@ -568,7 +568,7 @@ static bool ggml_metal_heap_resize(struct ggml_metal_heap * heap, size_t size) {
 
     heap->need += size_aligned;
 
-    if (!heap->fail && heap->need > [heap->obj maxAvailableSizeWithAlignment:alignment]) {
+    if (!heap->fail && size_aligned > [heap->obj maxAvailableSizeWithAlignment:alignment]) {
         heap->fail = 1;
     }
 
@@ -2278,11 +2278,13 @@ static bool ggml_metal_encode_node(
                     /*.nb3  =*/ nb03,
                 };
 
-                id<MTLBuffer> id_src0h = ggml_metal_heap_alloc(heap, ggml_nbytes(src0), 32);
+                id<MTLBuffer> id_src0h = ggml_metal_heap_alloc(heap, ggml_nbytes(src0), 64*1024);
                 if (!id_src0h) {
-                    //GGML_LOG_ERROR("%s: failed to allocate buffer for cpy, size = %zu, need = %zu, max available = %zu\n",
-                    //        __func__, ggml_nbytes(src0), heap->need, [heap->obj maxAvailableSizeWithAlignment:32]);
-                    return false;
+                    //GGML_LOG_ERROR("%s: failed to allocate buffer, idx = %4d, size = %8zu, need = %8zu, max available = %9zu, heap size = %9zu, heap used = %zu\n",
+                    //        __func__, idx, ggml_nbytes(src0), heap->need, [heap->obj maxAvailableSizeWithAlignment:0], [heap->obj size], [heap->obj usedSize]);
+                    return true;
+                } else {
+                    //GGML_LOG_ERROR("%s: allocated %zu\n", __func__, ggml_nbytes(src0));
                 }
 
                 if (src0->type == GGML_TYPE_F16) {
@@ -4689,7 +4691,7 @@ static enum ggml_status ggml_metal_graph_compute(
     // number of threads in addition to the main thread
     const int n_cb = ctx->n_cb;
 
-    int n_try = 64;
+    int n_try = 2;
 
     // submit the ggml compute graph to the GPU by creating command buffers and encoding the ops in them
     // the first n_nodes_0 are encoded and submitted for processing directly by the calling thread
@@ -4816,7 +4818,7 @@ static enum ggml_status ggml_metal_graph_compute(
     for (int i = 0; i <= n_cb; ++i) {
         struct ggml_metal_heap * heap = ctx->cmd_bufs[i].heap;
 
-        const size_t need = 4*heap->need;
+        const size_t need = heap->need;
 
         //printf("\nXXXXXXXXXXXXXXXXX cb %d, need = %zu, fail = %d, size = %zu\n", i, need, heap->fail, [heap->obj currentAllocatedSize]);
 

From cbb617edc6d981f3a8495f91a1b81739f808757e Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Thu, 10 Apr 2025 18:42:36 +0300
Subject: [PATCH 09/15] cont : not working .. [no ci]

---
 ggml/src/ggml-metal/ggml-metal.m | 57 +++++++++++++++++++++++---------
 1 file changed, 41 insertions(+), 16 deletions(-)

diff --git a/ggml/src/ggml-metal/ggml-metal.m b/ggml/src/ggml-metal/ggml-metal.m
index e9fc033eb85e5..9aca9d953a8a5 100644
--- a/ggml/src/ggml-metal/ggml-metal.m
+++ b/ggml/src/ggml-metal/ggml-metal.m
@@ -563,7 +563,9 @@ static bool ggml_metal_heap_resize(struct ggml_metal_heap * heap, size_t size) {
     return true;
 }
 
-static id<MTLBuffer> ggml_metal_heap_alloc(struct ggml_metal_heap * heap, size_t size, size_t alignment) {
+static id<MTLBuffer> ggml_metal_heap_alloc(struct ggml_metal_heap * heap, size_t size) {
+    const size_t alignment = 1024*1024;
+
     const size_t size_aligned = GGML_PAD(size, alignment);
 
     heap->need += size_aligned;
@@ -1583,7 +1585,8 @@ static bool ggml_metal_encode_node(
                         ggml_backend_t   backend,
                                    int   idx,
           id<MTLComputeCommandEncoder>   encoder,
-                struct ggml_metal_heap * heap) {
+                struct ggml_metal_heap * heap,
+                                  bool   no_compute) {
     struct ggml_backend_metal_context        * ctx     = backend->context;
     struct ggml_backend_metal_device_context * ctx_dev = backend->device->context;
 
@@ -1621,6 +1624,28 @@ static bool ggml_metal_encode_node(
         GGML_ABORT("unsupported op");
     }
 
+    id<MTLBuffer> h_src0 = nil;
+    switch (dst->op) {
+        case GGML_OP_SOFT_MAX:
+            {
+                h_src0 = ggml_metal_heap_alloc(heap, ggml_nbytes(src0));
+                if (!h_src0) {
+                    //GGML_LOG_ERROR("%s: failed to allocate buffer, idx = %4d, size = %8zu, need = %8zu, max available = %9zu, heap size = %9zu, heap used = %zu\n",
+                    //        __func__, idx, ggml_nbytes(src0), heap->need, [heap->obj maxAvailableSizeWithAlignment:0], [heap->obj size], [heap->obj usedSize]);
+                    return false;
+                } else {
+                    //GGML_LOG_ERROR("%s: allocated %zu\n", __func__, ggml_nbytes(src0));
+                }
+            } break;
+        default:
+            {
+            } break;
+    }
+
+    if (no_compute) {
+        return true;
+    }
+
     const int64_t  ne00 = src0 ? src0->ne[0] : 0;
     const int64_t  ne01 = src0 ? src0->ne[1] : 0;
     const int64_t  ne02 = src0 ? src0->ne[2] : 0;
@@ -2278,15 +2303,6 @@ static bool ggml_metal_encode_node(
                     /*.nb3  =*/ nb03,
                 };
 
-                id<MTLBuffer> id_src0h = ggml_metal_heap_alloc(heap, ggml_nbytes(src0), 64*1024);
-                if (!id_src0h) {
-                    //GGML_LOG_ERROR("%s: failed to allocate buffer, idx = %4d, size = %8zu, need = %8zu, max available = %9zu, heap size = %9zu, heap used = %zu\n",
-                    //        __func__, idx, ggml_nbytes(src0), heap->need, [heap->obj maxAvailableSizeWithAlignment:0], [heap->obj size], [heap->obj usedSize]);
-                    return true;
-                } else {
-                    //GGML_LOG_ERROR("%s: allocated %zu\n", __func__, ggml_nbytes(src0));
-                }
-
                 if (src0->type == GGML_TYPE_F16) {
                     [encoder setComputePipelineState:ctx->kernels[GGML_METAL_KERNEL_TYPE_CPY_F16_F16].pipeline];
                 } else {
@@ -2294,7 +2310,7 @@ static bool ggml_metal_encode_node(
                 }
                 [encoder setBytes:&args_cpy length:sizeof(args_cpy) atIndex:0];
                 [encoder setBuffer:id_src0  offset:offs_src0        atIndex:1];
-                [encoder setBuffer:id_src0h offset:0                atIndex:2];
+                [encoder setBuffer:h_src0   offset:0                atIndex:2];
 
                 GGML_ASSERT(ne00 % ggml_blck_size(src0->type) == 0);
                 int nth_cpy = MIN(1024, ne00 / ggml_blck_size(src0->type));
@@ -2315,11 +2331,11 @@ static bool ggml_metal_encode_node(
                 };
 
                 [encoder setComputePipelineState:pipeline];
-                [encoder setBuffer:id_src0h offset:0            atIndex:0];
+                [encoder setBuffer:h_src0 offset:0              atIndex:0];
                 if (id_src1) {
                     [encoder setBuffer:id_src1 offset:offs_src1 atIndex:1];
                 } else {
-                    [encoder setBuffer:id_src0h offset:0        atIndex:1];
+                    [encoder setBuffer:h_src0 offset:0          atIndex:1];
                 }
                 [encoder setBuffer:id_dst offset:offs_dst       atIndex:2];
                 [encoder setBytes:&args   length:sizeof(args)   atIndex:3];
@@ -4732,6 +4748,12 @@ static enum ggml_status ggml_metal_graph_compute(
             }
         }
 
+        for (int i = 0; i <= n_cb; ++i) {
+            struct ggml_metal_heap * heap = ctx->cmd_bufs[i].heap;
+
+            [heap->obj setPurgeableState:MTLPurgeableStateNonVolatile];
+        }
+
         // the main thread commits the first few commands immediately
         // cmd_buf[n_cb]
         {
@@ -4824,6 +4846,7 @@ static enum ggml_status ggml_metal_graph_compute(
 
         if (heap->fail == 0) {
             ggml_metal_heap_reset(ctx->cmd_bufs[i].heap);
+            [heap->obj setPurgeableState:MTLPurgeableStateEmpty];
 
             continue;
         }
@@ -5234,19 +5257,21 @@ static void ggml_backend_metal_set_n_cb(ggml_backend_t backend, int n_cb) {
 
         const bool should_capture = ctx->capture_next_compute;
 
+        bool no_compute = false;
+
         for (int idx = node_start; idx < node_end; ++idx) {
             if (should_capture) {
                 [encoder pushDebugGroup:[NSString stringWithCString:ggml_op_desc(ggml_graph_node(ctx->gf, idx)) encoding:NSUTF8StringEncoding]];
             }
 
-            const bool res = ggml_metal_encode_node(backend, idx, encoder, heap);
+            const bool res = ggml_metal_encode_node(backend, idx, encoder, heap, no_compute);
 
             if (should_capture) {
                 [encoder popDebugGroup];
             }
 
             if (!res) {
-                break;
+                no_compute = true;
             }
         }
 

From 91d5dc5a2be410478f77fcbcf04ec4e1a3196ffc Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Fri, 11 Apr 2025 12:27:15 +0300
Subject: [PATCH 10/15] cont : heap allocation now works [no ci]

---
 ggml/src/ggml-metal/ggml-metal.m | 116 ++++++++++++-------------------
 1 file changed, 46 insertions(+), 70 deletions(-)

diff --git a/ggml/src/ggml-metal/ggml-metal.m b/ggml/src/ggml-metal/ggml-metal.m
index 9aca9d953a8a5..149999cb30723 100644
--- a/ggml/src/ggml-metal/ggml-metal.m
+++ b/ggml/src/ggml-metal/ggml-metal.m
@@ -558,18 +558,27 @@ static bool ggml_metal_heap_resize(struct ggml_metal_heap * heap, size_t size) {
 
     [desc release];
 
+    //GGML_LOG_INFO("%s: resized heap to %zu\n", __func__, [heap->obj size]);
+
     ggml_metal_heap_reset(heap);
 
     return true;
 }
 
-static id<MTLBuffer> ggml_metal_heap_alloc(struct ggml_metal_heap * heap, size_t size) {
-    const size_t alignment = 1024*1024;
+static id<MTLBuffer> ggml_metal_heap_alloc(struct ggml_metal_heap * heap, size_t size, bool no_alloc) {
+    // note: this is probably more than needed, but just in case
+    const size_t alignment = 1024;
 
     const size_t size_aligned = GGML_PAD(size, alignment);
 
+    //GGML_LOG_INFO("%s: size = %zu, size_aligned = %zu, need = %zu, fail = %d\n", __func__, size, size_aligned, heap->need, heap->fail);
+
     heap->need += size_aligned;
 
+    if (no_alloc) {
+        return nil;
+    }
+
     if (!heap->fail && size_aligned > [heap->obj maxAvailableSizeWithAlignment:alignment]) {
         heap->fail = 1;
     }
@@ -883,7 +892,7 @@ @implementation GGMLMetalClass
     for (int i = 0; i < GGML_METAL_MAX_COMMAND_BUFFERS; ++i) {
         ctx->cmd_bufs[i].obj = nil;
 
-        // create 1MB heaps per command buffer
+        // create initial small heaps per command buffer
         // these can be resized during compute when necessary
         ctx->cmd_bufs[i].heap = ggml_metal_heap_init(device, 32);
     }
@@ -1624,17 +1633,19 @@ static bool ggml_metal_encode_node(
         GGML_ABORT("unsupported op");
     }
 
+    const bool no_alloc = no_compute;
+
+    // heap buffers for temporary data
     id<MTLBuffer> h_src0 = nil;
+
     switch (dst->op) {
         case GGML_OP_SOFT_MAX:
             {
-                h_src0 = ggml_metal_heap_alloc(heap, ggml_nbytes(src0));
-                if (!h_src0) {
-                    //GGML_LOG_ERROR("%s: failed to allocate buffer, idx = %4d, size = %8zu, need = %8zu, max available = %9zu, heap size = %9zu, heap used = %zu\n",
-                    //        __func__, idx, ggml_nbytes(src0), heap->need, [heap->obj maxAvailableSizeWithAlignment:0], [heap->obj size], [heap->obj usedSize]);
+                h_src0 = ggml_metal_heap_alloc(heap, ggml_nbytes(src0), no_alloc);
+                if (!no_alloc && !h_src0) {
+                    GGML_LOG_ERROR("%s: failed to allocate buffer, idx = %4d, size = %8zu, need = %8zu, max available = %9zu, heap size = %9zu, heap used = %zu, fail = %d\n",
+                            __func__, idx, ggml_nbytes(src0), heap->need, [heap->obj maxAvailableSizeWithAlignment:0], [heap->obj size], [heap->obj usedSize], heap->fail);
                     return false;
-                } else {
-                    //GGML_LOG_ERROR("%s: allocated %zu\n", __func__, ggml_nbytes(src0));
                 }
             } break;
         default:
@@ -4707,8 +4718,6 @@ static enum ggml_status ggml_metal_graph_compute(
     // number of threads in addition to the main thread
     const int n_cb = ctx->n_cb;
 
-    int n_try = 2;
-
     // submit the ggml compute graph to the GPU by creating command buffers and encoding the ops in them
     // the first n_nodes_0 are encoded and submitted for processing directly by the calling thread
     // while these nodes are processing, we start n_cb threads to enqueue the rest of the nodes
@@ -4716,7 +4725,6 @@ static enum ggml_status ggml_metal_graph_compute(
     //
     // tests on M1 Pro and M2 Ultra using LLaMA models, show that optimal values for n_cb are 1 or 2
 
-    while (n_try-- > 0) {
     @autoreleasepool {
         ctx->gf = gf;
 
@@ -4834,55 +4842,6 @@ static enum ggml_status ggml_metal_graph_compute(
         }
     }
 
-    bool retry = false;
-
-    // check heap statuses
-    for (int i = 0; i <= n_cb; ++i) {
-        struct ggml_metal_heap * heap = ctx->cmd_bufs[i].heap;
-
-        const size_t need = heap->need;
-
-        //printf("\nXXXXXXXXXXXXXXXXX cb %d, need = %zu, fail = %d, size = %zu\n", i, need, heap->fail, [heap->obj currentAllocatedSize]);
-
-        if (heap->fail == 0) {
-            ggml_metal_heap_reset(ctx->cmd_bufs[i].heap);
-            [heap->obj setPurgeableState:MTLPurgeableStateEmpty];
-
-            continue;
-        }
-
-        if (heap->fail == 2) {
-            GGML_LOG_ERROR("%s: command buffer %d, MTLHeap ran out of buffers, max = %d\n", __func__, i, heap->n);
-            return GGML_STATUS_ALLOC_FAILED;
-        }
-
-        if (heap->fail == 3) {
-            GGML_LOG_ERROR("%s: command buffer %d, MTLHeap failed to allocate buffer, max = %d\n", __func__, i, heap->n);
-            return GGML_STATUS_ALLOC_FAILED;
-        }
-
-        //GGML_LOG_INFO("%s: command buffer %d, MTLHeap need = %zu\n", __func__, i, need);
-
-        if (!ggml_metal_heap_resize(heap, need)) {
-            GGML_LOG_ERROR("%s: failed to increase heap size to %zu\n", __func__, need);
-            return GGML_STATUS_ALLOC_FAILED;
-        }
-
-        retry = true;
-    }
-
-    if (!retry) {
-        break;
-    }
-
-    //printf("XXXXXXXXXXXXXXXXXXXXXXX retry\n");
-
-    if (n_try == 0) {
-        GGML_LOG_ERROR("%s: failed to allocate heap memory\n", __func__);
-        return GGML_STATUS_ALLOC_FAILED;
-    }
-    }
-
     return GGML_STATUS_SUCCESS;
 }
 
@@ -5257,21 +5216,38 @@ static void ggml_backend_metal_set_n_cb(ggml_backend_t backend, int n_cb) {
 
         const bool should_capture = ctx->capture_next_compute;
 
-        bool no_compute = false;
+        ggml_metal_heap_reset(heap);
 
         for (int idx = node_start; idx < node_end; ++idx) {
-            if (should_capture) {
-                [encoder pushDebugGroup:[NSString stringWithCString:ggml_op_desc(ggml_graph_node(ctx->gf, idx)) encoding:NSUTF8StringEncoding]];
-            }
+            ggml_metal_encode_node(backend, idx, encoder, heap, true);
+        }
+
+        bool can_compute = true;
 
-            const bool res = ggml_metal_encode_node(backend, idx, encoder, heap, no_compute);
+        if (heap->need > [heap->obj size]) {
+            const size_t need = heap->need;
 
-            if (should_capture) {
-                [encoder popDebugGroup];
+            if (!ggml_metal_heap_resize(heap, need)) {
+                GGML_LOG_ERROR("%s: failed to resize MTLHeap, need = %zu\n", __func__, need);
+                can_compute = false;
             }
+        }
+
+        if (can_compute) {
+            for (int idx = node_start; idx < node_end; ++idx) {
+                if (should_capture) {
+                    [encoder pushDebugGroup:[NSString stringWithCString:ggml_op_desc(ggml_graph_node(ctx->gf, idx)) encoding:NSUTF8StringEncoding]];
+                }
+
+                const bool res = ggml_metal_encode_node(backend, idx, encoder, heap, false);
 
-            if (!res) {
-                no_compute = true;
+                if (should_capture) {
+                    [encoder popDebugGroup];
+                }
+
+                if (!res) {
+                    break;
+                }
             }
         }
 

From 455691c52fd793141c8d2466660c417ae9d4dec7 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Fri, 11 Apr 2025 13:32:36 +0300
Subject: [PATCH 11/15] cont : use MTLHeapTypePlacement

ggml-ci
---
 ggml/src/ggml-metal/ggml-metal.m | 85 ++++++++++++++++----------------
 1 file changed, 43 insertions(+), 42 deletions(-)

diff --git a/ggml/src/ggml-metal/ggml-metal.m b/ggml/src/ggml-metal/ggml-metal.m
index 149999cb30723..42519b7e6768b 100644
--- a/ggml/src/ggml-metal/ggml-metal.m
+++ b/ggml/src/ggml-metal/ggml-metal.m
@@ -19,9 +19,6 @@
 // max number of MTLCommandBuffer used to submit a graph for processing
 #define GGML_METAL_MAX_COMMAND_BUFFERS 8
 
-// max number of buffers that can be allocated on the heap per command buffer
-#define GGML_METAL_MAX_HEAP_BUFFERS 64
-
 #ifndef TARGET_OS_VISION
 #define TARGET_OS_VISION 0
 #endif
@@ -472,14 +469,15 @@ static void ggml_backend_metal_device_rel(struct ggml_backend_metal_device_conte
 };
 
 struct ggml_metal_heap {
-    int n;
     int fail;
 
+    size_t offs;
     size_t need;
 
     id<MTLDevice> device;
     id<MTLHeap>   obj;
-    id<MTLBuffer> bufs[GGML_METAL_MAX_HEAP_BUFFERS];
+
+    NSMutableArray * bufs;
 };
 
 static struct ggml_metal_heap * ggml_metal_heap_init(id<MTLDevice> device, size_t size) {
@@ -488,7 +486,7 @@ static void ggml_backend_metal_device_rel(struct ggml_backend_metal_device_conte
     MTLHeapDescriptor * desc = [[MTLHeapDescriptor alloc] init];
     desc.storageMode  = MTLStorageModePrivate;
     desc.cpuCacheMode = MTLCPUCacheModeDefaultCache;
-    desc.type         = MTLHeapTypeAutomatic; // TODO: use MTLHeapTypePlacement
+    desc.type         = MTLHeapTypePlacement;
     desc.size         = size;
 
     heap->device = device;
@@ -501,39 +499,35 @@ static void ggml_backend_metal_device_rel(struct ggml_backend_metal_device_conte
         return false;
     }
 
-    for (int i = 0; i < GGML_METAL_MAX_HEAP_BUFFERS; ++i) {
-        heap->bufs[i] = nil;
-    }
-
     [desc release];
 
+    heap->bufs = [[NSMutableArray alloc] init];
+
     return heap;
 }
 
+static void ggml_metal_heap_reset(struct ggml_metal_heap * heap) {
+    heap->fail = 0;
+    heap->offs = 0;
+    heap->need = 0;
+
+    for (id<MTLBuffer> buf in heap->bufs) {
+        [buf release];
+    }
+    [heap->bufs removeAllObjects];
+}
+
 static void ggml_metal_heap_free(struct ggml_metal_heap * heap) {
     if (heap == nil) {
         return;
     }
 
-    [heap->obj release];
-
-    free(heap);
-}
-
-static void ggml_metal_heap_reset(struct ggml_metal_heap * heap) {
-    heap->n = 0;
-    heap->fail = 0;
-    heap->need = 0;
+    ggml_metal_heap_reset(heap);
 
-    for (int i = 0; i < GGML_METAL_MAX_HEAP_BUFFERS; i++) {
-        if (heap->bufs[i]) {
-            [heap->bufs[i] release];
-            heap->bufs[i] = nil;
-            continue;
-        }
+    [heap->obj  release];
+    [heap->bufs release];
 
-        break;
-    }
+    free(heap);
 }
 
 static bool ggml_metal_heap_resize(struct ggml_metal_heap * heap, size_t size) {
@@ -546,7 +540,7 @@ static bool ggml_metal_heap_resize(struct ggml_metal_heap * heap, size_t size) {
     MTLHeapDescriptor * desc = [[MTLHeapDescriptor alloc] init];
     desc.storageMode  = MTLStorageModePrivate;
     desc.cpuCacheMode = MTLCPUCacheModeDefaultCache;
-    desc.type         = MTLHeapTypeAutomatic; // TODO: use MTLHeapTypePlacement
+    desc.type         = MTLHeapTypePlacement;
     desc.size         = size;
 
     heap->obj = [heap->device newHeapWithDescriptor:desc];
@@ -571,33 +565,32 @@ static bool ggml_metal_heap_resize(struct ggml_metal_heap * heap, size_t size) {
 
     const size_t size_aligned = GGML_PAD(size, alignment);
 
-    //GGML_LOG_INFO("%s: size = %zu, size_aligned = %zu, need = %zu, fail = %d\n", __func__, size, size_aligned, heap->need, heap->fail);
+    heap->offs += size_aligned;
+    heap->need = MAX(heap->need, heap->offs + size_aligned);
 
-    heap->need += size_aligned;
+    //GGML_LOG_INFO("%s: size = %zu, size_aligned = %zu, offs = %zu, need = %zu\n", __func__, size, size_aligned, offs, heap->offs, heap->need);
 
     if (no_alloc) {
         return nil;
     }
 
-    if (!heap->fail && size_aligned > [heap->obj maxAvailableSizeWithAlignment:alignment]) {
+    if (!heap->fail && heap->offs + size_aligned > [heap->obj size]) {
         heap->fail = 1;
     }
 
-    if (!heap->fail && heap->n >= GGML_METAL_MAX_HEAP_BUFFERS) {
-        heap->fail = 2;
-    }
-
     if (heap->fail) {
         return nil;
     }
 
-    id<MTLBuffer> buf = [heap->obj newBufferWithLength:size_aligned options:MTLResourceStorageModePrivate];
+    id<MTLBuffer> buf = [heap->obj newBufferWithLength:size_aligned options:MTLResourceStorageModePrivate offset:heap->offs];
     if (!buf) {
         heap->fail = 3;
         return nil;
     }
 
-    heap->bufs[heap->n++] = buf;
+    [heap->bufs addObject:buf];
+
+    //GGML_LOG_INFO("%s: allocated buffer, size = %zu, offs = %zu, heap size = %zu, heap used = %zu\n", __func__, size_aligned, offs, [heap->obj size], [heap->obj usedSize]);
 
     return buf;
 }
@@ -634,7 +627,6 @@ static bool ggml_metal_heap_resize(struct ggml_metal_heap * heap, size_t size) {
     void (^encode_async)(size_t ith);
 
     // n_cb command buffers + 1 used by the main thread
-    //id<MTLCommandBuffer> command_buffers[GGML_METAL_MAX_COMMAND_BUFFERS + 1];
     struct ggml_metal_command_buffer cmd_bufs[GGML_METAL_MAX_COMMAND_BUFFERS + 1];
 
     // abort ggml_metal_graph_compute if callback returns true
@@ -1638,13 +1630,16 @@ static bool ggml_metal_encode_node(
     // heap buffers for temporary data
     id<MTLBuffer> h_src0 = nil;
 
+    // always allocate buffers from the start of the heap for the current node
+    heap->offs = 0;
+
     switch (dst->op) {
         case GGML_OP_SOFT_MAX:
             {
                 h_src0 = ggml_metal_heap_alloc(heap, ggml_nbytes(src0), no_alloc);
                 if (!no_alloc && !h_src0) {
-                    GGML_LOG_ERROR("%s: failed to allocate buffer, idx = %4d, size = %8zu, need = %8zu, max available = %9zu, heap size = %9zu, heap used = %zu, fail = %d\n",
-                            __func__, idx, ggml_nbytes(src0), heap->need, [heap->obj maxAvailableSizeWithAlignment:0], [heap->obj size], [heap->obj usedSize], heap->fail);
+                    GGML_LOG_ERROR("%s: failed to allocate buffer, idx = %4d, size = %8zu, offs = %8zu, max available = %9zu, heap size = %9zu, heap used = %zu, fail = %d\n",
+                            __func__, idx, ggml_nbytes(src0), heap->offs, [heap->obj maxAvailableSizeWithAlignment:0], [heap->obj size], [heap->obj usedSize], heap->fail);
                     return false;
                 }
             } break;
@@ -2250,8 +2245,6 @@ static bool ggml_metal_encode_node(
             {
                 GGML_ASSERT(!src1 || src1->type == GGML_TYPE_F16 || src1->type == GGML_TYPE_F32);
 
-                GGML_ASSERT(ggml_is_contiguous(src0));
-
                 int nth = 32; // SIMD width
 
                 id<MTLComputePipelineState> pipeline = nil;
@@ -4836,6 +4829,12 @@ static enum ggml_status ggml_metal_graph_compute(
             [next_buffer commit];
         }
 
+        for (int i = 0; i <= n_cb; ++i) {
+            struct ggml_metal_heap * heap = ctx->cmd_bufs[i].heap;
+
+            [heap->obj setPurgeableState:MTLPurgeableStateEmpty];
+        }
+
         if (!should_capture && ctx->capture_started) {
             [ctx->capture_scope endScope];
             [[MTLCaptureManager sharedCaptureManager] stopCapture];
@@ -5233,6 +5232,8 @@ static void ggml_backend_metal_set_n_cb(ggml_backend_t backend, int n_cb) {
             }
         }
 
+        //GGML_LOG_INFO("XXXXXXXXXXXXXXXXXXXXXXXXX\n");
+
         if (can_compute) {
             for (int idx = node_start; idx < node_end; ++idx) {
                 if (should_capture) {

From 6f413276faa48260f006dada7deb9237e01778b5 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Mon, 14 Apr 2025 13:48:10 +0300
Subject: [PATCH 12/15] metal : use dynamic MTLHeap allocations

ggml-ci
---
 ggml/src/ggml-metal/ggml-metal.m | 331 +++++++++++++++++--------------
 1 file changed, 185 insertions(+), 146 deletions(-)

diff --git a/ggml/src/ggml-metal/ggml-metal.m b/ggml/src/ggml-metal/ggml-metal.m
index 42519b7e6768b..a95ad00d10780 100644
--- a/ggml/src/ggml-metal/ggml-metal.m
+++ b/ggml/src/ggml-metal/ggml-metal.m
@@ -469,13 +469,13 @@ static void ggml_backend_metal_device_rel(struct ggml_backend_metal_device_conte
 };
 
 struct ggml_metal_heap {
-    int fail;
+    int n_unused; // number of times the heap was unused
+
+    int64_t n_alloc;
 
     size_t offs;
-    size_t need;
 
-    id<MTLDevice> device;
-    id<MTLHeap>   obj;
+    id<MTLHeap> obj;
 
     NSMutableArray * bufs;
 };
@@ -489,7 +489,9 @@ static void ggml_backend_metal_device_rel(struct ggml_backend_metal_device_conte
     desc.type         = MTLHeapTypePlacement;
     desc.size         = size;
 
-    heap->device = device;
+    heap->n_unused = 0;
+    heap->n_alloc = 0;
+
     heap->obj = [device newHeapWithDescriptor:desc];
     if (!heap->obj) {
         GGML_LOG_ERROR("%s: error: failed to create MTLHeap with size %zu\n", __func__, size);
@@ -507,14 +509,20 @@ static void ggml_backend_metal_device_rel(struct ggml_backend_metal_device_conte
 }
 
 static void ggml_metal_heap_reset(struct ggml_metal_heap * heap) {
-    heap->fail = 0;
     heap->offs = 0;
-    heap->need = 0;
+
+    if ([heap->bufs count] > 0) {
+        heap->n_unused = 0;
+    } else {
+        heap->n_unused++;
+    }
 
     for (id<MTLBuffer> buf in heap->bufs) {
         [buf release];
     }
     [heap->bufs removeAllObjects];
+
+    [heap->obj setPurgeableState:MTLPurgeableStateVolatile];
 }
 
 static void ggml_metal_heap_free(struct ggml_metal_heap * heap) {
@@ -530,75 +538,10 @@ static void ggml_metal_heap_free(struct ggml_metal_heap * heap) {
     free(heap);
 }
 
-static bool ggml_metal_heap_resize(struct ggml_metal_heap * heap, size_t size) {
-    if (heap == nil) {
-        return false;
-    }
-
-    [heap->obj release];
-
-    MTLHeapDescriptor * desc = [[MTLHeapDescriptor alloc] init];
-    desc.storageMode  = MTLStorageModePrivate;
-    desc.cpuCacheMode = MTLCPUCacheModeDefaultCache;
-    desc.type         = MTLHeapTypePlacement;
-    desc.size         = size;
-
-    heap->obj = [heap->device newHeapWithDescriptor:desc];
-    if (!heap->obj) {
-        GGML_LOG_ERROR("%s: error: failed to create MTLHeap with size %zu\n", __func__, size);
-
-        return false;
-    }
-
-    [desc release];
-
-    //GGML_LOG_INFO("%s: resized heap to %zu\n", __func__, [heap->obj size]);
-
-    ggml_metal_heap_reset(heap);
-
-    return true;
-}
-
-static id<MTLBuffer> ggml_metal_heap_alloc(struct ggml_metal_heap * heap, size_t size, bool no_alloc) {
-    // note: this is probably more than needed, but just in case
-    const size_t alignment = 1024;
-
-    const size_t size_aligned = GGML_PAD(size, alignment);
-
-    heap->offs += size_aligned;
-    heap->need = MAX(heap->need, heap->offs + size_aligned);
-
-    //GGML_LOG_INFO("%s: size = %zu, size_aligned = %zu, offs = %zu, need = %zu\n", __func__, size, size_aligned, offs, heap->offs, heap->need);
-
-    if (no_alloc) {
-        return nil;
-    }
-
-    if (!heap->fail && heap->offs + size_aligned > [heap->obj size]) {
-        heap->fail = 1;
-    }
-
-    if (heap->fail) {
-        return nil;
-    }
-
-    id<MTLBuffer> buf = [heap->obj newBufferWithLength:size_aligned options:MTLResourceStorageModePrivate offset:heap->offs];
-    if (!buf) {
-        heap->fail = 3;
-        return nil;
-    }
-
-    [heap->bufs addObject:buf];
-
-    //GGML_LOG_INFO("%s: allocated buffer, size = %zu, offs = %zu, heap size = %zu, heap used = %zu\n", __func__, size_aligned, offs, [heap->obj size], [heap->obj usedSize]);
-
-    return buf;
-}
-
 struct ggml_metal_command_buffer {
     id<MTLCommandBuffer> obj;
 
-    struct ggml_metal_heap * heap;
+    struct ggml_metal_mem_pool * mem_pool;
 };
 
 struct ggml_backend_metal_context {
@@ -634,6 +577,154 @@ static bool ggml_metal_heap_resize(struct ggml_metal_heap * heap, size_t size) {
     void *              abort_callback_data;
 };
 
+@interface ggml_metal_heap_ptr : NSObject
+
+@property (nonatomic, assign) struct ggml_metal_heap * data;
+
+@end
+
+@implementation ggml_metal_heap_ptr
+
+@end
+
+struct ggml_metal_mem_pool {
+    id<MTLDevice> device;
+
+    NSMutableArray * heaps;
+    NSMutableArray * heaps_to_remove;
+};
+
+static struct ggml_metal_mem_pool * ggml_metal_mem_pool_init(void) {
+    struct ggml_metal_mem_pool * mem_pool = calloc(1, sizeof(struct ggml_metal_mem_pool));
+
+    mem_pool->heaps           = [[NSMutableArray alloc] init];
+    mem_pool->heaps_to_remove = [[NSMutableArray alloc] init];
+
+    return mem_pool;
+}
+
+static void ggml_metal_mem_pool_free(struct ggml_metal_mem_pool * mem_pool) {
+    GGML_LOG_DEBUG("%s: freeing memory pool, num heaps = %zu\n", __func__, [mem_pool->heaps count]);
+
+    size_t size_all = 0;
+    size_t size_cur = 0;
+
+    for (ggml_metal_heap_ptr * ptr in mem_pool->heaps) {
+        GGML_LOG_DEBUG("%s:   heap: %p\n",               __func__, (void *) ptr.data);
+        GGML_LOG_DEBUG("%s:     n_alloc:  %" PRId64 "\n", __func__, ptr.data->n_alloc);
+        GGML_LOG_DEBUG("%s:     n_unused: %d\n",          __func__, ptr.data->n_unused);
+        GGML_LOG_DEBUG("%s:     size:     %.2f MiB\n",    __func__, [ptr.data->obj size] / 1024.0 / 1024.0);
+        GGML_LOG_DEBUG("%s:     bufs:     %zu\n",         __func__, [ptr.data->bufs count]);
+
+        if ([ptr.data->bufs count] > 0) {
+            size_cur += [ptr.data->obj size];
+        }
+        size_all += [ptr.data->obj size];
+
+        ggml_metal_heap_free(ptr.data);
+        [ptr release];
+    }
+    [mem_pool->heaps           release];
+    [mem_pool->heaps_to_remove release];
+
+    if (size_all > 0) {
+        GGML_LOG_DEBUG("%s:   size_all: %.2f MiB\n", __func__, size_all / 1024.0 / 1024.0);
+        GGML_LOG_DEBUG("%s:   size_cur: %.2f MiB\n", __func__, size_cur / 1024.0 / 1024.0);
+    }
+
+    free(mem_pool);
+}
+
+static void ggml_metal_mem_pool_reset(struct ggml_metal_mem_pool * mem_pool) {
+    for (NSUInteger i = 0; i < [mem_pool->heaps count]; i++) {
+        ggml_metal_heap_ptr * ptr = [mem_pool->heaps objectAtIndex:i];
+
+        struct ggml_metal_heap * heap = ptr.data;
+        ggml_metal_heap_reset(heap);
+
+        // if the heap hasn't been used for a while, remove it
+        if (heap->n_unused >= 128) {
+            [mem_pool->heaps_to_remove addObject:@(i)];
+        }
+    }
+
+    if (mem_pool->heaps_to_remove.count > 0) {
+        for (NSUInteger i = 0; i < [mem_pool->heaps_to_remove count]; i++) {
+            NSUInteger index = [[mem_pool->heaps_to_remove objectAtIndex:i] intValue];
+            ggml_metal_heap_ptr * ptr = [mem_pool->heaps objectAtIndex:index];
+
+            struct ggml_metal_heap * heap = ptr.data;
+            ggml_metal_heap_free(heap);
+
+            [mem_pool->heaps removeObjectAtIndex:index];
+            [ptr release];
+        }
+
+        [mem_pool->heaps_to_remove removeAllObjects];
+    }
+}
+
+static void ggml_metal_mem_pool_clear(struct ggml_metal_mem_pool * mem_pool) {
+    for (ggml_metal_heap_ptr * ptr in mem_pool->heaps) {
+        ptr.data->offs = 0;
+    }
+}
+
+static id<MTLBuffer> ggml_metal_mem_pool_alloc(struct ggml_metal_mem_pool * mem_pool, size_t size) {
+    const size_t alignment = 32;
+
+    const size_t size_aligned = GGML_PAD(size, alignment);
+
+    // try one of the existing heaps
+    for (ggml_metal_heap_ptr * ptr in mem_pool->heaps) {
+        struct ggml_metal_heap * heap = ptr.data;
+        if (heap->offs + size_aligned <= [heap->obj size]) {
+            if ([heap->bufs count] == 0) {
+                [heap->obj setPurgeableState:MTLPurgeableStateNonVolatile];
+            }
+
+            id<MTLBuffer> buf = [heap->obj newBufferWithLength:size_aligned options:MTLResourceStorageModePrivate offset:heap->offs];
+            if (buf == nil) {
+                GGML_LOG_ERROR("%s: error: failed to create MTLBuffer with size %zu\n", __func__, size_aligned);
+                return nil;
+            }
+
+            heap->n_alloc++;
+            heap->offs += size_aligned;
+
+            [heap->bufs addObject:buf];
+
+            return buf;
+        }
+    }
+
+    // create a new heap that can fit this buffer
+    ggml_metal_heap_ptr * heap_ptr = [ggml_metal_heap_ptr new];
+
+    struct ggml_metal_heap * heap = ggml_metal_heap_init(mem_pool->device, size_aligned);
+    heap_ptr.data = heap;
+
+    GGML_LOG_DEBUG("%s: creating new heap of size %zu, got %zu\n", __func__, size_aligned, [heap->obj size]);
+
+    ggml_metal_heap_reset(heap_ptr.data);
+
+    [heap->obj setPurgeableState:MTLPurgeableStateNonVolatile];
+    id<MTLBuffer> buf = [heap->obj newBufferWithLength:size_aligned options:MTLResourceStorageModePrivate offset:heap->offs];
+    if (buf == nil) {
+        GGML_LOG_ERROR("%s: error: failed to create MTLBuffer with size %zu\n", __func__, size_aligned);
+        return NULL;
+    }
+
+    heap->n_alloc++;
+    heap->offs += size_aligned;
+
+    [heap->bufs addObject:buf];
+
+    [mem_pool->heaps addObject:heap_ptr];
+
+    return buf;
+}
+
 // MSL code
 // TODO: move the contents here when ready
 //       for now it is easier to work in a separate file
@@ -884,9 +975,8 @@ @implementation GGMLMetalClass
     for (int i = 0; i < GGML_METAL_MAX_COMMAND_BUFFERS; ++i) {
         ctx->cmd_bufs[i].obj = nil;
 
-        // create initial small heaps per command buffer
-        // these can be resized during compute when necessary
-        ctx->cmd_bufs[i].heap = ggml_metal_heap_init(device, 32);
+        ctx->cmd_bufs[i].mem_pool = ggml_metal_mem_pool_init();
+        ctx->cmd_bufs[i].mem_pool->device = device;
     }
 
 #if TARGET_OS_OSX || (TARGET_OS_IOS && __clang_major__ >= 15)
@@ -1280,7 +1370,7 @@ static void ggml_metal_free(struct ggml_backend_metal_context * ctx) {
     for (int i = 0; i < GGML_METAL_MAX_COMMAND_BUFFERS; ++i) {
         // ctx->cmd_bufs[i].obj is auto released
 
-        ggml_metal_heap_free(ctx->cmd_bufs[i].heap);
+        ggml_metal_mem_pool_free(ctx->cmd_bufs[i].mem_pool);
     }
 
     dispatch_release(ctx->d_queue);
@@ -1586,8 +1676,7 @@ static bool ggml_metal_encode_node(
                         ggml_backend_t   backend,
                                    int   idx,
           id<MTLComputeCommandEncoder>   encoder,
-                struct ggml_metal_heap * heap,
-                                  bool   no_compute) {
+            struct ggml_metal_mem_pool * mem_pool) {
     struct ggml_backend_metal_context        * ctx     = backend->context;
     struct ggml_backend_metal_device_context * ctx_dev = backend->device->context;
 
@@ -1625,32 +1714,7 @@ static bool ggml_metal_encode_node(
         GGML_ABORT("unsupported op");
     }
 
-    const bool no_alloc = no_compute;
-
-    // heap buffers for temporary data
-    id<MTLBuffer> h_src0 = nil;
-
-    // always allocate buffers from the start of the heap for the current node
-    heap->offs = 0;
-
-    switch (dst->op) {
-        case GGML_OP_SOFT_MAX:
-            {
-                h_src0 = ggml_metal_heap_alloc(heap, ggml_nbytes(src0), no_alloc);
-                if (!no_alloc && !h_src0) {
-                    GGML_LOG_ERROR("%s: failed to allocate buffer, idx = %4d, size = %8zu, offs = %8zu, max available = %9zu, heap size = %9zu, heap used = %zu, fail = %d\n",
-                            __func__, idx, ggml_nbytes(src0), heap->offs, [heap->obj maxAvailableSizeWithAlignment:0], [heap->obj size], [heap->obj usedSize], heap->fail);
-                    return false;
-                }
-            } break;
-        default:
-            {
-            } break;
-    }
-
-    if (no_compute) {
-        return true;
-    }
+    ggml_metal_mem_pool_clear(mem_pool);
 
     const int64_t  ne00 = src0 ? src0->ne[0] : 0;
     const int64_t  ne01 = src0 ? src0->ne[1] : 0;
@@ -2288,6 +2352,12 @@ static bool ggml_metal_encode_node(
 
                 // cpy to tmp buffer in MTLHeap
 
+                id<MTLBuffer> h_src0 = h_src0 = ggml_metal_mem_pool_alloc(mem_pool, ggml_nbytes(src0));
+                if (!h_src0) {
+                    GGML_LOG_ERROR("%s: failed to allocate buffer from memory pool, size = %zu\n", __func__, ggml_nbytes(src0));
+                    return false;
+                }
+
                 ggml_metal_kargs_cpy args_cpy = {
                     /*.ne00 =*/ ne00,
                     /*.ne01 =*/ ne01,
@@ -4749,12 +4819,6 @@ static enum ggml_status ggml_metal_graph_compute(
             }
         }
 
-        for (int i = 0; i <= n_cb; ++i) {
-            struct ggml_metal_heap * heap = ctx->cmd_bufs[i].heap;
-
-            [heap->obj setPurgeableState:MTLPurgeableStateNonVolatile];
-        }
-
         // the main thread commits the first few commands immediately
         // cmd_buf[n_cb]
         {
@@ -4829,12 +4893,6 @@ static enum ggml_status ggml_metal_graph_compute(
             [next_buffer commit];
         }
 
-        for (int i = 0; i <= n_cb; ++i) {
-            struct ggml_metal_heap * heap = ctx->cmd_bufs[i].heap;
-
-            [heap->obj setPurgeableState:MTLPurgeableStateEmpty];
-        }
-
         if (!should_capture && ctx->capture_started) {
             [ctx->capture_scope endScope];
             [[MTLCaptureManager sharedCaptureManager] stopCapture];
@@ -5201,7 +5259,6 @@ static void ggml_backend_metal_set_n_cb(ggml_backend_t backend, int n_cb) {
         const int n_nodes_per_cb = ctx->n_nodes_per_cb;
 
         id<MTLCommandBuffer> cmd_buf = ctx->cmd_bufs[cb_idx].obj;
-        struct ggml_metal_heap * heap = ctx->cmd_bufs[cb_idx].heap;
 
         id<MTLComputeCommandEncoder> encoder = [cmd_buf computeCommandEncoder];
 
@@ -5215,40 +5272,22 @@ static void ggml_backend_metal_set_n_cb(ggml_backend_t backend, int n_cb) {
 
         const bool should_capture = ctx->capture_next_compute;
 
-        ggml_metal_heap_reset(heap);
+        struct ggml_metal_mem_pool * mem_pool = ctx->cmd_bufs[cb_idx].mem_pool;
+        ggml_metal_mem_pool_reset(mem_pool);
 
         for (int idx = node_start; idx < node_end; ++idx) {
-            ggml_metal_encode_node(backend, idx, encoder, heap, true);
-        }
-
-        bool can_compute = true;
-
-        if (heap->need > [heap->obj size]) {
-            const size_t need = heap->need;
-
-            if (!ggml_metal_heap_resize(heap, need)) {
-                GGML_LOG_ERROR("%s: failed to resize MTLHeap, need = %zu\n", __func__, need);
-                can_compute = false;
+            if (should_capture) {
+                [encoder pushDebugGroup:[NSString stringWithCString:ggml_op_desc(ggml_graph_node(ctx->gf, idx)) encoding:NSUTF8StringEncoding]];
             }
-        }
-
-        //GGML_LOG_INFO("XXXXXXXXXXXXXXXXXXXXXXXXX\n");
-
-        if (can_compute) {
-            for (int idx = node_start; idx < node_end; ++idx) {
-                if (should_capture) {
-                    [encoder pushDebugGroup:[NSString stringWithCString:ggml_op_desc(ggml_graph_node(ctx->gf, idx)) encoding:NSUTF8StringEncoding]];
-                }
 
-                const bool res = ggml_metal_encode_node(backend, idx, encoder, heap, false);
+            const bool res = ggml_metal_encode_node(backend, idx, encoder, mem_pool);
 
-                if (should_capture) {
-                    [encoder popDebugGroup];
-                }
+            if (should_capture) {
+                [encoder popDebugGroup];
+            }
 
-                if (!res) {
-                    break;
-                }
+            if (!res) {
+                break;
             }
         }
 

From 69f7b0920ddcfe16c468b471dfb0c3c3939f99ae Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Tue, 15 Apr 2025 14:59:16 +0300
Subject: [PATCH 13/15] metal : add comments

---
 ggml/src/ggml-metal/ggml-metal.m | 116 +++++++++++++++++++------------
 1 file changed, 70 insertions(+), 46 deletions(-)

diff --git a/ggml/src/ggml-metal/ggml-metal.m b/ggml/src/ggml-metal/ggml-metal.m
index a95ad00d10780..b2e810f10c094 100644
--- a/ggml/src/ggml-metal/ggml-metal.m
+++ b/ggml/src/ggml-metal/ggml-metal.m
@@ -468,16 +468,21 @@ static void ggml_backend_metal_device_rel(struct ggml_backend_metal_device_conte
     GGML_METAL_KERNEL_TYPE_COUNT
 };
 
+//
+// ggml_metal_heap
+//
+
 struct ggml_metal_heap {
     int n_unused; // number of times the heap was unused
 
-    int64_t n_alloc;
+    int64_t n_alloc; // total number of buffer allocations in this heap across all computes
 
+    // current offset in the heap - we reset this after each node in order to reuse the memory
     size_t offs;
 
     id<MTLHeap> obj;
 
-    NSMutableArray * bufs;
+    NSMutableArray * bufs; // the currently allocated MTLBuffer objects in this heap
 };
 
 static struct ggml_metal_heap * ggml_metal_heap_init(id<MTLDevice> device, size_t size) {
@@ -511,6 +516,7 @@ static void ggml_backend_metal_device_rel(struct ggml_backend_metal_device_conte
 static void ggml_metal_heap_reset(struct ggml_metal_heap * heap) {
     heap->offs = 0;
 
+    // count how many graph computes the heap ended up being unused
     if ([heap->bufs count] > 0) {
         heap->n_unused = 0;
     } else {
@@ -522,6 +528,8 @@ static void ggml_metal_heap_reset(struct ggml_metal_heap * heap) {
     }
     [heap->bufs removeAllObjects];
 
+    // tell the OS that it can reuse this memory if needed
+    // ref: https://developer.apple.com/documentation/metal/mtlpurgeablestate?language=objc
     [heap->obj setPurgeableState:MTLPurgeableStateVolatile];
 }
 
@@ -538,45 +546,6 @@ static void ggml_metal_heap_free(struct ggml_metal_heap * heap) {
     free(heap);
 }
 
-struct ggml_metal_command_buffer {
-    id<MTLCommandBuffer> obj;
-
-    struct ggml_metal_mem_pool * mem_pool;
-};
-
-struct ggml_backend_metal_context {
-    id<MTLDevice>       device;
-    id<MTLCommandQueue> queue;
-
-    dispatch_queue_t d_queue;
-
-    struct ggml_metal_kernel kernels[GGML_METAL_KERNEL_TYPE_COUNT];
-
-    // capture state
-    bool capture_next_compute;
-    bool capture_started;
-
-    id<MTLCaptureScope> capture_scope;
-
-    // command buffer state
-    int n_cb;           // number of extra threads used to submit the command buffers
-    int n_nodes_0;      // number of nodes submitted by the main thread
-    int n_nodes_1;      // remaining number of nodes submitted by the n_cb threads
-    int n_nodes_per_cb;
-
-    struct ggml_cgraph * gf;
-
-    // the callback given to the thread pool
-    void (^encode_async)(size_t ith);
-
-    // n_cb command buffers + 1 used by the main thread
-    struct ggml_metal_command_buffer cmd_bufs[GGML_METAL_MAX_COMMAND_BUFFERS + 1];
-
-    // abort ggml_metal_graph_compute if callback returns true
-    ggml_abort_callback abort_callback;
-    void *              abort_callback_data;
-};
-
 @interface ggml_metal_heap_ptr : NSObject
 
 @property (nonatomic, assign) struct ggml_metal_heap * data;
@@ -584,12 +553,17 @@ @interface ggml_metal_heap_ptr : NSObject
 @end
 
 @implementation ggml_metal_heap_ptr
-
 @end
 
+//
+// ggml_metal_mem_pool
+//
+
 struct ggml_metal_mem_pool {
     id<MTLDevice> device;
 
+    int n_heaps; // total number of heaps ever created (including those that were removed)
+
     NSMutableArray * heaps;
     NSMutableArray * heaps_to_remove;
 };
@@ -597,6 +571,8 @@ @implementation ggml_metal_heap_ptr
 static struct ggml_metal_mem_pool * ggml_metal_mem_pool_init(void) {
     struct ggml_metal_mem_pool * mem_pool = calloc(1, sizeof(struct ggml_metal_mem_pool));
 
+    mem_pool->n_heaps = 0;
+
     mem_pool->heaps           = [[NSMutableArray alloc] init];
     mem_pool->heaps_to_remove = [[NSMutableArray alloc] init];
 
@@ -604,7 +580,7 @@ @implementation ggml_metal_heap_ptr
 }
 
 static void ggml_metal_mem_pool_free(struct ggml_metal_mem_pool * mem_pool) {
-    GGML_LOG_DEBUG("%s: freeing memory pool, num heaps = %zu\n", __func__, [mem_pool->heaps count]);
+    GGML_LOG_DEBUG("%s: freeing memory pool, num heaps = %zu (total = %d)\n", __func__, [mem_pool->heaps count], mem_pool->n_heaps);
 
     size_t size_all = 0;
     size_t size_cur = 0;
@@ -679,6 +655,9 @@ static void ggml_metal_mem_pool_clear(struct ggml_metal_mem_pool * mem_pool) {
     for (ggml_metal_heap_ptr * ptr in mem_pool->heaps) {
         struct ggml_metal_heap * heap = ptr.data;
         if (heap->offs + size_aligned <= [heap->obj size]) {
+            // if this is the first buffer in the heap for the current command buffer, tell the OS that
+            //   it cannot free the memory used by the heap
+            // ref: https://developer.apple.com/documentation/metal/mtlpurgeablestate?language=objc
             if ([heap->bufs count] == 0) {
                 [heap->obj setPurgeableState:MTLPurgeableStateNonVolatile];
             }
@@ -702,11 +681,15 @@ static void ggml_metal_mem_pool_clear(struct ggml_metal_mem_pool * mem_pool) {
     ggml_metal_heap_ptr * heap_ptr = [ggml_metal_heap_ptr new];
 
     struct ggml_metal_heap * heap = ggml_metal_heap_init(mem_pool->device, size_aligned);
-    heap_ptr.data = heap;
+    if (heap == NULL) {
+        GGML_LOG_ERROR("%s: error: failed to create heap of size %zu\n", __func__, size_aligned);
+        return NULL;
+    }
 
-    GGML_LOG_DEBUG("%s: creating new heap of size %zu, got %zu\n", __func__, size_aligned, [heap->obj size]);
+    //GGML_LOG_DEBUG("%s: creating new heap of size %zu, got %zu\n", __func__, size_aligned, [heap->obj size]);
 
-    ggml_metal_heap_reset(heap_ptr.data);
+    heap_ptr.data = heap;
+    ggml_metal_heap_reset(heap);
 
     [heap->obj setPurgeableState:MTLPurgeableStateNonVolatile];
     id<MTLBuffer> buf = [heap->obj newBufferWithLength:size_aligned options:MTLResourceStorageModePrivate offset:heap->offs];
@@ -721,10 +704,51 @@ static void ggml_metal_mem_pool_clear(struct ggml_metal_mem_pool * mem_pool) {
     [heap->bufs addObject:buf];
 
     [mem_pool->heaps addObject:heap_ptr];
+    mem_pool->n_heaps++;
 
     return buf;
 }
 
+struct ggml_metal_command_buffer {
+    id<MTLCommandBuffer> obj;
+
+    // each command buffer has a memory pool from which it can allocate temporary buffers during the compute
+    struct ggml_metal_mem_pool * mem_pool;
+};
+
+struct ggml_backend_metal_context {
+    id<MTLDevice>       device;
+    id<MTLCommandQueue> queue;
+
+    dispatch_queue_t d_queue;
+
+    struct ggml_metal_kernel kernels[GGML_METAL_KERNEL_TYPE_COUNT];
+
+    // capture state
+    bool capture_next_compute;
+    bool capture_started;
+
+    id<MTLCaptureScope> capture_scope;
+
+    // command buffer state
+    int n_cb;           // number of extra threads used to submit the command buffers
+    int n_nodes_0;      // number of nodes submitted by the main thread
+    int n_nodes_1;      // remaining number of nodes submitted by the n_cb threads
+    int n_nodes_per_cb;
+
+    struct ggml_cgraph * gf;
+
+    // the callback given to the thread pool
+    void (^encode_async)(size_t ith);
+
+    // n_cb command buffers + 1 used by the main thread
+    struct ggml_metal_command_buffer cmd_bufs[GGML_METAL_MAX_COMMAND_BUFFERS + 1];
+
+    // abort ggml_metal_graph_compute if callback returns true
+    ggml_abort_callback abort_callback;
+    void *              abort_callback_data;
+};
+
 // MSL code
 // TODO: move the contents here when ready
 //       for now it is easier to work in a separate file

From e5b7f7e2b54a2c879854842b7cf17e8ac3468d1e Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Tue, 15 Apr 2025 15:01:13 +0300
Subject: [PATCH 14/15] metal : disable softmax use of mem_pool

ggml-ci
---
 ggml/src/ggml-metal/ggml-metal.m | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

diff --git a/ggml/src/ggml-metal/ggml-metal.m b/ggml/src/ggml-metal/ggml-metal.m
index b2e810f10c094..027044f1ccb6f 100644
--- a/ggml/src/ggml-metal/ggml-metal.m
+++ b/ggml/src/ggml-metal/ggml-metal.m
@@ -2374,6 +2374,8 @@ static bool ggml_metal_encode_node(
                 const float m0 = powf(2.0f, -(max_bias       ) / n_head_log2);
                 const float m1 = powf(2.0f, -(max_bias / 2.0f) / n_head_log2);
 
+// use this branch to test the ggml_metal_mem_pool functionality
+#if 0
                 // cpy to tmp buffer in MTLHeap
 
                 id<MTLBuffer> h_src0 = h_src0 = ggml_metal_mem_pool_alloc(mem_pool, ggml_nbytes(src0));
@@ -2382,6 +2384,8 @@ static bool ggml_metal_encode_node(
                     return false;
                 }
 
+                offs_src0 = 0;
+
                 ggml_metal_kargs_cpy args_cpy = {
                     /*.ne00 =*/ ne00,
                     /*.ne01 =*/ ne01,
@@ -2415,6 +2419,9 @@ static bool ggml_metal_encode_node(
 
                 [encoder dispatchThreadgroups:MTLSizeMake(ne01, ne02, ne03) threadsPerThreadgroup:MTLSizeMake(nth_cpy, 1, 1)];
 
+#else
+                id<MTLBuffer> h_src0 = id_src0;
+#endif
                 // softmax
 
                 ggml_metal_kargs_soft_max args = {
@@ -2429,11 +2436,11 @@ static bool ggml_metal_encode_node(
                 };
 
                 [encoder setComputePipelineState:pipeline];
-                [encoder setBuffer:h_src0 offset:0              atIndex:0];
+                [encoder setBuffer:h_src0 offset:offs_src0      atIndex:0];
                 if (id_src1) {
                     [encoder setBuffer:id_src1 offset:offs_src1 atIndex:1];
                 } else {
-                    [encoder setBuffer:h_src0 offset:0          atIndex:1];
+                    [encoder setBuffer:h_src0 offset:offs_src0  atIndex:1];
                 }
                 [encoder setBuffer:id_dst offset:offs_dst       atIndex:2];
                 [encoder setBytes:&args   length:sizeof(args)   atIndex:3];

From dc4c04816ee95181ce3c8426f91e10ca4c29fe7c Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Tue, 22 Apr 2025 16:15:19 +0300
Subject: [PATCH 15/15] metal : final touches

---
 ggml/src/ggml-metal/ggml-metal.m | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/ggml/src/ggml-metal/ggml-metal.m b/ggml/src/ggml-metal/ggml-metal.m
index 027044f1ccb6f..da5857542752c 100644
--- a/ggml/src/ggml-metal/ggml-metal.m
+++ b/ggml/src/ggml-metal/ggml-metal.m
@@ -473,16 +473,19 @@ static void ggml_backend_metal_device_rel(struct ggml_backend_metal_device_conte
 //
 
 struct ggml_metal_heap {
-    int n_unused; // number of times the heap was unused
+    // number of times the heap was unused
+    int n_unused;
 
-    int64_t n_alloc; // total number of buffer allocations in this heap across all computes
+    // total number of buffer allocations in this heap across all computes
+    int64_t n_alloc;
 
     // current offset in the heap - we reset this after each node in order to reuse the memory
     size_t offs;
 
+    // the currently allocated MTLBuffer objects in this heap
     id<MTLHeap> obj;
 
-    NSMutableArray * bufs; // the currently allocated MTLBuffer objects in this heap
+    NSMutableArray * bufs;
 };
 
 static struct ggml_metal_heap * ggml_metal_heap_init(id<MTLDevice> device, size_t size) {
@@ -586,7 +589,7 @@ static void ggml_metal_mem_pool_free(struct ggml_metal_mem_pool * mem_pool) {
     size_t size_cur = 0;
 
     for (ggml_metal_heap_ptr * ptr in mem_pool->heaps) {
-        GGML_LOG_DEBUG("%s:   heap: %p\n",               __func__, (void *) ptr.data);
+        GGML_LOG_DEBUG("%s:   heap: %p\n",                __func__, (void *) ptr.data);
         GGML_LOG_DEBUG("%s:     n_alloc:  %" PRId64 "\n", __func__, ptr.data->n_alloc);
         GGML_LOG_DEBUG("%s:     n_unused: %d\n",          __func__, ptr.data->n_unused);
         GGML_LOG_DEBUG("%s:     size:     %.2f MiB\n",    __func__, [ptr.data->obj size] / 1024.0 / 1024.0);