19
19
// max number of MTLCommandBuffer used to submit a graph for processing
20
20
#define GGML_METAL_MAX_COMMAND_BUFFERS 8
21
21
22
+ // create residency sets only on macOS >= 15.0
23
+ #if TARGET_OS_OSX && __MAC_OS_X_VERSION_MAX_ALLOWED >= 150000
24
+ #define GGML_METAL_HAS_RESIDENCY_SETS 1
25
+ #endif
26
+
22
27
#define UNUSED (x ) (void )(x)
23
28
24
29
// globals
39
44
40
45
bool has_simdgroup_reduction;
41
46
bool has_simdgroup_mm;
47
+ bool has_residency_sets;
42
48
bool has_bfloat;
43
49
bool use_bfloat;
44
50
48
54
/* .mtl_device_ref_count =*/ 0 ,
49
55
/* .has_simdgroup_reduction =*/ false ,
50
56
/* .has_simdgroup_mm =*/ false ,
57
+ /* .has_residency_sets =*/ false ,
51
58
/* .has_bfloat =*/ false ,
52
59
/* .use_bfloat =*/ false ,
53
60
/* .name =*/ " " ,
65
72
66
73
ctx->has_simdgroup_mm = [ctx->mtl_device supportsFamily: MTLGPUFamilyApple7];
67
74
75
+ #if defined(GGML_METAL_HAS_RESIDENCY_SETS)
76
+ ctx->has_residency_sets = true ;
77
+ #endif
78
+
68
79
ctx->has_bfloat = [ctx->mtl_device supportsFamily: MTLGPUFamilyMetal3_GGML];
69
80
ctx->has_bfloat |= [ctx->mtl_device supportsFamily: MTLGPUFamilyApple6];
70
81
@@ -483,6 +494,11 @@ @implementation GGMLMetalClass
483
494
GGML_LOG_INFO (" %s : picking default device: %s \n " , __func__, [[device name ] UTF8String ]);
484
495
485
496
ctx->queue = [device newCommandQueue ];
497
+ if (ctx->queue == nil ) {
498
+ GGML_LOG_ERROR (" %s : error: failed to create command queue\n " , __func__);
499
+ return NULL ;
500
+ }
501
+
486
502
ctx->d_queue = dispatch_queue_create (" ggml-metal" , DISPATCH_QUEUE_CONCURRENT);
487
503
488
504
id <MTLLibrary > metal_library;
@@ -649,6 +665,7 @@ @implementation GGMLMetalClass
649
665
650
666
GGML_LOG_INFO (" %s : simdgroup reduction = %s \n " , __func__, ctx_dev->has_simdgroup_reduction ? " true" : " false" );
651
667
GGML_LOG_INFO (" %s : simdgroup matrix mul. = %s \n " , __func__, ctx_dev->has_simdgroup_mm ? " true" : " false" );
668
+ GGML_LOG_INFO (" %s : has residency sets = %s \n " , __func__, ctx_dev->has_residency_sets ? " true" : " false" );
652
669
GGML_LOG_INFO (" %s : has bfloat = %s \n " , __func__, ctx_dev->has_bfloat ? " true" : " false" );
653
670
GGML_LOG_INFO (" %s : use bfloat = %s \n " , __func__, ctx_dev->use_bfloat ? " true" : " false" );
654
671
GGML_LOG_INFO (" %s : hasUnifiedMemory = %s \n " , __func__, ctx_dev->mtl_device .hasUnifiedMemory ? " true" : " false" );
@@ -1035,8 +1052,60 @@ static void ggml_metal_free(struct ggml_backend_metal_context * ctx) {
1035
1052
// multiple buffers are used only to avoid the maximum buffer size limitation when using mmap
1036
1053
int n_buffers;
1037
1054
struct ggml_backend_metal_buffer buffers[GGML_METAL_MAX_BUFFERS];
1055
+
1056
+ // optional MTLResidencySet
1057
+ id rset;
1038
1058
};
1039
1059
1060
+ // rset init
1061
+ static bool ggml_backend_metal_buffer_rset_init (struct ggml_backend_metal_buffer_context * ctx, id <MTLDevice > device) {
1062
+ #if defined(GGML_METAL_HAS_RESIDENCY_SETS)
1063
+ if (@available (macOS 15.0 , *)) {
1064
+ MTLResidencySetDescriptor * desc;
1065
+ desc = [[MTLResidencySetDescriptor alloc ] init ];
1066
+ desc.label = @" ggml_backend_metal" ;
1067
+ desc.initialCapacity = ctx->n_buffers ;
1068
+
1069
+ NSError * error;
1070
+ ctx->rset = [device newResidencySetWithDescriptor: desc error: &error];
1071
+ if (error) {
1072
+ GGML_LOG_ERROR (" %s : error: %s \n " , __func__, [[error description ] UTF8String ]);
1073
+ return false ;
1074
+ }
1075
+
1076
+ for (int i = 0 ; i < ctx->n_buffers ; i++) {
1077
+ [ctx->rset addAllocation: ctx->buffers[i].metal];
1078
+ }
1079
+
1080
+ [ctx->rset commit ];
1081
+ [ctx->rset requestResidency ];
1082
+
1083
+ return true ;
1084
+ }
1085
+ #else
1086
+ GGML_UNUSED (device);
1087
+ #endif
1088
+
1089
+ ctx->rset = nil ;
1090
+
1091
+ return true ;
1092
+ }
1093
+
1094
+ // rset free
1095
+ static void ggml_backend_metal_buffer_rset_free (struct ggml_backend_metal_buffer_context * ctx) {
1096
+ #if defined(GGML_METAL_HAS_RESIDENCY_SETS)
1097
+ if (@available (macOS 15.0 , *)) {
1098
+ if (ctx->rset ) {
1099
+ [ctx->rset endResidency ];
1100
+ [ctx->rset removeAllAllocations ];
1101
+ [ctx->rset release ];
1102
+ }
1103
+ }
1104
+ #else
1105
+ GGML_UNUSED (ctx);
1106
+ #endif
1107
+ }
1108
+
1040
1109
// finds the Metal buffer that contains the tensor data on the GPU device
1041
1110
// the assumption is that there is 1-to-1 mapping between the host and device memory buffers, so we can find the
1042
1111
// Metal buffer based on the host memory pointer
@@ -4086,7 +4155,7 @@ static enum ggml_status ggml_metal_graph_compute(
4086
4155
// the main thread commits the first few commands immediately
4087
4156
// command_buffer[n_cb]
4088
4157
{
4089
- id <MTLCommandBuffer > command_buffer = [ctx->queue commandBufferWithUnretainedReferences ];
4158
+ id <MTLCommandBuffer > command_buffer = [ctx->queue commandBuffer ];
4090
4159
ctx->command_buffers [n_cb] = command_buffer;
4091
4160
4092
4161
[command_buffer enqueue ];
@@ -4096,7 +4165,7 @@ static enum ggml_status ggml_metal_graph_compute(
4096
4165
// prepare the rest of the command buffers asynchronously
4097
4166
// command_buffer[0.. n_cb)
4098
4167
for (int cb_idx = 0 ; cb_idx < n_cb; ++cb_idx) {
4099
- id <MTLCommandBuffer > command_buffer = [ctx->queue commandBufferWithUnretainedReferences ];
4168
+ id <MTLCommandBuffer > command_buffer = [ctx->queue commandBuffer ];
4100
4169
ctx->command_buffers [cb_idx] = command_buffer;
4101
4170
4102
4171
// always enqueue the first two command buffers
@@ -4176,6 +4245,8 @@ static void ggml_backend_metal_buffer_free_buffer(ggml_backend_buffer_t buffer)
4176
4245
for (int i = 0 ; i < ctx->n_buffers ; i++) {
4177
4246
[ctx->buffers[i].metal release ];
4178
4247
}
4248
+
4249
+ ggml_backend_metal_buffer_rset_free (ctx);
4179
4250
ggml_backend_metal_device_rel (buffer->buft ->device ->context );
4180
4251
4181
4252
if (ctx->owned ) {
@@ -4284,7 +4355,8 @@ static ggml_backend_buffer_t ggml_backend_metal_buffer_type_alloc_buffer(ggml_ba
4284
4355
size_aligned += (size_page - (size_aligned % size_page));
4285
4356
}
4286
4357
4287
- id <MTLDevice > device = ggml_backend_metal_device_acq (buft->device ->context );
4358
+ struct ggml_backend_metal_device_context * ctx_dev = (struct ggml_backend_metal_device_context *)buft->device ->context ;
4359
+ id <MTLDevice > device = ggml_backend_metal_device_acq (ctx_dev);
4288
4360
4289
4361
ctx->all_data = ggml_metal_host_malloc (size_aligned);
4290
4362
ctx->all_size = size_aligned;
@@ -4307,7 +4379,14 @@ static ggml_backend_buffer_t ggml_backend_metal_buffer_type_alloc_buffer(ggml_ba
4307
4379
if (size_aligned > 0 && (ctx->all_data == NULL || ctx->buffers [0 ].metal == nil )) {
4308
4380
GGML_LOG_ERROR (" %s : error: failed to allocate buffer, size = %8.2f MiB\n " , __func__, size_aligned / 1024.0 / 1024.0 );
4309
4381
free (ctx);
4310
- ggml_backend_metal_device_rel (buft->device ->context );
4382
+ ggml_backend_metal_device_rel (ctx_dev);
4383
+ return NULL ;
4384
+ }
4385
+
4386
+ if (!ggml_backend_metal_buffer_rset_init (ctx, device)) {
4387
+ GGML_LOG_ERROR (" %s : error: failed to initialize residency set\n " , __func__);
4388
+ free (ctx);
4389
+ ggml_backend_metal_device_rel (ctx_dev);
4311
4390
return NULL ;
4312
4391
}
4313
4392
@@ -4400,7 +4479,8 @@ ggml_backend_buffer_t ggml_backend_metal_buffer_from_ptr(void * data, size_t siz
4400
4479
size_aligned += (size_page - (size_aligned % size_page));
4401
4480
}
4402
4481
4403
- id <MTLDevice > device = ggml_backend_metal_device_acq (&g_ggml_ctx_dev_main);
4482
+ struct ggml_backend_metal_device_context * ctx_dev = &g_ggml_ctx_dev_main;
4483
+ id <MTLDevice > device = ggml_backend_metal_device_acq (ctx_dev);
4404
4484
4405
4485
// the buffer fits into the max buffer size allowed by the device
4406
4486
if (size_aligned <= device.maxBufferLength ) {
@@ -4453,6 +4533,13 @@ ggml_backend_buffer_t ggml_backend_metal_buffer_from_ptr(void * data, size_t siz
4453
4533
}
4454
4534
}
4455
4535
4536
+ if (!ggml_backend_metal_buffer_rset_init (ctx, device)) {
4537
+ GGML_LOG_ERROR (" %s : error: failed to initialize residency set\n " , __func__);
4538
+ free (ctx);
4539
+ ggml_backend_metal_device_rel (ctx_dev);
4540
+ return NULL ;
4541
+ }
4542
+
4456
4543
return ggml_backend_buffer_init (ggml_backend_metal_buffer_from_ptr_type (), ggml_backend_metal_buffer_i, ctx, size);
4457
4544
}
4458
4545
@@ -4766,6 +4853,13 @@ static ggml_backend_buffer_t ggml_backend_metal_device_buffer_from_ptr(ggml_back
4766
4853
}
4767
4854
}
4768
4855
4856
+ if (!ggml_backend_metal_buffer_rset_init (ctx, device)) {
4857
+ GGML_LOG_ERROR (" %s : error: failed to initialize residency set\n " , __func__);
4858
+ free (ctx);
4859
+ ggml_backend_metal_device_rel (ctx_dev);
4860
+ return NULL ;
4861
+ }
4862
+
4769
4863
return ggml_backend_buffer_init (ggml_backend_metal_buffer_from_ptr_type (), ggml_backend_metal_buffer_i, ctx, size);
4770
4864
}
4771
4865
0 commit comments