@@ -386,14 +386,18 @@ struct LLM_TN {
386
386
// ggml helpers
387
387
//
388
388
389
- static void ggml_graph_compute_helper (std::vector<uint8_t > & buf, ggml_cgraph * graph, int n_threads) {
389
+ static void ggml_graph_compute_helper (std::vector<uint8_t > & buf, ggml_cgraph * graph, int n_threads, llama_abort_callback abort_callback ) {
390
390
struct ggml_cplan plan = ggml_graph_plan (graph, n_threads);
391
391
392
392
if (plan.work_size > 0 ) {
393
393
buf.resize (plan.work_size );
394
394
plan.work_data = buf.data ();
395
395
}
396
396
397
+ if (abort_callback) {
398
+ plan.abort_callback = abort_callback;
399
+ }
400
+
397
401
ggml_graph_compute (graph, &plan);
398
402
}
399
403
@@ -2902,10 +2906,10 @@ static bool llama_eval_internal(
2902
2906
ggml_metal_get_tensor (lctx.ctx_metal , embeddings);
2903
2907
}
2904
2908
} else {
2905
- ggml_graph_compute_helper (lctx.work_buffer , gf, n_threads);
2909
+ ggml_graph_compute_helper (lctx.work_buffer , gf, n_threads, nullptr );
2906
2910
}
2907
2911
#else
2908
- ggml_graph_compute_helper (lctx.work_buffer , gf, n_threads);
2912
+ ggml_graph_compute_helper (lctx.work_buffer , gf, n_threads, nullptr );
2909
2913
#endif
2910
2914
2911
2915
#if GGML_USE_MPI
@@ -5198,7 +5202,7 @@ int llama_apply_lora_from_file_internal(const struct llama_model & model, const
5198
5202
5199
5203
struct ggml_cgraph gf = ggml_build_forward (r);
5200
5204
5201
- ggml_graph_compute_helper (work_buffer, &gf, n_threads);
5205
+ ggml_graph_compute_helper (work_buffer, &gf, n_threads, nullptr );
5202
5206
5203
5207
// we won't need these tensors again, reset the context to save memory
5204
5208
ggml_free (lora_ctx);
@@ -5240,6 +5244,8 @@ struct llama_context_params llama_context_default_params() {
5240
5244
/* .rope_freq_scale =*/ 1 .0f ,
5241
5245
/* .progress_callback =*/ nullptr ,
5242
5246
/* .progress_callback_user_data =*/ nullptr ,
5247
+ /* .abort_callback =*/ nullptr ,
5248
+ /* .abort_callback_user_data =*/ nullptr ,
5243
5249
/* .low_vram =*/ false ,
5244
5250
/* .mul_mat_q =*/ false ,
5245
5251
/* .f16_kv =*/ true ,
@@ -5776,7 +5782,7 @@ void llama_copy_state_data_internal(struct llama_context * ctx, llama_data_conte
5776
5782
5777
5783
ggml_build_forward_expand (&gf, ggml_cpy (cpy_ctx, k3d, kout3d));
5778
5784
ggml_build_forward_expand (&gf, ggml_cpy (cpy_ctx, v3d, vout3d));
5779
- ggml_graph_compute_helper (ctx->work_buffer , &gf, /* n_threads*/ 1 );
5785
+ ggml_graph_compute_helper (ctx->work_buffer , &gf, /* n_threads*/ 1 , nullptr );
5780
5786
5781
5787
ggml_free (cpy_ctx);
5782
5788
@@ -5886,7 +5892,7 @@ size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src) {
5886
5892
5887
5893
ggml_build_forward_expand (&gf, ggml_cpy (cpy_ctx, kin3d, k3d));
5888
5894
ggml_build_forward_expand (&gf, ggml_cpy (cpy_ctx, vin3d, v3d));
5889
- ggml_graph_compute_helper (ctx->work_buffer , &gf, /* n_threads*/ 1 );
5895
+ ggml_graph_compute_helper (ctx->work_buffer , &gf, /* n_threads*/ 1 , nullptr );
5890
5896
5891
5897
ggml_free (cpy_ctx);
5892
5898
}
0 commit comments