1
1
#include "ggml-vulkan.h"
2
2
#include <vulkan/vulkan_core.h>
3
- #if defined(GGML_VULKAN_RUN_TESTS) || defined(GGML_VULKAN_PERF) || defined( GGML_VULKAN_CHECK_RESULTS)
3
+ #if defined(GGML_VULKAN_RUN_TESTS) || defined(GGML_VULKAN_CHECK_RESULTS)
4
4
#include <chrono>
5
5
#include "ggml-cpu.h"
6
6
#endif
@@ -184,9 +184,7 @@ static ggml_backend_buffer_type_i ggml_backend_vk_buffer_type_interface = {
184
184
#ifdef GGML_VULKAN_MEMORY_DEBUG
185
185
class vk_memory_logger;
186
186
#endif
187
- #ifdef GGML_VULKAN_PERF
188
187
class vk_perf_logger;
189
- #endif
190
188
static void ggml_vk_destroy_buffer(vk_buffer& buf);
191
189
192
190
static constexpr uint32_t mul_mat_vec_max_cols = 8;
@@ -442,9 +440,11 @@ struct vk_device_struct {
442
440
#ifdef GGML_VULKAN_MEMORY_DEBUG
443
441
std::unique_ptr<vk_memory_logger> memory_logger;
444
442
#endif
445
- #ifdef GGML_VULKAN_PERF
443
+
444
+ // for GGML_VK_PERF_LOGGER
446
445
std::unique_ptr<vk_perf_logger> perf_logger;
447
- #endif
446
+ vk::QueryPool query_pool;
447
+ uint32_t num_queries;
448
448
449
449
~vk_device_struct() {
450
450
VK_LOG_DEBUG("destroy device " << name);
@@ -828,8 +828,6 @@ class vk_memory_logger {
828
828
#define VK_LOG_MEMORY(msg) ((void) 0)
829
829
#endif // GGML_VULKAN_MEMORY_DEBUG
830
830
831
- #if defined(GGML_VULKAN_PERF)
832
-
833
831
class vk_perf_logger {
834
832
public:
835
833
void print_timings() {
@@ -839,7 +837,7 @@ class vk_perf_logger {
839
837
for (const auto& time : t.second) {
840
838
total += time;
841
839
}
842
- std::cerr << t.first << ": " << t.second.size() << " x " << (total / t.second.size() / 1000.0) << " ms " << std::endl;
840
+ std::cerr << t.first << ": " << t.second.size() << " x " << (total / t.second.size() / 1000.0) << " us " << std::endl;
843
841
}
844
842
845
843
timings.clear();
@@ -868,7 +866,6 @@ class vk_perf_logger {
868
866
private:
869
867
std::map<std::string, std::vector<uint64_t>> timings;
870
868
};
871
- #endif // GGML_VULKAN_PERF
872
869
873
870
struct ggml_backend_vk_context {
874
871
std::string name;
@@ -958,6 +955,8 @@ struct vk_instance_t {
958
955
static bool vk_instance_initialized = false;
959
956
static vk_instance_t vk_instance;
960
957
958
+ static bool vk_perf_logger_enabled = false;
959
+
961
960
#ifdef GGML_VULKAN_CHECK_RESULTS
962
961
static size_t vk_skip_checks;
963
962
static size_t vk_output_tensor;
@@ -2757,9 +2756,9 @@ static vk_device ggml_vk_get_device(size_t idx) {
2757
2756
#ifdef GGML_VULKAN_MEMORY_DEBUG
2758
2757
device->memory_logger = std::unique_ptr<vk_memory_logger>(new vk_memory_logger());
2759
2758
#endif
2760
- #ifdef GGML_VULKAN_PERF
2761
- device->perf_logger = std::unique_ptr<vk_perf_logger>(new vk_perf_logger());
2762
- #endif
2759
+ if (vk_perf_logger_enabled) {
2760
+ device->perf_logger = std::unique_ptr<vk_perf_logger>(new vk_perf_logger());
2761
+ }
2763
2762
2764
2763
size_t dev_num = vk_instance.device_indices[idx];
2765
2764
@@ -3547,6 +3546,8 @@ static void ggml_vk_instance_init() {
3547
3546
vk_instance.instance = vk::createInstance(instance_create_info);
3548
3547
vk_instance_initialized = true;
3549
3548
3549
+ vk_perf_logger_enabled = getenv("GGML_VK_PERF_LOGGER") != nullptr;
3550
+
3550
3551
size_t num_available_devices = vk_instance.instance.enumeratePhysicalDevices().size();
3551
3552
3552
3553
// Emulate behavior of CUDA_VISIBLE_DEVICES for Vulkan
@@ -8885,7 +8886,7 @@ static bool ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * nod
8885
8886
8886
8887
ctx->tensor_ctxs[node_idx] = compute_ctx;
8887
8888
8888
- #if defined(GGML_VULKAN_CHECK_RESULTS) || defined(GGML_VULKAN_PERF)
8889
+ #if defined(GGML_VULKAN_CHECK_RESULTS)
8889
8890
// Force context reset on each node so that each tensor ends up in its own context
8890
8891
// and can be run and compared to its CPU equivalent separately
8891
8892
last_node = true;
@@ -9505,6 +9506,29 @@ static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cg
9505
9506
bool first_node_in_batch = true; // true if next node will be first node in a batch
9506
9507
int submit_node_idx = 0; // index to first node in a batch
9507
9508
9509
+ vk_context compute_ctx;
9510
+ if (vk_perf_logger_enabled) {
9511
+ // allocate/resize the query pool
9512
+ if (ctx->device->num_queries < cgraph->n_nodes + 1) {
9513
+ if (ctx->device->query_pool) {
9514
+ ctx->device->device.destroyQueryPool(ctx->device->query_pool);
9515
+ }
9516
+ VkQueryPoolCreateInfo query_create_info = { VK_STRUCTURE_TYPE_QUERY_POOL_CREATE_INFO };
9517
+ query_create_info.queryType = VK_QUERY_TYPE_TIMESTAMP;
9518
+ query_create_info.queryCount = cgraph->n_nodes + 100;
9519
+ ctx->device->query_pool = ctx->device->device.createQueryPool(query_create_info);
9520
+ ctx->device->num_queries = query_create_info.queryCount;
9521
+ }
9522
+
9523
+ ctx->device->device.resetQueryPool(ctx->device->query_pool, 0, cgraph->n_nodes+1);
9524
+
9525
+ GGML_ASSERT(ctx->compute_ctx.expired());
9526
+ compute_ctx = ggml_vk_create_context(ctx, ctx->device->compute_queue);
9527
+ ctx->compute_ctx = compute_ctx;
9528
+ ggml_vk_ctx_begin(ctx->device, compute_ctx);
9529
+ compute_ctx->s->buffer.writeTimestamp(vk::PipelineStageFlagBits::eAllCommands, ctx->device->query_pool, 0);
9530
+ }
9531
+
9508
9532
// Submit after enough work has accumulated, to overlap CPU cmdbuffer generation with GPU execution.
9509
9533
// Estimate the amount of matmul work by looking at the weight matrix size, and submit every 100MB
9510
9534
// (and scaled down based on model size, so smaller models submit earlier).
@@ -9532,6 +9556,17 @@ static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cg
9532
9556
9533
9557
bool enqueued = ggml_vk_build_graph(ctx, cgraph->nodes[i], i, cgraph->nodes[submit_node_idx], submit_node_idx, false, i == last_node, almost_ready, submit);
9534
9558
9559
+ if (vk_perf_logger_enabled) {
9560
+ if (ctx->compute_ctx.expired()) {
9561
+ compute_ctx = ggml_vk_create_context(ctx, ctx->device->compute_queue);
9562
+ ctx->compute_ctx = compute_ctx;
9563
+ ggml_vk_ctx_begin(ctx->device, compute_ctx);
9564
+ } else {
9565
+ compute_ctx = ctx->compute_ctx.lock();
9566
+ }
9567
+ compute_ctx->s->buffer.writeTimestamp(vk::PipelineStageFlagBits::eAllCommands, ctx->device->query_pool, i+1);
9568
+ }
9569
+
9535
9570
if (enqueued) {
9536
9571
++submitted_nodes;
9537
9572
@@ -9553,9 +9588,27 @@ static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cg
9553
9588
}
9554
9589
}
9555
9590
9556
- #ifdef GGML_VULKAN_PERF
9557
- ctx->device->perf_logger->print_timings();
9558
- #endif
9591
+ if (vk_perf_logger_enabled) {
9592
+ // End the command buffer and submit/wait
9593
+ GGML_ASSERT(!ctx->compute_ctx.expired());
9594
+ compute_ctx = ctx->compute_ctx.lock();
9595
+ ggml_vk_ctx_end(compute_ctx);
9596
+
9597
+ ggml_vk_submit(compute_ctx, ctx->device->fence);
9598
+ VK_CHECK(ctx->device->device.waitForFences({ ctx->device->fence }, true, UINT64_MAX), "GGML_VULKAN_PERF waitForFences");
9599
+ ctx->device->device.resetFences({ ctx->device->fence });
9600
+
9601
+ // Get the results and pass them to the logger
9602
+ std::vector<uint64_t> timestamps(cgraph->n_nodes + 1);
9603
+ ctx->device->device.getQueryPoolResults(ctx->device->query_pool, 0, cgraph->n_nodes + 1, (cgraph->n_nodes + 1)*sizeof(uint64_t), timestamps.data(), sizeof(uint64_t), vk::QueryResultFlagBits::e64 | vk::QueryResultFlagBits::eWait);
9604
+ for (int i = 0; i < cgraph->n_nodes; i++) {
9605
+ if (!ggml_vk_is_empty(cgraph->nodes[i])) {
9606
+ ctx->device->perf_logger->log_timing(cgraph->nodes[i], uint64_t((timestamps[i+1] - timestamps[i]) * ctx->device->properties.limits.timestampPeriod));
9607
+ }
9608
+ }
9609
+
9610
+ ctx->device->perf_logger->print_timings();
9611
+ }
9559
9612
9560
9613
ggml_vk_graph_cleanup(ctx);
9561
9614
0 commit comments