8000 vulkan: use timestamp queries for GGML_VULKAN_PERF (#13817) · ggml-org/llama.cpp@bef8176 · GitHub
[go: up one dir, main page]

Skip to content

Commit bef8176

Browse files
authored
vulkan: use timestamp queries for GGML_VULKAN_PERF (#13817)
Also change it to be controlled by an env var rather than cmake flag
1 parent 34b7c04 commit bef8176

File tree

3 files changed

+69
-21
lines changed

3 files changed

+69
-21
lines changed

ggml/CMakeLists.txt

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -177,7 +177,6 @@ option(GGML_VULKAN_CHECK_RESULTS "ggml: run Vulkan op checks"
177177
option(GGML_VULKAN_DEBUG "ggml: enable Vulkan debug output" OFF)
178178
option(GGML_VULKAN_MEMORY_DEBUG "ggml: enable Vulkan memory debug output" OFF)
179179
option(GGML_VULKAN_SHADER_DEBUG_INFO "ggml: enable Vulkan shader debug info" OFF)
180-
option(GGML_VULKAN_PERF "ggml: enable Vulkan perf output" OFF)
181180
option(GGML_VULKAN_VALIDATE "ggml: enable Vulkan validation" OFF)
182181
option(GGML_VULKAN_RUN_TESTS "ggml: run Vulkan tests" OFF)
183182
option(GGML_KOMPUTE "ggml: use Kompute" OFF)

ggml/src/ggml-vulkan/CMakeLists.txt

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -109,10 +109,6 @@ if (Vulkan_FOUND)
109109
add_compile_definitions(GGML_VULKAN_SHADER_DEBUG_INFO)
110110
endif()
111111

112-
if (GGML_VULKAN_PERF)
113-
add_compile_definitions(GGML_VULKAN_PERF)
114-
endif()
115-
116112
if (GGML_VULKAN_VALIDATE)
117113
add_compile_definitions(GGML_VULKAN_VALIDATE)
118114
endif()

ggml/src/ggml-vulkan/ggml-vulkan.cpp

Lines changed: 69 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
#include "ggml-vulkan.h"
22
#include <vulkan/vulkan_core.h>
3-
#if defined(GGML_VULKAN_RUN_TESTS) || defined(GGML_VULKAN_PERF) || defined(GGML_VULKAN_CHECK_RESULTS)
3+
#if defined(GGML_VULKAN_RUN_TESTS) || defined(GGML_VULKAN_CHECK_RESULTS)
44
#include <chrono>
55
#include "ggml-cpu.h"
66
#endif
@@ -184,9 +184,7 @@ static ggml_backend_buffer_type_i ggml_backend_vk_buffer_type_interface = {
184184
#ifdef GGML_VULKAN_MEMORY_DEBUG
185185
class vk_memory_logger;
186186
#endif
187-
#ifdef GGML_VULKAN_PERF
188187
class vk_perf_logger;
189-
#endif
190188
static void ggml_vk_destroy_buffer(vk_buffer& buf);
191189

192190
static constexpr uint32_t mul_mat_vec_max_cols = 8;
@@ -442,9 +440,11 @@ struct vk_device_struct {
442440
#ifdef GGML_VULKAN_MEMORY_DEBUG
443441
std::unique_ptr<vk_memory_logger> memory_logger;
444442
#endif
445-
#ifdef GGML_VULKAN_PERF
443+
444+
// for GGML_VK_PERF_LOGGER
446445
std::unique_ptr<vk_perf_logger> perf_logger;
447-
#endif
446+
vk::QueryPool query_pool;
447+
uint32_t num_queries;
448448

449449
~vk_device_struct() {
450450
VK_LOG_DEBUG("destroy device " << name);
@@ -828,8 +828,6 @@ class vk_memory_logger {
828828
#define VK_LOG_MEMORY(msg) ((void) 0)
829829
#endif // GGML_VULKAN_MEMORY_DEBUG
830830

831-
#if defined(GGML_VULKAN_PERF)
832-
833831
class vk_perf_logger {
834832
public:
835833
void print_timings() {
@@ -839,7 +837,7 @@ class vk_perf_logger {
839837
for (const auto& time : t.second) {
840838
total += time;
841839
}
842-
std::cerr << t.first << ": " << t.second.size() << " x " << (total / t.second.size() / 1000.0) << " ms" << std::endl;
840+
std::cerr << t.first << ": " << t.second.size() << " x " << (total / t.second.size() / 1000.0) << " us" << std::endl;
843841
}
844842

845843
timings.clear();
@@ -868,7 +866,6 @@ class vk_perf_logger {
868866
private:
869867
std::map<std::string, std::vector<uint64_t>> timings;
870868
};
871-
#endif // GGML_VULKAN_PERF
872869

873870
struct ggml_backend_vk_context {
874871
std::string name;
@@ -958,6 +955,8 @@ struct vk_instance_t {
958955
static bool vk_instance_initialized = false;
959956
static vk_instance_t vk_instance;
960957

958+
static bool vk_perf_logger_enabled = false;
959+
961960
#ifdef GGML_VULKAN_CHECK_RESULTS
962961
static size_t vk_skip_checks;
963962
static size_t vk_output_tensor;
@@ -2757,9 +2756,9 @@ static vk_device ggml_vk_get_device(size_t idx) {
27572756
#ifdef GGML_VULKAN_MEMORY_DEBUG
27582757
device->memory_logger = std::unique_ptr<vk_memory_logger>(new vk_memory_logger());
27592758
#endif
2760-
#ifdef GGML_VULKAN_PERF
2761-
device->perf_logger = std::unique_ptr<vk_perf_logger>(new vk_perf_logger());
2762-
#endif
2759+
if (vk_perf_logger_enabled) {
2760+
device->perf_logger = std::unique_ptr<vk_perf_logger>(new vk_perf_logger());
2761+
}
27632762

27642763
size_t dev_num = vk_instance.device_indices[idx];
27652764

@@ -3547,6 +3546,8 @@ static void ggml_vk_instance_init() {
35473546
vk_instance.instance = vk::createInstance(instance_create_info);
35483547
vk_instance_initialized = true;
35493548

3549+
vk_perf_logger_enabled = getenv("GGML_VK_PERF_LOGGER") != nullptr;
3550+
35503551
size_t num_available_devices = vk_instance.instance.enumeratePhysicalDevices().size();
35513552

35523553
// Emulate behavior of CUDA_VISIBLE_DEVICES for Vulkan
@@ -8885,7 +8886,7 @@ static bool ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * nod
88858886

88868887
ctx->tensor_ctxs[node_idx] = compute_ctx;
88878888

8888-
#if defined(GGML_VULKAN_CHECK_RESULTS) || defined(GGML_VULKAN_PERF)
8889+
#if defined(GGML_VULKAN_CHECK_RESULTS)
88898890
// Force context reset on each node so that each tensor ends up in its own context
88908891
// and can be run and compared to its CPU equivalent separately
88918892
last_node = true;
@@ -9505,6 +9506,29 @@ static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cg
95059506
bool first_node_in_batch = true; // true if next node will be first node in a batch
95069507
int submit_node_idx = 0; // index to first node in a batch
95079508

9509+
vk_context compute_ctx;
9510+
if (vk_perf_logger_enabled) {
9511+
// allocate/resize the query pool
9512+
if (ctx->device->num_queries < cgraph->n_nodes + 1) {
9513+
if (ctx->device->query_pool) {
9514+
ctx->device->device.destroyQueryPool(ctx->device->query_pool);
9515+
}
9516+
VkQueryPoolCreateInfo query_create_info = { VK_STRUCTURE_TYPE_QUERY_POOL_CREATE_INFO };
9517+
query_create_info.queryType = VK_QUERY_TYPE_TIMESTAMP;
9518+
query_create_info.queryCount = cgraph->n_nodes + 100;
9519+
ctx->device->query_pool = ctx->device->device.createQueryPool(query_create_info);
9520+
ctx->device->num_queries = query_create_info.queryCount;
9521+
}
9522+
9523+
ctx->device->device.resetQueryPool(ctx->device->query_pool, 0, cgraph->n_nodes+1);
9524+
9525+
GGML_ASSERT(ctx->compute_ctx.expired());
9526+
compute_ctx = ggml_vk_create_context(ctx, ctx->device->compute_queue);
9527+
ctx->compute_ctx = compute_ctx;
9528+
ggml_vk_ctx_begin(ctx->device, compute_ctx);
9529+
compute_ctx->s->buffer.writeTimestamp(vk::PipelineStageFlagBits::eAllCommands, ctx->device->query_pool, 0);
9530+
}
9531+
95089532
// Submit after enough work has accumulated, to overlap CPU cmdbuffer generation with GPU execution.
95099533
// Estimate the amount of matmul work by looking at the weight matrix size, and submit every 100MB
95109534
// (and scaled down based on model size, so smaller models submit earlier).
@@ -9532,6 +9556,17 @@ static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cg
95329556

95339557
bool enqueued = ggml_vk_build_graph(ctx, cgraph->nodes[i], i, cgraph->nodes[submit_node_idx], submit_node_idx, false, i == last_node, almost_ready, submit);
95349558

9559+
if (vk_perf_logger_enabled) {
9560+
if (ctx->compute_ctx.expired()) {
9561+
compute_ctx = ggml_vk_create_context(ctx, ctx->device->compute_queue);
9562+
ctx->compute_ctx = compute_ctx;
9563+
ggml_vk_ctx_begin(ctx->device, compute_ctx);
9564+
} else {
9565+
compute_ctx = ctx->compute_ctx.lock();
9566+
}
9567+
compute_ctx->s->buffer.writeTimestamp(vk::PipelineStageFlagBits::eAllCommands, ctx->device->query_pool, i+1);
9568+
}
9569+
95359570
if (enqueued) {
95369571
++submitted_nodes;
95379572

@@ -9553,9 +9588,27 @@ static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cg
95539588
}
95549589
}
95559590

9556-
#ifdef GGML_VULKAN_PERF
9557-
ctx->device->perf_logger->print_timings();
9558-
#endif
9591+
if (vk_perf_logger_enabled) {
9592+
// End the command buffer and submit/wait
9593+
GGML_ASSERT(!ctx->compute_ctx.expired());
9594+
compute_ctx = ctx->compute_ctx.lock();
9595+
ggml_vk_ctx_end(compute_ctx);
9596+
9597+
ggml_vk_submit(compute_ctx, ctx->device->fence);
9598+
VK_CHECK(ctx->device->device.waitForFences({ ctx->device->fence }, true, UINT64_MAX), "GGML_VULKAN_PERF waitForFences");
9599+
ctx->device->device.resetFences({ ctx->device->fence });
9600+
9601+
// Get the results and pass them to the logger
9602+
std::vector<uint64_t> timestamps(cgraph->n_nodes + 1);
9603+
ctx->device->device.getQueryPoolResults(ctx->device->query_pool, 0, cgraph->n_nodes + 1, (cgraph->n_nodes + 1)*sizeof(uint64_t), timestamps.data(), sizeof(uint64_t), vk::QueryResultFlagBits::e64 | vk::QueryResultFlagBits::eWait);
9604+
for (int i = 0; i < cgraph->n_nodes; i++) {
9605+
if (!ggml_vk_is_empty(cgraph->nodes[i])) {
9606+
ctx->device->perf_logger->log_timing(cgraph->nodes[i], uint64_t((timestamps[i+1] - timestamps[i]) * ctx->device->properties.limits.timestampPeriod));
9607+
}
9608+
}
9609+
9610+
ctx->device->perf_logger->print_timings();
9611+
}
95599612

95609613
ggml_vk_graph_cleanup(ctx);
95619614

0 commit comments

Comments
 (0)
0