Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Overlap cmdbuffer creation and cmdbuffer execution in Vulkan backend by submitting smaller cmdbuffers early. #9118

Merged
merged 7 commits into from
Sep 8, 2024
132 changes: 81 additions & 51 deletions ggml/src/ggml-vulkan.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -785,6 +785,9 @@ static vk_submission ggml_vk_create_submission(vk_device& device, vk_queue& q, s

static void ggml_vk_submit(vk_context& ctx, vk::Fence fence) {
if (ctx->seqs.empty()) {
if (fence) {
ctx->q->queue.submit({}, fence);
}
return;
}
VK_LOG_DEBUG("ggml_vk_submit(" << ctx << ", " << fence << ")");
Expand Down Expand Up @@ -2476,7 +2479,7 @@ static void ggml_vk_dispatch_pipeline(ggml_backend_vk_context* ctx, vk_context&
const uint32_t wg2 = CEIL_DIV(elements[2], pipeline->wg_denoms[2]);
VK_LOG_DEBUG("ggml_vk_dispatch_pipeline(" << pipeline->name << ", {";
for (auto& buffer : descriptor_buffer_infos) {
std::cerr << "(" << buffer << ", " << buffer.offset << ", " << buffer.size << "), ";
std::cerr << "(" << buffer.buffer << ", " << buffer.offset << ", " << buffer.range << "), ";
}
std::cerr << "}, (" << wg0 << "," << wg1 << "," << wg2 << "))");
GGML_ASSERT(pipeline->descriptor_set_idx < pipeline->descriptor_sets.size());
Expand Down Expand Up @@ -5614,11 +5617,15 @@ static void ggml_vk_preallocate_buffers(ggml_backend_vk_context * ctx) {
}
}

static void ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * node, int node_idx, bool last_node, bool dryrun){
static bool ggml_vk_compute_forward(ggml_backend_vk_context* ctx, ggml_tensor* tensor, int tensor_idx, bool use_fence);

// Returns true if node has enqueued work into the queue, false otherwise
// If submit is true the current all operations queued so far are being submitted to Vulkan to overlap cmdlist creation and GPU execution.
static bool ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * node, int node_idx, ggml_tensor *node_begin, int node_idx_begin, bool dryrun, bool last_node, bool submit){
ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) node->extra;

if (ggml_is_empty(node) || extra == nullptr) {
return;
return false;
}

VK_LOG_DEBUG("ggml_vk_build_graph(" << node << ", " << ggml_op_name(node->op) << ")");
Expand All @@ -5635,7 +5642,7 @@ static void ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * nod
case GGML_OP_PERMUTE:
case GGML_OP_TRANSPOSE:
case GGML_OP_NONE:
return;
return false;
case GGML_OP_UNARY:
switch (ggml_get_unary_op(node)) {
case GGML_UNARY_OP_SILU:
Expand All @@ -5645,7 +5652,7 @@ static void ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * nod
case GGML_UNARY_OP_TANH:
break;
default:
return;
return false;
}
break;
case GGML_OP_REPEAT:
Expand Down Expand Up @@ -5680,7 +5687,7 @@ static void ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * nod
default:
std::cerr << "ggml_vulkan: Error: Missing op: " << ggml_op_name(node->op) << std::endl;
GGML_ABORT("fatal error");
return;
return false;
}

vk_context compute_ctx;
Expand Down Expand Up @@ -5772,7 +5779,7 @@ static void ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * nod
ggml_vk_unary(ctx, compute_ctx, src0, node, dryrun);
break;
default:
return;
return false;
}
break;
case GGML_OP_DIAG_MASK_INF:
Expand Down Expand Up @@ -5816,11 +5823,11 @@ static void ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * nod

break;
default:
return;
return false;
}

if (dryrun) {
return;
return false;
}

ctx->tensor_ctxs[node_idx] = compute_ctx;
Expand All @@ -5831,14 +5838,34 @@ static void ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * nod
last_node = true;
#endif

if (last_node) {
if (submit || last_node) {
ggml_vk_ctx_end(compute_ctx);
compute_ctx->exit_tensor_idx = node_idx;

// TODO probably it'd be better to pass a exit_node flag to ggml_vk_compute_forward
if (last_node) {
compute_ctx->exit_tensor_idx = node_idx_begin;
}
else {
compute_ctx->exit_tensor_idx = -1;
}

ctx->compute_ctx.reset();

bool ok = ggml_vk_compute_forward(ctx, node_begin, node_idx_begin, false);
if (!ok) {
if (node->op == GGML_OP_UNARY) {
std::cerr << __func__ << ": error: op not supported UNARY " << node->name << " (" << ggml_unary_op_name(static_cast<ggml_unary_op>(node->op_params[0])) << ")" << std::endl;
}
else {
std::cerr << __func__ << ": error: op not supported " << node->name << " (" << ggml_op_name(node->op) << ")" << std::endl;
}
}

}
return true;
}

static bool ggml_vk_compute_forward(ggml_backend_vk_context * ctx, ggml_tensor * tensor, int tensor_idx){
static bool ggml_vk_compute_forward(ggml_backend_vk_context * ctx, ggml_tensor * tensor, int tensor_idx, bool use_fence = true){
ggml_tensor_extra_gpu * extra = nullptr;

switch (tensor->op) {
Expand Down Expand Up @@ -5904,40 +5931,38 @@ static bool ggml_vk_compute_forward(ggml_backend_vk_context * ctx, ggml_tensor *

VK_LOG_DEBUG("ggml_vk_compute_forward(" << tensor << ", name=" << tensor->name << ", op=" << ggml_op_name(tensor->op) << ", type=" << tensor->type << ", ne0=" << tensor->ne[0] << ", ne1=" << tensor->ne[1] << ", ne2=" << tensor->ne[2] << ", ne3=" << tensor->ne[3] << ", nb0=" << tensor->nb[0] << ", nb1=" << tensor->nb[1] << ", nb2=" << tensor->nb[2] << ", nb3=" << tensor->nb[3] << ", view_src=" << tensor->view_src << ", view_offs=" << tensor->view_offs << ")");

#ifdef GGML_VULKAN_CHECK_RESULTS
ggml_vk_check_results_0(tensor);
#endif

vk_context subctx = ctx->tensor_ctxs[tensor_idx].lock();

#ifdef GGML_VULKAN_PERF
std::chrono::steady_clock::time_point start;
#endif // GGML_VULKAN_PERF
// always wait for the GPU work to be done for the last submit
if (tensor_idx == subctx->exit_tensor_idx) {
use_fence = true;
}

// Only run if ctx hasn't been submitted yet
if (!subctx->seqs.empty()) {
#ifdef GGML_VULKAN_CHECK_RESULTS
ggml_vk_check_results_0(tensor);
use_fence = true;
#endif

// Do staging buffer copies
for (auto& cpy : subctx->in_memcpys) {
memcpy(cpy.dst, cpy.src, cpy.n);
}

#ifdef GGML_VULKAN_PERF
start = std::chrono::steady_clock::now();
#endif // GGML_VULKAN_PERF
ggml_vk_submit(subctx, use_fence ? ctx->fence : vk::Fence{});

if (use_fence) {
VK_CHECK(ctx->device->device.waitForFences({ ctx->fence }, true, UINT64_MAX), "ggml_vk_compute_forward waitForFences");

ggml_vk_submit(subctx, ctx->fence);
ctx->device->device.resetFences({ ctx->fence });
}
#ifdef GGML_VULKAN_CHECK_RESULTS
ggml_vk_check_results_1(tensor);
#endif
}

if (tensor_idx == subctx->exit_tensor_idx) {
0cc4m marked this conversation as resolved.
Show resolved Hide resolved
VK_CHECK(ctx->device->device.waitForFences({ ctx->fence }, true, UINT64_MAX), "ggml_vk_compute_forward waitForFences");

#ifdef GGML_VULKAN_PERF
auto duration = std::chrono::duration_cast<std::chrono::nanoseconds>(std::chrono::steady_clock::now() - start);
ctx->device->perf_logger->log_timing(tensor, duration.count());
#endif // GGML_VULKAN_PERF

ctx->device->device.resetFences({ ctx->fence });

// Do staging buffer copies
for (auto& cpy : subctx->out_memcpys) {
memcpy(cpy.dst, cpy.src, cpy.n);
Expand Down Expand Up @@ -6426,7 +6451,7 @@ GGML_CALL static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backen
ggml_backend_vk_context * ctx = (ggml_backend_vk_context *)backend->context;

for (int i = 0; i < cgraph->n_nodes; i++) {
ggml_vk_build_graph(ctx, cgraph->nodes[i], i, 0, true);
ggml_vk_build_graph(ctx, cgraph->nodes[i], i, nullptr, 0, true, false, false);
}
ggml_vk_preallocate_buffers(ctx);
ggml_pipeline_allocate_descriptor_sets(ctx->device);
Expand All @@ -6441,31 +6466,36 @@ GGML_CALL static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backen
// Reserve tensor context space for all nodes
ctx->tensor_ctxs.resize(cgraph->n_nodes);

for (int i = 0; i < cgraph->n_nodes; i++) {
ggml_vk_build_graph(ctx, cgraph->nodes[i], i, i == last_node, false);
}
bool first_node_in_batch = true; // true if next node will be first node in a batch
int submit_node_idx = 0; // index to first node in a batch

// submit work every submit_count node to overlap CPU cmdbuffer generation with GPU execution
constexpr int submit_count = 100;
int submitted_nodes = 0;
for (int i = 0; i < cgraph->n_nodes; i++) {
ggml_tensor * node = cgraph->nodes[i];

if (ggml_vk_is_empty(node)) {
continue;
if (first_node_in_batch) {
submit_node_idx = i;
}

bool ok = ggml_vk_compute_forward(ctx, node, i);
if (!ok) {
if (node->op == GGML_OP_UNARY) {
std::cerr << __func__ << ": error: op not supported UNARY " << node->name << " (" << ggml_unary_op_name(static_cast<ggml_unary_op>(node->op_params[0])) << ")" << std::endl;
} else {
std::cerr << __func__ << ": error: op not supported " << node->name << " (" << ggml_op_name(node->op) << ")" << std::endl;
bool submit = (submitted_nodes >= submit_count) || (i == last_node);


bool enqueued = ggml_vk_build_graph(ctx, cgraph->nodes[i], i, cgraph->nodes[submit_node_idx], submit_node_idx, false, i == last_node, submit);

if (enqueued) {
++submitted_nodes;

#ifndef GGML_VULKAN_CHECK_RESULTS
if (first_node_in_batch) {
first_node_in_batch = false;
}
#endif
}
#ifdef GGML_VULKAN_CHECK_RESULTS
else {
ggml_vk_check_results_1(node);
0cc4m marked this conversation as resolved.
Show resolved Hide resolved

if (submit) {
first_node_in_batch = true;
submitted_nodes = 0;
}
#endif
GGML_ASSERT(ok);
}

#ifdef GGML_VULKAN_PERF
Expand Down
Loading