From 73beb8ddabdb37f522be27aa8429f4fe7478fbc8 Mon Sep 17 00:00:00 2001
From: Markus Tavenrath <matavenrath@nvidia.com>
Date: Wed, 21 Aug 2024 14:31:59 +0200
Subject: [PATCH 1/7] Overlap cmdbuffer creation and cmdbuffer execution in
 Vulkan backend by submitting smaller cmdbuffers early.

---
 ggml/src/ggml-vulkan.cpp | 105 +++++++++++++++++++++++----------------
 1 file changed, 61 insertions(+), 44 deletions(-)

diff --git a/ggml/src/ggml-vulkan.cpp b/ggml/src/ggml-vulkan.cpp
index 32fda32a879ba..3dd242df18a55 100644
--- a/ggml/src/ggml-vulkan.cpp
+++ b/ggml/src/ggml-vulkan.cpp
@@ -785,6 +785,9 @@ static vk_submission ggml_vk_create_submission(vk_device& device, vk_queue& q, s
 
 static void ggml_vk_submit(vk_context& ctx, vk::Fence fence) {
     if (ctx->seqs.empty()) {
+        if (fence) {
+            ctx->q->queue.submit({}, fence);
+        }
         return;
     }
     VK_LOG_DEBUG("ggml_vk_submit(" << ctx << ", " << fence << ")");
@@ -5614,11 +5617,15 @@ static void ggml_vk_preallocate_buffers(ggml_backend_vk_context * ctx) {
     }
 }
 
-static void ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * node, int node_idx, bool last_node, bool dryrun){
+bool ggml_vk_compute_forward(ggml_backend_vk_context* ctx, ggml_tensor* tensor, int tensor_idx, bool use_fence);
+
+// Returns true if node has enqueued work into the queue, false otherwise
+// If submit is true the current all operations queued so far are being submitted to Vulkan to overlap cmdlist creation and GPU execution.
+static bool ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * node, int node_idx, ggml_tensor *node_begin, int node_idx_begin, bool dryrun, bool submit){
     ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) node->extra;
 
     if (ggml_is_empty(node) || extra == nullptr) {
-        return;
+        return false;
     }
 
     VK_LOG_DEBUG("ggml_vk_build_graph(" << node << ", " << ggml_op_name(node->op) << ")");
@@ -5635,7 +5642,7 @@ static void ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * nod
     case GGML_OP_PERMUTE:
     case GGML_OP_TRANSPOSE:
     case GGML_OP_NONE:
-        return;
+        return false;
     case GGML_OP_UNARY:
         switch (ggml_get_unary_op(node)) {
         case GGML_UNARY_OP_SILU:
@@ -5645,7 +5652,7 @@ static void ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * nod
         case GGML_UNARY_OP_TANH:
             break;
         default:
-            return;
+            return false;
         }
         break;
     case GGML_OP_REPEAT:
@@ -5680,7 +5687,7 @@ static void ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * nod
     default:
         std::cerr << "ggml_vulkan: Error: Missing op: " << ggml_op_name(node->op) << std::endl;
         GGML_ABORT("fatal error");
-        return;
+        return false;
     }
 
     vk_context compute_ctx;
@@ -5772,7 +5779,7 @@ static void ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * nod
             ggml_vk_unary(ctx, compute_ctx, src0, node, dryrun);
             break;
         default:
-            return;
+            return false;
         }
         break;
     case GGML_OP_DIAG_MASK_INF:
@@ -5816,11 +5823,11 @@ static void ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * nod
 
         break;
     default:
-        return;
+        return false;
     }
 
     if (dryrun) {
-        return;
+        return false;
     }
 
     ctx->tensor_ctxs[node_idx] = compute_ctx;
@@ -5831,14 +5838,26 @@ static void ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * nod
     last_node = true;
 #endif
 
-    if (last_node) {
+    if (submit) {
         ggml_vk_ctx_end(compute_ctx);
         compute_ctx->exit_tensor_idx = node_idx;
         ctx->compute_ctx.reset();
+
+        bool ok = ggml_vk_compute_forward(ctx, node_begin, node_idx_begin, false);
+        if (!ok) {
+            if (node->op == GGML_OP_UNARY) {
+                std::cerr << __func__ << ": error: op not supported UNARY " << node->name << " (" << ggml_unary_op_name(static_cast<ggml_unary_op>(node->op_params[0])) << ")" << std::endl;
+            }
+            else {
+                std::cerr << __func__ << ": error: op not supported " << node->name << " (" << ggml_op_name(node->op) << ")" << std::endl;
+            }
+        }
+
     }
+    return true;
 }
 
-static bool ggml_vk_compute_forward(ggml_backend_vk_context * ctx, ggml_tensor * tensor, int tensor_idx){
+static bool ggml_vk_compute_forward(ggml_backend_vk_context * ctx, ggml_tensor * tensor, int tensor_idx, bool use_fence = true){
     ggml_tensor_extra_gpu * extra = nullptr;
 
     switch (tensor->op) {
@@ -5910,9 +5929,7 @@ static bool ggml_vk_compute_forward(ggml_backend_vk_context * ctx, ggml_tensor *
 
     vk_context subctx = ctx->tensor_ctxs[tensor_idx].lock();
 
-#ifdef GGML_VULKAN_PERF
-    std::chrono::steady_clock::time_point start;
-#endif // GGML_VULKAN_PERF
+    VkFence fence = use_fence ? ctx->fence : VkFence{};
 
     // Only run if ctx hasn't been submitted yet
     if (!subctx->seqs.empty()) {
@@ -5921,20 +5938,13 @@ static bool ggml_vk_compute_forward(ggml_backend_vk_context * ctx, ggml_tensor *
             memcpy(cpy.dst, cpy.src, cpy.n);
         }
 
-#ifdef GGML_VULKAN_PERF
-        start = std::chrono::steady_clock::now();
-#endif // GGML_VULKAN_PERF
-
-        ggml_vk_submit(subctx, ctx->fence);
+        ggml_vk_submit(subctx, fence);
     }
 
-    if (tensor_idx == subctx->exit_tensor_idx) {
-        VK_CHECK(ctx->device->device.waitForFences({ ctx->fence }, true, UINT64_MAX), "ggml_vk_compute_forward waitForFences");
+    if (tensor_idx != 0 && tensor_idx == subctx->exit_tensor_idx) {
+        ggml_vk_submit(subctx, fence);
 
-#ifdef GGML_VULKAN_PERF
-        auto duration = std::chrono::duration_cast<std::chrono::nanoseconds>(std::chrono::steady_clock::now() - start);
-        ctx->device->perf_logger->log_timing(tensor, duration.count());
-#endif // GGML_VULKAN_PERF
+        VK_CHECK(ctx->device->device.waitForFences({ ctx->fence }, true, UINT64_MAX), "ggml_vk_compute_forward waitForFences");
 
         ctx->device->device.resetFences({ ctx->fence });
 
@@ -6426,7 +6436,7 @@ GGML_CALL static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backen
     ggml_backend_vk_context * ctx = (ggml_backend_vk_context *)backend->context;
 
     for (int i = 0; i < cgraph->n_nodes; i++) {
-        ggml_vk_build_graph(ctx, cgraph->nodes[i], i, 0, true);
+        ggml_vk_build_graph(ctx, cgraph->nodes[i], i, nullptr, 0, true, false);
     }
     ggml_vk_preallocate_buffers(ctx);
     ggml_pipeline_allocate_descriptor_sets(ctx->device);
@@ -6441,33 +6451,40 @@ GGML_CALL static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backen
     // Reserve tensor context space for all nodes
     ctx->tensor_ctxs.resize(cgraph->n_nodes);
 
-    for (int i = 0; i < cgraph->n_nodes; i++) {
-        ggml_vk_build_graph(ctx, cgraph->nodes[i], i, i == last_node, false);
-    }
+    bool first_node_in_batch = true; // true if next node will be first node in a batch
+    int submit_node_idx = 0; // index to first node in a batch
 
+    // submit work every submit_count node to overlap CPU cmdbuffer generation with GPU execution
+    constexpr int submit_count = 50;
     for (int i = 0; i < cgraph->n_nodes; i++) {
-        ggml_tensor * node = cgraph->nodes[i];
-
-        if (ggml_vk_is_empty(node)) {
-            continue;
+        if (first_node_in_batch) {
+            submit_node_idx = i;
         }
+            
+        bool submit = ((i % submit_count) == 0) || (i == last_node);
+        bool enqueued = ggml_vk_build_graph(ctx, cgraph->nodes[i], i, cgraph->nodes[submit_node_idx], submit_node_idx, false, submit);
 
-        bool ok = ggml_vk_compute_forward(ctx, node, i);
-        if (!ok) {
-            if (node->op == GGML_OP_UNARY) {
-                std::cerr << __func__ << ": error: op not supported UNARY " << node->name << " (" << ggml_unary_op_name(static_cast<ggml_unary_op>(node->op_params[0])) << ")" << std::endl;
-            } else {
-                std::cerr << __func__ << ": error: op not supported " << node->name << " (" << ggml_op_name(node->op) << ")" << std::endl;
-            }
+        if (first_node_in_batch && enqueued) {
+            first_node_in_batch = false;
         }
-#ifdef GGML_VULKAN_CHECK_RESULTS
-        else {
-            ggml_vk_check_results_1(node);
+        if (submit) {
+            first_node_in_batch = true;
         }
-#endif
-        GGML_ASSERT(ok);
     }
 
+    // wait for work on the GPU to complete work
+    bool ok = ggml_vk_compute_forward(ctx, cgraph->nodes[cgraph->n_nodes-1], cgraph->n_nodes - 1, true);
+
+    if (!ok) {
+        std::cerr << __func__ << ": error: failed to enqueue cmdbuffer" << std::endl;
+    }
+#ifdef GGML_VULKAN_CHECK_RESULTS
+    else {
+        ggml_vk_check_results_1(node);
+    }
+#endif
+    GGML_ASSERT(ok);
+
 #ifdef GGML_VULKAN_PERF
     ctx->device->perf_logger->print_timings();
 #endif

From fb7befd045146e3f859a711cffc841e19167dc41 Mon Sep 17 00:00:00 2001
From: Markus Tavenrath <matavenrath@nvidia.com>
Date: Wed, 21 Aug 2024 19:08:01 +0200
Subject: [PATCH 2/7] fix compile issues

---
 ggml/src/ggml-vulkan.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/ggml/src/ggml-vulkan.cpp b/ggml/src/ggml-vulkan.cpp
index 3dd242df18a55..916f05c0b09eb 100644
--- a/ggml/src/ggml-vulkan.cpp
+++ b/ggml/src/ggml-vulkan.cpp
@@ -5617,7 +5617,7 @@ static void ggml_vk_preallocate_buffers(ggml_backend_vk_context * ctx) {
     }
 }
 
-bool ggml_vk_compute_forward(ggml_backend_vk_context* ctx, ggml_tensor* tensor, int tensor_idx, bool use_fence);
+static bool ggml_vk_compute_forward(ggml_backend_vk_context* ctx, ggml_tensor* tensor, int tensor_idx, bool use_fence);
 
 // Returns true if node has enqueued work into the queue, false otherwise
 // If submit is true the current all operations queued so far are being submitted to Vulkan to overlap cmdlist creation and GPU execution.
@@ -5929,7 +5929,7 @@ static bool ggml_vk_compute_forward(ggml_backend_vk_context * ctx, ggml_tensor *
 
     vk_context subctx = ctx->tensor_ctxs[tensor_idx].lock();
 
-    VkFence fence = use_fence ? ctx->fence : VkFence{};
+    vk::Fence fence = use_fence ? ctx->fence : vk::Fence{};
 
     // Only run if ctx hasn't been submitted yet
     if (!subctx->seqs.empty()) {

From 5e2ce2471004132245d39a0c6e1e759a482c5f94 Mon Sep 17 00:00:00 2001
From: Markus Tavenrath <matavenrath@nvidia.com>
Date: Thu, 22 Aug 2024 15:48:24 +0200
Subject: [PATCH 3/7] Fix issues where the last submit wasn't executed or
 handled properly.

---
 ggml/src/ggml-vulkan.cpp | 49 ++++++++++++++++++++--------------------
 1 file changed, 25 insertions(+), 24 deletions(-)

diff --git a/ggml/src/ggml-vulkan.cpp b/ggml/src/ggml-vulkan.cpp
index 916f05c0b09eb..ccf73cae1d37a 100644
--- a/ggml/src/ggml-vulkan.cpp
+++ b/ggml/src/ggml-vulkan.cpp
@@ -2479,7 +2479,7 @@ static void ggml_vk_dispatch_pipeline(ggml_backend_vk_context* ctx, vk_context&
     const uint32_t wg2 = CEIL_DIV(elements[2], pipeline->wg_denoms[2]);
     VK_LOG_DEBUG("ggml_vk_dispatch_pipeline(" << pipeline->name << ", {";
     for (auto& buffer : descriptor_buffer_infos) {
-        std::cerr << "(" << buffer << ", " << buffer.offset << ", " << buffer.size << "), ";
+        std::cerr << "(" << buffer.buffer << ", " << buffer.offset << ", " << buffer.range << "), ";
     }
     std::cerr << "}, (" << wg0 << "," << wg1 << "," << wg2 << "))");
     GGML_ASSERT(pipeline->descriptor_set_idx < pipeline->descriptor_sets.size());
@@ -5621,7 +5621,7 @@ static bool ggml_vk_compute_forward(ggml_backend_vk_context* ctx, ggml_tensor* t
 
 // Returns true if node has enqueued work into the queue, false otherwise
 // If submit is true the current all operations queued so far are being submitted to Vulkan to overlap cmdlist creation and GPU execution.
-static bool ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * node, int node_idx, ggml_tensor *node_begin, int node_idx_begin, bool dryrun, bool submit){
+static bool ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * node, int node_idx, ggml_tensor *node_begin, int node_idx_begin, bool dryrun, bool last_node, bool submit){
     ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) node->extra;
 
     if (ggml_is_empty(node) || extra == nullptr) {
@@ -5838,9 +5838,17 @@ static bool ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * nod
     last_node = true;
 #endif
 
-    if (submit) {
+    if (submit || last_node) {
         ggml_vk_ctx_end(compute_ctx);
-        compute_ctx->exit_tensor_idx = node_idx;
+
+        // TODO probably it'd be better to pass a exit_node flag to ggml_vk_compute_forward
+        if (last_node) {
+            compute_ctx->exit_tensor_idx = node_idx_begin; 
+        }
+        else {
+            compute_ctx->exit_tensor_idx = -1;
+        }
+
         ctx->compute_ctx.reset();
 
         bool ok = ggml_vk_compute_forward(ctx, node_begin, node_idx_begin, false);
@@ -5929,6 +5937,11 @@ static bool ggml_vk_compute_forward(ggml_backend_vk_context * ctx, ggml_tensor *
 
     vk_context subctx = ctx->tensor_ctxs[tensor_idx].lock();
 
+    // always wait for the GPU work to be done for the last submit
+    if (tensor_idx == subctx->exit_tensor_idx) {
+        use_fence = true;
+    }
+
     vk::Fence fence = use_fence ? ctx->fence : vk::Fence{};
 
     // Only run if ctx hasn't been submitted yet
@@ -5941,13 +5954,13 @@ static bool ggml_vk_compute_forward(ggml_backend_vk_context * ctx, ggml_tensor *
         ggml_vk_submit(subctx, fence);
     }
 
-    if (tensor_idx != 0 && tensor_idx == subctx->exit_tensor_idx) {
-        ggml_vk_submit(subctx, fence);
-
+    if (use_fence) {
         VK_CHECK(ctx->device->device.waitForFences({ ctx->fence }, true, UINT64_MAX), "ggml_vk_compute_forward waitForFences");
 
         ctx->device->device.resetFences({ ctx->fence });
+    }
 
+    if (tensor_idx == subctx->exit_tensor_idx) {
         // Do staging buffer copies
         for (auto& cpy : subctx->out_memcpys) {
             memcpy(cpy.dst, cpy.src, cpy.n);
@@ -6436,7 +6449,7 @@ GGML_CALL static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backen
     ggml_backend_vk_context * ctx = (ggml_backend_vk_context *)backend->context;
 
     for (int i = 0; i < cgraph->n_nodes; i++) {
-        ggml_vk_build_graph(ctx, cgraph->nodes[i], i, nullptr, 0, true, false);
+        ggml_vk_build_graph(ctx, cgraph->nodes[i], i, nullptr, 0, true, false, false);
     }
     ggml_vk_preallocate_buffers(ctx);
     ggml_pipeline_allocate_descriptor_sets(ctx->device);
@@ -6460,9 +6473,10 @@ GGML_CALL static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backen
         if (first_node_in_batch) {
             submit_node_idx = i;
         }
-            
-        bool submit = ((i % submit_count) == 0) || (i == last_node);
-        bool enqueued = ggml_vk_build_graph(ctx, cgraph->nodes[i], i, cgraph->nodes[submit_node_idx], submit_node_idx, false, submit);
+
+        // TODO probably it's better to move the submit conter to ggml_vk_build_graph since a lot of nodes might not contain actual vulkan work
+        bool submit = i && ((i % submit_count) == 0);
+        bool enqueued = ggml_vk_build_graph(ctx, cgraph->nodes[i], i, cgraph->nodes[submit_node_idx], submit_node_idx, false, i == last_node, submit);
 
         if (first_node_in_batch && enqueued) {
             first_node_in_batch = false;
@@ -6472,19 +6486,6 @@ GGML_CALL static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backen
         }
     }
 
-    // wait for work on the GPU to complete work
-    bool ok = ggml_vk_compute_forward(ctx, cgraph->nodes[cgraph->n_nodes-1], cgraph->n_nodes - 1, true);
-
-    if (!ok) {
-        std::cerr << __func__ << ": error: failed to enqueue cmdbuffer" << std::endl;
-    }
-#ifdef GGML_VULKAN_CHECK_RESULTS
-    else {
-        ggml_vk_check_results_1(node);
-    }
-#endif
-    GGML_ASSERT(ok);
-
 #ifdef GGML_VULKAN_PERF
     ctx->device->perf_logger->print_timings();
 #endif

From 2812b35ce42992aa9e6693f01a8c157dfdad4400 Mon Sep 17 00:00:00 2001
From: Markus Tavenrath <matavenrath@nvidia.com>
Date: Fri, 23 Aug 2024 09:01:46 +0200
Subject: [PATCH 4/7] remove trailing whitespace

---
 ggml/src/ggml-vulkan.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ggml/src/ggml-vulkan.cpp b/ggml/src/ggml-vulkan.cpp
index ccf73cae1d37a..84b591cc90737 100644
--- a/ggml/src/ggml-vulkan.cpp
+++ b/ggml/src/ggml-vulkan.cpp
@@ -5843,7 +5843,7 @@ static bool ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * nod
 
         // TODO probably it'd be better to pass a exit_node flag to ggml_vk_compute_forward
         if (last_node) {
-            compute_ctx->exit_tensor_idx = node_idx_begin; 
+            compute_ctx->exit_tensor_idx = node_idx_begin;
         }
         else {
             compute_ctx->exit_tensor_idx = -1;

From 2b2bc1ff8b167de8047be9d8d72eebf1d4e7721d Mon Sep 17 00:00:00 2001
From: Markus Tavenrath <matavenrath@nvidia.com>
Date: Tue, 27 Aug 2024 13:47:33 +0200
Subject: [PATCH 5/7] Repair GGML_VULKAN_CHECK_RESULTS

---
 ggml/src/ggml-vulkan.cpp | 24 +++++++++++++-----------
 1 file changed, 13 insertions(+), 11 deletions(-)

diff --git a/ggml/src/ggml-vulkan.cpp b/ggml/src/ggml-vulkan.cpp
index 84b591cc90737..5f5fb8a6bfcab 100644
--- a/ggml/src/ggml-vulkan.cpp
+++ b/ggml/src/ggml-vulkan.cpp
@@ -5931,10 +5931,6 @@ static bool ggml_vk_compute_forward(ggml_backend_vk_context * ctx, ggml_tensor *
 
     VK_LOG_DEBUG("ggml_vk_compute_forward(" << tensor << ", name=" << tensor->name << ", op=" << ggml_op_name(tensor->op) << ", type=" << tensor->type << ", ne0=" << tensor->ne[0] << ", ne1=" << tensor->ne[1] << ", ne2=" << tensor->ne[2] << ", ne3=" << tensor->ne[3] << ", nb0=" << tensor->nb[0] << ", nb1=" << tensor->nb[1] << ", nb2=" << tensor->nb[2] << ", nb3=" << tensor->nb[3] << ", view_src=" << tensor->view_src << ", view_offs=" << tensor->view_offs << ")");
 
-#ifdef GGML_VULKAN_CHECK_RESULTS
-    ggml_vk_check_results_0(tensor);
-#endif
-
     vk_context subctx = ctx->tensor_ctxs[tensor_idx].lock();
 
     // always wait for the GPU work to be done for the last submit
@@ -5942,22 +5938,28 @@ static bool ggml_vk_compute_forward(ggml_backend_vk_context * ctx, ggml_tensor *
         use_fence = true;
     }
 
-    vk::Fence fence = use_fence ? ctx->fence : vk::Fence{};
-
     // Only run if ctx hasn't been submitted yet
     if (!subctx->seqs.empty()) {
+#ifdef GGML_VULKAN_CHECK_RESULTS
+        ggml_vk_check_results_0(tensor);
+        use_fence = true;
+#endif
+
         // Do staging buffer copies
         for (auto& cpy : subctx->in_memcpys) {
             memcpy(cpy.dst, cpy.src, cpy.n);
         }
 
-        ggml_vk_submit(subctx, fence);
-    }
+        ggml_vk_submit(subctx, use_fence ? ctx->fence : vk::Fence{});
 
-    if (use_fence) {
-        VK_CHECK(ctx->device->device.waitForFences({ ctx->fence }, true, UINT64_MAX), "ggml_vk_compute_forward waitForFences");
+        if (use_fence) {
+            VK_CHECK(ctx->device->device.waitForFences({ ctx->fence }, true, UINT64_MAX), "ggml_vk_compute_forward waitForFences");
 
-        ctx->device->device.resetFences({ ctx->fence });
+            ctx->device->device.resetFences({ ctx->fence });
+        }
+#ifdef GGML_VULKAN_CHECK_RESULTS
+        ggml_vk_check_results_1(tensor);
+#endif
     }
 
     if (tensor_idx == subctx->exit_tensor_idx) {

From 28506e51d6cc62d0f132f0f37dc7bfaf1ba51930 Mon Sep 17 00:00:00 2001
From: Markus Tavenrath <matavenrath@nvidia.com>
Date: Tue, 27 Aug 2024 13:56:13 +0200
Subject: [PATCH 6/7] Increase submit counter only if actual work has been
 submitted and increase submit count to 100.

---
 ggml/src/ggml-vulkan.cpp | 16 +++++++++++-----
 1 file changed, 11 insertions(+), 5 deletions(-)

diff --git a/ggml/src/ggml-vulkan.cpp b/ggml/src/ggml-vulkan.cpp
index 5f5fb8a6bfcab..411a13c8a5eea 100644
--- a/ggml/src/ggml-vulkan.cpp
+++ b/ggml/src/ggml-vulkan.cpp
@@ -6470,21 +6470,27 @@ GGML_CALL static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backen
     int submit_node_idx = 0; // index to first node in a batch
 
     // submit work every submit_count node to overlap CPU cmdbuffer generation with GPU execution
-    constexpr int submit_count = 50;
+    constexpr int submit_count = 100;
+    int submitted_nodes = 0;
     for (int i = 0; i < cgraph->n_nodes; i++) {
         if (first_node_in_batch) {
             submit_node_idx = i;
         }
 
-        // TODO probably it's better to move the submit conter to ggml_vk_build_graph since a lot of nodes might not contain actual vulkan work
-        bool submit = i && ((i % submit_count) == 0);
+        bool submit = (submitted_nodes >= submit_count) || (i == last_node);
         bool enqueued = ggml_vk_build_graph(ctx, cgraph->nodes[i], i, cgraph->nodes[submit_node_idx], submit_node_idx, false, i == last_node, submit);
 
-        if (first_node_in_batch && enqueued) {
-            first_node_in_batch = false;
+        if (enqueued) {
+            ++submitted_nodes;
+
+            if (first_node_in_batch) {
+                first_node_in_batch = false;
+            }
         }
+
         if (submit) {
             first_node_in_batch = true;
+            submitted_nodes = 0;
         }
     }
 

From 9dc02233909a978109da2a474db265fe264c5613 Mon Sep 17 00:00:00 2001
From: Markus Tavenrath <matavenrath@nvidia.com>
Date: Sun, 8 Sep 2024 11:19:34 +0200
Subject: [PATCH 7/7] Fix some nodes are not checked with
 GGML_VULKAN_CHECK_RESULTS enabled.

---
 ggml/src/ggml-vulkan.cpp | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/ggml/src/ggml-vulkan.cpp b/ggml/src/ggml-vulkan.cpp
index 411a13c8a5eea..e76d58a630870 100644
--- a/ggml/src/ggml-vulkan.cpp
+++ b/ggml/src/ggml-vulkan.cpp
@@ -6478,14 +6478,18 @@ GGML_CALL static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backen
         }
 
         bool submit = (submitted_nodes >= submit_count) || (i == last_node);
+
+
         bool enqueued = ggml_vk_build_graph(ctx, cgraph->nodes[i], i, cgraph->nodes[submit_node_idx], submit_node_idx, false, i == last_node, submit);
 
         if (enqueued) {
             ++submitted_nodes;
 
+#ifndef GGML_VULKAN_CHECK_RESULTS
             if (first_node_in_batch) {
                 first_node_in_batch = false;
             }
+#endif
         }
 
         if (submit) {