From 4d56810a839d9ee556f68107790ddc8e77591675 Mon Sep 17 00:00:00 2001 From: Changyeon Kim Date: Sun, 11 Aug 2024 19:24:42 +0900 Subject: [PATCH 1/6] llava: Add ACC OP for GPU acceleration to the Vulkan backend in the LLAVA CLIP model. - The CLIP model now prioritizes the Vulkan backend over the CPU when vulkan available. - A GGML_OP_ACC shader has been added. - The encoding performance of the CLIP model improved from 4.2s on the CPU to 0.9s on the GPU. Signed-off-by: Changyeon Kim --- examples/llava/clip.cpp | 8 +++++ ggml/src/ggml-vulkan.cpp | 34 +++++++++++++++++++ ggml/src/vulkan-shaders/acc.comp | 22 ++++++++++++ .../src/vulkan-shaders/vulkan-shaders-gen.cpp | 4 +++ 4 files changed, 68 insertions(+) create mode 100644 ggml/src/vulkan-shaders/acc.comp diff --git a/examples/llava/clip.cpp b/examples/llava/clip.cpp index 54aa822c90d29..25fe90ac2b460 100644 --- a/examples/llava/clip.cpp +++ b/examples/llava/clip.cpp @@ -20,6 +20,10 @@ #include "ggml-cann.h" #endif +#ifdef GGML_USE_VULKAN +#include "ggml-vulkan.h" +#endif + #define STB_IMAGE_IMPLEMENTATION #include "stb_image.h" @@ -1125,6 +1129,10 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) { LOG_TEE("%s: CLIP using CANN backend\n", __func__); #endif +#ifdef GGML_USE_VULKAN + new_clip->backend = ggml_backend_vk_init(0); + LOG_TEE("%s: CLIP using Vulkan backend\n", __func__); +#endif if (!new_clip->backend) { new_clip->backend = ggml_backend_cpu_init(); diff --git a/ggml/src/ggml-vulkan.cpp b/ggml/src/ggml-vulkan.cpp index 86732837254f0..e5f605d8135e1 100644 --- a/ggml/src/ggml-vulkan.cpp +++ b/ggml/src/ggml-vulkan.cpp @@ -177,6 +177,7 @@ struct vk_device_struct { vk_pipeline pipeline_mul_mat_vec_nc_f16_f32; vk_pipeline pipeline_get_rows[GGML_TYPE_COUNT]; vk_pipeline pipeline_get_rows_f32[GGML_TYPE_COUNT]; + vk_pipeline pipeline_acc_f32; vk_pipeline pipeline_add_f32, pipeline_add_f16_f32_f16; vk_pipeline pipeline_mul_f32; vk_pipeline pipeline_div_f32; @@ -1647,6 +1648,8 @@ static void ggml_vk_load_shaders(vk_device& device) { ggml_vk_create_pipeline(device, device->pipeline_add_f32, "add_f32", add_f32_len, add_f32_data, "main", 3, sizeof(vk_op_binary_push_constants), {512, 1, 1}, {}, 1); ggml_vk_create_pipeline(device, device->pipeline_add_f16_f32_f16, "add_f16_f32_f16", add_f16_f32_f16_len, add_f16_f32_f16_data, "main", 3, sizeof(vk_op_binary_push_constants), {512, 1, 1}, {}, 1); + ggml_vk_create_pipeline(device, device->pipeline_acc_f32, "acc_f32", acc_f32_len, acc_f32_data, "main", 3, sizeof(vk_op_binary_push_constants), {512, 1, 1}, {}, 1); + ggml_vk_create_pipeline(device, device->pipeline_mul_f32, "mul_f32", mul_f32_len, mul_f32_data, "main", 3, sizeof(vk_op_binary_push_constants), {512, 1, 1}, {}, 1); ggml_vk_create_pipeline(device, device->pipeline_div_f32, "div_f32", div_f32_len, div_f32_data, "main", 3, sizeof(vk_op_binary_push_constants), {512, 1, 1}, {}, 1); @@ -3931,6 +3934,11 @@ static vk_pipeline ggml_vk_op_get_pipeline(ggml_backend_vk_context * ctx, const return ctx->device->pipeline_get_rows_f32[src0->type]; } return nullptr; + case GGML_OP_ACC: + if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) { + return ctx->device->pipeline_acc_f32; + } + return nullptr; case GGML_OP_ADD: if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) { return ctx->device->pipeline_add_f32; @@ -4474,6 +4482,23 @@ static void ggml_vk_get_rows(ggml_backend_vk_context * ctx, vk_context& subctx, }); } +static void ggml_vk_acc(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { + ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) dst->extra; + const uint32_t src0_type_size = ggml_type_size(src0->type); + const uint32_t src1_type_size = ggml_type_size(src1->type); + const uint32_t dst_type_size = ggml_type_size(dst->type); + const uint32_t d_offset = ((extra->offset + dst->view_offs) % ctx->device->properties.limits.minStorageBufferOffsetAlignment) / dst_type_size; + + ggml_vk_op_f32(ctx, subctx, src0, src1, nullptr, dst, GGML_OP_ACC, { + (uint32_t)ggml_nelements(src0), + (uint32_t)src0->ne[0], (uint32_t)src0->ne[1], (uint32_t)src0->ne[2],(uint32_t)src0->ne[3], (uint32_t)src0->nb[0] / src0_type_size, (uint32_t)src0->nb[1] / src0_type_size, (uint32_t)src0->nb[2] / src0_type_size, (uint32_t)src0->nb[3] / src0_type_size, + (uint32_t)src1->ne[0], (uint32_t)src1->ne[1], (uint32_t)src1->ne[2],(uint32_t)src1->ne[3], (uint32_t)src1->nb[0] / src1_type_size, (uint32_t)src1->nb[1] / src1_type_size, (uint32_t)src1->nb[2] / src1_type_size, (uint32_t)src1->nb[3] / src1_type_size, + (uint32_t) dst->ne[0], (uint32_t) dst->ne[1], (uint32_t) dst->ne[2],(uint32_t) dst->ne[3], (uint32_t) dst->nb[0] / dst_type_size, (uint32_t) dst->nb[1] / dst_type_size, (uint32_t) dst->nb[2] / dst_type_size, (uint32_t) dst->nb[3] / dst_type_size, + d_offset, + 0.0f, 0.0f, + }); +} + static void ggml_vk_add(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { const uint32_t src0_type_size = ggml_type_size(src0->type); const uint32_t src1_type_size = ggml_type_size(src1->type); @@ -5758,6 +5783,7 @@ static void ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * nod case GGML_OP_REPEAT: case GGML_OP_GET_ROWS: case GGML_OP_ADD: + case GGML_OP_ACC: case GGML_OP_MUL: case GGML_OP_DIV: case GGML_OP_CONCAT: @@ -5803,6 +5829,10 @@ static void ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * nod case GGML_OP_REPEAT: ggml_vk_repeat(ctx, compute_ctx, src0, node); + break; + case GGML_OP_ACC: + ggml_vk_acc(ctx, compute_ctx, src0, src1, node); + break; case GGML_OP_GET_ROWS: ggml_vk_get_rows(ctx, compute_ctx, src0, src1, node); @@ -5939,6 +5969,7 @@ static bool ggml_vk_compute_forward(ggml_backend_vk_context * ctx, ggml_tensor * switch (tensor->op) { case GGML_OP_ADD: + case GGML_OP_ACC: case GGML_OP_GET_ROWS: case GGML_OP_MUL: case GGML_OP_DIV: @@ -6655,6 +6686,7 @@ GGML_CALL static bool ggml_backend_vk_supports_op(ggml_backend_t backend, const case GGML_OP_GROUP_NORM: case GGML_OP_RMS_NORM: case GGML_OP_ADD: + case GGML_OP_ACC: case GGML_OP_MUL: case GGML_OP_DIV: case GGML_OP_CONCAT: @@ -7109,6 +7141,8 @@ static void ggml_vk_check_results_0(ggml_tensor * tensor) { tensor_clone = ggml_pad(ggml_ctx, src0_clone, tensor->ne[0] - src0_clone->ne[0], tensor->ne[1] - src0_clone->ne[1], tensor->ne[2] - src0_clone->ne[2], tensor->ne[3] - src0_clone->ne[3]); } else if (tensor->op == GGML_OP_ADD) { tensor_clone = ggml_add(ggml_ctx, src0_clone, src1_clone); + } else if (tensor->op == GGML_OP_ACC) { + tensor_clone = ggml_acc(ggml_ctx, src0_clone, src1_clone); } else if (tensor->op == GGML_OP_NORM) { tensor_clone = ggml_norm(ggml_ctx, src0_clone, *(float *)tensor->op_params); } else if (tensor->op == GGML_OP_GROUP_NORM) { diff --git a/ggml/src/vulkan-shaders/acc.comp b/ggml/src/vulkan-shaders/acc.comp new file mode 100644 index 0000000000000..5ca9140278fd2 --- /dev/null +++ b/ggml/src/vulkan-shaders/acc.comp @@ -0,0 +1,22 @@ +#version 450 + +#include "types.comp" +#include "generic_binary_head.comp" + +void main() { + const uint idx = get_idx(); + if (idx >= p.ne) { + return; + } + + const uint src1_i = src1_idx(idx); + const uint oz = src1_i / p.nb02; + const uint oy = (src1_i - (oz * p.nb02)) / p.nb01; + const uint ox = src1_i % p.nb01; + + if (ox < p.ne10 && oy < p.ne11 && oz < p.ne12) { + data_d[p.d_offset + dst_idx(idx)] = D_TYPE(FLOAT_TYPE(data_a[src0_idx(idx)]) + FLOAT_TYPE(data_b[ox + oy * p.ne10 + oz * p.ne10 * p.ne11])); + } else { + data_d[p.d_offset + dst_idx(idx)] = D_TYPE(FLOAT_TYPE(data_a[src0_idx(idx)])); + } +} \ No newline at end of file diff --git a/ggml/src/vulkan-shaders/vulkan-shaders-gen.cpp b/ggml/src/vulkan-shaders/vulkan-shaders-gen.cpp index a792e203b273a..2ee997af8d5c9 100644 --- a/ggml/src/vulkan-shaders/vulkan-shaders-gen.cpp +++ b/ggml/src/vulkan-shaders/vulkan-shaders-gen.cpp @@ -368,6 +368,10 @@ void process_shaders(std::vector>& tasks) { string_to_spv("add_f16_f32_f16", "add.comp", {{"A_TYPE", "float16_t"}, {"B_TYPE", "float"}, {"D_TYPE", "float16_t"}, {"FLOAT_TYPE", "float"}}); })); + tasks.push_back(std::async(std::launch::async, [] { + string_to_spv("acc_f32", "acc.comp", {{"A_TYPE", "float"}, {"B_TYPE", "float"}, {"D_TYPE", "float"}, {"FLOAT_TYPE", "float"}}); + })); + tasks.push_back(std::async(std::launch::async, [] { string_to_spv("split_k_reduce", "mul_mat_split_k_reduce.comp", {}); })); From b9fc6784d03cd4da724852d0be3b99b27321dc4e Mon Sep 17 00:00:00 2001 From: Changyeon Kim Date: Sun, 11 Aug 2024 21:28:19 +0900 Subject: [PATCH 2/6] fix-up coding style. Signed-off-by: Changyeon Kim --- ggml/src/vulkan-shaders/acc.comp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/ggml/src/vulkan-shaders/acc.comp b/ggml/src/vulkan-shaders/acc.comp index 5ca9140278fd2..8566129eebdf9 100644 --- a/ggml/src/vulkan-shaders/acc.comp +++ b/ggml/src/vulkan-shaders/acc.comp @@ -19,4 +19,5 @@ void main() { } else { data_d[p.d_offset + dst_idx(idx)] = D_TYPE(FLOAT_TYPE(data_a[src0_idx(idx)])); } -} \ No newline at end of file +} + From e6e018dafe885182dc099dc5b96cb42f0ca5726f Mon Sep 17 00:00:00 2001 From: Changyeon Kim Date: Mon, 12 Aug 2024 22:19:10 +0900 Subject: [PATCH 3/6] Fix-up the missing initial parameter to resolve the compilation warning. Signed-off-by: Changyeon Kim --- ggml/src/ggml-vulkan.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ggml/src/ggml-vulkan.cpp b/ggml/src/ggml-vulkan.cpp index e5f605d8135e1..42c478196b315 100644 --- a/ggml/src/ggml-vulkan.cpp +++ b/ggml/src/ggml-vulkan.cpp @@ -4495,7 +4495,7 @@ static void ggml_vk_acc(ggml_backend_vk_context * ctx, vk_context& subctx, const (uint32_t)src1->ne[0], (uint32_t)src1->ne[1], (uint32_t)src1->ne[2],(uint32_t)src1->ne[3], (uint32_t)src1->nb[0] / src1_type_size, (uint32_t)src1->nb[1] / src1_type_size, (uint32_t)src1->nb[2] / src1_type_size, (uint32_t)src1->nb[3] / src1_type_size, (uint32_t) dst->ne[0], (uint32_t) dst->ne[1], (uint32_t) dst->ne[2],(uint32_t) dst->ne[3], (uint32_t) dst->nb[0] / dst_type_size, (uint32_t) dst->nb[1] / dst_type_size, (uint32_t) dst->nb[2] / dst_type_size, (uint32_t) dst->nb[3] / dst_type_size, d_offset, - 0.0f, 0.0f, + 0.0f, 0.0f, 0, }); } From 12ab18bba06ae1d2b7785c98c5a187aa74016c5f Mon Sep 17 00:00:00 2001 From: Changyeon Kim Date: Thu, 15 Aug 2024 22:10:41 +0900 Subject: [PATCH 4/6] [fix] Add missing parameters. Signed-off-by: Changyeon Kim --- ggml/src/ggml-vulkan.cpp | 9 +++++++-- ggml/src/vulkan-shaders/acc.comp | 5 +++-- 2 files changed, 10 insertions(+), 4 deletions(-) diff --git a/ggml/src/ggml-vulkan.cpp b/ggml/src/ggml-vulkan.cpp index 42c478196b315..d45ea48a2d6be 100644 --- a/ggml/src/ggml-vulkan.cpp +++ b/ggml/src/ggml-vulkan.cpp @@ -4489,13 +4489,18 @@ static void ggml_vk_acc(ggml_backend_vk_context * ctx, vk_context& subctx, const const uint32_t dst_type_size = ggml_type_size(dst->type); const uint32_t d_offset = ((extra->offset + dst->view_offs) % ctx->device->properties.limits.minStorageBufferOffsetAlignment) / dst_type_size; + int nb1 = dst->op_params[0] / 4; // 4 bytes of float32 + int nb2 = dst->op_params[1] / 4; // 4 bytes of float32 + // int nb3 = dst->op_params[2] / 4; // 4 bytes of float32 - unused + int offset = dst->op_params[3] / 4; // offset in bytes + ggml_vk_op_f32(ctx, subctx, src0, src1, nullptr, dst, GGML_OP_ACC, { (uint32_t)ggml_nelements(src0), - (uint32_t)src0->ne[0], (uint32_t)src0->ne[1], (uint32_t)src0->ne[2],(uint32_t)src0->ne[3], (uint32_t)src0->nb[0] / src0_type_size, (uint32_t)src0->nb[1] / src0_type_size, (uint32_t)src0->nb[2] / src0_type_size, (uint32_t)src0->nb[3] / src0_type_size, + (uint32_t)src0->ne[0], (uint32_t)src0->ne[1], (uint32_t)src0->ne[2],(uint32_t)src0->ne[3], (uint32_t)src0->nb[0] / src0_type_size, (uint32_t)nb1, (uint32_t)nb2, (uint32_t)src0->nb[3] / src0_type_size, (uint32_t)src1->ne[0], (uint32_t)src1->ne[1], (uint32_t)src1->ne[2],(uint32_t)src1->ne[3], (uint32_t)src1->nb[0] / src1_type_size, (uint32_t)src1->nb[1] / src1_type_size, (uint32_t)src1->nb[2] / src1_type_size, (uint32_t)src1->nb[3] / src1_type_size, (uint32_t) dst->ne[0], (uint32_t) dst->ne[1], (uint32_t) dst->ne[2],(uint32_t) dst->ne[3], (uint32_t) dst->nb[0] / dst_type_size, (uint32_t) dst->nb[1] / dst_type_size, (uint32_t) dst->nb[2] / dst_type_size, (uint32_t) dst->nb[3] / dst_type_size, d_offset, - 0.0f, 0.0f, 0, + 0.0f, 0.0f, offset, }); } diff --git a/ggml/src/vulkan-shaders/acc.comp b/ggml/src/vulkan-shaders/acc.comp index 8566129eebdf9..4c8739efee227 100644 --- a/ggml/src/vulkan-shaders/acc.comp +++ b/ggml/src/vulkan-shaders/acc.comp @@ -4,12 +4,13 @@ #include "generic_binary_head.comp" void main() { - const uint idx = get_idx(); + const uint idx = gl_GlobalInvocationID.x; if (idx >= p.ne) { return; } - const uint src1_i = src1_idx(idx); + const uint offset = p.param3; + const uint src1_i = idx - offset; const uint oz = src1_i / p.nb02; const uint oy = (src1_i - (oz * p.nb02)) / p.nb01; const uint ox = src1_i % p.nb01; From 065a9d8438eabf77b7c5ead73c89efa6b71d686b Mon Sep 17 00:00:00 2001 From: Changyeon Kim Date: Tue, 20 Aug 2024 21:26:09 +0900 Subject: [PATCH 5/6] [fix] Use nb1 and nb2 for dst. Signed-off-by: Changyeon Kim --- ggml/src/ggml-vulkan.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ggml/src/ggml-vulkan.cpp b/ggml/src/ggml-vulkan.cpp index 1c0eed1cee701..4a2244d935912 100644 --- a/ggml/src/ggml-vulkan.cpp +++ b/ggml/src/ggml-vulkan.cpp @@ -4487,7 +4487,7 @@ static void ggml_vk_acc(ggml_backend_vk_context * ctx, vk_context& subctx, const (uint32_t)ggml_nelements(src0), (uint32_t)src0->ne[0], (uint32_t)src0->ne[1], (uint32_t)src0->ne[2],(uint32_t)src0->ne[3], (uint32_t)src0->nb[0] / src0_type_size, (uint32_t)nb1, (uint32_t)nb2, (uint32_t)src0->nb[3] / src0_type_size, (uint32_t)src1->ne[0], (uint32_t)src1->ne[1], (uint32_t)src1->ne[2],(uint32_t)src1->ne[3], (uint32_t)src1->nb[0] / src1_type_size, (uint32_t)src1->nb[1] / src1_type_size, (uint32_t)src1->nb[2] / src1_type_size, (uint32_t)src1->nb[3] / src1_type_size, - (uint32_t) dst->ne[0], (uint32_t) dst->ne[1], (uint32_t) dst->ne[2],(uint32_t) dst->ne[3], (uint32_t) dst->nb[0] / dst_type_size, (uint32_t) dst->nb[1] / dst_type_size, (uint32_t) dst->nb[2] / dst_type_size, (uint32_t) dst->nb[3] / dst_type_size, + (uint32_t) dst->ne[0], (uint32_t) dst->ne[1], (uint32_t) dst->ne[2],(uint32_t) dst->ne[3], (uint32_t) dst->nb[0] / dst_type_size, (uint32_t)nb1, (uint32_t)nb2, (uint32_t) dst->nb[3] / dst_type_size, d_offset, 0.0f, 0.0f, offset, }, dryrun); From a2d1f44335b268988b7667cccf947886c902f219 Mon Sep 17 00:00:00 2001 From: 0cc4m Date: Tue, 20 Aug 2024 20:00:00 +0200 Subject: [PATCH 6/6] Fix check results ggml_acc call --- ggml/src/ggml-vulkan.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ggml/src/ggml-vulkan.cpp b/ggml/src/ggml-vulkan.cpp index 4a2244d935912..32fda32a879ba 100644 --- a/ggml/src/ggml-vulkan.cpp +++ b/ggml/src/ggml-vulkan.cpp @@ -7033,7 +7033,7 @@ static void ggml_vk_check_results_0(ggml_tensor * tensor) { } else if (tensor->op == GGML_OP_ADD) { tensor_clone = ggml_add(ggml_ctx, src0_clone, src1_clone); } else if (tensor->op == GGML_OP_ACC) { - tensor_clone = ggml_acc(ggml_ctx, src0_clone, src1_clone); + tensor_clone = ggml_acc(ggml_ctx, src0_clone, src1_clone, tensor->op_params[0], tensor->op_params[1], tensor->op_params[2], tensor->op_params[3]); } else if (tensor->op == GGML_OP_NORM) { tensor_clone = ggml_norm(ggml_ctx, src0_clone, *(float *)tensor->op_params); } else if (tensor->op == GGML_OP_GROUP_NORM) {