From 0bdffcb33fe583f956bcb378ec3f2e15422c44b3 Mon Sep 17 00:00:00 2001 From: pytorchbot Date: Wed, 18 Dec 2024 12:14:04 -0600 Subject: [PATCH] [ET-VK] Replace Uniform buffers with push constants for binary op (#7346) Pull Request resolved: https://github.com/pytorch/executorch/pull/7230 This diff replaces uniform buffers with push constants for binary op in the Vulkan backend of Executorch. The changes include updating the GLSL code to use push constants instead of uniform buffers and updating the C++ code to pass the sizes as push constants to the shader. ghstack-source-id: 258575398 @exported-using-ghexport Differential Revision: [D66853542](https://our.internmc.facebook.com/intern/diff/D66853542/) Co-authored-by: Vivek Trivedi <5340687+trivedivivek@users.noreply.github.com> --- .../vulkan/runtime/graph/ops/DispatchNode.h | 9 ++++++--- .../runtime/graph/ops/glsl/binary_op.glsl | 13 ++++++++----- .../vulkan/runtime/graph/ops/impl/BinaryOp.cpp | 17 ++++++++++------- .../vulkan/test/vulkan_compute_api_test.cpp | 11 ++++------- 4 files changed, 28 insertions(+), 22 deletions(-) diff --git a/backends/vulkan/runtime/graph/ops/DispatchNode.h b/backends/vulkan/runtime/graph/ops/DispatchNode.h index 958637218e..7d04f7714e 100644 --- a/backends/vulkan/runtime/graph/ops/DispatchNode.h +++ b/backends/vulkan/runtime/graph/ops/DispatchNode.h @@ -46,12 +46,15 @@ class PushConstantDataInfo { payload_.attr = attr; } - explicit PushConstantDataInfo(const void* data, uint32_t dataLen) + explicit PushConstantDataInfo( + const void* data, + uint32_t dataLen, + uint32_t pushConstantLen = 0) : tensorUniformData(nullptr) { VK_CHECK_COND( dataLen <= 16, "Single push constant data size must be <= 16 bytes"); - payload_.dataSize = dataLen; - memcpy(payload_.data, data, payload_.dataSize); + payload_.dataSize = pushConstantLen ? pushConstantLen : dataLen; + memcpy(payload_.data, data, dataLen); } /* diff --git a/backends/vulkan/runtime/graph/ops/glsl/binary_op.glsl b/backends/vulkan/runtime/graph/ops/glsl/binary_op.glsl index be0e1bfa20..62aa2f810d 100644 --- a/backends/vulkan/runtime/graph/ops/glsl/binary_op.glsl +++ b/backends/vulkan/runtime/graph/ops/glsl/binary_op.glsl @@ -19,11 +19,6 @@ layout(std430) buffer; ${layout_declare_tensor(B, "w", "t_out", DTYPE, STORAGE)} ${layout_declare_tensor(B, "r", "t_in", DTYPE, STORAGE)} ${layout_declare_tensor(B, "r", "t_other", DTYPE, STORAGE)} -${layout_declare_ubo(B, "ivec4", "out_sizes")} -${layout_declare_ubo(B, "ivec4", "in_sizes")} -${layout_declare_ubo(B, "ivec4", "other_sizes")} -${layout_declare_ubo(B, "ivec2", "broadcast_params")} -${layout_declare_ubo(B, "float", "alpha")} #include "broadcasting_utils.h" #include "indexing_utils.h" @@ -40,6 +35,14 @@ const lowp ivec4 in_axis_map = unhash_axis_map(in_layout); ${layout_declare_spec_const(C, "int", "other_layout", "DEFAULT_LAYOUT")} const lowp ivec4 other_axis_map = unhash_axis_map(other_layout); +layout(push_constant) uniform restrict Block { + ivec4 out_sizes; + ivec4 in_sizes; + ivec4 other_sizes; + ivec2 broadcast_params; + float alpha; +}; + void main() { const ivec3 lpos = ivec3(gl_GlobalInvocationID); const ivec4 tidx = lpos_to_tidx(lpos, out_sizes, out_axis_map.w, packed_dim); diff --git a/backends/vulkan/runtime/graph/ops/impl/BinaryOp.cpp b/backends/vulkan/runtime/graph/ops/impl/BinaryOp.cpp index 33f73cd6da..7e88982aae 100644 --- a/backends/vulkan/runtime/graph/ops/impl/BinaryOp.cpp +++ b/backends/vulkan/runtime/graph/ops/impl/BinaryOp.cpp @@ -67,7 +67,10 @@ void add_binary_op_node( alpha_val = graph.extract_scalar(alpha); } - const utils::ivec2 broadcast_params = create_broadcast_params(*t_in1, *t_in2); + const struct BinaryOpsParams { + const utils::ivec2 broadcast_params; + const float alpha_val; + } binary_ops_params{create_broadcast_params(*t_in1, *t_in2), alpha_val}; std::string kernel_name("binary_"); kernel_name.reserve(kShaderNameReserve); @@ -83,16 +86,16 @@ void add_binary_op_node( {{out, vkapi::MemoryAccessType::WRITE}, {{arg1, arg2}, vkapi::MemoryAccessType::READ}}, // Shader params buffers - {t_out->sizes_ubo(), - t_in1->sizes_ubo(), - t_in2->sizes_ubo(), - graph.create_params_buffer(broadcast_params), - graph.create_params_buffer(alpha_val)}, + {}, // Specialization Constants {t_out->hashed_layout(), t_in1->hashed_layout(), t_in2->hashed_layout()}, // Resizing Logic resize_binary_op_node, - {})); + {}, + {{graph.sizes_pc_of(out), + graph.sizes_pc_of(arg1), + graph.sizes_pc_of(arg2), + PushConstantDataInfo(&binary_ops_params, sizeof(binary_ops_params))}})); } #define DEFINE_BINARY_OP_WITH_ALPHA_FN(op_name) \ diff --git a/backends/vulkan/test/vulkan_compute_api_test.cpp b/backends/vulkan/test/vulkan_compute_api_test.cpp index 77a0458d90..604ad26588 100644 --- a/backends/vulkan/test/vulkan_compute_api_test.cpp +++ b/backends/vulkan/test/vulkan_compute_api_test.cpp @@ -1601,9 +1601,7 @@ TEST(VulkanComputeGraphTest, test_simple_shared_objects_with_resize) { auto addFn = VK_GET_OP_FN("aten.add.Tensor"); addFn(graph, {a.value, b.value, kDummyValueRef, c}); - // +2: alpha UBO, broadcast UBO for arithmetic shader - // +1: t.sizes_ubo() for arithmetic shader output c - expected_vma_allocation_count += 3; + // no new allocations if binary op uses push constants EXPECT_EQ(get_vma_allocation_count(), expected_vma_allocation_count); IOValueRef d = graph.add_input_tensor( @@ -1624,17 +1622,16 @@ TEST(VulkanComputeGraphTest, test_simple_shared_objects_with_resize) { auto mulFn = VK_GET_OP_FN("aten.mul.Tensor"); mulFn(graph, {c, d.value, e}); - // +2: alpha UBO, broadcast UBO for arithmetic shader - // +1: t.sizes_ubo() for arithmetic shader output e - expected_vma_allocation_count += 3; + // no new allocations if binary op uses push constants EXPECT_EQ(get_vma_allocation_count(), expected_vma_allocation_count); IOValueRef out = {}; out.value = e; out.staging = graph.set_output_tensor(out.value); + // +1: staging buffer input tensor // +1: staging buffer for the output tensor - expected_vma_allocation_count += 1; + expected_vma_allocation_count += 2; EXPECT_EQ(get_vma_allocation_count(), expected_vma_allocation_count); graph.prepare();