From a396b47663161072b9755517d74e9cea16dc1c60 Mon Sep 17 00:00:00 2001 From: pytorchbot Date: Fri, 20 Dec 2024 13:31:46 -0600 Subject: [PATCH] [ET-VK] Replace Uniform buffers with push constants for copy op Pull Request resolved: https://github.com/pytorch/executorch/pull/7267 This diff replaces uniform buffers with push constants for copy op in the Vulkan backend of Executorch. The changes include updating the GLSL code to use push constants instead of uniform buffers and updating the C++ code to pass the sizes as push constants to the shader. ghstack-source-id: 259127151 @exported-using-ghexport Differential Revision: [D66890851](https://our.internmc.facebook.com/intern/diff/D66890851/) Co-authored-by: Vivek Trivedi <5340687+trivedivivek@users.noreply.github.com> --- .../graph/ops/glsl/copy_channel_offset.glsl | 23 +++--- .../runtime/graph/ops/glsl/copy_offset.glsl | 6 +- .../vulkan/runtime/graph/ops/impl/Copy.cpp | 70 +++++++++---------- 3 files changed, 49 insertions(+), 50 deletions(-) diff --git a/backends/vulkan/runtime/graph/ops/glsl/copy_channel_offset.glsl b/backends/vulkan/runtime/graph/ops/glsl/copy_channel_offset.glsl index 862ccdad30..39aa9b11a0 100644 --- a/backends/vulkan/runtime/graph/ops/glsl/copy_channel_offset.glsl +++ b/backends/vulkan/runtime/graph/ops/glsl/copy_channel_offset.glsl @@ -18,17 +18,16 @@ ${layout_declare_tensor(B, "w", "t_out", DTYPE, STORAGE)} ${layout_declare_tensor(B, "r", "existing_out", DTYPE, STORAGE)} ${layout_declare_tensor(B, "r", "t_in", DTYPE, STORAGE)} -${layout_declare_ubo(B, "ivec4", "out_sizes")} -${layout_declare_ubo(B, "ivec4", "in_sizes")} - -layout(set = 0, binding = 5) uniform PRECISION restrict CopyArgs { +layout(push_constant) uniform restrict Block { + ivec4 out_sizes; + ivec4 in_sizes; // Operates on (x, y, z) logical extents. - ivec3 range; + // channel_range is stored in range.w + ivec4 range; // Analogus to range variable in copy. It defines the # of channel being // copied. - int channel_range; - ivec3 dst_offset; - int dst_channel_offset; + // dst channel offset is stored in dst_offset.w + ivec4 dst_offset; int src_channel_offset; }; @@ -47,11 +46,11 @@ void main() { // Note: Unlike other shaders, the range is often not equal to the destination // texture extent. const ivec3 lpos = ivec3(gl_GlobalInvocationID); - if (any(greaterThanEqual(lpos, range))) { + if (any(greaterThanEqual(lpos, range.xyz))) { return; } - const ivec3 out_lpos = lpos + dst_offset; + const ivec3 out_lpos = lpos + dst_offset.xyz; const ivec4 out_tidx = lpos_to_tidx(out_lpos, out_sizes, out_axis_map.w, packed_dim); @@ -61,12 +60,12 @@ void main() { ivec4 in_tidx = out_tidx; for (int i=0; i<4; i++) { - in_tidx[packed_dim] = out_tidx[packed_dim] - dst_channel_offset + i; + in_tidx[packed_dim] = out_tidx[packed_dim] - dst_offset.w + i; // Handle the partial update for begining of channel in an existing tensor. // If the source channel index is below zero or exceeds the range, we skip // updating the element to avoid overwriting existing data. - if ((in_tidx[packed_dim] < 0) || (in_tidx[packed_dim] >= channel_range)) { + if ((in_tidx[packed_dim] < 0) || (in_tidx[packed_dim] >= range.w)) { continue; } diff --git a/backends/vulkan/runtime/graph/ops/glsl/copy_offset.glsl b/backends/vulkan/runtime/graph/ops/glsl/copy_offset.glsl index 3dbc59e041..a42a592762 100644 --- a/backends/vulkan/runtime/graph/ops/glsl/copy_offset.glsl +++ b/backends/vulkan/runtime/graph/ops/glsl/copy_offset.glsl @@ -17,7 +17,11 @@ layout(std430) buffer; ${layout_declare_tensor(B, "w", "t_out", DTYPE, STORAGE)} ${layout_declare_tensor(B, "r", "t_in", DTYPE, STORAGE)} -${layout_declare_ubo(B, "ivec3", "range", "ivec3", "src_offset", "ivec3", "dst_offset")} +layout(push_constant) uniform restrict Block { + ivec3 range; + ivec3 src_offset; + ivec3 dst_offset; +}; #include "indexing_utils.h" diff --git a/backends/vulkan/runtime/graph/ops/impl/Copy.cpp b/backends/vulkan/runtime/graph/ops/impl/Copy.cpp index b98b2c504d..69378524af 100644 --- a/backends/vulkan/runtime/graph/ops/impl/Copy.cpp +++ b/backends/vulkan/runtime/graph/ops/impl/Copy.cpp @@ -33,16 +33,6 @@ void add_copy_offset_node( add_dtype_suffix(kernel_name, *t_out); add_storage_type_suffix(kernel_name, *t_out); - const struct Block final { - alignas(16) ivec3 range; - alignas(16) ivec3 src_offset; - alignas(16) ivec3 dst_offset; - } offset_params{ - range, - src_offset, - dst_offset, - }; - auto shader = VK_KERNEL_FROM_STR(kernel_name); graph.execute_nodes().emplace_back(new DispatchNode( @@ -56,11 +46,18 @@ void add_copy_offset_node( {in, vkapi::kRead}, }, // Parameter buffers - { - graph.create_params_buffer(offset_params), - }, + {}, // Specialization Constants - {graph.hashed_layout_of(out), graph.hashed_layout_of(in)})); + {graph.hashed_layout_of(out), graph.hashed_layout_of(in)}, + nullptr, + {}, + { + PushConstantDataInfo(&range, sizeof(range), sizeof(utils::ivec4)), + PushConstantDataInfo( + &src_offset, sizeof(src_offset), sizeof(utils::ivec4)), + PushConstantDataInfo( + &dst_offset, sizeof(dst_offset), sizeof(utils::ivec4)), + })); } void add_copy_channel_offset_node( @@ -128,28 +125,23 @@ void add_copy_channel_offset_node( // The shader combines the global invocation id and the dst_offset to get // the actual coordinate. - ivec3 dst_offset{ + const ivec3 dst_offset{ 0, 0, dst_first_z + batch_idx * utils::div_up_4(out_channels)}; - uvec3 global_size{ + const uvec3 global_size{ utils::safe_downcast(dim_at(in_sizes)), utils::safe_downcast(dim_at(in_sizes)), utils::safe_downcast(dst_last_z - dst_first_z + 1)}; - uvec3 local_size = graph.create_local_wg_size(global_size); - - const struct Block final { - ivec3 range; - int32_t channel_range; - ivec3 dst_offset; - int32_t dst_channel_offset; - int32_t src_channel_offset; - } channel_offset_params{ - utils::make_ivec3(global_size), - channel_range, - dst_offset, - dst_channel_offset, - src_channel_offset, - }; + const uvec3 local_size = graph.create_local_wg_size(global_size); + + const utils::ivec4 range_params = { + static_cast(global_size[0]), + static_cast(global_size[1]), + static_cast(global_size[2]), + channel_range}; + + const utils::ivec4 offset_params = { + dst_offset[0], dst_offset[1], dst_offset[2], dst_channel_offset}; auto shader = VK_KERNEL_FROM_STR(kernel_name); @@ -165,13 +157,17 @@ void add_copy_channel_offset_node( {in, vkapi::MemoryAccessType::READ}, }, // Parameter buffers - { - t_out->sizes_ubo(), - t_in->sizes_ubo(), - graph.create_params_buffer(channel_offset_params), - }, + {}, // Specialization Constants - {graph.hashed_layout_of(out), graph.hashed_layout_of(in)})); + {graph.hashed_layout_of(out), graph.hashed_layout_of(in)}, + nullptr, + {}, + {graph.sizes_pc_of(out), + graph.sizes_pc_of(in), + PushConstantDataInfo(&range_params, sizeof(range_params)), + PushConstantDataInfo(&offset_params, sizeof(offset_params)), + PushConstantDataInfo( + &src_channel_offset, sizeof(src_channel_offset))})); } }