From f6a87acdaab7018525cc55c2bedf2e7ca7a25408 Mon Sep 17 00:00:00 2001 From: pytorchbot Date: Mon, 9 Dec 2024 22:17:53 -0800 Subject: [PATCH] [ET-VK] Adding UniformData struct in vTensor class to store uniform data, which will be stored using shared ptr and can be shared with push constants. Pull Request resolved: https://github.com/pytorch/executorch/pull/7222 This diff adds a new struct called `UniformData` in the `vTensor` class to store uniform data, which can be shared with push constants. The `UniformData` struct contains the sizes, strides, and logical limits of the tensor, as well as the number of elements in the tensor. Diff adds `Attribute` enum to Tensor class to enumerate attributes supplied to dispatch and `UniformData` class to store tensor data supplied as uniforms to op shaders. The diff also adds write_attribute function to UniformData class to write attribute data to a given memory. ghstack-source-id: 257227242 @exported-using-ghexport Differential Revision: [D66733611](https://our.internmc.facebook.com/intern/diff/D66733611/) Co-authored-by: Vivek Trivedi <5340687+trivedivivek@users.noreply.github.com> --- .../vulkan/runtime/api/containers/Tensor.cpp | 96 +++++++++++++------ .../vulkan/runtime/api/containers/Tensor.h | 62 ++++++++++-- 2 files changed, 124 insertions(+), 34 deletions(-) diff --git a/backends/vulkan/runtime/api/containers/Tensor.cpp b/backends/vulkan/runtime/api/containers/Tensor.cpp index d79e2a95fb..21b0ee4b17 100644 --- a/backends/vulkan/runtime/api/containers/Tensor.cpp +++ b/backends/vulkan/runtime/api/containers/Tensor.cpp @@ -7,6 +7,7 @@ */ #include +#include namespace vkcompute { namespace api { @@ -446,11 +447,10 @@ vTensor::vTensor( dim_order_(calculate_dim_order(sizes_.size(), packed_dim_)), axis_map_(default_axis_map()), strides_(calculate_strides(sizes, dim_order_)), - numel_(utils::multiply_integers(sizes_)), padded_sizes_{calculate_padded_sizes(sizes, packed_dim_)}, - unsqueezed_strides_{unsqueeze_strides(strides_, numel_)}, + unsqueezed_strides_{ + unsqueeze_strides(strides_, utils::multiply_integers(sizes_))}, padded_numel_(utils::multiply_integers(padded_sizes_)), - logical_limits_{{0, 0, 0}}, uniforms_(), // Utility Uniform Buffers that can be passed to shaders as arguments uniforms_size_(0), @@ -467,6 +467,11 @@ vTensor::vTensor( padded_sizes_, dtype_, allocate_memory) { + uniform_data_ = std::make_shared(UniformData{ + sizes_, + unsqueezed_strides_, + {{0, 0, 0}}, + static_cast(utils::multiply_integers(sizes_))}); VK_CHECK_COND( dim_order_is_valid(dim_order_), "computed dim order is invalid"); @@ -494,11 +499,9 @@ vTensor::vTensor( dim_order_(), axis_map_(default_axis_map()), strides_(), - numel_(utils::multiply_integers(sizes_)), padded_sizes_(calculate_padded_sizes(sizes_, packed_dim_)), unsqueezed_strides_(), padded_numel_(utils::multiply_integers(padded_sizes_)), - logical_limits_(), uniforms_(), // Utility Uniform Buffers that can be passed to shaders as arguments uniforms_size_(0), @@ -508,6 +511,11 @@ vTensor::vTensor( logical_limits_uniform_offset_(kUniformOffsetUnset), // Construct Tensor storage storage_(context, image) { + uniform_data_ = std::make_shared(UniformData{ + sizes_, + {0, 0, 0, 0}, + {{0, 0, 0}}, + static_cast(utils::multiply_integers(sizes_))}); set_logical_limits(storage_.image_extents_); } @@ -519,13 +527,11 @@ vTensor::vTensor(vTensor& other) dim_order_(other.dim_order_.begin(), other.dim_order_.end()), axis_map_(other.axis_map_.begin(), other.axis_map_.end()), strides_(other.strides_.begin(), other.strides_.end()), - numel_(other.numel_), padded_sizes_{other.padded_sizes_.begin(), other.padded_sizes_.end()}, unsqueezed_strides_{ other.unsqueezed_strides_.begin(), other.unsqueezed_strides_.end()}, padded_numel_(other.padded_numel_), - logical_limits_{other.logical_limits_}, uniforms_(), // Empty initialize Utility Uniform Buffers uniforms_size_(0), @@ -534,7 +540,9 @@ vTensor::vTensor(vTensor& other) numel_uniform_offset_(kUniformOffsetUnset), logical_limits_uniform_offset_(kUniformOffsetUnset), // Copy Tensor storage - storage_(other.storage_) {} + storage_(other.storage_) { + uniform_data_ = std::make_shared(*other.get_uniform_data()); +} vTensor::vTensor( vTensor& other, @@ -548,11 +556,10 @@ vTensor::vTensor( dim_order_(dim_order.begin(), dim_order.end()), axis_map_(default_axis_map()), strides_(calculate_strides(sizes_, dim_order_)), - numel_(utils::multiply_integers(sizes_)), padded_sizes_{calculate_padded_sizes(sizes, packed_dim_)}, - unsqueezed_strides_{unsqueeze_strides(strides_, numel_)}, + unsqueezed_strides_{ + unsqueeze_strides(strides_, utils::multiply_integers(sizes_))}, padded_numel_(utils::multiply_integers(padded_sizes_)), - logical_limits_(other.logical_limits_), uniforms_(), // Empty initialize Utility Uniform Buffers uniforms_size_(0), @@ -562,14 +569,45 @@ vTensor::vTensor( logical_limits_uniform_offset_(kUniformOffsetUnset), // Copy Tensor storage storage_(other.storage_, vkapi::element_size(dtype_) * offset_numel) { + uniform_data_ = std::make_shared(UniformData{ + sizes_, + unsqueezed_strides_, + {other.logical_limits()}, + static_cast(utils::multiply_integers(sizes_))}); + VK_CHECK_COND( dim_order_is_valid(dim_order_), "new dim order provided is invalid"); VK_CHECK_COND( - offset_numel + numel_ <= other.numel(), + offset_numel + numel() <= other.numel(), "Tensor alias cannot access more elements than available in the original" "tensor"); } +uint32_t vTensor::UniformData::write_attribute( + void* dst, + const uint32_t dst_offset, + const uint32_t max_dst_size, + const Attribute attr) { +#define WRITE_ATTRIBUTE_CASE(enum_name, member_name) \ + case vTensor::Attribute::enum_name: { \ + VK_CHECK_COND( \ + (dst_offset + sizeof(member_name)) <= max_dst_size, \ + "Attempting to write tensor attribute outside data boundary."); \ + memcpy((uint8_t*)dst + dst_offset, &member_name, sizeof(member_name)); \ + return sizeof(member_name); \ + } + switch (attr) { + WRITE_ATTRIBUTE_CASE(SIZES, sizes_v); + WRITE_ATTRIBUTE_CASE(STRIDES, strides_v); + WRITE_ATTRIBUTE_CASE(LOGICAL_LIMITS, logical_limits); + WRITE_ATTRIBUTE_CASE(NUMEL, numel); + default: + VK_THROW("Invalid Attribute"); + } +#undef WRITE_ATTRIBUTE_CASE + return 0; +} + vkapi::VulkanImage& vTensor::image( vkapi::PipelineBarrier& pipeline_barrier, const vkapi::PipelineStageFlags stage) & { @@ -601,9 +639,9 @@ vkapi::VulkanBuffer& vTensor::buffer( } void vTensor::set_logical_limits(const utils::uvec3& image_extents) { - logical_limits_.limits[0] = image_extents[axis_map_.at(0)]; - logical_limits_.limits[1] = image_extents[axis_map_.at(1)]; - logical_limits_.limits[2] = image_extents[axis_map_.at(2)]; + uniform_data_->logical_limits.limits[0] = image_extents[axis_map_.at(0)]; + uniform_data_->logical_limits.limits[1] = image_extents[axis_map_.at(1)]; + uniform_data_->logical_limits.limits[2] = image_extents[axis_map_.at(2)]; } utils::GPUMemoryLayout vTensor::estimate_memory_layout() const { @@ -661,7 +699,7 @@ const vkapi::BufferBindInfo vTensor::logical_limits_ubo() { "Uniform data allocation has exceeded Tensor uniform buffer size"); logical_limits_uniform_offset_ = uniforms_size_; uniforms_size_ += kSizePerUniform; - uniforms_.update(logical_limits_, logical_limits_uniform_offset_); + uniforms_.update(logical_limits(), logical_limits_uniform_offset_); } return vkapi::BufferBindInfo( uniforms_.buffer(), logical_limits_uniform_offset_); @@ -677,7 +715,7 @@ const vkapi::BufferBindInfo vTensor::numel_ubo() { "Uniform data allocation has exceeded Tensor uniform buffer size"); numel_uniform_offset_ = uniforms_size_; uniforms_size_ += kSizePerUniform; - uniforms_.update(numel_, numel_uniform_offset_); + uniforms_.update(numel(), numel_uniform_offset_); } return vkapi::BufferBindInfo(uniforms_.buffer(), numel_uniform_offset_); } @@ -687,10 +725,10 @@ size_t vTensor::staging_buffer_numel() const { const bool int8_supported = storage_.context_->adapter_ptr()->has_full_int8_buffers_support(); if (is_int8 && !int8_supported) { - return utils::align_up_4(numel_); + return utils::align_up_4(numel()); } if (storage_type() == utils::kBuffer) { - return numel_; + return numel(); } return padded_numel_; } @@ -720,30 +758,32 @@ void vTensor::bind_allocation(const vkapi::Allocation& allocation) { void vTensor::update_metadata() { strides_ = calculate_strides(sizes_, dim_order_); - numel_ = utils::multiply_integers(sizes_); + uniform_data_->numel = utils::multiply_integers(sizes_); padded_sizes_ = calculate_padded_sizes(sizes_, packed_dim_); - unsqueezed_strides_ = unsqueeze_strides(strides_, numel_); + unsqueezed_strides_ = unsqueeze_strides(strides_, numel()); padded_numel_ = utils::multiply_integers(padded_sizes_); + // Update uniform data if it has been modified + uniform_data_->sizes_v = utils::make_whcn_ivec4(sizes_); + uniform_data_->strides_v = utils::make_whcn_ivec4(unsqueezed_strides_); + // Calculate the image extents that would have been used to allocate a texture // withthe current sizes, and use that to set the logical limits. set_logical_limits( calculate_image_extents(padded_sizes_, axis_map_, packed_dim_)); if (sizes_uniform_offset_ != kUniformOffsetUnset) { - uniforms_.update(utils::make_whcn_ivec4(sizes_), sizes_uniform_offset_); + uniforms_.update(uniform_data_->sizes_v, sizes_uniform_offset_); } if (unsqueezed_strides_offset_ != kUniformOffsetUnset) { - uniforms_.update( - utils::make_whcn_ivec4(unsqueezed_strides_), - unsqueezed_strides_offset_); + uniforms_.update(uniform_data_->strides_v, unsqueezed_strides_offset_); } if (numel_uniform_offset_ != kUniformOffsetUnset) { - uniforms_.update(numel_, numel_uniform_offset_); + uniforms_.update(numel(), numel_uniform_offset_); } if (logical_limits_uniform_offset_ != kUniformOffsetUnset) { - uniforms_.update(logical_limits_, logical_limits_uniform_offset_); + uniforms_.update(logical_limits(), logical_limits_uniform_offset_); } } @@ -796,6 +836,8 @@ void vTensor::virtual_clone(const vTensor& other) { dim_order_ = other.dim_order_; axis_map_ = other.axis_map_; packed_dim_ = other.packed_dim_; + + *uniform_data_ = *other.get_uniform_data(); } void vTensor::virtual_resize(const std::vector& new_sizes) { diff --git a/backends/vulkan/runtime/api/containers/Tensor.h b/backends/vulkan/runtime/api/containers/Tensor.h index 2dfe846958..3e51be6f94 100644 --- a/backends/vulkan/runtime/api/containers/Tensor.h +++ b/backends/vulkan/runtime/api/containers/Tensor.h @@ -229,6 +229,46 @@ class vTensor final { vTensor(vTensor&& other) = default; vTensor& operator=(vTensor&& other) = default; + enum class Attribute : uint8_t { + SIZES, + STRIDES, + LOGICAL_LIMITS, + NUMEL, + }; + + class UniformData { + utils::ivec4 sizes_v; + utils::ivec4 strides_v; + // See the comments documenting logical_limits() for more context. + TextureLimits logical_limits; + // Contains the number of elements in the tensor according to the canonical + // sizes. + size_t numel; + + friend class vTensor; + + UniformData( + const std::vector& sizes, + const std::vector& strides, + const TextureLimits& logical_limits, + const size_t numel) + : sizes_v(utils::make_whcn_ivec4(sizes)), + strides_v(utils::make_whcn_ivec4(strides)), + logical_limits(logical_limits), + numel(numel) {} + + public: + /* + * Write tensor's metadata into dst, at the given dst_offset. max_dst_size + * is the size of dst and is used to avoid out of bounds writes. + */ + uint32_t write_attribute( + void* dst, + const uint32_t dst_offset, + const uint32_t max_dst_size, + const Attribute attr); + }; + private: /* * "Core" tensor metadata. They are the minimum amount of information required @@ -274,9 +314,6 @@ class vTensor final { // strides of the tensor in NCHW dimension order std::vector strides_; - // Contains the number of elements in the tensor according to the canonical - // sizes. - size_t numel_; /* * The below metadata members are derived from the above, and are typically @@ -293,8 +330,6 @@ class vTensor final { // Contains the number of elements in the tensor according to the padded // sizes. size_t padded_numel_; - // See the comments documenting logical_limits() for more context. - TextureLimits logical_limits_; /* * Utility GPU buffer that can be passed to shaders in order to convey tensor @@ -326,6 +361,8 @@ class vTensor final { vTensorStorage storage_; + std::shared_ptr uniform_data_; + public: /* Texture Access @@ -391,7 +428,7 @@ class vTensor final { * instead of the original sizes. */ inline const utils::ivec3& logical_limits() const { - return logical_limits_.limits; + return uniform_data_->logical_limits.limits; } /* @@ -501,7 +538,7 @@ class vTensor final { const vkapi::BufferBindInfo numel_ubo(); inline size_t numel() const { - return numel_; + return uniform_data_->numel; } inline size_t nbytes() const { @@ -589,7 +626,18 @@ class vTensor final { inline bool is_view_of(const vTensor& other) const { return storage_.is_copy_of(other.storage_); } + + const std::shared_ptr& get_uniform_data() const { + return uniform_data_; + } }; +static constexpr vTensor::Attribute kTensorSizes = vTensor::Attribute::SIZES; +static constexpr vTensor::Attribute kTensorStrides = + vTensor::Attribute::STRIDES; +static constexpr vTensor::Attribute kTensorLogicalLimits = + vTensor::Attribute::LOGICAL_LIMITS; +static constexpr vTensor::Attribute kTensorNumel = vTensor::Attribute::NUMEL; + } // namespace api } // namespace vkcompute