Add Shared Memory Bandwidth Profiler (pytorch#4277)

Summary: Pull Request resolved: pytorch#4277 This diff introduces a profiler that obtains the maximum and minimum bandwidth for reading unique addresses from UBOs, using the following shader, where A is a shared buffer and B is a writeonly buffer. shared vec4 A[nvec]; void main() { vec4 sum = vec4(0); const uint workgroup_width = local_group_size * niter * ${NUNROLL}; uint offset = (gl_WorkGroupID[0] * workgroup_width + gl_LocalInvocationID[0]) & addr_mask; int i = 0; for (; i < niter; ++i) { sum *= A[offset]; offset = (offset + local_group_size) & addr_mask; ... ... sum *= A[offset]; offset = (offset + local_group_size) & addr_mask; } vec4 zero = vec4(i>>31); B[gl_LocalInvocationID[0]] = sum + zero; } The address mask allows us to control how many unique addresses we are accessing. If the number of unique vectors we want to read is 3, the offset will jump between three unique addresses throughout the iterations, giving us the bandwidth for that specific size of data. If the size of the unique data read is larger than the work group size, then each run will have its own block of data to read, defined by the initial offset calculation, where the offset is obtained through the workgroup ID and the local invocation ID. Finally, we make sure to use the `sum` and `i ` variables so that the compiler's optimizer does not flatten the loops. For a Samsung S22, the bandwidth behaves like this. We can see that accessing the shared memory has a constant latency, until it reaches the Maximum Shared Memory size. NOTE: The graph is extended for visualization purposes, the experiment stops before it drops, because otherwise it would crash. {F1759597657} Comparing it to OpenCL, we can observe that, although the behavior is the same, Vulkan has an increased bandwidth. {F1759600867} Reviewed By: copyrightly Differential Revision: D59811152 fbshipit-source-id: 537be13dbec1a02cb55e689db2a0fd548613c729
kirklandsign · Jul 18, 2024 · e5687a4 · e5687a4
1 parent a4decca
commit e5687a4
Show file tree

Hide file tree

Showing 3 changed files with 43 additions and 27 deletions.
diff --git a/backends/vulkan/tools/gpuinfo/glsl/buf_bandwidth.glsl b/backends/vulkan/tools/gpuinfo/glsl/buf_bandwidth.glsl
@@ -14,22 +14,32 @@ layout(std430) buffer;
 
 $if MEMTYPE == "ubo":
     ${layout_declare_ubo(0, "vec4", "A")}
-$else:
+$elif MEMTYPE == "buffer":
     ${layout_declare_buffer(0, "r", "A", DTYPE, "PRECISION", False)}
+$else:
+    ${layout_declare_buffer(0, "r", "_", DTYPE, "PRECISION", False)}
 
 ${layout_declare_buffer(1, "w", "B", DTYPE, "PRECISION", False)}
 
 layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
 
 layout(constant_id = 3) const int niter = 1;
-
-// The address mask works as a modulo because x % 2^n == x & (2^n - 1).
-// This will help us limit address accessing to a specific set of unique
-// addresses depending on the access size we want to measure.
-layout(constant_id = 4) const int addr_mask = 1;
+layout(constant_id = 4) const int nvec = 1;
 layout(constant_id = 5) const int local_group_size = 1;
 
+$if MEMTYPE == "shared":
+    shared vec4 A[nvec];
+
 void main() {
+
+    $if MEMTYPE == "shared":
+        A[gl_LocalInvocationID[0]][0] = gl_LocalInvocationID[0];
+        memoryBarrierShared();
+
+    // The address mask works as a modulo because x % 2^n == x & (2^n - 1).
+    // This will help us limit address accessing to a specific set of unique
+    // addresses depending on the access size we want to measure.
+    const int addr_mask = nvec - 1;
     vec4 sum = vec4(0);
 
     // This is to distribute the accesses to unique addresses across the workgroups, once the

diff --git a/backends/vulkan/tools/gpuinfo/glsl/buf_bandwidth.yaml b/backends/vulkan/tools/gpuinfo/glsl/buf_bandwidth.yaml
@@ -13,5 +13,6 @@ buf_bandwidth:
     MEMTYPE:
       - VALUE: ubo
       - VALUE: buffer
+      - VALUE: shared
   shader_variants:
     - NAME: buf_bandwidth
diff --git a/backends/vulkan/tools/gpuinfo/src/app.cpp b/backends/vulkan/tools/gpuinfo/src/app.cpp
@@ -7,7 +7,6 @@
  */
 
 #include <executorch/backends/vulkan/runtime/api/api.h>
-#include <executorch/backends/vulkan/runtime/graph/ops/utils/StagingUtils.h>
 #include <iostream>
 
 #include "stats.h"
@@ -18,6 +17,7 @@ using namespace vkapi;
 class App {
  private:
   size_t buf_cache_size_;
+  uint32_t max_shared_mem_size_;
   uint32_t sm_count_;
   uint32_t nthread_logic_;
 
@@ -33,11 +33,12 @@ class App {
     sm_count_ = cl_device.getInfo<CL_DEVICE_MAX_COMPUTE_UNITS>();
     nthread_logic_ = cl_device.getInfo<CL_DEVICE_MAX_WORK_GROUP_SIZE>();
     buf_cache_size_ = cl_device.getInfo<CL_DEVICE_GLOBAL_MEM_CACHE_SIZE>();
-
+    max_shared_mem_size_ = cl_device.getInfo<CL_DEVICE_LOCAL_MEM_SIZE>();
     std::cout << std::endl;
     std::cout << "SM count," << sm_count_ << std::endl;
     std::cout << "Logic Thread Count," << nthread_logic_ << std::endl;
     std::cout << "Cache Size," << buf_cache_size_ << std::endl;
+    std::cout << "Shared Memory Size," << max_shared_mem_size_ << std::endl;
   }
 
   void reg_count() {
@@ -211,13 +212,8 @@ class App {
   }
 
  private:
-  void _bandwidth(std::string memtype) {
+  void _bandwidth(std::string memtype, uint32_t range) {
     // TODO: Make these values configurable
-
-    // Maximum memory space read - 128MB
-    // For regular devices, bandwidth plateaus at less memory than this, so more
-    // is not needed.
-    const uint32_t RANGE = 128 * 1024 * 1024;
     // Cache lines flushed
     const uint32_t NFLUSH = 4;
     // Number of loop unrolls. Changing this value requires an equal change in
@@ -230,11 +226,14 @@ class App {
     const uint32_t VEC_WIDTH = 4;
     const uint32_t VEC_SIZE = VEC_WIDTH * sizeof(float);
     // Number of vectors that fit in the selected memory space
-    const uint32_t NVEC = RANGE / VEC_SIZE;
+    const uint32_t NVEC = range / VEC_SIZE;
     // Number of memory reads per thread
     const uint32_t NREAD_PER_THREAD = NUNROLL * NITER;
-    // Number of threads needed to read all vectors
-    const uint32_t NTHREAD = NVEC / NREAD_PER_THREAD;
+    // Number of threads needed to read al l vectors
+    // The thread count doesn't divide by thread workload in shared memory
+    // because of the limited memory size.
+    const uint32_t NTHREAD =
+        memtype == "Shared" ? NVEC : NVEC / NREAD_PER_THREAD;
     // Occupy all threads
     const uint32_t local_x = nthread_logic_;
     // Ensure that global is a multiple of local, and distribute across all SMs
@@ -245,12 +244,7 @@ class App {
       // Number of vectors that fit in this iteration
       const uint32_t nvec_access = access_size / VEC_SIZE;
 
-      // The address mask works as a modulo because x % 2^n == x & (2^n - 1).
-      // This will help us limit address accessing to a specific set of unique
-      // addresses depending on the iteration
-      const uint32_t addr_mask = nvec_access - 1;
-
-      StorageBuffer in_buf(context(), vkapi::kFloat, RANGE / sizeof(float));
+      StorageBuffer in_buf(context(), vkapi::kFloat, range / sizeof(float));
       StorageBuffer out_buf(
           context(), vkapi::kFloat, VEC_WIDTH * nthread_logic_);
       vkapi::PipelineBarrier pipeline_barrier{};
@@ -269,7 +263,7 @@ class App {
             pipeline_barrier,
             {global_x, 1, 1},
             {local_x, 1, 1},
-            {SV(NITER), SV(addr_mask), SV(local_x)},
+            {SV(NITER), SV(nvec_access), SV(local_x)},
             VK_NULL_HANDLE,
             0,
             in_buf.buffer(),
@@ -286,7 +280,7 @@ class App {
 
     double max_bandwidth = 0;
     double min_bandwidth = DBL_MAX;
-    for (uint32_t access_size = VEC_SIZE; access_size < RANGE;
+    for (uint32_t access_size = VEC_SIZE; access_size < range;
          access_size *= 2) {
       double gbps = bench(access_size);
       max_bandwidth = std::max(gbps, max_bandwidth);
@@ -302,12 +296,22 @@ class App {
  public:
   void buf_bandwidth() {
     std::cout << "\n------ Memory Bandwidth ------" << std::endl;
-    _bandwidth("Buffer");
+    // Maximum memory space read - 128MB
+    // For regular devices, bandwidth plateaus at less memory than this, so more
+    // is not needed.
+    const uint32_t RANGE = 128 * 1024 * 1024;
+    _bandwidth("Buffer", RANGE);
   }
 
   void ubo_bandwidth() {
     std::cout << "\n------ UBO Bandwidth ------" << std::endl;
-    _bandwidth("UBO");
+    const uint32_t RANGE = 128 * 1024 * 1024;
+    _bandwidth("UBO", RANGE);
+  }
+  void shared_mem_bandwidth() {
+    std::cout << "\n------ Shared Bandwidth ------" << std::endl;
+    const uint32_t RANGE = max_shared_mem_size_;
+    _bandwidth("Shared", RANGE);
   }
 };
 
@@ -319,6 +323,7 @@ int main(int argc, const char** argv) {
   app.buf_cacheline_size();
   app.buf_bandwidth();
   app.ubo_bandwidth();
+  app.shared_mem_bandwidth();
 
   return 0;
 }