Skip to content

Commit

Permalink
Add Shared Memory Bandwidth Profiler (pytorch#4277)
Browse files Browse the repository at this point in the history
Summary:
Pull Request resolved: pytorch#4277

This diff introduces a profiler that obtains the maximum and minimum bandwidth for reading unique addresses from UBOs, using the following shader, where A is a shared buffer and B is a writeonly buffer.

  shared vec4 A[nvec];

  void main() {
    vec4 sum = vec4(0);
    const uint workgroup_width = local_group_size * niter * ${NUNROLL};
    uint offset = (gl_WorkGroupID[0] * workgroup_width  + gl_LocalInvocationID[0]) & addr_mask;

    int i = 0;
    for (; i < niter; ++i)
    {
        sum *= A[offset];
        offset = (offset + local_group_size) & addr_mask;
        ...
        ...
        sum *= A[offset];
        offset = (offset + local_group_size) & addr_mask;
    }

    vec4 zero = vec4(i>>31);

    B[gl_LocalInvocationID[0]] = sum + zero;
  }

The address mask allows us to control how many unique addresses we are accessing. If the number of unique vectors we want to read is 3, the offset will jump between three unique addresses throughout the iterations, giving us the bandwidth for that specific size of data. If the size of the unique data read is larger than the work group size, then each run will have its own block of data to read, defined by the initial offset calculation, where the offset is obtained through the workgroup ID and the local invocation ID.

Finally, we make sure to use the `sum` and `i	` variables so that the compiler's optimizer does not flatten the loops.

For a Samsung S22, the bandwidth behaves like this. We can see that accessing the shared memory has a constant latency, until it reaches the Maximum Shared Memory size.

NOTE: The graph is extended for visualization purposes, the experiment stops before it drops, because otherwise it would crash.

{F1759597657}

Comparing it to OpenCL, we can observe that, although the behavior is the same, Vulkan has an increased bandwidth.

{F1759600867}

Reviewed By: copyrightly

Differential Revision: D59811152

fbshipit-source-id: 537be13dbec1a02cb55e689db2a0fd548613c729
  • Loading branch information
Esteban Padilla Cerdio authored and facebook-github-bot committed Jul 18, 2024
1 parent a4decca commit e5687a4
Show file tree
Hide file tree
Showing 3 changed files with 43 additions and 27 deletions.
22 changes: 16 additions & 6 deletions backends/vulkan/tools/gpuinfo/glsl/buf_bandwidth.glsl
Original file line number Diff line number Diff line change
Expand Up @@ -14,22 +14,32 @@ layout(std430) buffer;

$if MEMTYPE == "ubo":
${layout_declare_ubo(0, "vec4", "A")}
$else:
$elif MEMTYPE == "buffer":
${layout_declare_buffer(0, "r", "A", DTYPE, "PRECISION", False)}
$else:
${layout_declare_buffer(0, "r", "_", DTYPE, "PRECISION", False)}

${layout_declare_buffer(1, "w", "B", DTYPE, "PRECISION", False)}

layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;

layout(constant_id = 3) const int niter = 1;

// The address mask works as a modulo because x % 2^n == x & (2^n - 1).
// This will help us limit address accessing to a specific set of unique
// addresses depending on the access size we want to measure.
layout(constant_id = 4) const int addr_mask = 1;
layout(constant_id = 4) const int nvec = 1;
layout(constant_id = 5) const int local_group_size = 1;

$if MEMTYPE == "shared":
shared vec4 A[nvec];

void main() {

$if MEMTYPE == "shared":
A[gl_LocalInvocationID[0]][0] = gl_LocalInvocationID[0];
memoryBarrierShared();

// The address mask works as a modulo because x % 2^n == x & (2^n - 1).
// This will help us limit address accessing to a specific set of unique
// addresses depending on the access size we want to measure.
const int addr_mask = nvec - 1;
vec4 sum = vec4(0);

// This is to distribute the accesses to unique addresses across the workgroups, once the
Expand Down
1 change: 1 addition & 0 deletions backends/vulkan/tools/gpuinfo/glsl/buf_bandwidth.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -13,5 +13,6 @@ buf_bandwidth:
MEMTYPE:
- VALUE: ubo
- VALUE: buffer
- VALUE: shared
shader_variants:
- NAME: buf_bandwidth
47 changes: 26 additions & 21 deletions backends/vulkan/tools/gpuinfo/src/app.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,6 @@
*/

#include <executorch/backends/vulkan/runtime/api/api.h>
#include <executorch/backends/vulkan/runtime/graph/ops/utils/StagingUtils.h>
#include <iostream>

#include "stats.h"
Expand All @@ -18,6 +17,7 @@ using namespace vkapi;
class App {
private:
size_t buf_cache_size_;
uint32_t max_shared_mem_size_;
uint32_t sm_count_;
uint32_t nthread_logic_;

Expand All @@ -33,11 +33,12 @@ class App {
sm_count_ = cl_device.getInfo<CL_DEVICE_MAX_COMPUTE_UNITS>();
nthread_logic_ = cl_device.getInfo<CL_DEVICE_MAX_WORK_GROUP_SIZE>();
buf_cache_size_ = cl_device.getInfo<CL_DEVICE_GLOBAL_MEM_CACHE_SIZE>();

max_shared_mem_size_ = cl_device.getInfo<CL_DEVICE_LOCAL_MEM_SIZE>();
std::cout << std::endl;
std::cout << "SM count," << sm_count_ << std::endl;
std::cout << "Logic Thread Count," << nthread_logic_ << std::endl;
std::cout << "Cache Size," << buf_cache_size_ << std::endl;
std::cout << "Shared Memory Size," << max_shared_mem_size_ << std::endl;
}

void reg_count() {
Expand Down Expand Up @@ -211,13 +212,8 @@ class App {
}

private:
void _bandwidth(std::string memtype) {
void _bandwidth(std::string memtype, uint32_t range) {
// TODO: Make these values configurable

// Maximum memory space read - 128MB
// For regular devices, bandwidth plateaus at less memory than this, so more
// is not needed.
const uint32_t RANGE = 128 * 1024 * 1024;
// Cache lines flushed
const uint32_t NFLUSH = 4;
// Number of loop unrolls. Changing this value requires an equal change in
Expand All @@ -230,11 +226,14 @@ class App {
const uint32_t VEC_WIDTH = 4;
const uint32_t VEC_SIZE = VEC_WIDTH * sizeof(float);
// Number of vectors that fit in the selected memory space
const uint32_t NVEC = RANGE / VEC_SIZE;
const uint32_t NVEC = range / VEC_SIZE;
// Number of memory reads per thread
const uint32_t NREAD_PER_THREAD = NUNROLL * NITER;
// Number of threads needed to read all vectors
const uint32_t NTHREAD = NVEC / NREAD_PER_THREAD;
// Number of threads needed to read al l vectors
// The thread count doesn't divide by thread workload in shared memory
// because of the limited memory size.
const uint32_t NTHREAD =
memtype == "Shared" ? NVEC : NVEC / NREAD_PER_THREAD;
// Occupy all threads
const uint32_t local_x = nthread_logic_;
// Ensure that global is a multiple of local, and distribute across all SMs
Expand All @@ -245,12 +244,7 @@ class App {
// Number of vectors that fit in this iteration
const uint32_t nvec_access = access_size / VEC_SIZE;

// The address mask works as a modulo because x % 2^n == x & (2^n - 1).
// This will help us limit address accessing to a specific set of unique
// addresses depending on the iteration
const uint32_t addr_mask = nvec_access - 1;

StorageBuffer in_buf(context(), vkapi::kFloat, RANGE / sizeof(float));
StorageBuffer in_buf(context(), vkapi::kFloat, range / sizeof(float));
StorageBuffer out_buf(
context(), vkapi::kFloat, VEC_WIDTH * nthread_logic_);
vkapi::PipelineBarrier pipeline_barrier{};
Expand All @@ -269,7 +263,7 @@ class App {
pipeline_barrier,
{global_x, 1, 1},
{local_x, 1, 1},
{SV(NITER), SV(addr_mask), SV(local_x)},
{SV(NITER), SV(nvec_access), SV(local_x)},
VK_NULL_HANDLE,
0,
in_buf.buffer(),
Expand All @@ -286,7 +280,7 @@ class App {

double max_bandwidth = 0;
double min_bandwidth = DBL_MAX;
for (uint32_t access_size = VEC_SIZE; access_size < RANGE;
for (uint32_t access_size = VEC_SIZE; access_size < range;
access_size *= 2) {
double gbps = bench(access_size);
max_bandwidth = std::max(gbps, max_bandwidth);
Expand All @@ -302,12 +296,22 @@ class App {
public:
void buf_bandwidth() {
std::cout << "\n------ Memory Bandwidth ------" << std::endl;
_bandwidth("Buffer");
// Maximum memory space read - 128MB
// For regular devices, bandwidth plateaus at less memory than this, so more
// is not needed.
const uint32_t RANGE = 128 * 1024 * 1024;
_bandwidth("Buffer", RANGE);
}

void ubo_bandwidth() {
std::cout << "\n------ UBO Bandwidth ------" << std::endl;
_bandwidth("UBO");
const uint32_t RANGE = 128 * 1024 * 1024;
_bandwidth("UBO", RANGE);
}
void shared_mem_bandwidth() {
std::cout << "\n------ Shared Bandwidth ------" << std::endl;
const uint32_t RANGE = max_shared_mem_size_;
_bandwidth("Shared", RANGE);
}
};

Expand All @@ -319,6 +323,7 @@ int main(int argc, const char** argv) {
app.buf_cacheline_size();
app.buf_bandwidth();
app.ubo_bandwidth();
app.shared_mem_bandwidth();

return 0;
}

0 comments on commit e5687a4

Please sign in to comment.