Skip to content

Commit

Permalink
Moved a bunch of variables out of shared memory to reduce memory BW, …
Browse files Browse the repository at this point in the history
…changed threads per block to 128 rows. Now 77% speedup vs original on a 2080 Ti
  • Loading branch information
azonenberg committed Jul 8, 2024
1 parent 227c26b commit d72278a
Showing 1 changed file with 13 additions and 17 deletions.
30 changes: 13 additions & 17 deletions src/ngscopeclient/shaders/waveform-compute.glsl
Original file line number Diff line number Diff line change
Expand Up @@ -46,18 +46,12 @@
#define MAX_HEIGHT 2048

//Number of threads per column of pixels
#define ROWS_PER_BLOCK 64
#define ROWS_PER_BLOCK 128

//Shared buffer for the local working buffer (8 kB)
shared uint g_workingBuffer[MAX_HEIGHT];

//Min/max for the current sample
shared int g_blockmin[ROWS_PER_BLOCK];
shared int g_blockmax[ROWS_PER_BLOCK];
shared float g_alpha[ROWS_PER_BLOCK];
shared bool g_done;
shared bool g_updating[ROWS_PER_BLOCK];

layout(local_size_x=1, local_size_y=ROWS_PER_BLOCK, local_size_z=1) in;

//Global configuration for the run
Expand Down Expand Up @@ -255,6 +249,10 @@ void main()
//Main loop
while(true)
{
int blockmin = 0;
int blockmax = 0;
bool updating = false;

if(i < (memDepth - ADDTL_NEEDED_SAMPLES) )
{
//Fetch coordinates
Expand Down Expand Up @@ -324,26 +322,26 @@ void main()
if( ( (starty < 0) && (endy < 0) ) ||
( (starty >= windowHeight) && (endy >= windowHeight) ) )
{
g_updating[gl_LocalInvocationID.y] = false;
updating = false;
}

//Something is visible. Clip to window size in case anything is partially offscreen
else
{
g_updating[gl_LocalInvocationID.y] = true;
updating = true;

starty = min(starty, windowHeight - 1);
endy = min(endy, windowHeight - 1);
starty = max(starty, 0);
endy = max(endy, 0);

//Sort Y coordinates from min to max
g_blockmin[gl_LocalInvocationID.y] = int(min(starty, endy));
g_blockmax[gl_LocalInvocationID.y] = int(max(starty, endy));
blockmin = int(min(starty, endy));
blockmax = int(max(starty, endy));
}
}
else
g_updating[gl_LocalInvocationID.y] = false;
updating = false;

//Check if we're at the end of the pixel
if(right.x > gl_GlobalInvocationID.x + 1)
Expand All @@ -353,7 +351,7 @@ void main()
else
{
l_done = true;
g_updating[gl_LocalInvocationID.y] = false;
updating = false;
}

i += ROWS_PER_BLOCK;
Expand All @@ -362,11 +360,9 @@ void main()
g_done = true;

//integrate intensity graded output
if(g_updating[gl_LocalInvocationID.y])
if(updating)
{
int nmin = g_blockmin[gl_LocalInvocationID.y];
int nmax = g_blockmax[gl_LocalInvocationID.y];
for(int y=nmin; y<=nmax; y++)
for(int y=blockmin; y<=blockmax; y++)
{
#ifdef HISTOGRAM_PATH
atomicMax(g_workingBuffer[y], 1);
Expand Down

0 comments on commit d72278a

Please sign in to comment.