Skip to content

Commit

Permalink
Merge branch 'tile_load_sync' into 'main'
Browse files Browse the repository at this point in the history
Detect data reinitialization in tile_shared_t and sync if true

See merge request omniverse/warp!981
  • Loading branch information
mmacklin committed Jan 17, 2025
2 parents 95e56cb + f50d05f commit 6ab3ca3
Show file tree
Hide file tree
Showing 2 changed files with 45 additions and 6 deletions.
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,7 @@
- Fix the OpenGL renderer now correctly displaying duplicate cylinder and mesh shapes ([GH-388](https://github.com/NVIDIA/warp/issues/388)).
- Fix the overriding of `wp.sim.Model` default parameters ([GH-429](https://github.com/NVIDIA/warp/pull/429)).
- Fix `wp.array()` not respecting the target `dtype` when the given data is an another array with a CUDA interface ([GH-363](https://github.com/NVIDIA/warp/issues/363)).
- Add an implicit tile sychronization whenever a shared memory tile's data is reinitialized (e.g. in dynamic loops). This could result in lower performance.

## [1.5.1] - 2025-01-02

Expand Down
50 changes: 44 additions & 6 deletions warp/native/tile.h
Original file line number Diff line number Diff line change
Expand Up @@ -569,13 +569,22 @@ struct tile_shared_t
Storage data;
Storage grad;

// we need to track whether or not this tile's data has been initialized.
// once true, any re-initialization of data that follows needs a WP_TILE_SYNC()
// call to precede it, to allow threads that are still reading from this tile
// to complete their work. e.g, in a dynamic loop:
// for i in range(x):
// tile = wp.tile_load(arr, i, TILE_SIZE, storage="shared")
// # read from tile...
bool initialized;

// default initialization (non-initialized)
inline CUDA_CALLABLE tile_shared_t() : data(NULL), grad(NULL)
inline CUDA_CALLABLE tile_shared_t() : data(NULL), grad(NULL), initialized(false)
{
}

// initialize from an existing tile's memory
inline CUDA_CALLABLE tile_shared_t(T* data, T* grad=NULL) : data(data), grad(grad)
inline CUDA_CALLABLE tile_shared_t(T* data, T* grad=NULL, bool initialized=true) : data(data), grad(grad), initialized(initialized)
{
}

Expand Down Expand Up @@ -613,6 +622,7 @@ struct tile_shared_t
// alias tile directly
data = rhs.data;
grad = rhs.grad;
initialized = rhs.initialized;

return *this;
}
Expand All @@ -633,9 +643,16 @@ struct tile_shared_t
// assign from a constant value
inline CUDA_CALLABLE auto& operator=(const T& x)
{
// sync if we are re-initializing data so that any threads that are still
// reading from this tile can complete their work, e.g.: if re-assigning
// to a tile during a dynamic loop
if (initialized)
WP_TILE_SYNC();

for (int i=threadIdx.x; i < M*N; i+= WP_TILE_BLOCK_DIM)
data(i) = x;

initialized = true;
WP_TILE_SYNC();
return *this;
}
Expand Down Expand Up @@ -674,7 +691,13 @@ struct tile_shared_t

// copy register tile to shared
inline CUDA_CALLABLE void assign(const tile_register_t<T, M, N>& tile)
{
{
// sync if we are re-initializing data so that any threads that are still
// reading from this tile can complete their work, e.g.: if re-assigning
// to a tile during a dynamic loop
if (initialized)
WP_TILE_SYNC();

WP_PRAGMA_UNROLL
for (int i=0; i < tile.NumRegs; ++i)
{
Expand All @@ -688,6 +711,7 @@ struct tile_shared_t
data(linear) = tile.data[i];
}

initialized = true;
WP_TILE_SYNC();
}

Expand Down Expand Up @@ -855,6 +879,12 @@ struct tile_shared_t

inline CUDA_CALLABLE void copy_from_global(const array_t<T>& src, int x)
{
// sync if we are re-initializing data so that any threads that are still
// reading from this tile can complete their work, e.g.: if re-assigning
// to a tile during a dynamic loop
if (initialized)
WP_TILE_SYNC();

// todo: use async pipelines or TMA here
const int tile_i = x*N;

Expand All @@ -864,11 +894,18 @@ struct tile_shared_t
data(i) = wp::index(src, tile_i + i);
}

initialized = true;
WP_TILE_SYNC();
}

inline CUDA_CALLABLE void copy_from_global(const array_t<T>& src, int x, int y)
{
// sync if we are re-initializing data so that any threads that are still
// reading from this tile can complete their work, e.g.: if re-assigning
// to a tile during a dynamic loop
if (initialized)
WP_TILE_SYNC();

// todo: use async pipelines or TMA here
const int tile_i = x*M;
const int tile_j = y*N;
Expand Down Expand Up @@ -930,6 +967,7 @@ struct tile_shared_t
}
}

initialized = true;
#if !WP_USE_ASYNC_PIPELINE
WP_TILE_SYNC();
#endif
Expand Down Expand Up @@ -997,8 +1035,8 @@ inline CUDA_CALLABLE void adj_print(const tile_shared_t<T, M, N, StrideM, Stride
// helpers to allocate shared tiles
template <typename T, int M, int N, bool RequiresGrad>
inline CUDA_CALLABLE auto tile_alloc_empty()

{ constexpr int Len = M*N;
{
constexpr int Len = M*N;
T* data = (T*)tile_alloc_shared(Len*sizeof(T));
T* grad = NULL;

Expand All @@ -1022,7 +1060,7 @@ inline CUDA_CALLABLE auto tile_alloc_empty()
WP_TILE_SYNC();
}

return tile_shared_t<T, M, N>(data, grad);
return tile_shared_t<T, M, N>(data, grad, false);
}

template <typename T, int M, int N, bool RequiresGrad>
Expand Down

0 comments on commit 6ab3ca3

Please sign in to comment.