Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Cudastf #4

Merged
merged 19 commits into from
Oct 11, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 3 additions & 3 deletions cudax/cmake/cudaxHeaderTesting.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -57,9 +57,9 @@ function(cudax_add_header_test label definitions)
# FIXME: layout_left::mapping referenced before layout_left:
cuda/experimental/__stf/supplemental_std_experimental/__p0009_bits/layout_left.hpp

# FIXME: error: possibly dangling reference to a temporary (stream_task.cuh:114)
cuda/experimental/__stf/stream/stream_task.cuh
cuda/experimental/__stf/stream/stream_ctx.cuh
# # FIXME: error: possibly dangling reference to a temporary (stream_task.cuh:114)
# cuda/experimental/__stf/stream/stream_task.cuh
# cuda/experimental/__stf/stream/stream_ctx.cuh
)
target_link_libraries(${headertest_target} PUBLIC ${cn_target})
target_compile_definitions(${headertest_target} PRIVATE
Expand Down
4 changes: 2 additions & 2 deletions cudax/examples/stf/1f1b.cu
Original file line number Diff line number Diff line change
Expand Up @@ -81,7 +81,7 @@ int main(int argc, char** argv) {
for (size_t iter = 0; iter < niter; iter++) {
size_t task_cnt = 0;
for (size_t b = 0; b < num_batches; b++) {
for (size_t d = 0; d < num_devs; d++) {
for (int d = 0; d < num_devs; d++) {
ctx.task(exec_place::device(d % real_devs), data[b].rw())->*[=](cudaStream_t s, auto bd) {
int ms = 10;
long long int clock_cnt = (long long int) (ms * clock_rate / factor);
Expand All @@ -92,7 +92,7 @@ int main(int argc, char** argv) {
// }
//
// for (size_t b = 0; b < num_batches; b++) {
for (size_t d = num_devs; d-- > 0;) {
for (int d = num_devs; d-- > 0;) {
ctx.task(exec_place::device(d % real_devs), data[b].rw())->*[=](cudaStream_t s, auto bd) {
int ms = 20;
long long int clock_cnt = (long long int) (ms * clock_rate / factor);
Expand Down
2 changes: 1 addition & 1 deletion cudax/examples/stf/binary_fhe.cu
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ class plaintext {
public:
plaintext(const context& ctx) : ctx(ctx) {}

plaintext(context& ctx, std::vector<char> v) : ctx(ctx), values(v) {
plaintext(context& ctx, std::vector<char> v) : values(v), ctx(ctx) {
l = ctx.logical_data(&values[0], values.size());
}

Expand Down
4 changes: 2 additions & 2 deletions cudax/examples/stf/graph_algorithms/pagerank.cu
Original file line number Diff line number Diff line change
Expand Up @@ -124,8 +124,8 @@ int main() {
printf("Page rank answer is %s.\n", abs(sum_pageranks - 1.0) < 0.001 ? "correct" : "not correct");

printf("PageRank Results:\n");
for (int i = 0; i < page_rank.size(); ++i) {
printf("Vertex %d: %f\n", i, page_rank[i]);
for (size_t i = 0; i < page_rank.size(); ++i) {
printf("Vertex %ld: %f\n", i, page_rank[i]);
}

return 0;
Expand Down
4 changes: 2 additions & 2 deletions cudax/examples/stf/mandelbrot.cu
Original file line number Diff line number Diff line change
Expand Up @@ -95,8 +95,8 @@ int main(int argc, char** argv) {
imageFile << width << " " << height << "\n";
imageFile << "255\n";

for (int y = 0; y < height; y++)
for (int x = 0; x < width; x++) {
for (size_t y = 0; y < height; y++)
for (size_t x = 0; x < width; x++) {
int iterations = buffer(x, y);
// Convert iterations to RGB values
unsigned char r = (iterations % 8) * 32;
Expand Down
2 changes: 1 addition & 1 deletion cudax/examples/stf/word_count.cu
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@ int main() {

context ctx;

auto ltext = ctx.logical_data((char*) &raw_input[0], { sizeof(raw_input) });
auto ltext = ctx.logical_data(const_cast<char*>(&raw_input[0]), { sizeof(raw_input) });

int cnt = 0;
auto lcnt = ctx.logical_data(&cnt, { 1 });
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -34,8 +34,8 @@ protected:
}

public:
const cudaGraphNode_t node;
size_t epoch;
mutable cudaGraphNode_t node;
mutable size_t epoch;
};

using graph_event = handle<graph_event_impl, handle_flags::non_null>;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -149,7 +149,7 @@ public:
}

/// A unique identifier for the event, used to ensure proper event ordering.
const unique_id_t unique_prereq_id;
mutable unique_id_t unique_prereq_id;

::std::atomic<int> outbound_deps = 0;

Expand Down
25 changes: 13 additions & 12 deletions cudax/include/cuda/experimental/__stf/internal/backend_ctx.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -281,17 +281,6 @@ public:
dot.template add_vertex<typename Ctx::task_type, logical_data_untyped>(t);
}

auto insert_one_kernel = [](cuda_kernel_desc& k, cudaGraphNode_t& n, cudaGraph_t& g) {
cudaKernelNodeParams kconfig;
kconfig.blockDim = k.blockDim;
kconfig.extra = nullptr;
kconfig.func = (void*) k.func;
kconfig.gridDim = k.gridDim;
kconfig.kernelParams = k.args_ptr.data();
kconfig.sharedMemBytes = k.sharedMem;
cuda_safe_call(cudaGraphAddKernelNode(&n, g, nullptr, 0, &kconfig));
};

// When chained is enable, we expect a vector of kernel description which should be executed one after the other
if constexpr (chained)
{
Expand Down Expand Up @@ -362,6 +351,18 @@ public:
}

private:
/* Add a kernel to a CUDA graph given its description */
auto insert_one_kernel(cuda_kernel_desc& k, cudaGraphNode_t& n, cudaGraph_t& g) const {
cudaKernelNodeParams kconfig;
kconfig.blockDim = k.blockDim;
kconfig.extra = nullptr;
kconfig.func = const_cast<void *>(k.func);
kconfig.gridDim = k.gridDim;
kconfig.kernelParams = k.args_ptr.data();
kconfig.sharedMemBytes = k.sharedMem;
cuda_safe_call(cudaGraphAddKernelNode(&n, g, nullptr, 0, &kconfig));
}

::std::string symbol;
Ctx& ctx;
task_dep_vector<Deps...> deps;
Expand Down Expand Up @@ -1068,7 +1069,7 @@ private:
template <typename T, typename... P>
auto make_data_interface(P&&... p)
{
return ::std::make_shared<typename Engine::data_interface<T>>(::std::forward<P>(p)...);
return ::std::make_shared<typename Engine::template data_interface<T>>(::std::forward<P>(p)...);
}
};

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -575,7 +575,7 @@ public:
auto& current_instance = get_data_instance(instance_id);
auto current_state = current_instance.get_msir();

static size_t total_write_back_cnt = 0;
// static size_t total_write_back_cnt = 0;

/* Update MSI status depending on the current states and the required access mode */
switch (current_state)
Expand All @@ -597,7 +597,7 @@ public:
prereqs.merge(ref_instance.get_read_prereq(), current_instance.get_read_prereq());

write_back(memory_node, instance_id, prereqs);
total_write_back_cnt++;
// total_write_back_cnt++;
// fprintf(stderr, "WRITE BACK... %s (%ld)!!\n", get_symbol().c_str(), total_write_back_cnt);

ref_instance.add_read_prereq(prereqs);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -287,7 +287,6 @@ public:
}
else
{
auto grid_dims = t.grid_dims();
size_t grid_size = t.grid_dims().size();
for (size_t i = 0; i < grid_size; i++)
{
Expand Down Expand Up @@ -353,7 +352,7 @@ public:
}();

const auto [block_size, min_blocks] = conf;
const size_t n = sub_shape.size();
size_t n = sub_shape.size();

// If there is no item in that shape, no need to launch a kernel !
if (n == 0)
Expand Down Expand Up @@ -428,7 +427,7 @@ public:

auto arg1 = mv(explode_deps);
auto arg2 = deps.instance(t);
void* kernelArgs[] = {(void*) &n, &arg1, &arg2};
void* kernelArgs[] = {&n, &arg1, &arg2};
kernel_params.kernelParams = kernelArgs;
kernel_params.extra = nullptr;

Expand Down
4 changes: 2 additions & 2 deletions cudax/include/cuda/experimental/__stf/internal/slice.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -1065,7 +1065,7 @@ size_t data_hash([[maybe_unused]] mdspan<E, X, L, A> s, ::std::index_sequence<i.
* Write the content of the mdspan into a file
*/
template <typename E, typename X, typename L, typename A, size_t... i>
void data_dump(mdspan<E, X, L, A> s,
void data_dump([[maybe_unused]] mdspan<E, X, L, A> s,
::std::ostream& file = ::std::cerr,
::std::index_sequence<i...> = ::std::index_sequence<>())
{
Expand Down Expand Up @@ -1181,7 +1181,7 @@ struct std::hash<::cuda::experimental::stf::mdspan<P...>>

if constexpr (_dimensions > 1)
{
for (auto i = 1; i < _dimensions; i++)
for (size_t i = 1; i < _dimensions; i++)
{
cuda::experimental::stf::hash_combine(h, s.stride(i));
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,8 @@ class localized_array
{
struct metadata
{
metadata(int dev_, size_t size_, size_t offset_) : alloc_handle{}, dev(dev_), size(size_), offset(offset_) {}

CUmemGenericAllocationHandle alloc_handle;
int dev;
size_t size;
Expand Down Expand Up @@ -122,10 +124,7 @@ public:
j++;
}

meta.push_back({.alloc_handle = {},
.dev = grid_pos_to_dev(p),
.size = j * alloc_granularity_bytes,
.offset = i * block_size_bytes});
meta.emplace_back(grid_pos_to_dev(p), j * alloc_granularity_bytes, i * block_size_bytes);

i += j;
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -64,7 +64,7 @@ public:
}

template <typename mdspan_shape_t>
CUDASTF_HOST_DEVICE static const auto apply(const mdspan_shape_t& in, pos4 place_position, dim4 grid_dims)
CUDASTF_HOST_DEVICE static auto apply(const mdspan_shape_t& in, pos4 place_position, dim4 grid_dims)
{
constexpr size_t dimensions = mdspan_shape_t::rank();

Expand Down
4 changes: 2 additions & 2 deletions cudax/include/cuda/experimental/__stf/places/cyclic_shape.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -202,7 +202,7 @@ public:
cyclic_partition() = default;

template <size_t dimensions>
CUDASTF_HOST_DEVICE static const auto apply(const box<dimensions>& in, pos4 place_position, dim4 grid_dims)
CUDASTF_HOST_DEVICE static auto apply(const box<dimensions>& in, pos4 place_position, dim4 grid_dims)
{
::std::array<size_t, dimensions> begins;
::std::array<size_t, dimensions> ends;
Expand All @@ -218,7 +218,7 @@ public:
}

template <typename mdspan_shape_t>
CUDASTF_HOST_DEVICE static const auto apply(const mdspan_shape_t& in, pos4 place_position, dim4 grid_dims)
CUDASTF_HOST_DEVICE static auto apply(const mdspan_shape_t& in, pos4 place_position, dim4 grid_dims)
{
constexpr size_t dimensions = mdspan_shape_t::rank();

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -217,13 +217,13 @@ public:
cuda_safe_call(cuCtxSetCurrent(saved_ctx));
}

virtual ::std::string to_string() const override
::std::string to_string() const override
{
return "green ctx ( id=" + ::std::to_string(get_cuda_context_id(g_ctx)) + " dev_id =" + ::std::to_string(devid)
+ ")";
}

virtual stream_pool& get_stream_pool(async_resources_handle&, bool) const
stream_pool& get_stream_pool(async_resources_handle&, bool) const override
{
return *pool;
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
#include <cuda/experimental/__stf/places/exec/host/callback_queues.cuh>
#include <cuda/experimental/__stf/utility/traits.cuh>
#include <stack>
#include <cstdio>

#define STATEFUL_CALLBACKS

Expand Down
37 changes: 28 additions & 9 deletions cudax/include/cuda/experimental/__stf/places/places.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -804,12 +804,28 @@ public:

bool operator==(const exec_place::impl& rhs) const override
{
if (!exec_place::impl::operator==(rhs))
// First, check if rhs is of type exec_place_grid::impl
auto other = dynamic_cast<const impl*>(&rhs);
if (!other)
{
return false;
return false; // rhs is not a grid, so they are not equal
}
auto other = dynamic_cast<const impl*>(&rhs);
return other && dims == other->dims && places == other->places;

// Compare two grids
return *this == *other;
}

// Compare two grids
bool operator==(const impl& rhs) const
{
// First, compare base class properties
if (!exec_place::impl::operator==(rhs))
{
return false;
}

// Compare grid-specific properties
return dims == rhs.dims && places == rhs.places;
}

const ::std::vector<exec_place>& get_places() const
Expand Down Expand Up @@ -1009,6 +1025,9 @@ public:
return ::std::static_pointer_cast<impl>(exec_place::get_impl());
}

// Default constructor
exec_place_grid() : exec_place(nullptr) {}

// private:
exec_place_grid(::std::shared_ptr<impl> p)
: exec_place(mv(p))
Expand Down Expand Up @@ -1537,7 +1556,7 @@ interpreted_execution_policy<spec...>::interpreted_execution_policy(
}
else
{
if (l1_size > block_size_limit)
if (int(l1_size) > block_size_limit)
{
fprintf(stderr,
"Unsatisfiable spec: Maximum block size %d threads, requested %ld (level 1)\n",
Expand All @@ -1553,7 +1572,7 @@ interpreted_execution_policy<spec...>::interpreted_execution_policy(
}

// Enforce the resource limits in the number of threads per block
assert(l1_size <= block_size_limit);
assert(int(l1_size) <= block_size_limit);

assert(l0_size % ndevs == 0);

Expand Down Expand Up @@ -1589,7 +1608,7 @@ interpreted_execution_policy<spec...>::interpreted_execution_policy(
}
else
{
if (l2_size > block_size_limit)
if (int(l2_size) > block_size_limit)
{
fprintf(stderr,
"Unsatisfiable spec: Maximum block size %d threads, requested %ld (level 2)\n",
Expand All @@ -1610,8 +1629,8 @@ interpreted_execution_policy<spec...>::interpreted_execution_policy(
}

// Enforce the resource limits in the number of threads per block
assert(l2_size <= block_size_limit);
assert(l0_size <= ndevs);
assert(int(l2_size) <= block_size_limit);
assert(int(l0_size) <= ndevs);

/* Merge blocks and devices */
this->add_level({::std::make_pair(hw_scope::device, l0_size)});
Expand Down
6 changes: 5 additions & 1 deletion cudax/include/cuda/experimental/__stf/stream/stream_task.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -111,7 +111,11 @@ public:
// We have currently no way to pass an array of per-place streams
assert(automatic_stream);

const auto& places = e_place.as_grid().get_places();
// Note: we store grid in a variable to avoid dangling references
// because the compiler does not know we are making a refernce to
// a vector that remains valid
const auto& grid = e_place.as_grid();
const auto& places = grid.get_places();
for (const exec_place& p : places)
{
stream_grid.push_back(get_stream_from_pool(p));
Expand Down
1 change: 1 addition & 0 deletions cudax/include/cuda/experimental/__stf/utility/traits.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -138,6 +138,7 @@ class print_type_name_and_fail
template <class T>
class meyers_singleton
{
protected:
template <class U>
struct wrapper
{
Expand Down
2 changes: 1 addition & 1 deletion cudax/test/stf/cpp/redundant_data.cu
Original file line number Diff line number Diff line change
Expand Up @@ -75,7 +75,7 @@ int main()
{
double X[n], Y[n];

for (int ind = 0; ind < n; ind++)
for (size_t ind = 0; ind < n; ind++)
{
X[ind] = 1.0 * ind;
Y[ind] = 2.0 * ind - 3.0;
Expand Down
Loading
Loading