From 6c69a794c0786e8833fe41d23e01635fc1f13ee1 Mon Sep 17 00:00:00 2001 From: Joseph Schuchart Date: Thu, 4 Apr 2024 15:21:26 -0400 Subject: [PATCH 01/25] Move copy version increment to CPU hook The new copy versioning in PaRSEC requires this only for CPU tasks to make sure the results are handled correctly. Signed-off-by: Joseph Schuchart --- cmake/modules/ExternalDependenciesVersions.cmake | 2 +- cmake/modules/FindOrFetchPARSEC.cmake | 2 +- ttg/ttg/parsec/ttg.h | 3 +-- 3 files changed, 3 insertions(+), 4 deletions(-) diff --git a/cmake/modules/ExternalDependenciesVersions.cmake b/cmake/modules/ExternalDependenciesVersions.cmake index fd96e5816..2e79bdb88 100644 --- a/cmake/modules/ExternalDependenciesVersions.cmake +++ b/cmake/modules/ExternalDependenciesVersions.cmake @@ -4,7 +4,7 @@ set(TTG_TRACKED_VG_CMAKE_KIT_TAG 7ea2d4d3f8854b9e417f297fd74d6fc49aa13fd5) # used to provide "real" FindOrFetchBoost set(TTG_TRACKED_CATCH2_VERSION 3.5.0) set(TTG_TRACKED_MADNESS_TAG 2eb3bcf0138127ee2dbc651f1aabd3e9b0def4e3) -set(TTG_TRACKED_PARSEC_TAG 0b3140f58ad9dc78a3d64da9fd73ecc7f443ece7) +set(TTG_TRACKED_PARSEC_TAG fc18d7881059a520e73cb7ccb6b56e6da705635e) set(TTG_TRACKED_BTAS_TAG 4e8f5233aa7881dccdfcc37ce07128833926d3c2) set(TTG_TRACKED_TILEDARRAY_TAG 493c109379a1b64ddd5ef59f7e33b95633b68d73) diff --git a/cmake/modules/FindOrFetchPARSEC.cmake b/cmake/modules/FindOrFetchPARSEC.cmake index 7b164019f..2a663f914 100644 --- a/cmake/modules/FindOrFetchPARSEC.cmake +++ b/cmake/modules/FindOrFetchPARSEC.cmake @@ -17,7 +17,7 @@ if (NOT TARGET PaRSEC::parsec) FetchContent_Declare( PARSEC - GIT_REPOSITORY https://github.com/devreal/parsec-1.git + GIT_REPOSITORY https://github.com/abouteiller/parsec.git GIT_TAG ${TTG_TRACKED_PARSEC_TAG} ) FetchContent_MakeAvailable(PARSEC) diff --git a/ttg/ttg/parsec/ttg.h b/ttg/ttg/parsec/ttg.h index f4653f962..048b15a0d 100644 --- a/ttg/ttg/parsec/ttg.h +++ b/ttg/ttg/parsec/ttg.h @@ -740,6 +740,7 @@ namespace ttg_parsec { inline void transfer_ownership_impl(ttg_data_copy_t *copy, int device) { if constexpr(!std::is_const_v>) { copy->transfer_ownership(PARSEC_FLOW_ACCESS_RW, device); + copy->inc_current_version(); } } @@ -4266,8 +4267,6 @@ struct ttg::detail::value_copy_handler { /* this copy won't be modified anymore so mark it as read-only */ copy->reset_readers(); } - /* the value was potentially changed, so increment version */ - copy->inc_current_version(); } /* We're coming from a writer so mark the data as modified. * That way we can force a pushout in prepare_send if we move to read-only tasks (needed by PaRSEC). */ From de6d8c1084a2bcffc3a1059ad857ff21cec9226b Mon Sep 17 00:00:00 2001 From: Joseph Schuchart Date: Thu, 4 Apr 2024 18:49:44 -0400 Subject: [PATCH 02/25] Initialize parsec_task data[].data_in to NULL Not sure if this really needed. Would prefer to remove this again. Signed-off-by: Joseph Schuchart --- ttg/ttg/parsec/task.h | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/ttg/ttg/parsec/task.h b/ttg/ttg/parsec/task.h index 5b23d53af..fa415375f 100644 --- a/ttg/ttg/parsec/task.h +++ b/ttg/ttg/parsec/task.h @@ -150,6 +150,9 @@ namespace ttg_parsec { parsec_task.mempool_owner = mempool; parsec_task.task_class = task_class; parsec_task.priority = 0; + + // TODO: can we avoid this? + for (int i = 0; i < MAX_PARAM_COUNT; ++i) { this->parsec_task.data[i].data_in = nullptr; } } parsec_ttg_task_base_t(parsec_thread_mempool_t *mempool, parsec_task_class_t *task_class, @@ -168,6 +171,9 @@ namespace ttg_parsec { parsec_task.taskpool = taskpool; parsec_task.priority = priority; parsec_task.chore_mask = 1<<0; + + // TODO: can we avoid this? + for (int i = 0; i < MAX_PARAM_COUNT; ++i) { this->parsec_task.data[i].data_in = nullptr; } } public: From 6492573d36ce85d7bb12c253cd81d4432d50dba7 Mon Sep 17 00:00:00 2001 From: Joseph Schuchart Date: Thu, 4 Apr 2024 18:50:17 -0400 Subject: [PATCH 03/25] Remove use of parsec_get_best_device This is now handled by parsec directly. Signed-off-by: Joseph Schuchart --- ttg/ttg/parsec/ttg.h | 20 ++++++-------------- 1 file changed, 6 insertions(+), 14 deletions(-) diff --git a/ttg/ttg/parsec/ttg.h b/ttg/ttg/parsec/ttg.h index 048b15a0d..5d0992053 100644 --- a/ttg/ttg/parsec/ttg.h +++ b/ttg/ttg/parsec/ttg.h @@ -1454,7 +1454,6 @@ namespace ttg_parsec { static parsec_hook_return_t device_static_op(parsec_task_t* parsec_task) { static_assert(derived_has_device_op()); - int dev_index; double ratio = 1.0; task_t *task = (task_t*)parsec_task; @@ -1469,7 +1468,7 @@ namespace ttg_parsec { PARSEC_OBJ_CONSTRUCT(gpu_task, parsec_list_item_t); gpu_task->ec = parsec_task; gpu_task->task_type = 0; // user task - gpu_task->load = 1; // TODO: can we do better? + //gpu_task->load = 1; // TODO: can we do better? gpu_task->last_data_check_epoch = -1; // used internally gpu_task->pushout = 0; gpu_task->submit = &TT::device_static_submit; @@ -1513,20 +1512,11 @@ namespace ttg_parsec { /* TODO: is this the right place to set the mask? */ task->parsec_task.chore_mask = PARSEC_DEV_ALL; - /* get a device and come back if we need another one */ - int64_t task_load = 1; - dev_index = parsec_get_best_device(parsec_task, &task_load); /* swap back the original task class */ task->parsec_task.task_class = tmp; - gpu_task->load = task_load; - assert(dev_index >= 0); - if (!parsec_mca_device_is_gpu(dev_index)) { - return PARSEC_HOOK_RETURN_NEXT; /* Fall back */ - } - - parsec_device_gpu_module_t *device = (parsec_device_gpu_module_t*)parsec_mca_device_get(dev_index); + parsec_device_gpu_module_t *device = (parsec_device_gpu_module_t*)task->parsec_task.selected_device; assert(NULL != device); task->dev_ptr->device = device; @@ -3745,7 +3735,7 @@ ttg::abort(); // should not happen ((__parsec_chore_t *)self.incarnations)[1].type = PARSEC_DEV_NONE; ((__parsec_chore_t *)self.incarnations)[1].evaluate = NULL; ((__parsec_chore_t *)self.incarnations)[1].hook = NULL; - } else if (derived_has_hip_op()) { + } else if constexpr (derived_has_hip_op()) { self.incarnations = (__parsec_chore_t *)malloc(3 * sizeof(__parsec_chore_t)); ((__parsec_chore_t *)self.incarnations)[0].type = PARSEC_DEV_HIP; ((__parsec_chore_t *)self.incarnations)[0].evaluate = NULL; @@ -3754,7 +3744,8 @@ ttg::abort(); // should not happen ((__parsec_chore_t *)self.incarnations)[1].type = PARSEC_DEV_NONE; ((__parsec_chore_t *)self.incarnations)[1].evaluate = NULL; ((__parsec_chore_t *)self.incarnations)[1].hook = NULL; - } else if (derived_has_level_zero_op()) { +#if defined(PARSEC_HAVE_DEV_LEVEL_ZERO_SUPPORT) + } else if constexpr (derived_has_level_zero_op()) { self.incarnations = (__parsec_chore_t *)malloc(3 * sizeof(__parsec_chore_t)); ((__parsec_chore_t *)self.incarnations)[0].type = PARSEC_DEV_LEVEL_ZERO; ((__parsec_chore_t *)self.incarnations)[0].evaluate = NULL; @@ -3763,6 +3754,7 @@ ttg::abort(); // should not happen ((__parsec_chore_t *)self.incarnations)[1].type = PARSEC_DEV_NONE; ((__parsec_chore_t *)self.incarnations)[1].evaluate = NULL; ((__parsec_chore_t *)self.incarnations)[1].hook = NULL; +#endif // PARSEC_HAVE_DEV_LEVEL_ZERO_SUPPORT } else { self.incarnations = (__parsec_chore_t *)malloc(2 * sizeof(__parsec_chore_t)); ((__parsec_chore_t *)self.incarnations)[0].type = PARSEC_DEV_CPU; From 3be65cf2f973abf333d0f5dce75b88a51a6dffbd Mon Sep 17 00:00:00 2001 From: Joseph Schuchart Date: Fri, 5 Apr 2024 18:57:13 -0400 Subject: [PATCH 04/25] Use the evaluate hook to query input data before calling the compute hook PaRSEC needs to know what input data we have and needs to be available in the taskclass, so put a copy of the task-class into the task object. Signed-off-by: Joseph Schuchart --- ttg/ttg/parsec/task.h | 10 +++ ttg/ttg/parsec/ttg.h | 144 ++++++++++++++++++++++++++++-------------- 2 files changed, 105 insertions(+), 49 deletions(-) diff --git a/ttg/ttg/parsec/task.h b/ttg/ttg/parsec/task.h index fa415375f..d60c87f3c 100644 --- a/ttg/ttg/parsec/task.h +++ b/ttg/ttg/parsec/task.h @@ -15,6 +15,7 @@ namespace ttg_parsec { parsec_flow_t* flows = nullptr; parsec_gpu_exec_stream_t* stream = nullptr; parsec_device_gpu_module_t* device = nullptr; + parsec_task_class_t task_class; // copy of the taskclass }; template @@ -240,6 +241,15 @@ namespace ttg_parsec { } } + template + parsec_hook_return_t invoke_evaluate() { + if constexpr (Space == ttg::ExecutionSpace::Host) { + return PARSEC_HOOK_RETURN_DONE; + } else { + return TT::template device_static_evaluate(&this->parsec_task); + } + } + parsec_key_t pkey() { return reinterpret_cast(&key); } }; diff --git a/ttg/ttg/parsec/ttg.h b/ttg/ttg/parsec/ttg.h index 5d0992053..6f46e1c75 100644 --- a/ttg/ttg/parsec/ttg.h +++ b/ttg/ttg/parsec/ttg.h @@ -766,7 +766,7 @@ namespace ttg_parsec { parsec_ttg_task_t *me = (parsec_ttg_task_t *)parsec_task; return me->template invoke_op(); } else { - throw std::runtime_error("PaRSEC CUDA hook invoked on a TT that does not support CUDA operations!"); + return PARSEC_HOOK_RETURN_NEXT; } } @@ -776,7 +776,7 @@ namespace ttg_parsec { parsec_ttg_task_t *me = (parsec_ttg_task_t *)parsec_task; return me->template invoke_op(); } else { - throw std::runtime_error("PaRSEC HIP hook invoked on a TT that does not support HIP operations!"); + return PARSEC_HOOK_RETURN_NEXT; } } @@ -786,10 +786,42 @@ namespace ttg_parsec { parsec_ttg_task_t *me = (parsec_ttg_task_t *)parsec_task; return me->template invoke_op(); } else { - throw std::runtime_error("PaRSEC HIP hook invoked on a TT that does not support HIP operations!"); + return PARSEC_HOOK_RETURN_NEXT; } } + + template + inline parsec_hook_return_t evaluate_cuda(struct parsec_execution_stream_s *es, parsec_task_t *parsec_task) { + if constexpr(TT::derived_has_cuda_op()) { + parsec_ttg_task_t *me = (parsec_ttg_task_t *)parsec_task; + return me->template invoke_evaluate(); + } else { + return PARSEC_HOOK_RETURN_NEXT; + } + } + + template + inline parsec_hook_return_t evaluate_hip(struct parsec_execution_stream_s *es, parsec_task_t *parsec_task) { + if constexpr(TT::derived_has_hip_op()) { + parsec_ttg_task_t *me = (parsec_ttg_task_t *)parsec_task; + return me->template invoke_evaluate(); + } else { + return PARSEC_HOOK_RETURN_NEXT; + } + } + + template + inline parsec_hook_return_t evaluate_level_zero(struct parsec_execution_stream_s *es, parsec_task_t *parsec_task) { + if constexpr(TT::derived_has_level_zero_op()) { + parsec_ttg_task_t *me = (parsec_ttg_task_t *)parsec_task; + return me->template invoke_evaluate(); + } else { + return PARSEC_HOOK_RETURN_NEXT; + } + } + + template class rma_delayed_activate { std::vector _keylist; @@ -1451,35 +1483,69 @@ namespace ttg_parsec { } template - static parsec_hook_return_t device_static_op(parsec_task_t* parsec_task) { - static_assert(derived_has_device_op()); - - double ratio = 1.0; + static parsec_hook_return_t device_static_evaluate(parsec_task_t* parsec_task) { task_t *task = (task_t*)parsec_task; - parsec_execution_stream_s *es = task->tt->world.impl().execution_stream(); + if (task->dev_ptr->gpu_task == nullptr) { + + //std::cout << "device_static_op: task " << parsec_task << std::endl; + + /* set up a device task */ + parsec_gpu_task_t *gpu_task; + /* PaRSEC wants to free the gpu_task, because F***K ownerships */ + gpu_task = static_cast(std::calloc(1, sizeof(*gpu_task))); + PARSEC_OBJ_CONSTRUCT(gpu_task, parsec_list_item_t); + gpu_task->ec = parsec_task; + gpu_task->task_type = 0; // user task + //gpu_task->load = 1; // TODO: can we do better? + gpu_task->last_data_check_epoch = -1; // used internally + gpu_task->pushout = 0; + gpu_task->submit = &TT::device_static_submit; - //std::cout << "device_static_op: task " << parsec_task << std::endl; + /* set the gpu_task so it's available in register_device_memory */ + task->dev_ptr->gpu_task = gpu_task; - /* set up a device task */ - parsec_gpu_task_t *gpu_task; - /* PaRSEC wants to free the gpu_task, because F***K ownerships */ - gpu_task = static_cast(std::calloc(1, sizeof(*gpu_task))); - PARSEC_OBJ_CONSTRUCT(gpu_task, parsec_list_item_t); - gpu_task->ec = parsec_task; - gpu_task->task_type = 0; // user task - //gpu_task->load = 1; // TODO: can we do better? - gpu_task->last_data_check_epoch = -1; // used internally - gpu_task->pushout = 0; - gpu_task->submit = &TT::device_static_submit; + /* TODO: is this the right place to set the mask? */ + task->parsec_task.chore_mask = PARSEC_DEV_ALL; - /* set the gpu_task so it's available in register_device_memory */ - task->dev_ptr->gpu_task = gpu_task; + /* copy over the task class, because that's what we need */ + task->dev_ptr->task_class = *task->parsec_task.task_class; - // first invocation of the coroutine to get the coroutine handle - static_op(parsec_task); + // first invocation of the coroutine to get the coroutine handle + static_op(parsec_task); - /* when we come back here, the flows in gpu_task are set (see register_device_memory) */ + /* when we come back here, the flows in gpu_task are set (see register_device_memory) */ + + parsec_task_class_t& tc = *task->dev_ptr->task_class; + + // input flows are set up during register_device_memory as part of the first invocation above + for (int i = 0; i < MAX_PARAM_COUNT; ++i) { + tc.in[i] = gpu_task->flow[i]; + tc.out[i] = gpu_task->flow[i]; + } + tc.nb_flows = MAX_PARAM_COUNT; + + /* set the new task class that contains the flows */ + task->parsec_task.task_class = task->dev_ptr->task_class; + + /* select this one */ + return PARSEC_HOOK_RETURN_DONE; + } + + std::cerr << "EVALUATE called on task with assigned GPU task!" << std::endl; + + /* not sure if this might happen*/ + return PARSEC_HOOK_RETURN_ERROR; + + } + + template + static parsec_hook_return_t device_static_op(parsec_task_t* parsec_task) { + static_assert(derived_has_device_op()); + + /* when we come in here we have a device assigned and are ready to go */ + + task_t *task = (task_t*)parsec_task; if (nullptr == task->suspended_task_address) { /* short-cut in case the task returned immediately */ @@ -1495,31 +1561,11 @@ namespace ttg_parsec { /* for now make sure we're waiting for transfers and the coro hasn't skipped this step */ assert(dev_data.state() == ttg::device::detail::TTG_DEVICE_CORO_WAIT_TRANSFER); - /* set up a temporary task-class to correctly specify the flows */ - parsec_task_class_t tc = *task->parsec_task.task_class; - - tc.name = task->parsec_task.task_class->name; - // input flows are set up during register_device_memory as part of the first invocation above - for (int i = 0; i < MAX_PARAM_COUNT; ++i) { - tc.in[i] = gpu_task->flow[i]; - tc.out[i] = gpu_task->flow[i]; - } - tc.nb_flows = MAX_PARAM_COUNT; - - /* swap in the new task class */ - const parsec_task_class_t* tmp = task->parsec_task.task_class; - *const_cast(&task->parsec_task.task_class) = &tc; - - /* TODO: is this the right place to set the mask? */ - task->parsec_task.chore_mask = PARSEC_DEV_ALL; - - /* swap back the original task class */ - task->parsec_task.task_class = tmp; - parsec_device_gpu_module_t *device = (parsec_device_gpu_module_t*)task->parsec_task.selected_device; assert(NULL != device); task->dev_ptr->device = device; + parsec_execution_stream_s *es = task->tt->world.impl().execution_stream(); switch(device->super.type) { @@ -3730,7 +3776,7 @@ ttg::abort(); // should not happen if constexpr (derived_has_cuda_op()) { self.incarnations = (__parsec_chore_t *)malloc(3 * sizeof(__parsec_chore_t)); ((__parsec_chore_t *)self.incarnations)[0].type = PARSEC_DEV_CUDA; - ((__parsec_chore_t *)self.incarnations)[0].evaluate = NULL; + ((__parsec_chore_t *)self.incarnations)[0].evaluate = &detail::evaluate_cuda; ((__parsec_chore_t *)self.incarnations)[0].hook = &detail::hook_cuda; ((__parsec_chore_t *)self.incarnations)[1].type = PARSEC_DEV_NONE; ((__parsec_chore_t *)self.incarnations)[1].evaluate = NULL; @@ -3738,7 +3784,7 @@ ttg::abort(); // should not happen } else if constexpr (derived_has_hip_op()) { self.incarnations = (__parsec_chore_t *)malloc(3 * sizeof(__parsec_chore_t)); ((__parsec_chore_t *)self.incarnations)[0].type = PARSEC_DEV_HIP; - ((__parsec_chore_t *)self.incarnations)[0].evaluate = NULL; + ((__parsec_chore_t *)self.incarnations)[0].evaluate = &detail::evaluate_hip; ((__parsec_chore_t *)self.incarnations)[0].hook = &detail::hook_hip; ((__parsec_chore_t *)self.incarnations)[1].type = PARSEC_DEV_NONE; @@ -3748,7 +3794,7 @@ ttg::abort(); // should not happen } else if constexpr (derived_has_level_zero_op()) { self.incarnations = (__parsec_chore_t *)malloc(3 * sizeof(__parsec_chore_t)); ((__parsec_chore_t *)self.incarnations)[0].type = PARSEC_DEV_LEVEL_ZERO; - ((__parsec_chore_t *)self.incarnations)[0].evaluate = NULL; + ((__parsec_chore_t *)self.incarnations)[0].evaluate = &detail::evaluate_level_zero; ((__parsec_chore_t *)self.incarnations)[0].hook = &detail::hook_level_zero; ((__parsec_chore_t *)self.incarnations)[1].type = PARSEC_DEV_NONE; From e42e3bdb4a9f062162ebc003807807c9587689ed Mon Sep 17 00:00:00 2001 From: Joseph Schuchart Date: Fri, 5 Apr 2024 19:59:46 -0400 Subject: [PATCH 05/25] Properly construct the parsec task Signed-off-by: Joseph Schuchart --- ttg/ttg/parsec/task.h | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/ttg/ttg/parsec/task.h b/ttg/ttg/parsec/task.h index d60c87f3c..7b63c1d4b 100644 --- a/ttg/ttg/parsec/task.h +++ b/ttg/ttg/parsec/task.h @@ -147,13 +147,14 @@ namespace ttg_parsec { : data_count(data_count) , copies(copies) , defer_writer(defer_writer) { + PARSEC_OBJ_CONSTRUCT(&parsec_task, parsec_task_t); PARSEC_LIST_ITEM_SINGLETON(&parsec_task.super); parsec_task.mempool_owner = mempool; parsec_task.task_class = task_class; parsec_task.priority = 0; - // TODO: can we avoid this? - for (int i = 0; i < MAX_PARAM_COUNT; ++i) { this->parsec_task.data[i].data_in = nullptr; } + // TODO: can we avoid this? + for (int i = 0; i < MAX_PARAM_COUNT; ++i) { this->parsec_task.data[i].data_in = nullptr; } } parsec_ttg_task_base_t(parsec_thread_mempool_t *mempool, parsec_task_class_t *task_class, @@ -173,8 +174,8 @@ namespace ttg_parsec { parsec_task.priority = priority; parsec_task.chore_mask = 1<<0; - // TODO: can we avoid this? - for (int i = 0; i < MAX_PARAM_COUNT; ++i) { this->parsec_task.data[i].data_in = nullptr; } + // TODO: can we avoid this? + for (int i = 0; i < MAX_PARAM_COUNT; ++i) { this->parsec_task.data[i].data_in = nullptr; } } public: From 0216894366c8df64998f6dd59bc7960dbaab9cf4 Mon Sep 17 00:00:00 2001 From: Joseph Schuchart Date: Fri, 5 Apr 2024 20:00:19 -0400 Subject: [PATCH 06/25] Use correct evaluate function signatures Signed-off-by: Joseph Schuchart --- ttg/ttg/parsec/ttg.h | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/ttg/ttg/parsec/ttg.h b/ttg/ttg/parsec/ttg.h index 6f46e1c75..991e29664 100644 --- a/ttg/ttg/parsec/ttg.h +++ b/ttg/ttg/parsec/ttg.h @@ -792,7 +792,7 @@ namespace ttg_parsec { template - inline parsec_hook_return_t evaluate_cuda(struct parsec_execution_stream_s *es, parsec_task_t *parsec_task) { + inline parsec_hook_return_t evaluate_cuda(const parsec_task_t *parsec_task) { if constexpr(TT::derived_has_cuda_op()) { parsec_ttg_task_t *me = (parsec_ttg_task_t *)parsec_task; return me->template invoke_evaluate(); @@ -802,7 +802,7 @@ namespace ttg_parsec { } template - inline parsec_hook_return_t evaluate_hip(struct parsec_execution_stream_s *es, parsec_task_t *parsec_task) { + inline parsec_hook_return_t evaluate_hip(const parsec_task_t *parsec_task) { if constexpr(TT::derived_has_hip_op()) { parsec_ttg_task_t *me = (parsec_ttg_task_t *)parsec_task; return me->template invoke_evaluate(); @@ -812,7 +812,7 @@ namespace ttg_parsec { } template - inline parsec_hook_return_t evaluate_level_zero(struct parsec_execution_stream_s *es, parsec_task_t *parsec_task) { + inline parsec_hook_return_t evaluate_level_zero(const parsec_task_t *parsec_task) { if constexpr(TT::derived_has_level_zero_op()) { parsec_ttg_task_t *me = (parsec_ttg_task_t *)parsec_task; return me->template invoke_evaluate(); @@ -1516,7 +1516,7 @@ namespace ttg_parsec { /* when we come back here, the flows in gpu_task are set (see register_device_memory) */ - parsec_task_class_t& tc = *task->dev_ptr->task_class; + parsec_task_class_t& tc = task->dev_ptr->task_class; // input flows are set up during register_device_memory as part of the first invocation above for (int i = 0; i < MAX_PARAM_COUNT; ++i) { @@ -1526,7 +1526,7 @@ namespace ttg_parsec { tc.nb_flows = MAX_PARAM_COUNT; /* set the new task class that contains the flows */ - task->parsec_task.task_class = task->dev_ptr->task_class; + task->parsec_task.task_class = &task->dev_ptr->task_class; /* select this one */ return PARSEC_HOOK_RETURN_DONE; @@ -1565,6 +1565,7 @@ namespace ttg_parsec { assert(NULL != device); task->dev_ptr->device = device; + parsec_gpu_task_t *gpu_task = task->dev_ptr->gpu_task; parsec_execution_stream_s *es = task->tt->world.impl().execution_stream(); switch(device->super.type) { From 2e5741966d3464f42eb3d593cf30d562babcbe6c Mon Sep 17 00:00:00 2001 From: Joseph Schuchart Date: Thu, 11 Apr 2024 23:28:52 -0400 Subject: [PATCH 07/25] Properly construct parsec task object Signed-off-by: Joseph Schuchart --- ttg/ttg/parsec/task.h | 1 + 1 file changed, 1 insertion(+) diff --git a/ttg/ttg/parsec/task.h b/ttg/ttg/parsec/task.h index 7b63c1d4b..1d17a9ad7 100644 --- a/ttg/ttg/parsec/task.h +++ b/ttg/ttg/parsec/task.h @@ -166,6 +166,7 @@ namespace ttg_parsec { , copies(copies) , release_task_cb(release_fn) , defer_writer(defer_writer) { + PARSEC_OBJ_CONSTRUCT(&parsec_task, parsec_task_t); PARSEC_LIST_ITEM_SINGLETON(&parsec_task.super); parsec_task.mempool_owner = mempool; parsec_task.task_class = task_class; From edf48fc1c12a9f944273f9e0dd57c2a62c084251 Mon Sep 17 00:00:00 2001 From: Joseph Schuchart Date: Fri, 12 Apr 2024 01:13:04 -0400 Subject: [PATCH 08/25] Fix dataflags binary operators Signed-off-by: Joseph Schuchart --- ttg/ttg/parsec/task.h | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/ttg/ttg/parsec/task.h b/ttg/ttg/parsec/task.h index 1d17a9ad7..5109dbcaa 100644 --- a/ttg/ttg/parsec/task.h +++ b/ttg/ttg/parsec/task.h @@ -58,9 +58,10 @@ namespace ttg_parsec { } inline - ttg_parsec_data_flags operator|=(ttg_parsec_data_flags lhs, ttg_parsec_data_flags rhs) { + ttg_parsec_data_flags operator|=(ttg_parsec_data_flags& lhs, ttg_parsec_data_flags rhs) { using flags_type = std::underlying_type::type; - return ttg_parsec_data_flags(static_cast(lhs) | static_cast(rhs)); + lhs = ttg_parsec_data_flags(static_cast(lhs) | static_cast(rhs)); + return lhs; } inline @@ -69,6 +70,13 @@ namespace ttg_parsec { return static_cast(lhs) & static_cast(rhs); } + inline + ttg_parsec_data_flags operator&=(ttg_parsec_data_flags& lhs, ttg_parsec_data_flags rhs) { + using flags_type = std::underlying_type::type; + lhs = ttg_parsec_data_flags(static_cast(lhs) & static_cast(rhs)); + return lhs; + } + inline bool operator!(ttg_parsec_data_flags lhs) { using flags_type = std::underlying_type::type; From 0e6235276c9eb358e96f342bd3536e9f02a914a6 Mon Sep 17 00:00:00 2001 From: Joseph Schuchart Date: Fri, 12 Apr 2024 01:13:20 -0400 Subject: [PATCH 09/25] Fix logic for detecting device-aware MPI Signed-off-by: Joseph Schuchart --- ttg/ttg/parsec/ttg.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/ttg/ttg/parsec/ttg.h b/ttg/ttg/parsec/ttg.h index 991e29664..3069922b5 100644 --- a/ttg/ttg/parsec/ttg.h +++ b/ttg/ttg/parsec/ttg.h @@ -3390,11 +3390,11 @@ ttg::abort(); // should not happen if (!need_pushout) { bool device_supported = false; if constexpr (derived_has_cuda_op()) { - device_supported = !world.impl().mpi_support(ttg::ExecutionSpace::CUDA); + device_supported = world.impl().mpi_support(ttg::ExecutionSpace::CUDA); } else if constexpr (derived_has_hip_op()) { - device_supported = !world.impl().mpi_support(ttg::ExecutionSpace::HIP); + device_supported = world.impl().mpi_support(ttg::ExecutionSpace::HIP); } else if constexpr (derived_has_level_zero_op()) { - device_supported = !world.impl().mpi_support(ttg::ExecutionSpace::L0); + device_supported = world.impl().mpi_support(ttg::ExecutionSpace::L0); } /* if MPI supports the device we don't care whether we have remote peers * because we can send from the device directly */ From 38e7efb07363b0d57169de22f1b02cff89fc3d2a Mon Sep 17 00:00:00 2001 From: Joseph Schuchart Date: Fri, 12 Apr 2024 01:14:04 -0400 Subject: [PATCH 10/25] Add override keyword to fence and make_executable The hip compiler complains about the override keyword missing. Signed-off-by: Joseph Schuchart --- ttg/ttg/tt.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/ttg/ttg/tt.h b/ttg/ttg/tt.h index 7024776aa..ec7eff6d8 100644 --- a/ttg/ttg/tt.h +++ b/ttg/ttg/tt.h @@ -96,9 +96,9 @@ namespace ttg { ttg::World get_world() const override final { return tts[0]->get_world(); } - void fence() { tts[0]->fence(); } + void fence() override { tts[0]->fence(); } - void make_executable() { + void make_executable() override { for (auto &op : tts) op->make_executable(); } From 5dc442d773213dff6c7d5dab5d13f785f9ecdf74 Mon Sep 17 00:00:00 2001 From: Joseph Schuchart Date: Fri, 12 Apr 2024 01:15:47 -0400 Subject: [PATCH 11/25] POTRF: use correct keymap for SYRK The second element of the key contains the tile ID, the first one contains the iteration. Signed-off-by: Joseph Schuchart --- examples/potrf/potrf.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/potrf/potrf.h b/examples/potrf/potrf.h index f6ba6e147..1ecb2c23f 100644 --- a/examples/potrf/potrf.h +++ b/examples/potrf/potrf.h @@ -674,7 +674,7 @@ namespace potrf { auto keymap1 = [&](const Key1& key) { return A.rank_of(key[0], key[0]); }; auto keymap2a = [&](const Key2& key) { return A.rank_of(key[0], key[1]); }; - auto keymap2b = [&](const Key2& key) { return A.rank_of(key[0], key[0]); }; + auto keymap2b = [&](const Key2& key) { return A.rank_of(key[1], key[1]); }; auto keymap3 = [&](const Key3& key) { return A.rank_of(key[0], key[1]); }; From ec2b796e3e86d9c35c7cfa8ea5b4a6d6dd4aaab1 Mon Sep 17 00:00:00 2001 From: Joseph Schuchart Date: Fri, 12 Apr 2024 01:18:07 -0400 Subject: [PATCH 12/25] Remove unused variable Signed-off-by: Joseph Schuchart --- ttg/ttg/parsec/ttg.h | 1 - 1 file changed, 1 deletion(-) diff --git a/ttg/ttg/parsec/ttg.h b/ttg/ttg/parsec/ttg.h index 3069922b5..740c8fd07 100644 --- a/ttg/ttg/parsec/ttg.h +++ b/ttg/ttg/parsec/ttg.h @@ -3417,7 +3417,6 @@ ttg::abort(); // should not happen auto remote_check = [&](){ auto world = ttg_default_execution_context(); int rank = world.rank(); - uint64_t pos = 0; bool remote = keylist.end() != std::find_if(keylist.begin(), keylist.end(), [&](const Key &key) { return keymap(key) != rank; }); return remote; From b7c5088f9e251eb9ede67a3e072658798834b8b3 Mon Sep 17 00:00:00 2001 From: Joseph Schuchart Date: Wed, 15 May 2024 17:57:37 -0400 Subject: [PATCH 13/25] Add device hint to TT and buffer For POTRF, we want to provide a hint that tasks on the same column should be executed on the same device, to reduce data movement and provide a hint on load balancing up front. Signed-off-by: Joseph Schuchart --- examples/potrf/pmw.h | 10 ++++++- examples/potrf/potrf.h | 24 ++++++++++++++++ ttg/ttg/device/device.h | 7 +++++ ttg/ttg/madness/device.h | 9 ++++++ ttg/ttg/madness/ttg.h | 1 + ttg/ttg/parsec/buffer.h | 8 ++++++ ttg/ttg/parsec/device.h | 7 +++++ ttg/ttg/parsec/fwd.h | 2 ++ ttg/ttg/parsec/ttg.h | 61 ++++++++++++++++++++++++++++++++++++++++ ttg/ttg/util/meta.h | 14 ++++----- 10 files changed, 135 insertions(+), 8 deletions(-) create mode 100644 ttg/ttg/madness/device.h diff --git a/examples/potrf/pmw.h b/examples/potrf/pmw.h index 0f8d75d7d..25f01933c 100644 --- a/examples/potrf/pmw.h +++ b/examples/potrf/pmw.h @@ -102,6 +102,14 @@ class PaRSECMatrixWrapper { (pm->uplo == PARSEC_MATRIX_UPPER && col >= row); } + int P() const { + return pm->grid.rows; + } + + int Q() const { + return pm->grid.cols; + } + PaRSECMatrixT* parsec() { return pm; } @@ -132,7 +140,7 @@ class PaRSECMatrixWrapper { }; template -using MatrixT = PaRSECMatrixWrapper; +using MatrixT = PaRSECMatrixWrapper; static auto make_load_tt(MatrixT &A, ttg::Edge> &toop, bool defer_write) { diff --git a/examples/potrf/potrf.h b/examples/potrf/potrf.h index 1ecb2c23f..4adbff106 100644 --- a/examples/potrf/potrf.h +++ b/examples/potrf/potrf.h @@ -678,6 +678,18 @@ namespace potrf { auto keymap3 = [&](const Key3& key) { return A.rank_of(key[0], key[1]); }; + /** + * Device map hints: we try to keep tiles on one row on the same device to minimize + * data movement between devices. This provides hints for load-balancing up front + * and avoids movement of the TRSM result to GEMM tasks. + */ + auto devmap1 = [&](const key1& key) { return (key[0] / A.P()) % ttg::device::num_devices(); } + + auto devmap2a = [&](const key2& key) { return (key[0] / A.P()) % ttg::device::num_devices(); } + auto devmap2b = [&](const key2& key) { return (key[1] / A.P()) % ttg::device::num_devices(); } + + auto devmap3 = [&](const key3& key) { return (key[0] / A.P()) % ttg::device::num_devices(); } + ttg::Edge> syrk_potrf("syrk_potrf"), disp_potrf("disp_potrf"); ttg::Edge> potrf_trsm("potrf_trsm"), trsm_syrk("trsm_syrk"), gemm_trsm("gemm_trsm"), @@ -692,18 +704,30 @@ namespace potrf { auto tt_potrf = make_potrf(A, disp_potrf, syrk_potrf, potrf_trsm, output); tt_potrf->set_keymap(keymap1); tt_potrf->set_defer_writer(defer_write); +#ifdef ENABLE_DEVICE_KERNEL + tt_potrf->set_devmap(devmap1); +#endif // 0 auto tt_trsm = make_trsm(A, disp_trsm, potrf_trsm, gemm_trsm, trsm_syrk, trsm_gemm_row, trsm_gemm_col, output); tt_trsm->set_keymap(keymap2a); tt_trsm->set_defer_writer(defer_write); +#ifdef ENABLE_DEVICE_KERNEL + tt_trsm->set_devmap(devmap2a); +#endif // 0 auto tt_syrk = make_syrk(A, disp_syrk, trsm_syrk, syrk_syrk, syrk_potrf, syrk_syrk); tt_syrk->set_keymap(keymap2b); tt_syrk->set_defer_writer(defer_write); +#ifdef ENABLE_DEVICE_KERNEL + tt_syrk->set_devmap(devmap2b); +#endif // 0 auto tt_gemm = make_gemm(A, disp_gemm, trsm_gemm_row, trsm_gemm_col, gemm_gemm, gemm_trsm, gemm_gemm); tt_gemm->set_keymap(keymap3); tt_gemm->set_defer_writer(defer_write); +#ifdef ENABLE_DEVICE_KERNEL + tt_gemm->set_devmap(devmap3); +#endif // 0 /* Priorities taken from DPLASMA */ auto nt = A.cols(); diff --git a/ttg/ttg/device/device.h b/ttg/ttg/device/device.h index 6690982f6..e815aaf87 100644 --- a/ttg/ttg/device/device.h +++ b/ttg/ttg/device/device.h @@ -2,6 +2,7 @@ #include "ttg/config.h" #include "ttg/execution.h" +#include "ttg/impl_selector.h" @@ -180,3 +181,9 @@ namespace ttg::device { } } // namespace ttg #endif // defined(TTG_HAVE_HIP) + +namespace ttg::device { + inline int num_devices() { + return TTG_IMPL_NS::num_devices(); + } +} diff --git a/ttg/ttg/madness/device.h b/ttg/ttg/madness/device.h new file mode 100644 index 000000000..e13321194 --- /dev/null +++ b/ttg/ttg/madness/device.h @@ -0,0 +1,9 @@ +#ifndef TTG_MADNESS_DEVICE_H +#define TTG_MADNESS_DEVICE_H + +namespace ttg_madness { + /* no device support in MADNESS */ + inline int num_devices() { return 0; } +} + +#endif // TTG_MADNESS_DEVICE_H \ No newline at end of file diff --git a/ttg/ttg/madness/ttg.h b/ttg/ttg/madness/ttg.h index 5fab6f6dd..34c576f56 100644 --- a/ttg/ttg/madness/ttg.h +++ b/ttg/ttg/madness/ttg.h @@ -13,6 +13,7 @@ #include "ttg/base/keymap.h" #include "ttg/base/tt.h" #include "ttg/func.h" +#include "ttg/madness/device.h" #include "ttg/runtimes.h" #include "ttg/tt.h" #include "ttg/util/bug.h" diff --git a/ttg/ttg/parsec/buffer.h b/ttg/ttg/parsec/buffer.h index 98b14eb12..6e609a158 100644 --- a/ttg/ttg/parsec/buffer.h +++ b/ttg/ttg/parsec/buffer.h @@ -329,6 +329,14 @@ struct Buffer : public detail::ttg_parsec_data_wrapper_t // << " parsec_data " << m_data.get() << std::endl; } + void prefer_device(ttg::device::Device dev) { + /* only set device if the host has the latest copy as otherwise we might end up with a stale copy */ + if (dev.is_device() && this->parsec_data()->owner_device == 0) { + parsec_advise_data_on_device(this->parsec_data(), detail::ttg_device_to_parsec_device(dev), + PARSEC_DEV_DATA_ADVICE_PREFERRED_DEVICE); + } + } + /* serialization support */ #ifdef TTG_SERIALIZATION_SUPPORTS_BOOST diff --git a/ttg/ttg/parsec/device.h b/ttg/ttg/parsec/device.h index 77722b1c1..9f8ada05c 100644 --- a/ttg/ttg/parsec/device.h +++ b/ttg/ttg/parsec/device.h @@ -2,6 +2,7 @@ #define TTG_PARSEC_DEVICE_H #include "ttg/device/device.h" +#include namespace ttg_parsec { @@ -35,6 +36,12 @@ namespace ttg_parsec { } } // namespace detail + + inline + int num_devices() { + return parsec_nb_devices - detail::first_device_id; + } + } // namespace ttg_parsec #endif // TTG_PARSEC_DEVICE_H \ No newline at end of file diff --git a/ttg/ttg/parsec/fwd.h b/ttg/ttg/parsec/fwd.h index d5bc8931e..0cd798e87 100644 --- a/ttg/ttg/parsec/fwd.h +++ b/ttg/ttg/parsec/fwd.h @@ -82,6 +82,8 @@ namespace ttg_parsec { template inline void mark_device_out(std::tuple &b); + inline int num_devices(); + #if 0 template inline std::pair>...>> get_ptr(Args&&... args); diff --git a/ttg/ttg/parsec/ttg.h b/ttg/ttg/parsec/ttg.h index 740c8fd07..1c795677c 100644 --- a/ttg/ttg/parsec/ttg.h +++ b/ttg/ttg/parsec/ttg.h @@ -1296,6 +1296,7 @@ namespace ttg_parsec { ttg::World world; ttg::meta::detail::keymap_t keymap; ttg::meta::detail::keymap_t priomap; + ttg::meta::detail::keymap_t devicemap; // For now use same type for unary/streaming input terminals, and stream reducers assigned at runtime ttg::meta::detail::input_reducers_t input_reducers; //!< Reducers for the input terminals (empty = expect single value) @@ -1502,6 +1503,12 @@ namespace ttg_parsec { gpu_task->pushout = 0; gpu_task->submit = &TT::device_static_submit; + // one way to force the task device + // currently this will probably break all of PaRSEC if this hint + // does not match where the data is located, not really useful for us + // instead we set a hint on the data if there is no hint set yet + //parsec_task->selected_device = ...; + /* set the gpu_task so it's available in register_device_memory */ task->dev_ptr->gpu_task = gpu_task; @@ -1525,6 +1532,29 @@ namespace ttg_parsec { } tc.nb_flows = MAX_PARAM_COUNT; + /* set the device hint on the data */ + TT *tt = task->tt; + if (tt->devicemap) { + int parsec_dev; + if constexpr (std::is_void_v) { + parsec_dev = ttg::device::ttg_device_to_parsec_device(tt->devicemap()); + } else { + parsec_dev = ttg::device::ttg_device_to_parsec_device(tt->devicemap(tt->key)); + } + for (int i = 0; i < MAX_PARAM_COUNT; ++i) { + /* only set on mutable data since we have exclusive access */ + if (tc.in[i].flow_flags & PARSEC_FLOW_ACCESS_WRITE) { + parsec_data_t *data = parsec_task->data[i].data_in->original; + /* only set the preferred device if the host has the latest copy + * as otherwise we may end up with the wrong data if there is a newer + * version on a different device. Also, keep fingers crossed. */ + if (data->owner_device == 0) { + parsec_advise_data_on_device(data, parsec_dev, PARSEC_DEV_DATA_ADVICE_PREFERRED_DEVICE); + } + } + } + } + /* set the new task class that contains the flows */ task->parsec_task.task_class = &task->dev_ptr->task_class; @@ -4195,6 +4225,37 @@ ttg::abort(); // should not happen priomap = std::forward(pm); } + /// device map setter + /// The device map provides a hint on which device a task should execute. + /// TTG may not be able to honor the request and the corresponding task + /// may execute on a different device. + /// @arg pm a function that provides a hint on which device the task should execute. + template + void set_devicemap(Devicemap&& dm) { + static_assert(derived_has_device_op(), "Device map only allowed on device-enabled TT!"); + if constexpr (std::is_same_v()))>) { + // dm returns a Device + devicemap = std::forward(dm); + } else { + // convert dm return into a Device + devicemap = [=](const keyT& key) { + if constexpr (derived_has_cuda_op()) { + return ttg::device::Device(dm(key), ttg::ExecutionSpace::CUDA); + } else if constexpr (derived_has_hip_op()) { + return ttg::device::Device(dm(key), ttg::ExecutionSpace::HIP); + } else if constexpr (derived_has_level_zero_op()) { + return ttg::device::Device(dm(key), ttg::ExecutionSpace::L0); + } else { + throw std::runtime_error("Unknown device type!"); + } + }; + } + } + + /// device map accessor + /// @return the device map + auto get_devicemap() { return devicemap; } + // Register the static_op function to associate it to instance_id void register_static_op_function(void) { int rank; diff --git a/ttg/ttg/util/meta.h b/ttg/ttg/util/meta.h index b7bb31690..5322fa577 100644 --- a/ttg/ttg/util/meta.h +++ b/ttg/ttg/util/meta.h @@ -848,18 +848,18 @@ namespace ttg { //////////////////////////////////////////////////////////////////////////////////////////////////////////////////// // keymap_t = std::function, protected against void key //////////////////////////////////////////////////////////////////////////////////////////////////////////////////// - template + template struct keymap; - template + template struct keymap>> { - using type = std::function; + using type = std::function; }; - template + template struct keymap>> { - using type = std::function; + using type = std::function; }; - template - using keymap_t = typename keymap::type; + template + using keymap_t = typename keymap::type; //////////////////////////////////////////////////////////////////////////////////////////////////////////////////// // input_reducers_t = std::tuple< From 27f3fdebc7d97fe43422d34ec89756792e2a729d Mon Sep 17 00:00:00 2001 From: Joseph Schuchart Date: Wed, 15 May 2024 19:19:58 -0400 Subject: [PATCH 14/25] Fixes to device hint implementation Signed-off-by: Joseph Schuchart --- examples/potrf/potrf.h | 16 ++++++++-------- ttg/ttg/parsec/ttg.h | 6 +++--- ttg/ttg/util/meta.h | 4 ++-- 3 files changed, 13 insertions(+), 13 deletions(-) diff --git a/examples/potrf/potrf.h b/examples/potrf/potrf.h index 4adbff106..60daef72f 100644 --- a/examples/potrf/potrf.h +++ b/examples/potrf/potrf.h @@ -683,12 +683,12 @@ namespace potrf { * data movement between devices. This provides hints for load-balancing up front * and avoids movement of the TRSM result to GEMM tasks. */ - auto devmap1 = [&](const key1& key) { return (key[0] / A.P()) % ttg::device::num_devices(); } + auto devmap1 = [&](const Key1& key) { return (key[0] / A.P()) % ttg::device::num_devices(); }; - auto devmap2a = [&](const key2& key) { return (key[0] / A.P()) % ttg::device::num_devices(); } - auto devmap2b = [&](const key2& key) { return (key[1] / A.P()) % ttg::device::num_devices(); } + auto devmap2a = [&](const Key2& key) { return (key[0] / A.P()) % ttg::device::num_devices(); }; + auto devmap2b = [&](const Key2& key) { return (key[1] / A.P()) % ttg::device::num_devices(); }; - auto devmap3 = [&](const key3& key) { return (key[0] / A.P()) % ttg::device::num_devices(); } + auto devmap3 = [&](const Key3& key) { return (key[0] / A.P()) % ttg::device::num_devices(); }; ttg::Edge> syrk_potrf("syrk_potrf"), disp_potrf("disp_potrf"); @@ -705,28 +705,28 @@ namespace potrf { tt_potrf->set_keymap(keymap1); tt_potrf->set_defer_writer(defer_write); #ifdef ENABLE_DEVICE_KERNEL - tt_potrf->set_devmap(devmap1); + tt_potrf->set_devicemap(devmap1); #endif // 0 auto tt_trsm = make_trsm(A, disp_trsm, potrf_trsm, gemm_trsm, trsm_syrk, trsm_gemm_row, trsm_gemm_col, output); tt_trsm->set_keymap(keymap2a); tt_trsm->set_defer_writer(defer_write); #ifdef ENABLE_DEVICE_KERNEL - tt_trsm->set_devmap(devmap2a); + tt_trsm->set_devicemap(devmap2a); #endif // 0 auto tt_syrk = make_syrk(A, disp_syrk, trsm_syrk, syrk_syrk, syrk_potrf, syrk_syrk); tt_syrk->set_keymap(keymap2b); tt_syrk->set_defer_writer(defer_write); #ifdef ENABLE_DEVICE_KERNEL - tt_syrk->set_devmap(devmap2b); + tt_syrk->set_devicemap(devmap2b); #endif // 0 auto tt_gemm = make_gemm(A, disp_gemm, trsm_gemm_row, trsm_gemm_col, gemm_gemm, gemm_trsm, gemm_gemm); tt_gemm->set_keymap(keymap3); tt_gemm->set_defer_writer(defer_write); #ifdef ENABLE_DEVICE_KERNEL - tt_gemm->set_devmap(devmap3); + tt_gemm->set_devicemap(devmap3); #endif // 0 /* Priorities taken from DPLASMA */ diff --git a/ttg/ttg/parsec/ttg.h b/ttg/ttg/parsec/ttg.h index 1c795677c..b2a89f11b 100644 --- a/ttg/ttg/parsec/ttg.h +++ b/ttg/ttg/parsec/ttg.h @@ -1537,13 +1537,13 @@ namespace ttg_parsec { if (tt->devicemap) { int parsec_dev; if constexpr (std::is_void_v) { - parsec_dev = ttg::device::ttg_device_to_parsec_device(tt->devicemap()); + parsec_dev = detail::ttg_device_to_parsec_device(tt->devicemap()); } else { - parsec_dev = ttg::device::ttg_device_to_parsec_device(tt->devicemap(tt->key)); + parsec_dev = detail::ttg_device_to_parsec_device(tt->devicemap(task->key)); } for (int i = 0; i < MAX_PARAM_COUNT; ++i) { /* only set on mutable data since we have exclusive access */ - if (tc.in[i].flow_flags & PARSEC_FLOW_ACCESS_WRITE) { + if (tc.in[i]->flow_flags & PARSEC_FLOW_ACCESS_WRITE) { parsec_data_t *data = parsec_task->data[i].data_in->original; /* only set the preferred device if the host has the latest copy * as otherwise we may end up with the wrong data if there is a newer diff --git a/ttg/ttg/util/meta.h b/ttg/ttg/util/meta.h index 5322fa577..f3af03152 100644 --- a/ttg/ttg/util/meta.h +++ b/ttg/ttg/util/meta.h @@ -851,11 +851,11 @@ namespace ttg { template struct keymap; template - struct keymap>> { + struct keymap>> { using type = std::function; }; template - struct keymap>> { + struct keymap>> { using type = std::function; }; template From ee6fb2a09259251bce9e3bfc55f85b36fd0e8ef8 Mon Sep 17 00:00:00 2001 From: Joseph Schuchart Date: Wed, 22 May 2024 15:14:15 -0400 Subject: [PATCH 15/25] Point back to PaRSEC main repository Signed-off-by: Joseph Schuchart --- cmake/modules/ExternalDependenciesVersions.cmake | 2 +- cmake/modules/FindOrFetchPARSEC.cmake | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/cmake/modules/ExternalDependenciesVersions.cmake b/cmake/modules/ExternalDependenciesVersions.cmake index 2e79bdb88..43042cacd 100644 --- a/cmake/modules/ExternalDependenciesVersions.cmake +++ b/cmake/modules/ExternalDependenciesVersions.cmake @@ -4,7 +4,7 @@ set(TTG_TRACKED_VG_CMAKE_KIT_TAG 7ea2d4d3f8854b9e417f297fd74d6fc49aa13fd5) # used to provide "real" FindOrFetchBoost set(TTG_TRACKED_CATCH2_VERSION 3.5.0) set(TTG_TRACKED_MADNESS_TAG 2eb3bcf0138127ee2dbc651f1aabd3e9b0def4e3) -set(TTG_TRACKED_PARSEC_TAG fc18d7881059a520e73cb7ccb6b56e6da705635e) +set(TTG_TRACKED_PARSEC_TAG 58f8f3089ecad2e8ee50e80a9586e05ce8873b1c) set(TTG_TRACKED_BTAS_TAG 4e8f5233aa7881dccdfcc37ce07128833926d3c2) set(TTG_TRACKED_TILEDARRAY_TAG 493c109379a1b64ddd5ef59f7e33b95633b68d73) diff --git a/cmake/modules/FindOrFetchPARSEC.cmake b/cmake/modules/FindOrFetchPARSEC.cmake index 2a663f914..b3fd5faa3 100644 --- a/cmake/modules/FindOrFetchPARSEC.cmake +++ b/cmake/modules/FindOrFetchPARSEC.cmake @@ -17,7 +17,7 @@ if (NOT TARGET PaRSEC::parsec) FetchContent_Declare( PARSEC - GIT_REPOSITORY https://github.com/abouteiller/parsec.git + GIT_REPOSITORY https://github.com/ICLDisco/parsec.git GIT_TAG ${TTG_TRACKED_PARSEC_TAG} ) FetchContent_MakeAvailable(PARSEC) From f3e8b14147610ec5c43b7954164b3f5c54b4c8d9 Mon Sep 17 00:00:00 2001 From: Joseph Schuchart Date: Wed, 22 May 2024 15:43:05 -0400 Subject: [PATCH 16/25] Add missing fwd decl in madness backend Signed-off-by: Joseph Schuchart --- ttg/ttg/madness/fwd.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/ttg/ttg/madness/fwd.h b/ttg/ttg/madness/fwd.h index 0d340db0b..6bee1a832 100644 --- a/ttg/ttg/madness/fwd.h +++ b/ttg/ttg/madness/fwd.h @@ -77,6 +77,8 @@ namespace ttg_madness { template inline void mark_device_out(std::tuple &b); + inline int num_devices(); + } // namespace ttg_madness #endif // TTG_MADNESS_FWD_H From 2384e4d3dab2d636ccdcfd15c59ce2784ff662a2 Mon Sep 17 00:00:00 2001 From: Joseph Schuchart Date: Thu, 23 May 2024 10:29:16 -0400 Subject: [PATCH 17/25] Cleanup return values in parsec backend We should not return PARSEC_HOOK_RETURN_NEXT in hooks, only in the evaluate callback. Signed-off-by: Joseph Schuchart --- ttg/ttg/parsec/ttg.h | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/ttg/ttg/parsec/ttg.h b/ttg/ttg/parsec/ttg.h index b2a89f11b..71a943406 100644 --- a/ttg/ttg/parsec/ttg.h +++ b/ttg/ttg/parsec/ttg.h @@ -766,7 +766,8 @@ namespace ttg_parsec { parsec_ttg_task_t *me = (parsec_ttg_task_t *)parsec_task; return me->template invoke_op(); } else { - return PARSEC_HOOK_RETURN_NEXT; + std::cerr << "CUDA hook called without having a CUDA op!" << std::endl; + return PARSEC_HOOK_RETURN_ERROR; } } @@ -776,7 +777,8 @@ namespace ttg_parsec { parsec_ttg_task_t *me = (parsec_ttg_task_t *)parsec_task; return me->template invoke_op(); } else { - return PARSEC_HOOK_RETURN_NEXT; + std::cerr << "HIP hook called without having a HIP op!" << std::endl; + return PARSEC_HOOK_RETURN_ERROR; } } @@ -786,7 +788,8 @@ namespace ttg_parsec { parsec_ttg_task_t *me = (parsec_ttg_task_t *)parsec_task; return me->template invoke_op(); } else { - return PARSEC_HOOK_RETURN_NEXT; + std::cerr << "L0 hook called without having a L0 op!" << std::endl; + return PARSEC_HOOK_RETURN_ERROR; } } @@ -1489,8 +1492,6 @@ namespace ttg_parsec { task_t *task = (task_t*)parsec_task; if (task->dev_ptr->gpu_task == nullptr) { - //std::cout << "device_static_op: task " << parsec_task << std::endl; - /* set up a device task */ parsec_gpu_task_t *gpu_task; /* PaRSEC wants to free the gpu_task, because F***K ownerships */ @@ -1498,7 +1499,6 @@ namespace ttg_parsec { PARSEC_OBJ_CONSTRUCT(gpu_task, parsec_list_item_t); gpu_task->ec = parsec_task; gpu_task->task_type = 0; // user task - //gpu_task->load = 1; // TODO: can we do better? gpu_task->last_data_check_epoch = -1; // used internally gpu_task->pushout = 0; gpu_task->submit = &TT::device_static_submit; From 4b50c68dd4484127be8c9ec5e57505051b1907c3 Mon Sep 17 00:00:00 2001 From: Joseph Schuchart Date: Fri, 24 May 2024 14:00:40 -0400 Subject: [PATCH 18/25] Install madness backend device.h header Signed-off-by: Joseph Schuchart --- ttg/CMakeLists.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/ttg/CMakeLists.txt b/ttg/CMakeLists.txt index 900df6519..1918f40fe 100644 --- a/ttg/CMakeLists.txt +++ b/ttg/CMakeLists.txt @@ -208,6 +208,7 @@ endif(TARGET Boost::serialization) if (TARGET MADworld) set(ttg-mad-headers ${CMAKE_CURRENT_SOURCE_DIR}/ttg/madness/buffer.h + ${CMAKE_CURRENT_SOURCE_DIR}/ttg/madness/device.h ${CMAKE_CURRENT_SOURCE_DIR}/ttg/madness/fwd.h ${CMAKE_CURRENT_SOURCE_DIR}/ttg/madness/import.h ${CMAKE_CURRENT_SOURCE_DIR}/ttg/madness/ttg.h From 72aa980bf795b15bbdc3fb168a83d3d56e1376a6 Mon Sep 17 00:00:00 2001 From: Joseph Schuchart Date: Tue, 28 May 2024 11:03:57 -0400 Subject: [PATCH 19/25] Add missing fwd.h header to device.h Signed-off-by: Joseph Schuchart --- ttg/ttg/device/device.h | 1 + 1 file changed, 1 insertion(+) diff --git a/ttg/ttg/device/device.h b/ttg/ttg/device/device.h index e815aaf87..244e9c944 100644 --- a/ttg/ttg/device/device.h +++ b/ttg/ttg/device/device.h @@ -3,6 +3,7 @@ #include "ttg/config.h" #include "ttg/execution.h" #include "ttg/impl_selector.h" +#include "ttg/fwd.h" From 755ab26f6512defa427ead70a890fa34db76e2bc Mon Sep 17 00:00:00 2001 From: Joseph Schuchart Date: Tue, 28 May 2024 11:39:25 -0400 Subject: [PATCH 20/25] Check for broken GCC versions and disable Coroutine support if found Signed-off-by: Joseph Schuchart --- CMakeLists.txt | 19 +++++++++++++++++-- 1 file changed, 17 insertions(+), 2 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 28490557f..26433deb5 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -94,8 +94,23 @@ endif (BUILD_TESTING) ########################### # Boost include("${PROJECT_SOURCE_DIR}/cmake/modules/FindOrFetchBoost.cmake") -# C++ coroutines -find_package(CXXStdCoroutine MODULE REQUIRED COMPONENTS Final Experimental) + +# C++ coroutines, check for broken GCC releases and skip if one is found +set(SKIP_COROUTINE_DETECTION FALSE) +if (${CMAKE_CXX_COMPILER_ID} STREQUAL "GNU") + if (${CMAKE_CXX_COMPILER_VERSION} VERSION_LESS 11.4.0) + set(SKIP_COROUTINE_DETECTION TRUE) + elseif(${CMAKE_CXX_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.1.0 AND ${CMAKE_CXX_COMPILER_VERSION} VERSION_LESS 12.3.0) + set(SKIP_COROUTINE_DETECTION TRUE) + endif() + if (SKIP_COROUTINE_DETECTION) + message(WARNING "GCC with broken Coroutine support detected, disabling Coroutine support. At least GCC 11.4, 12.3, or 13.1 required.") + endif(SKIP_COROUTINE_DETECTION) +endif(${CMAKE_CXX_COMPILER_ID} STREQUAL "GNU") + +if (SKIP_COROUTINE_DETECTION) + find_package(CXXStdCoroutine MODULE REQUIRED COMPONENTS Final Experimental) +endif(SKIP_COROUTINE_DETECTION) ########################## From 8947d84200282a62a36df6acfe7bbfa432491e3a Mon Sep 17 00:00:00 2001 From: Joseph Schuchart Date: Tue, 28 May 2024 12:09:29 -0400 Subject: [PATCH 21/25] Protect unit tests for cases where coroutines are not found Signed-off-by: Joseph Schuchart --- tests/unit/CMakeLists.txt | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/tests/unit/CMakeLists.txt b/tests/unit/CMakeLists.txt index 275f3fdd8..7f25dd6fe 100644 --- a/tests/unit/CMakeLists.txt +++ b/tests/unit/CMakeLists.txt @@ -12,12 +12,14 @@ set(ut_libs Catch2::Catch2) # coroutine tests # we definitely have TARGET std::coroutine -list(APPEND ut_src fibonacci-coro.cc) -list(APPEND ut_src device_coro.cc) -if (TTG_HAVE_CUDA) - list(APPEND ut_src cuda_kernel.cu) -endif(TTG_HAVE_CUDA) -list(APPEND ut_libs std::coroutine) +if (CXXStdCoroutine_FOUND) + list(APPEND ut_src fibonacci-coro.cc) + list(APPEND ut_src device_coro.cc) + if (TTG_HAVE_CUDA) + list(APPEND ut_src cuda_kernel.cu) + endif(TTG_HAVE_CUDA) + list(APPEND ut_libs std::coroutine) +endif(CXXStdCoroutine_FOUND) add_ttg_executable(core-unittests-ttg "${ut_src}" LINK_LIBRARIES "${ut_libs}" COMPILE_DEFINITIONS "CATCH_CONFIG_NO_POSIX_SIGNALS=1" ) From 40595c1314cacc6f4c179adbb8f8eb464fc1b95e Mon Sep 17 00:00:00 2001 From: Joseph Schuchart Date: Tue, 28 May 2024 12:32:22 -0400 Subject: [PATCH 22/25] Fix macro TTG_PROCESS_TT_OP_RETURN if coros are not available Signed-off-by: Joseph Schuchart --- ttg/ttg/tt.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ttg/ttg/tt.h b/ttg/ttg/tt.h index ec7eff6d8..637ae42b5 100644 --- a/ttg/ttg/tt.h +++ b/ttg/ttg/tt.h @@ -196,7 +196,7 @@ namespace ttg { } \ } #else -#define TTG_PROCESS_TT_OP_RETURN(result, invoke) invoke +#define TTG_PROCESS_TT_OP_RETURN(result, id, invoke) invoke #endif #else #error "TTG_PROCESS_TT_OP_RETURN already defined in ttg/tt.h, check your header guards" From 1421c4f35e65169d775a685e8e2f27e45f35639e Mon Sep 17 00:00:00 2001 From: Joseph Schuchart Date: Thu, 30 May 2024 09:28:43 -0400 Subject: [PATCH 23/25] Only use suspended_task_address if we have coroutines Signed-off-by: Joseph Schuchart --- ttg/ttg/parsec/ttg.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/ttg/ttg/parsec/ttg.h b/ttg/ttg/parsec/ttg.h index 71a943406..1aaa21bdf 100644 --- a/ttg/ttg/parsec/ttg.h +++ b/ttg/ttg/parsec/ttg.h @@ -3695,6 +3695,7 @@ ttg::abort(); // should not happen task_t *task = (task_t*)parsec_task; +#ifdef TTG_HAS_COROUTINE /* if we still have a coroutine handle we invoke it one more time to get the sends/broadcasts */ if (task->suspended_task_address) { assert(task->coroutine_id != ttg::TaskCoroutineID::Invalid); @@ -3725,6 +3726,7 @@ ttg::abort(); // should not happen /* the coroutine should have completed and we cannot access the promise anymore */ task->suspended_task_address = nullptr; } +#endif // TTG_HAS_COROUTINE /* release our data copies */ for (int i = 0; i < task->data_count; i++) { From df32b1ed0e93c8a0762e832183c3abd3d91e5955 Mon Sep 17 00:00:00 2001 From: Joseph Schuchart Date: Thu, 30 May 2024 10:24:23 -0400 Subject: [PATCH 24/25] Replace TTG_HAS_COROUTINE with TTG_HAVE_COROUTINE and add to config.h Signed-off-by: Joseph Schuchart --- CMakeLists.txt | 3 ++- ttg/CMakeLists.txt | 1 - ttg/ttg/config.in.h | 3 +++ ttg/ttg/coroutine.h | 4 ++++ ttg/ttg/device/task.h | 6 +++++- ttg/ttg/madness/ttg.h | 22 ++++++++++------------ ttg/ttg/make_tt.h | 8 ++++---- ttg/ttg/parsec/task.h | 4 ++-- ttg/ttg/parsec/ttg.h | 32 +++++++++++++++++--------------- ttg/ttg/tt.h | 5 +++-- 10 files changed, 50 insertions(+), 38 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 26433deb5..519b13938 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -108,8 +108,9 @@ if (${CMAKE_CXX_COMPILER_ID} STREQUAL "GNU") endif(SKIP_COROUTINE_DETECTION) endif(${CMAKE_CXX_COMPILER_ID} STREQUAL "GNU") -if (SKIP_COROUTINE_DETECTION) +if (NOT SKIP_COROUTINE_DETECTION) find_package(CXXStdCoroutine MODULE REQUIRED COMPONENTS Final Experimental) + set(TTG_HAVE_COROUTINE CXXStdCoroutine_FOUND CACHE BOOL "True if the compiler has coroutine support") endif(SKIP_COROUTINE_DETECTION) diff --git a/ttg/CMakeLists.txt b/ttg/CMakeLists.txt index 1918f40fe..b1fa72947 100644 --- a/ttg/CMakeLists.txt +++ b/ttg/CMakeLists.txt @@ -114,7 +114,6 @@ if (TTG_ENABLE_TRACE) endif (TTG_ENABLE_TRACE) if (TARGET std::coroutine) list(APPEND ttg-deps std::coroutine) - list(APPEND ttg-defs "TTG_HAS_COROUTINE=1") list(APPEND ttg-util-headers ${CMAKE_CURRENT_SOURCE_DIR}/ttg/coroutine.h ) diff --git a/ttg/ttg/config.in.h b/ttg/ttg/config.in.h index 51e58b4a2..106f0c58a 100644 --- a/ttg/ttg/config.in.h +++ b/ttg/ttg/config.in.h @@ -11,6 +11,9 @@ /** the C++ namespace containing the coroutine API */ #define TTG_CXX_COROUTINE_NAMESPACE @CXX_COROUTINE_NAMESPACE@ +/** whether the compiler supports C++ coroutines */ +#cmakedefine TTG_HAVE_COROUTINE + /** whether TTG has CUDA language support */ #cmakedefine TTG_HAVE_CUDA diff --git a/ttg/ttg/coroutine.h b/ttg/ttg/coroutine.h index 81d5b1657..890a98b4f 100644 --- a/ttg/ttg/coroutine.h +++ b/ttg/ttg/coroutine.h @@ -11,6 +11,8 @@ #include #include +#ifdef TTG_HAVE_COROUTINE + namespace ttg { // import std coroutine API into ttg namespace @@ -227,4 +229,6 @@ namespace ttg { } // namespace ttg +#endif // TTG_HAVE_COROUTINE + #endif // TTG_COROUTINE_H diff --git a/ttg/ttg/device/task.h b/ttg/ttg/device/task.h index d95e0d1eb..8e2d14cfc 100644 --- a/ttg/ttg/device/task.h +++ b/ttg/ttg/device/task.h @@ -9,6 +9,8 @@ #include "ttg/impl_selector.h" #include "ttg/ptr.h" +#ifdef TTG_HAVE_COROUTINE + namespace ttg::device { namespace detail { @@ -632,6 +634,8 @@ namespace ttg::device { bool device_reducer::completed() { return base_type::promise().state() == ttg::device::detail::TTG_DEVICE_CORO_COMPLETE; } #endif // 0 -} // namespace ttg::devie +} // namespace ttg::device + +#endif // TTG_HAVE_COROUTINE #endif // TTG_DEVICE_TASK_H diff --git a/ttg/ttg/madness/ttg.h b/ttg/ttg/madness/ttg.h index 34c576f56..5d2360cfb 100644 --- a/ttg/ttg/madness/ttg.h +++ b/ttg/ttg/madness/ttg.h @@ -24,9 +24,7 @@ #include "ttg/util/meta/callable.h" #include "ttg/util/void.h" #include "ttg/world.h" -#ifdef TTG_HAS_COROUTINE #include "ttg/coroutine.h" -#endif #include #include @@ -303,10 +301,10 @@ namespace ttg_madness { derivedT *derived; // Pointer to derived class instance bool pull_terminals_invoked = false; std::conditional_t, ttg::Void, keyT> key; // Task key -#ifdef TTG_HAS_COROUTINE +#ifdef TTG_HAVE_COROUTINE void *suspended_task_address = nullptr; // if not null the function is suspended ttg::TaskCoroutineID coroutine_id = ttg::TaskCoroutineID::Invalid; -#endif +#endif // TTG_HAVE_COROUTINE /// makes a tuple of references out of tuple of template @@ -336,11 +334,11 @@ namespace ttg_madness { ttT::threaddata.call_depth++; void *suspended_task_address = -#ifdef TTG_HAS_COROUTINE +#ifdef TTG_HAVE_COROUTINE this->suspended_task_address; // non-null = need to resume the task -#else +#else // TTG_HAVE_COROUTINE nullptr; -#endif +#endif // TTG_HAVE_COROUTINE if (suspended_task_address == nullptr) { // task is a coroutine that has not started or an ordinary function // ttg::print("starting task"); if constexpr (!ttg::meta::is_void_v && !ttg::meta::is_empty_tuple_v) { @@ -362,7 +360,7 @@ namespace ttg_madness { } else // unreachable ttg::abort(); } else { // resume suspended coroutine -#ifdef TTG_HAS_COROUTINE +#ifdef TTG_HAVE_COROUTINE auto ret = static_cast(ttg::coroutine_handle::from_address(suspended_task_address)); assert(ret.ready()); ret.resume(); @@ -373,9 +371,9 @@ namespace ttg_madness { // leave suspended_task_address as is } this->suspended_task_address = suspended_task_address; -#else +#else // TTG_HAVE_COROUTINE ttg::abort(); // should not happen -#endif +#endif // TTG_HAVE_COROUTINE } ttT::threaddata.call_depth--; @@ -384,7 +382,7 @@ namespace ttg_madness { // ttg::print("finishing task",ttT::threaddata.call_depth); // } -#ifdef TTG_HAS_COROUTINE +#ifdef TTG_HAVE_COROUTINE if (suspended_task_address) { // TODO implement handling of suspended coroutines properly @@ -412,7 +410,7 @@ namespace ttg_madness { ttg::abort(); } } -#endif // TTG_HAS_COROUTINE +#endif // TTG_HAVE_COROUTINE } virtual ~TTArgs() {} // Will be deleted via TaskInterface* diff --git a/ttg/ttg/make_tt.h b/ttg/ttg/make_tt.h index 1711d8556..81897b816 100644 --- a/ttg/ttg/make_tt.h +++ b/ttg/ttg/make_tt.h @@ -149,7 +149,7 @@ class CallableWrapTTArgs std::conditional_t, std::add_pointer_t, noref_funcT> func; using op_return_type = -#ifdef TTG_HAS_COROUTINE +#ifdef TTG_HAVE_COROUTINE std::conditional_t, ttg::coroutine_handle, #ifdef TTG_HAVE_DEVICE @@ -160,9 +160,9 @@ class CallableWrapTTArgs void #endif // TTG_HAVE_DEVICE >; -#else // TTG_HAS_COROUTINE +#else // TTG_HAVE_COROUTINE void; -#endif // TTG_HAS_COROUTINE +#endif // TTG_HAVE_COROUTINE public: static constexpr bool have_cuda_op = (space == ttg::ExecutionSpace::CUDA); @@ -176,7 +176,7 @@ class CallableWrapTTArgs static_assert(std::is_same_v, returnT>, "CallableWrapTTArgs: returnT does not match the actual return type of funcT"); if constexpr (!std::is_void_v) { // protect from compiling for void returnT -#ifdef TTG_HAS_COROUTINE +#ifdef TTG_HAVE_COROUTINE if constexpr (std::is_same_v) { ttg::coroutine_handle coro_handle; // if task completed destroy it diff --git a/ttg/ttg/parsec/task.h b/ttg/ttg/parsec/task.h index 5109dbcaa..656117b94 100644 --- a/ttg/ttg/parsec/task.h +++ b/ttg/ttg/parsec/task.h @@ -202,7 +202,7 @@ namespace ttg_parsec { TT* tt = nullptr; key_type key; std::array streams; -#ifdef TTG_HAS_COROUTINE +#ifdef TTG_HAVE_COROUTINE void* suspended_task_address = nullptr; // if not null the function is suspended ttg::TaskCoroutineID coroutine_id = ttg::TaskCoroutineID::Invalid; #endif @@ -268,7 +268,7 @@ namespace ttg_parsec { static constexpr size_t num_streams = TT::numins; TT* tt = nullptr; std::array streams; -#ifdef TTG_HAS_COROUTINE +#ifdef TTG_HAVE_COROUTINE void* suspended_task_address = nullptr; // if not null the function is suspended ttg::TaskCoroutineID coroutine_id = ttg::TaskCoroutineID::Invalid; #endif diff --git a/ttg/ttg/parsec/ttg.h b/ttg/ttg/parsec/ttg.h index 1aaa21bdf..a241d2b3b 100644 --- a/ttg/ttg/parsec/ttg.h +++ b/ttg/ttg/parsec/ttg.h @@ -12,6 +12,8 @@ * This may cause deadlocks, so use with caution. */ #define TTG_PARSEC_DEFER_WRITER false +#include "ttg/config.h" + #include "ttg/impl_selector.h" /* include ttg header to make symbols available in case this header is included directly */ @@ -1643,11 +1645,11 @@ namespace ttg_parsec { task_t *task = (task_t*)parsec_task; void* suspended_task_address = -#ifdef TTG_HAS_COROUTINE +#ifdef TTG_HAVE_COROUTINE task->suspended_task_address; // non-null = need to resume the task -#else +#else // TTG_HAVE_COROUTINE nullptr; -#endif +#endif // TTG_HAVE_COROUTINE //std::cout << "static_op: suspended_task_address " << suspended_task_address << std::endl; if (suspended_task_address == nullptr) { // task is a coroutine that has not started or an ordinary function @@ -1679,9 +1681,9 @@ namespace ttg_parsec { } else { // resume the suspended coroutine +#ifdef TTG_HAVE_COROUTINE assert(task->coroutine_id != ttg::TaskCoroutineID::Invalid); -#ifdef TTG_HAS_COROUTINE #ifdef TTG_HAVE_DEVICE if (task->coroutine_id == ttg::TaskCoroutineID::DeviceTask) { ttg::device::Task coro = ttg::device::detail::device_task_handle_type::from_address(suspended_task_address); @@ -1725,9 +1727,9 @@ namespace ttg_parsec { } else ttg::abort(); // unrecognized task id -#else // TTG_HAS_COROUTINE -ttg::abort(); // should not happen -#endif // TTG_HAS_COROUTINE +#else // TTG_HAVE_COROUTINE + ttg::abort(); // should not happen +#endif // TTG_HAVE_COROUTINE } task->suspended_task_address = suspended_task_address; @@ -1750,11 +1752,11 @@ ttg::abort(); // should not happen task_t *task = static_cast(parsec_task); void* suspended_task_address = -#ifdef TTG_HAS_COROUTINE +#ifdef TTG_HAVE_COROUTINE task->suspended_task_address; // non-null = need to resume the task -#else +#else // TTG_HAVE_COROUTINE nullptr; -#endif +#endif // TTG_HAVE_COROUTINE if (suspended_task_address == nullptr) { // task is a coroutine that has not started or an ordinary function ttT *baseobj = (ttT *)task->object_ptr; derivedT *obj = (derivedT *)task->object_ptr; @@ -1769,7 +1771,7 @@ ttg::abort(); // should not happen detail::parsec_ttg_caller = NULL; } else { -#ifdef TTG_HAS_COROUTINE +#ifdef TTG_HAVE_COROUTINE auto ret = static_cast(ttg::coroutine_handle::from_address(suspended_task_address)); assert(ret.ready()); ret.resume(); @@ -1780,9 +1782,9 @@ ttg::abort(); // should not happen else { // not yet completed // leave suspended_task_address as is } -#else +#else // TTG_HAVE_COROUTINE ttg::abort(); // should not happen -#endif +#endif // TTG_HAVE_COROUTINE } task->suspended_task_address = suspended_task_address; @@ -3695,7 +3697,7 @@ ttg::abort(); // should not happen task_t *task = (task_t*)parsec_task; -#ifdef TTG_HAS_COROUTINE +#ifdef TTG_HAVE_COROUTINE /* if we still have a coroutine handle we invoke it one more time to get the sends/broadcasts */ if (task->suspended_task_address) { assert(task->coroutine_id != ttg::TaskCoroutineID::Invalid); @@ -3726,7 +3728,7 @@ ttg::abort(); // should not happen /* the coroutine should have completed and we cannot access the promise anymore */ task->suspended_task_address = nullptr; } -#endif // TTG_HAS_COROUTINE +#endif // TTG_HAVE_COROUTINE /* release our data copies */ for (int i = 0; i < task->data_count; i++) { diff --git a/ttg/ttg/tt.h b/ttg/ttg/tt.h index 637ae42b5..6b3972984 100644 --- a/ttg/ttg/tt.h +++ b/ttg/ttg/tt.h @@ -1,12 +1,13 @@ #ifndef TTG_TT_H #define TTG_TT_H +#include "ttg/config.h" #include "ttg/fwd.h" #include "ttg/base/tt.h" #include "ttg/edge.h" -#ifdef TTG_HAS_COROUTINE +#ifdef TTG_HAVE_COROUTINE #include "ttg/coroutine.h" #endif @@ -176,7 +177,7 @@ namespace ttg { } // namespace ttg #ifndef TTG_PROCESS_TT_OP_RETURN -#ifdef TTG_HAS_COROUTINE +#ifdef TTG_HAVE_COROUTINE #define TTG_PROCESS_TT_OP_RETURN(result, id, invoke) \ { \ using return_type = decltype(invoke); \ From a9c33d4b3652ddca7c912d18f007767fe97f0690 Mon Sep 17 00:00:00 2001 From: Joseph Schuchart Date: Thu, 30 May 2024 11:31:31 -0400 Subject: [PATCH 25/25] Add TTG_ENABLE_COROUTINES CMake option and fix non-coro builds Signed-off-by: Joseph Schuchart --- CMakeLists.txt | 37 ++++++++++++++++++++----------------- ttg/ttg/coroutine.h | 3 ++- ttg/ttg/parsec/ttg.h | 2 +- 3 files changed, 23 insertions(+), 19 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 519b13938..5406877c8 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -59,6 +59,7 @@ option(TTG_ENABLE_LEVEL_ZERO "Whether to TTG will look for Intel oneAPI Level Ze option(TTG_EXAMPLES "Whether to build examples" OFF) option(TTG_ENABLE_ASAN "Whether to enable address sanitizer" OFF) +option(TTG_ENABLE_COROUTINES "Whether to enable C++ coroutines, needed for accelerator device support" ON) option(TTG_FETCH_BOOST "Whether to fetch+build Boost, if missing" OFF) option(TTG_IGNORE_BUNDLED_EXTERNALS "Whether to skip installation and use of bundled external dependencies (Boost.CallableTraits)" OFF) option(TTG_ENABLE_TRACE "Whether to enable ttg::trace() output" OFF) @@ -95,23 +96,25 @@ endif (BUILD_TESTING) # Boost include("${PROJECT_SOURCE_DIR}/cmake/modules/FindOrFetchBoost.cmake") -# C++ coroutines, check for broken GCC releases and skip if one is found -set(SKIP_COROUTINE_DETECTION FALSE) -if (${CMAKE_CXX_COMPILER_ID} STREQUAL "GNU") - if (${CMAKE_CXX_COMPILER_VERSION} VERSION_LESS 11.4.0) - set(SKIP_COROUTINE_DETECTION TRUE) - elseif(${CMAKE_CXX_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.1.0 AND ${CMAKE_CXX_COMPILER_VERSION} VERSION_LESS 12.3.0) - set(SKIP_COROUTINE_DETECTION TRUE) - endif() - if (SKIP_COROUTINE_DETECTION) - message(WARNING "GCC with broken Coroutine support detected, disabling Coroutine support. At least GCC 11.4, 12.3, or 13.1 required.") - endif(SKIP_COROUTINE_DETECTION) -endif(${CMAKE_CXX_COMPILER_ID} STREQUAL "GNU") - -if (NOT SKIP_COROUTINE_DETECTION) - find_package(CXXStdCoroutine MODULE REQUIRED COMPONENTS Final Experimental) - set(TTG_HAVE_COROUTINE CXXStdCoroutine_FOUND CACHE BOOL "True if the compiler has coroutine support") -endif(SKIP_COROUTINE_DETECTION) +if (TTG_ENABLE_COROUTINES) + set(SKIP_COROUTINE_DETECTION FALSE) + # C++ coroutines, check for broken GCC releases and skip if one is found + if (${CMAKE_CXX_COMPILER_ID} STREQUAL "GNU") + if (${CMAKE_CXX_COMPILER_VERSION} VERSION_LESS 11.4.0) + set(SKIP_COROUTINE_DETECTION TRUE) + elseif(${CMAKE_CXX_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.1.0 AND ${CMAKE_CXX_COMPILER_VERSION} VERSION_LESS 12.3.0) + set(SKIP_COROUTINE_DETECTION TRUE) + endif() + if (SKIP_COROUTINE_DETECTION) + message(WARNING "GCC with broken Coroutine support detected, disabling Coroutine support. At least GCC 11.4, 12.3, or 13.1 required.") + endif(SKIP_COROUTINE_DETECTION) + endif(${CMAKE_CXX_COMPILER_ID} STREQUAL "GNU") + + if (NOT SKIP_COROUTINE_DETECTION) + find_package(CXXStdCoroutine MODULE REQUIRED COMPONENTS Final Experimental) + set(TTG_HAVE_COROUTINE CXXStdCoroutine_FOUND CACHE BOOL "True if the compiler has coroutine support") + endif(NOT SKIP_COROUTINE_DETECTION) +endif(TTG_ENABLE_COROUTINES) ########################## diff --git a/ttg/ttg/coroutine.h b/ttg/ttg/coroutine.h index 890a98b4f..dc1ed8e5b 100644 --- a/ttg/ttg/coroutine.h +++ b/ttg/ttg/coroutine.h @@ -6,12 +6,13 @@ #define TTG_COROUTINE_H #include "ttg/config.h" + +#ifdef TTG_HAVE_COROUTINE #include TTG_CXX_COROUTINE_HEADER #include #include -#ifdef TTG_HAVE_COROUTINE namespace ttg { diff --git a/ttg/ttg/parsec/ttg.h b/ttg/ttg/parsec/ttg.h index a241d2b3b..73672d285 100644 --- a/ttg/ttg/parsec/ttg.h +++ b/ttg/ttg/parsec/ttg.h @@ -1724,6 +1724,7 @@ namespace ttg_parsec { } task->tt->set_outputs_tls_ptr(old_output_tls_ptr); detail::parsec_ttg_caller = nullptr; + task->suspended_task_address = suspended_task_address; } else ttg::abort(); // unrecognized task id @@ -1731,7 +1732,6 @@ namespace ttg_parsec { ttg::abort(); // should not happen #endif // TTG_HAVE_COROUTINE } - task->suspended_task_address = suspended_task_address; if (suspended_task_address == nullptr) { ttT *baseobj = task->tt;