diff --git a/ttg/ttg/device/device.h b/ttg/ttg/device/device.h index 83845fb32..95e1e0fc8 100644 --- a/ttg/ttg/device/device.h +++ b/ttg/ttg/device/device.h @@ -6,6 +6,14 @@ namespace ttg::device { + +#if defined(TTG_HAVE_CUDA) + constexpr ttg::ExecutionSpace available_execution_space = ttg::ExecutionSpace::CUDA; +#elif defined(TTG_HAVE_HIP) + constexpr ttg::ExecutionSpace available_execution_space = ttg::ExecutionSpace::HIP; +#elif defined(TTG_HAVE_LEVEL_ZERO) + constexpr ttg::ExecutionSpace available_execution_space = ttg::ExecutionSpace::L0; +#endif class Device { int m_id = 0; ttg::ExecutionSpace m_space = ttg::ExecutionSpace::Host; @@ -36,11 +44,11 @@ namespace ttg::device { } bool is_device() const { - return ((!is_invalid()) && (m_space != ttg::ExecutionSpace::Host)); + return !is_host(); } bool is_host() const { - return (m_space == ttg::ExecutionSpace::Host); + return !is_invalid() && (m_space == ttg::ExecutionSpace::Host); } bool is_invalid() const { @@ -120,4 +128,36 @@ namespace ttg::device { } } // namespace ttg +#elif defined(TTG_HAVE_LEVEL_ZERO) + +#include + +namespace ttg::device { + namespace detail { + inline thread_local ttg::device::Device current_device_ts = {}; + inline thread_local sycl::queue* current_stream_ts = nullptr; // default stream + + void reset_current() { + current_device_ts = {}; + current_stream_ts = nullptr; + } + + void set_current(int device, sycl::queue& stream) { + current_device_ts = ttg::device::Device(device, ttg::ExecutionSpace::HIP); + current_stream_ts = &stream; + } + } // namespace detail + + inline + Device current_device() { + return detail::current_device_ts; + } + + inline + const sycl::queue& current_stream() { + return *detail::current_stream_ts; + } +} // namespace ttg + + #endif // defined(TTG_HAVE_HIP) diff --git a/ttg/ttg/execution.h b/ttg/ttg/execution.h index 4d99a3ae7..495d0e248 100644 --- a/ttg/ttg/execution.h +++ b/ttg/ttg/execution.h @@ -18,6 +18,7 @@ enum class ExecutionSpace { Host, // a CPU CUDA, // an NVIDIA CUDA device HIP, // an AMD HIP device + L0, // an Intel L0 device Invalid }; diff --git a/ttg/ttg/parsec/buffer.h b/ttg/ttg/parsec/buffer.h index 552d0b52e..e3d7a84b3 100644 --- a/ttg/ttg/parsec/buffer.h +++ b/ttg/ttg/parsec/buffer.h @@ -10,6 +10,7 @@ #include "ttg/parsec/parsec-ext.h" #include "ttg/util/iovec.h" #include "ttg/device/device.h" +#include "ttg/parsec/device.h" #if defined(PARSEC_HAVE_DEV_CUDA_SUPPORT) #include @@ -75,9 +76,6 @@ struct buffer : public detail::ttg_parsec_data_wrapper_t public: - /* The device ID of the CPU. */ - static constexpr int cpu_device = -2; - buffer() : buffer(nullptr, 0) { } @@ -161,33 +159,35 @@ struct buffer : public detail::ttg_parsec_data_wrapper_t /* set the current device, useful when a device * buffer was modified outside of a TTG */ - void set_current_device(int device_id) { + void set_current_device(const ttg::device::Device& device) { assert(is_valid()); /* make sure it's a valid device */ assert(parsec_nb_devices > device_id); /* make sure it's a valid copy */ - assert(m_data->device_copies[device_id+2] != nullptr); - m_data->owner_device = device_id+2; + int parsec_id = detail::device_to_parsec_device(device); + assert(m_data->device_copies[parsec_id] != nullptr); + m_data->owner_device = parsec_id; } /* Get the owner device ID, i.e., the last updated - * device buffer. A value of -2 designates the host - * as the current device. */ - int get_owner_device() const { + * device buffer. */ + ttg::device::Device get_owner_device() const { assert(is_valid()); - return m_data->owner_device - 2; // 0: host, 1: recursive, 2: first device + return detail::parsec_device_to_device(m_data->owner_device); } /* Get the pointer on the currently active device. */ element_type* current_device_ptr() { assert(is_valid()); - return static_cast(m_data->device_copies[ttg::device::current_device()+2]->device_private); + int device_id = ttg::device::current_device()+detail::first_device_id; + return static_cast(m_data->device_copies[device_id]->device_private); } /* Get the pointer on the currently active device. */ const element_type* current_device_ptr() const { assert(is_valid()); - return static_cast(m_data->device_copies[ttg::device::current_device()+2]->device_private); + int device_id = ttg::device::current_device()+detail::first_device_id; + return static_cast(m_data->device_copies[device_id]->device_private); } /* Get the pointer on the owning device. @@ -205,19 +205,19 @@ struct buffer : public detail::ttg_parsec_data_wrapper_t } /* get the device pointer at the given device - * \sa cpu_device */ - element_type* device_ptr_on(int device_id) { + element_type* device_ptr_on(const ttg::device::Device& device) { assert(is_valid()); - return static_cast(parsec_data_get_ptr(m_data.get(), device_id + 2)); + int device_id = detail::device_to_parsec_device(device); + return static_cast(parsec_data_get_ptr(m_data.get(), device_id)); } /* get the device pointer at the given device - * \sa cpu_device */ - const element_type* device_ptr_on(int device_id) const { + const element_type* device_ptr_on(const ttg::device::Device& device) const { assert(is_valid()); - return static_cast(parsec_data_get_ptr(m_data.get(), device_id + 2)); // GPUs start at 2 + int device_id = detail::device_to_parsec_device(device); + return static_cast(parsec_data_get_ptr(m_data.get(), device_id)); } element_type* host_ptr() { @@ -228,12 +228,13 @@ struct buffer : public detail::ttg_parsec_data_wrapper_t return static_cast(parsec_data_get_ptr(m_data.get(), 0)); } - bool is_valid_on(int device_id) const { + bool is_valid_on(const ttg::device::Device& device) const { assert(is_valid()); - return (parsec_data_get_ptr(m_data.get(), device_id+2) != nullptr); + int device_id = detail::device_to_parsec_device(device); + return (parsec_data_get_ptr(m_data.get(), device_id) != nullptr); } - void allocate_on(int device_id) { + void allocate_on(const ttg::device::Device& device_id) { /* TODO: need exposed PaRSEC memory allocator */ throw std::runtime_error("not implemented yet"); } @@ -252,7 +253,7 @@ struct buffer : public detail::ttg_parsec_data_wrapper_t /* Unpin the memory on all devices we currently track. */ void unpin() { if (!is_valid()) return; - for (int i = 0; i < parsec_nb_devices-2; ++i) { + for (int i = 0; i < parsec_nb_devices-detail::first_device_id; ++i) { unpin_on(i); } } diff --git a/ttg/ttg/parsec/device.h b/ttg/ttg/parsec/device.h new file mode 100644 index 000000000..8f46453ac --- /dev/null +++ b/ttg/ttg/parsec/device.h @@ -0,0 +1,40 @@ +#ifndef TTG_PARSEC_DEVICE_H +#define TTG_PARSEC_DEVICE_H + +#include "ttg/device/device.h" + +namespace ttg_parsec { + + namespace detail { + + // the first ID of an accelerator in the parsec ID-space + inline int first_device_id = 0; + + /** + * map from TTG ID-space to parsec ID-space + */ + inline + int device_to_parsec_device(const ttg::device::Device& device) { + if (device.is_host()) { + return 0; + } else { + return device.id() + first_device_id; + } + } + + /** + * map from parsec ID-space to TTG ID-space + */ + inline + ttg::device::Device parsec_device_to_device(int parsec_id) { + if (parsec_id < first_device_id) { + return ttg::device::Device(parsec_id, ttg::ExecutionSpace::Host); + } + return ttg::device::Device(parsec_id - first_device_id, + ttg::device::available_execution_space); + } + } // namespace detail + +} // namespace ttg_parsec + +#endif // TTG_PARSEC_DEVICE_H \ No newline at end of file diff --git a/ttg/ttg/parsec/ttg.h b/ttg/ttg/parsec/ttg.h index 71ef55e0f..cf0cd33b2 100644 --- a/ttg/ttg/parsec/ttg.h +++ b/ttg/ttg/parsec/ttg.h @@ -974,6 +974,14 @@ namespace ttg_parsec { std::shared_ptr world_sptr{static_cast(world_ptr)}; ttg::World world{std::move(world_sptr)}; ttg::detail::set_default_world(std::move(world)); + + // query the first device ID + for (int i = 0; i < parsec_nb_devices; ++i) { + if (parsec_mca_device_is_gpu(i)) { + detail::first_device_id = i; + break; + } + } } inline void ttg_finalize() { // We need to notify the current taskpool of termination if we are in user termination detection mode @@ -1329,7 +1337,7 @@ namespace ttg_parsec { #if defined(PARSEC_HAVE_DEV_CUDA_SUPPORT) && defined(TTG_HAVE_CUDA) { parsec_cuda_exec_stream_t *cuda_stream = (parsec_cuda_exec_stream_t *)gpu_stream; - int device = gpu_device->super.device_index - 2; // 0: host, 1: recursive, 2: first GPU + int device = gpu_device->super.device_index - detail::first_device_id; ttg::device::detail::set_current(device, cuda_stream->cuda_stream); } #endif // defined(PARSEC_HAVE_DEV_CUDA_SUPPORT) && defined(TTG_HAVE_CUDA) @@ -1337,11 +1345,20 @@ namespace ttg_parsec { #if defined(PARSEC_HAVE_DEV_HIP_SUPPORT) && defined(TTG_HAVE_HIP) { parsec_hip_exec_stream_t *hip_stream = (parsec_hip_exec_stream_t *)gpu_stream; - int device = gpu_device->super.device_index - 2; // 0: host, 1: recursive, 2: first GPU + int device = gpu_device->super.device_index - detail::first_device_id; ttg::device::detail::set_current(device, hip_stream->hip_stream); } #endif // defined(PARSEC_HAVE_DEV_CUDA_SUPPORT) && defined(TTG_HAVE_CUDA) +#if defined(PARSEC_HAVE_DEV_LEVEL_ZERO_SUPPORT) && defined(TTG_HAVE_LEVEL_ZERO) + { + parsec_level_zero_exec_stream_t *stream; + stream = (parsec_level_zero_exec_stream_t *)gpu_stream; + int device = gpu_device->super.device_index - detail::first_device_id; + ttg::device::detail::set_current(device, stream->swq->queue); + } +#endif // defined(PARSEC_HAVE_DEV_CUDA_SUPPORT) && defined(TTG_HAVE_CUDA) + /* Here we call back into the coroutine again after the transfers have completed */ static_op(&task->parsec_task); @@ -1394,24 +1411,14 @@ namespace ttg_parsec { } } } -#if defined(PARSEC_HAVE_DEV_CUDA_SUPPORT) - static int - static_cuda_stage_in(parsec_gpu_task_t *gtask, - uint32_t flow_mask, - parsec_gpu_exec_stream_t *gpu_stream) { - static_device_stage_in(gtask, flow_mask, gpu_stream); - return parsec_default_cuda_stage_in(gtask, flow_mask, gpu_stream); - } -#endif // PARSEC_HAVE_DEV_CUDA_SUPPORT -#if defined(PARSEC_HAVE_DEV_HIP_SUPPORT) + static int - static_hip_stage_in(parsec_gpu_task_t *gtask, - uint32_t flow_mask, - parsec_gpu_exec_stream_t *gpu_stream) { + static_device_stage_in_hook(parsec_gpu_task_t *gtask, + uint32_t flow_mask, + parsec_gpu_exec_stream_t *gpu_stream) { static_device_stage_in(gtask, flow_mask, gpu_stream); - return parsec_default_hip_stage_in(gtask, flow_mask, gpu_stream); + return parsec_default_gpu_stage_in(gtask, flow_mask, gpu_stream); } -#endif template static parsec_hook_return_t device_static_op(parsec_task_t* parsec_task) { @@ -1493,21 +1500,30 @@ namespace ttg_parsec { if constexpr (Space == ttg::ExecutionSpace::CUDA) { /* TODO: we need custom staging functions because PaRSEC looks at the * task-class to determine the number of flows. */ - gpu_task->stage_in = static_cuda_stage_in; - gpu_task->stage_out = parsec_default_cuda_stage_out; - return parsec_cuda_kernel_scheduler(es, gpu_task, dev_index); + gpu_task->stage_in = static_device_stage_in_hook; + gpu_task->stage_out = parsec_default_gpu_stage_out; + return parsec_device_kernel_scheduler(device, es, gpu_task); } break; #endif #if defined(PARSEC_HAVE_DEV_HIP_SUPPORT) case PARSEC_DEV_HIP: if constexpr (Space == ttg::ExecutionSpace::HIP) { - gpu_task->stage_in = static_hip_stage_in; - gpu_task->stage_out = parsec_default_hip_stage_out; - return parsec_hip_kernel_scheduler(es, gpu_task, dev_index); + gpu_task->stage_in = static_device_stage_in_hook; + gpu_task->stage_out = parsec_default_gpu_stage_out; + return parsec_device_kernel_scheduler(device, es, gpu_task); } break; #endif // PARSEC_HAVE_DEV_HIP_SUPPORT +#if defined(PARSEC_HAVE_DEV_LEVEL_ZERO_SUPPORT) + case PARSEC_DEV_LEVEL_ZERO: + if constexpr (Space == ttg::ExecutionSpace::L0) { + gpu_task->stage_in = static_device_stage_in_hook; + gpu_task->stage_out = parsec_default_gpu_stage_out; + return parsec_device_kernel_scheduler(device, es, gpu_task); + } + break; +#endif // PARSEC_HAVE_DEV_LEVEL_ZERO_SUPPORT default: break; }