diff --git a/ttg/ttg/device/task.h b/ttg/ttg/device/task.h index e592e99a2..476119886 100644 --- a/ttg/ttg/device/task.h +++ b/ttg/ttg/device/task.h @@ -306,8 +306,7 @@ namespace ttg::device { } } - template + template inline void broadcast(const std::tuple &keylists, valueT &&value) { using key_t = typename broadcast_keylist_trait< std::tuple_element_t...>> @@ -379,8 +378,7 @@ namespace ttg::device { } } - template + template inline void broadcastk(const std::tuple &keylists) { using key_t = typename broadcast_keylist_trait< std::tuple_element_t...>> diff --git a/ttg/ttg/func.h b/ttg/ttg/func.h index 4273e7c66..9e9830b0b 100644 --- a/ttg/ttg/func.h +++ b/ttg/ttg/func.h @@ -416,8 +416,7 @@ namespace ttg { std::get(t).broadcast(keylist, copy_handler(std::forward(value))); } - template + template inline void broadcast(std::size_t i, const rangeT &keylist, valueT &&value) { detail::value_copy_handler copy_handler; using key_t = decltype(*std::begin(keylist)); @@ -425,8 +424,7 @@ namespace ttg { terminal_ptr->broadcast(keylist, copy_handler(std::forward(value))); } - template + template inline void broadcast(const rangeT &keylist, valueT &&value) { broadcast(i, keylist, std::forward(value)); } @@ -505,7 +503,7 @@ namespace ttg { terminal_ptr->set_size(size); } - template + template inline std::enable_if_t, void> set_size(const keyT &key, const std::size_t size) { set_size(i, key, size); } diff --git a/ttg/ttg/parsec/devicefunc.h b/ttg/ttg/parsec/devicefunc.h index ff8452034..080660205 100644 --- a/ttg/ttg/parsec/devicefunc.h +++ b/ttg/ttg/parsec/devicefunc.h @@ -123,7 +123,6 @@ namespace ttg_parsec { /* get_parsec_data is overloaded for buffer and devicescratch */ parsec_data_t* data = detail::get_parsec_data(view); - parsec_gpu_task_t *gpu_task = detail::parsec_ttg_caller->dev_ptr->gpu_task; parsec_gpu_exec_stream_t *stream = detail::parsec_ttg_caller->dev_ptr->stream; /* enqueue the transfer into the compute stream to come back once the compute and transfer are complete */ diff --git a/ttg/ttg/parsec/ttg.h b/ttg/ttg/parsec/ttg.h index 8e7f117d6..7b49dcb4b 100644 --- a/ttg/ttg/parsec/ttg.h +++ b/ttg/ttg/parsec/ttg.h @@ -146,8 +146,8 @@ namespace ttg_parsec { MSG_SET_ARGSTREAM_SIZE = 1, MSG_FINALIZE_ARGSTREAM_SIZE = 2, MSG_GET_FROM_PULL = 3 } fn_id_t; - uint32_t taskpool_id = -1; - uint64_t op_id = -1; + uint32_t taskpool_id = std::numeric_limits::max(); + uint64_t op_id = std::numeric_limits::max(); std::size_t key_offset = 0; fn_id_t fn_id = MSG_INVALID; std::int8_t num_iovecs = 0; @@ -334,7 +334,7 @@ namespace ttg_parsec { void create_tpool() { assert(nullptr == tpool); tpool = PARSEC_OBJ_NEW(parsec_taskpool_t); - tpool->taskpool_id = -1; + tpool->taskpool_id = std::numeric_limits::max(); tpool->update_nb_runtime_task = parsec_add_fetch_runtime_task; tpool->taskpool_type = PARSEC_TASKPOOL_TYPE_TTG; tpool->taskpool_name = strdup("TTG Taskpool"); @@ -701,7 +701,7 @@ namespace ttg_parsec { template inline ttg_data_copy_t *create_new_datacopy(Value &&value) { using value_type = std::decay_t; - ttg_data_copy_t *copy; + ttg_data_copy_t *copy = nullptr; if constexpr (std::is_base_of_v, value_type> && std::is_constructible_v) { copy = new value_type(std::forward(value)); @@ -1472,36 +1472,6 @@ namespace ttg_parsec { return rc; } - static void - static_device_stage_in(parsec_gpu_task_t *gtask, - uint32_t flow_mask, - parsec_gpu_exec_stream_t *gpu_stream) { - /* register any memory that hasn't been registered yet */ - for (int i = 0; i < MAX_PARAM_COUNT; ++i) { - if (flow_mask & (1<ec; - parsec_data_copy_t *copy = task->parsec_task.data[i].data_in; - if (0 == (copy->flags & TTG_PARSEC_DATA_FLAG_REGISTERED)) { -#if defined(PARSEC_HAVE_DEV_CUDA_SUPPORT) - // register host memory for faster device access - cudaError_t status; - //status = cudaHostRegister(copy->device_private, gtask->flow_nb_elts[i], cudaHostRegisterPortable); - //assert(cudaSuccess == status); -#endif // PARSEC_HAVE_DEV_CUDA_SUPPORT - //copy->flags |= TTG_PARSEC_DATA_FLAG_REGISTERED; - } - } - } - } - - static int - static_device_stage_in_hook(parsec_gpu_task_t *gtask, - uint32_t flow_mask, - parsec_gpu_exec_stream_t *gpu_stream) { - static_device_stage_in(gtask, flow_mask, gpu_stream); - return parsec_default_gpu_stage_in(gtask, flow_mask, gpu_stream); - } - template static parsec_hook_return_t device_static_evaluate(parsec_task_t* parsec_task) { @@ -1515,7 +1485,7 @@ namespace ttg_parsec { PARSEC_OBJ_CONSTRUCT(gpu_task, parsec_list_item_t); gpu_task->ec = parsec_task; gpu_task->task_type = 0; // user task - gpu_task->last_data_check_epoch = -1; // used internally + gpu_task->last_data_check_epoch = 0; // used internally gpu_task->pushout = 0; gpu_task->submit = &TT::device_static_submit; @@ -1624,7 +1594,7 @@ namespace ttg_parsec { if constexpr (Space == ttg::ExecutionSpace::CUDA) { /* TODO: we need custom staging functions because PaRSEC looks at the * task-class to determine the number of flows. */ - gpu_task->stage_in = static_device_stage_in_hook; + gpu_task->stage_in = parsec_default_gpu_stage_in; gpu_task->stage_out = parsec_default_gpu_stage_out; return parsec_device_kernel_scheduler(&device->super, es, gpu_task); } @@ -1633,7 +1603,7 @@ namespace ttg_parsec { #if defined(PARSEC_HAVE_DEV_HIP_SUPPORT) case PARSEC_DEV_HIP: if constexpr (Space == ttg::ExecutionSpace::HIP) { - gpu_task->stage_in = static_device_stage_in_hook; + gpu_task->stage_in = parsec_default_gpu_stage_in; gpu_task->stage_out = parsec_default_gpu_stage_out; return parsec_device_kernel_scheduler(&device->super, es, gpu_task); } @@ -1642,7 +1612,7 @@ namespace ttg_parsec { #if defined(PARSEC_HAVE_DEV_LEVEL_ZERO_SUPPORT) case PARSEC_DEV_LEVEL_ZERO: if constexpr (Space == ttg::ExecutionSpace::L0) { - gpu_task->stage_in = static_device_stage_in_hook; + gpu_task->stage_in = parsec_default_gpu_stage_in; gpu_task->stage_out = parsec_default_gpu_stage_out; return parsec_device_kernel_scheduler(&device->super, es, gpu_task); } @@ -2399,7 +2369,9 @@ namespace ttg_parsec { auto &reducer = std::get(input_reducers); bool release = false; bool remove_from_hash = true; +#if defined(PARSEC_PROF_GRAPHER) bool discover_task = true; +#endif bool get_pull_data = false; bool has_lock = false; /* If we have only one input and no reducer on that input we can skip the hash table */ @@ -2795,7 +2767,6 @@ namespace ttg_parsec { num_iovecs = std::distance(std::begin(iovs), std::end(iovs)); /* pack the metadata */ auto metadata = descr.get_metadata(*const_cast(value_ptr)); - size_t metadata_size = sizeof(metadata); pos = pack(metadata, msg->bytes, pos); //std::cout << "set_arg_impl splitmd num_iovecs " << num_iovecs << std::endl; handle_iovec_fn(iovs); @@ -2970,7 +2941,6 @@ namespace ttg_parsec { ttg::SplitMetadataDescriptor descr; /* pack the metadata */ auto metadata = descr.get_metadata(value); - size_t metadata_size = sizeof(metadata); pos = pack(metadata, msg->bytes, pos); auto iovs = descr.get_data(*const_cast(&value)); num_iovs = std::distance(std::begin(iovs), std::end(iovs)); diff --git a/ttg/ttg/util/dot.h b/ttg/ttg/util/dot.h index 5e0dea7f6..f562dcb49 100644 --- a/ttg/ttg/util/dot.h +++ b/ttg/ttg/util/dot.h @@ -57,13 +57,12 @@ namespace ttg { void ttfunc(TTBase *tt) { std::string ttnm = nodename(tt); - bool is_ttg = true; const TTBase *ttc = reinterpret_cast(tt); build_ttg_hierarchy(ttc); if(!tt->is_ttg()) { std::stringstream ttss; - + ttss << " " << ttnm << " [shape=record,style=filled,fillcolor=gray90,label=\"{"; size_t count = 0;