From d32b853f932d8bfb2c2f93b70fd3e184c5d66840 Mon Sep 17 00:00:00 2001 From: Hans Pabst Date: Tue, 30 Jan 2024 14:48:59 +0100 Subject: [PATCH] ocl: pointer-arithmetic for device-pointers * Fallback to main-thread's stream (c_dbcsr_acc_opencl_stream_default). * Fixed c_dbcsr_acc_opencl_stream_default and reduce one level of indirection. * Reworked entire memory allocation (determining offsets). * Consolidated compile-time decisions about LIBXSMM_VERSION_NUMBER. * Removed runtime decisions accounting for pooled allocations. * Support older LIBXSMM (pooled memory allocations). * Set ACC_OPENCL_ATOMIC_KIND to sequentially consistent; set ACC_OPENCL_NLOCKS=1. * Complemented ACC_OPENCL_NLOCKS with environment variable. * Introduced ACC_OPENCL_OMPLOCKS, ACC_OPENCL_MEM_DEBUG, ACC_OPENCL_EVENT_FLUSH. * Implemented behavior of c_dbcsr_acc_opencl_stream_default already in c_dbcsr_acc_opencl_stream. * Attempt to avoid recursive/dead lock, and revised function signature (c_dbcsr_acc_opencl_get_ptr). * Introduced lock-arguments (internal, e.g., c_dbcsr_acc_opencl_set_active_device). * Consolidated domain-locks into c_dbcsr_acc_opencl_config. * Made build-log available (c_dbcsr_acc_opencl_kernel). * Reworked stream-registry and stream-info facility. * Use "int" instead of "cl_int" when taking the return-code. * Consistently use EXIT_SUCCESS instead of CL_SUCCESS. * Removed support for ACC_OPENCL_OVERMALLOC. * Removed support for per-thread device. * Removed ACC_OPENCL_EVENT_BARRIER. * Introduced ACC_OPENCL_MEM_TLS (disabled). * Simplified c_dbcsr_acc_opencl_memset. * Support ACC_OPENCL_STREAM_NULL in event facility. * Fixed using size_t as kernel argument. * Introduced UNROLL_AUTO. --- src/acc/opencl/acc_opencl.c | 630 +++++++++++--------------- src/acc/opencl/acc_opencl.h | 204 +++++---- src/acc/opencl/acc_opencl_event.c | 156 ++----- src/acc/opencl/acc_opencl_mem.c | 612 ++++++++++++++----------- src/acc/opencl/acc_opencl_stream.c | 247 +++++----- src/acc/opencl/common/opencl_common.h | 10 +- src/acc/opencl/smm/opencl_libsmm.c | 149 +++--- 7 files changed, 961 insertions(+), 1047 deletions(-) diff --git a/src/acc/opencl/acc_opencl.c b/src/acc/opencl/acc_opencl.c index 13c19d54f4c..b2fffe84692 100644 --- a/src/acc/opencl/acc_opencl.c +++ b/src/acc/opencl/acc_opencl.c @@ -40,6 +40,9 @@ # if !defined(ACC_OPENCL_SEDBIN) && 1 # define ACC_OPENCL_SEDBIN "/usr/bin/sed" # endif +# if !defined(ACC_OPENCL_NLOCKS) +# define ACC_OPENCL_NLOCKS 8 +# endif # if !defined(ACC_OPENCL_NCCS) && 1 # define ACC_OPENCL_NCCS 4 # endif @@ -52,6 +55,7 @@ extern "C" { # endif +char c_dbcsr_acc_opencl_locks[ACC_OPENCL_CACHELINE_NBYTES * ACC_OPENCL_NLOCKS]; /* global configuration discovered during initialization */ c_dbcsr_acc_opencl_config_t c_dbcsr_acc_opencl_config; @@ -65,47 +69,6 @@ void c_dbcsr_acc_opencl_notify(const char errinfo[], const void* private_info, s } -cl_context c_dbcsr_acc_opencl_context(int* thread_id) { - cl_context result; - const int tid = ACC_OPENCL_OMP_TID(); - assert(0 <= tid && tid < c_dbcsr_acc_opencl_config.nthreads); - assert(NULL != c_dbcsr_acc_opencl_config.device); - result = c_dbcsr_acc_opencl_config.device[tid].context; - if (NULL == result) { /* fallback */ - int i = 0; /* prefer master's context */ - for (; i < c_dbcsr_acc_opencl_config.nthreads; ++i) { - if (tid != i) { /* adopt another context */ - result = c_dbcsr_acc_opencl_config.device[i].context; - if (NULL != result && CL_SUCCESS == clRetainContext(result)) break; - else result = NULL; - } - } - } - if (NULL != thread_id) *thread_id = tid; - return result; -} - - -cl_context c_dbcsr_acc_opencl_device_context(cl_device_id device, const int* thread_id) { - const int i0 = (NULL != thread_id ? *thread_id : /*main*/ 0); - cl_context result = NULL; - int i = 0; - for (; i < c_dbcsr_acc_opencl_config.nthreads; ++i) { - const int j = i + i0, tid = (j < c_dbcsr_acc_opencl_config.nthreads ? j : (j - c_dbcsr_acc_opencl_config.nthreads)); - result = c_dbcsr_acc_opencl_config.device[tid].context; - if (NULL != result) { - cl_device_id device_id = NULL; - if (CL_SUCCESS == clGetContextInfo(result, CL_CONTEXT_DEVICES, sizeof(cl_device_id), &device_id, NULL) && device == device_id) - { - break; - } - else result = NULL; - } - } - return result; -} - - /** * Comparator used with qsort; stabilized by tail condition (a < b ? -1 : 1). * Brings GPUs with local memory in front, followed by (potentially) integrated GPUs, @@ -212,19 +175,42 @@ int c_dbcsr_acc_init(void) { const char *const env_neo = getenv("NEOReadDebugKeys"), *const env_ienv = getenv("ACC_OPENCL_IENV"); const int neo = (NULL == env_neo ? 1 : atoi(env_neo)), ienv = neo * (NULL == env_ienv ? 1 : atoi(env_ienv)); # endif + const char* const env_nlocks = getenv("ACC_OPENCL_NLOCKS"); char* const env_devids = getenv("ACC_OPENCL_DEVIDS"); int device_id = (NULL == env_device ? 0 : atoi(env_device)); + const int nlocks = (NULL == env_nlocks ? 1 /*default*/ : atoi(env_nlocks)); cl_uint nplatforms = 0, ndevices = 0, i; cl_device_type type = CL_DEVICE_TYPE_ALL; # if defined(_OPENMP) const int max_threads = omp_get_max_threads(), num_threads = omp_get_num_threads(); c_dbcsr_acc_opencl_config.nthreads = (num_threads < max_threads ? max_threads : num_threads); - c_dbcsr_acc_opencl_config.nstreams = (num_threads < max_threads ? (ACC_OPENCL_STREAMS_MAXCOUNT + max_threads) - : (ACC_OPENCL_STREAMS_MAXCOUNT)); + c_dbcsr_acc_opencl_config.nstreams = (num_threads < max_threads ? (ACC_OPENCL_HANDLES_MAXCOUNT * max_threads) + : (ACC_OPENCL_HANDLES_MAXCOUNT)); # else c_dbcsr_acc_opencl_config.nthreads = 1; - c_dbcsr_acc_opencl_config.nstreams = ACC_OPENCL_STREAMS_MAXCOUNT; + c_dbcsr_acc_opencl_config.nstreams = ACC_OPENCL_HANDLES_MAXCOUNT; # endif + assert(sizeof(ACC_OPENCL_LOCKTYPE) <= ACC_OPENCL_CACHELINE_NBYTES); + for (i = 0; i < ACC_OPENCL_NLOCKS; ++i) { + ACC_OPENCL_INIT((ACC_OPENCL_LOCKTYPE*)(c_dbcsr_acc_opencl_locks + ACC_OPENCL_CACHELINE_NBYTES * i)); + } + c_dbcsr_acc_opencl_config.lock_main = (ACC_OPENCL_LOCKTYPE*)c_dbcsr_acc_opencl_locks; + c_dbcsr_acc_opencl_config.lock_stream = + (1 < LIBXSMM_MIN(nlocks, ACC_OPENCL_NLOCKS) + ? ((ACC_OPENCL_LOCKTYPE*)(c_dbcsr_acc_opencl_locks + ACC_OPENCL_CACHELINE_NBYTES * 1)) + : c_dbcsr_acc_opencl_config.lock_main); + c_dbcsr_acc_opencl_config.lock_memory = + (2 < LIBXSMM_MIN(nlocks, ACC_OPENCL_NLOCKS) + ? ((ACC_OPENCL_LOCKTYPE*)(c_dbcsr_acc_opencl_locks + ACC_OPENCL_CACHELINE_NBYTES * 2)) + : c_dbcsr_acc_opencl_config.lock_main); + c_dbcsr_acc_opencl_config.lock_memset = + (3 < LIBXSMM_MIN(nlocks, ACC_OPENCL_NLOCKS) + ? ((ACC_OPENCL_LOCKTYPE*)(c_dbcsr_acc_opencl_locks + ACC_OPENCL_CACHELINE_NBYTES * 3)) + : c_dbcsr_acc_opencl_config.lock_memory); + c_dbcsr_acc_opencl_config.lock_memcpy = + (4 < LIBXSMM_MIN(nlocks, ACC_OPENCL_NLOCKS) + ? ((ACC_OPENCL_LOCKTYPE*)(c_dbcsr_acc_opencl_locks + ACC_OPENCL_CACHELINE_NBYTES * 4)) + : c_dbcsr_acc_opencl_config.lock_memset); c_dbcsr_acc_opencl_config.verbosity = (NULL == env_verbose ? 0 : atoi(env_verbose)); c_dbcsr_acc_opencl_config.priority = (NULL == env_priority ? /*default*/ 3 : atoi(env_priority)); c_dbcsr_acc_opencl_config.devcopy = (NULL == env_devcopy ? /*default*/ 0 : atoi(env_devcopy)); @@ -310,7 +296,7 @@ int c_dbcsr_acc_init(void) { } } # endif - if (CL_SUCCESS == clGetPlatformIDs(0, NULL, &nplatforms) && 0 < nplatforms) { + if (EXIT_SUCCESS == clGetPlatformIDs(0, NULL, &nplatforms) && 0 < nplatforms) { ACC_OPENCL_CHECK( clGetPlatformIDs(nplatforms <= ACC_OPENCL_DEVICES_MAXCOUNT ? nplatforms : ACC_OPENCL_DEVICES_MAXCOUNT, platforms, 0), "retrieve platform ids", result); @@ -332,7 +318,7 @@ int c_dbcsr_acc_init(void) { } c_dbcsr_acc_opencl_config.ndevices = 0; for (i = 0; i < nplatforms; ++i) { - if (CL_SUCCESS == clGetDeviceIDs(platforms[i], type, 0, NULL, &ndevices) && 0 < ndevices) { + if (EXIT_SUCCESS == clGetDeviceIDs(platforms[i], type, 0, NULL, &ndevices) && 0 < ndevices) { ACC_OPENCL_CHECK(clGetDeviceIDs(platforms[i], type, ndevices, devices, NULL), "retrieve device ids", result); if (EXIT_SUCCESS == result) { cl_uint j = 0; @@ -348,7 +334,7 @@ int c_dbcsr_acc_init(void) { CL_DEVICE_PARTITION_BY_AFFINITY_DOMAIN, CL_DEVICE_AFFINITY_DOMAIN_NUMA, /*terminator*/ 0}; cl_uint nunits = 0; if (0 != devsplit && - CL_SUCCESS == clGetDeviceInfo(devices[j], CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(cl_uint), &nunits, NULL) && + EXIT_SUCCESS == clGetDeviceInfo(devices[j], CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(cl_uint), &nunits, NULL) && 1 < nunits) { if (1 < devsplit) { @@ -358,7 +344,7 @@ int c_dbcsr_acc_init(void) { } if ((NULL != env_devsplit && '0' == *env_devsplit) || (c_dbcsr_acc_opencl_config.ndevices + 1) == ACC_OPENCL_DEVICES_MAXCOUNT || - (CL_SUCCESS != clCreateSubDevices(devices[j], properties, 0, NULL, &n))) + (EXIT_SUCCESS != clCreateSubDevices(devices[j], properties, 0, NULL, &n))) # endif { c_dbcsr_acc_opencl_config.devices[c_dbcsr_acc_opencl_config.ndevices] = devices[j]; @@ -397,12 +383,12 @@ int c_dbcsr_acc_init(void) { /* filter device by vendor (if requested) */ if (NULL != env_vendor && '\0' != *env_vendor) { for (i = 0; (int)i < c_dbcsr_acc_opencl_config.ndevices;) { - if (CL_SUCCESS == + if (EXIT_SUCCESS == clGetDeviceInfo(c_dbcsr_acc_opencl_config.devices[i], CL_DEVICE_VENDOR, ACC_OPENCL_BUFFERSIZE, buffer, NULL)) { if (NULL == LIBXSMM_STRISTR(buffer, env_vendor)) { # if defined(CL_VERSION_1_2) - ACC_OPENCL_EXPECT(CL_SUCCESS == clReleaseDevice(c_dbcsr_acc_opencl_config.devices[i])); + ACC_OPENCL_EXPECT(EXIT_SUCCESS == clReleaseDevice(c_dbcsr_acc_opencl_config.devices[i])); # endif --c_dbcsr_acc_opencl_config.ndevices; if ((int)i < c_dbcsr_acc_opencl_config.ndevices) { /* keep original order (stable) */ @@ -440,7 +426,7 @@ int c_dbcsr_acc_init(void) { while (++j < ndevids); if (0 == match) { # if defined(CL_VERSION_1_2) - ACC_OPENCL_EXPECT(CL_SUCCESS == clReleaseDevice(c_dbcsr_acc_opencl_config.devices[i])); + ACC_OPENCL_EXPECT(EXIT_SUCCESS == clReleaseDevice(c_dbcsr_acc_opencl_config.devices[i])); # endif c_dbcsr_acc_opencl_config.devices[i] = NULL; } @@ -467,7 +453,7 @@ int c_dbcsr_acc_init(void) { for (i = 0; i < ndevices; ++i) { cl_device_type itype; result = clGetDeviceInfo(c_dbcsr_acc_opencl_config.devices[i], CL_DEVICE_TYPE, sizeof(cl_device_type), &itype, NULL); - if (CL_SUCCESS == result) { + if (EXIT_SUCCESS == result) { if (0 != (CL_DEVICE_TYPE_DEFAULT & itype)) { if (0 < i) { c_dbcsr_acc_opencl_config.devices[0] = c_dbcsr_acc_opencl_config.devices[i]; @@ -478,7 +464,7 @@ int c_dbcsr_acc_init(void) { } else if (CL_DEVICE_TYPE_ALL == type && NULL == env_devtype /*&& CL_DEVICE_TYPE_GPU == itype*/ && device_id <= (int)i) { result = clGetDeviceInfo(c_dbcsr_acc_opencl_config.devices[i], CL_DEVICE_NAME, ACC_OPENCL_BUFFERSIZE, buffer, NULL); - if (CL_SUCCESS == result /* prune for homogeneous set of devices */ + if (EXIT_SUCCESS == result /* prune for homogeneous set of devices */ && ('\0' == *tmp || 0 == strncmp(buffer, tmp, ACC_OPENCL_BUFFERSIZE))) { c_dbcsr_acc_opencl_config.ndevices = i + 1; @@ -503,73 +489,81 @@ int c_dbcsr_acc_init(void) { } if (device_id < c_dbcsr_acc_opencl_config.ndevices) { if (EXIT_SUCCESS == result) { - assert(NULL == c_dbcsr_acc_opencl_config.device && 0 < c_dbcsr_acc_opencl_config.ndevices); + assert(0 < c_dbcsr_acc_opencl_config.ndevices); assert(c_dbcsr_acc_opencl_config.ndevices < ACC_OPENCL_DEVICES_MAXCOUNT); - c_dbcsr_acc_opencl_config.device = (c_dbcsr_acc_opencl_device_t*)calloc(/* thread-specific */ - c_dbcsr_acc_opencl_config.nthreads, sizeof(c_dbcsr_acc_opencl_device_t)); - if (NULL != c_dbcsr_acc_opencl_config.device) { - result = c_dbcsr_acc_opencl_set_active_device(/*main*/ 0, device_id); - assert(EXIT_SUCCESS == result || NULL == c_dbcsr_acc_opencl_config.device[/*main*/ 0].context); - if (1 < c_dbcsr_acc_opencl_config.verbosity || 0 > c_dbcsr_acc_opencl_config.verbosity) { - char platform_name[ACC_OPENCL_BUFFERSIZE]; - for (i = 0; i < (cl_uint)c_dbcsr_acc_opencl_config.ndevices; ++i) { - if (EXIT_SUCCESS == c_dbcsr_acc_opencl_device_name(c_dbcsr_acc_opencl_config.devices[i], buffer, - ACC_OPENCL_BUFFERSIZE, platform_name, ACC_OPENCL_BUFFERSIZE, /*cleanup*/ 0)) - { - fprintf(stderr, "INFO ACC/OpenCL: DEVICE -> \"%s : %s\"\n", platform_name, buffer); - } + result = c_dbcsr_acc_opencl_set_active_device(NULL /*lock*/, device_id); + assert(EXIT_SUCCESS == result || NULL == c_dbcsr_acc_opencl_config.device.context); + if (2 <= c_dbcsr_acc_opencl_config.verbosity || 0 > c_dbcsr_acc_opencl_config.verbosity) { + char platform_name[ACC_OPENCL_BUFFERSIZE]; + for (i = 0; i < (cl_uint)c_dbcsr_acc_opencl_config.ndevices; ++i) { + if (EXIT_SUCCESS == c_dbcsr_acc_opencl_device_name(c_dbcsr_acc_opencl_config.devices[i], buffer, ACC_OPENCL_BUFFERSIZE, + platform_name, ACC_OPENCL_BUFFERSIZE, /*cleanup*/ 0)) + { + fprintf(stderr, "INFO ACC/OpenCL: DEVICE -> \"%s : %s\"\n", platform_name, buffer); } } } - else { - result = EXIT_FAILURE; - } - c_dbcsr_acc_opencl_config.nclmems = c_dbcsr_acc_opencl_config.nevents = 0; - c_dbcsr_acc_opencl_config.clmems = c_dbcsr_acc_opencl_config.events = NULL; - c_dbcsr_acc_opencl_config.storage = NULL; -# if LIBXSMM_VERSION4(1, 17, 0, 0) < LIBXSMM_VERSION_NUMBER && defined(ACC_OPENCL_HANDLES_MAXCOUNT) && \ - (0 < ACC_OPENCL_HANDLES_MAXCOUNT) + c_dbcsr_acc_opencl_config.memptrs = NULL; + c_dbcsr_acc_opencl_config.streams = NULL; + c_dbcsr_acc_opencl_config.events = NULL; + c_dbcsr_acc_opencl_config.memptr_data = NULL; + c_dbcsr_acc_opencl_config.stream_data = NULL; + c_dbcsr_acc_opencl_config.event_data = NULL; + c_dbcsr_acc_opencl_config.nmemptrs = c_dbcsr_acc_opencl_config.nstreams = c_dbcsr_acc_opencl_config.nevents = 0; if (EXIT_SUCCESS == result) { const size_t nhandles = ACC_OPENCL_HANDLES_MAXCOUNT * c_dbcsr_acc_opencl_config.nthreads; -# if defined(ACC_OPENCL_MEM_OFFSET) - c_dbcsr_acc_opencl_config.nclmems = nhandles; - c_dbcsr_acc_opencl_config.clmems = (void**)malloc(sizeof(void*) * nhandles); - c_dbcsr_acc_opencl_config.storage = malloc(sizeof(void*) * (nhandles + nhandles)); - if (NULL != c_dbcsr_acc_opencl_config.clmems && NULL != c_dbcsr_acc_opencl_config.storage) { - libxsmm_pmalloc_init(sizeof(void*), &c_dbcsr_acc_opencl_config.nclmems, c_dbcsr_acc_opencl_config.clmems, - (void**)c_dbcsr_acc_opencl_config.storage + nhandles); + /* allocate and initialize memptr registry */ + c_dbcsr_acc_opencl_config.nmemptrs = nhandles; + c_dbcsr_acc_opencl_config.memptrs = (c_dbcsr_acc_opencl_info_memptr_t**)malloc( + sizeof(c_dbcsr_acc_opencl_info_memptr_t*) * nhandles); + c_dbcsr_acc_opencl_config.memptr_data = (c_dbcsr_acc_opencl_info_memptr_t*)malloc( + sizeof(c_dbcsr_acc_opencl_info_memptr_t) * nhandles); + if (NULL != c_dbcsr_acc_opencl_config.memptrs && NULL != c_dbcsr_acc_opencl_config.memptr_data) { + c_dbcsr_acc_opencl_pmalloc_init(sizeof(c_dbcsr_acc_opencl_info_memptr_t), &c_dbcsr_acc_opencl_config.nmemptrs, + (void**)c_dbcsr_acc_opencl_config.memptrs, c_dbcsr_acc_opencl_config.memptr_data); } else { - free(c_dbcsr_acc_opencl_config.clmems); - c_dbcsr_acc_opencl_config.clmems = NULL; - c_dbcsr_acc_opencl_config.nclmems = 0; + free(c_dbcsr_acc_opencl_config.memptrs); + free(c_dbcsr_acc_opencl_config.memptr_data); + c_dbcsr_acc_opencl_config.memptr_data = NULL; + c_dbcsr_acc_opencl_config.memptrs = NULL; + c_dbcsr_acc_opencl_config.nmemptrs = 0; result = EXIT_FAILURE; } -# else - c_dbcsr_acc_opencl_config.storage = malloc(sizeof(void*) * nhandles); -# endif + /* allocate and initialize streams registry */ + c_dbcsr_acc_opencl_config.nstreams = nhandles; + c_dbcsr_acc_opencl_config.streams = (c_dbcsr_acc_opencl_stream_t**)malloc( + sizeof(c_dbcsr_acc_opencl_stream_t*) * nhandles); + c_dbcsr_acc_opencl_config.stream_data = (c_dbcsr_acc_opencl_stream_t*)malloc( + sizeof(c_dbcsr_acc_opencl_stream_t) * nhandles); + if (NULL != c_dbcsr_acc_opencl_config.streams && NULL != c_dbcsr_acc_opencl_config.stream_data) { + c_dbcsr_acc_opencl_pmalloc_init(sizeof(c_dbcsr_acc_opencl_stream_t), &c_dbcsr_acc_opencl_config.nstreams, + (void**)c_dbcsr_acc_opencl_config.streams, c_dbcsr_acc_opencl_config.stream_data); + } + else { + free(c_dbcsr_acc_opencl_config.streams); + free(c_dbcsr_acc_opencl_config.stream_data); + c_dbcsr_acc_opencl_config.stream_data = NULL; + c_dbcsr_acc_opencl_config.streams = NULL; + c_dbcsr_acc_opencl_config.nstreams = 0; + result = EXIT_FAILURE; + } + /* allocate and initialize events registry */ c_dbcsr_acc_opencl_config.nevents = nhandles; - c_dbcsr_acc_opencl_config.events = (void**)malloc(sizeof(void*) * nhandles); - if (NULL != c_dbcsr_acc_opencl_config.events && NULL != c_dbcsr_acc_opencl_config.storage) { - libxsmm_pmalloc_init(sizeof(void*), &c_dbcsr_acc_opencl_config.nevents, c_dbcsr_acc_opencl_config.events, - c_dbcsr_acc_opencl_config.storage); + c_dbcsr_acc_opencl_config.events = (cl_event**)malloc(sizeof(cl_event*) * nhandles); + c_dbcsr_acc_opencl_config.event_data = malloc(sizeof(void*) * nhandles); + if (NULL != c_dbcsr_acc_opencl_config.events && NULL != c_dbcsr_acc_opencl_config.event_data) { + c_dbcsr_acc_opencl_pmalloc_init(sizeof(cl_event*), &c_dbcsr_acc_opencl_config.nevents, + (void**)c_dbcsr_acc_opencl_config.events, c_dbcsr_acc_opencl_config.event_data); } else { free(c_dbcsr_acc_opencl_config.events); + free(c_dbcsr_acc_opencl_config.event_data); + c_dbcsr_acc_opencl_config.event_data = NULL; c_dbcsr_acc_opencl_config.events = NULL; c_dbcsr_acc_opencl_config.nevents = 0; result = EXIT_FAILURE; } - if (EXIT_SUCCESS != result) { - free(c_dbcsr_acc_opencl_config.storage); - c_dbcsr_acc_opencl_config.storage = NULL; - } - } -# endif - if (EXIT_SUCCESS == result) { - const int nelements = c_dbcsr_acc_opencl_config.nthreads * c_dbcsr_acc_opencl_config.nstreams; - c_dbcsr_acc_opencl_config.streams = (void**)calloc(nelements, sizeof(void*)); /* allocate streams */ - if (NULL == c_dbcsr_acc_opencl_config.streams) result = EXIT_FAILURE; } } } @@ -609,10 +603,12 @@ int c_dbcsr_acc_finalize(void) { int i; assert(c_dbcsr_acc_opencl_config.ndevices < ACC_OPENCL_DEVICES_MAXCOUNT); if (0 != c_dbcsr_acc_opencl_config.verbosity) { - cl_device_id device; + cl_device_id device = NULL; int d; fprintf(stderr, "INFO ACC/OpenCL: pid=%u nthreads=%i", libxsmm_get_pid(), c_dbcsr_acc_opencl_config.nthreads); - if (EXIT_SUCCESS == c_dbcsr_acc_opencl_device(0, &device) && + if (NULL != c_dbcsr_acc_opencl_config.device.context && + EXIT_SUCCESS == + clGetContextInfo(c_dbcsr_acc_opencl_config.device.context, CL_CONTEXT_DEVICES, sizeof(cl_device_id), &device, NULL) && EXIT_SUCCESS == c_dbcsr_acc_opencl_device_id(device, NULL /*devid*/, &d)) { fprintf(stderr, " device=%i", d); @@ -628,15 +624,10 @@ int c_dbcsr_acc_finalize(void) { if (EXIT_SUCCESS == result) result = libsmm_acc_finalize(); # endif libxsmm_finalize(); - if (NULL != c_dbcsr_acc_opencl_config.device) { - for (i = 0; i < c_dbcsr_acc_opencl_config.nthreads; ++i) { - const cl_context context = c_dbcsr_acc_opencl_config.device[i].context; - if (NULL != context) { - c_dbcsr_acc_opencl_config.device[i].context = NULL; - clReleaseContext(context); /* ignore return code */ - } - } - free(c_dbcsr_acc_opencl_config.device); /* release buffer */ + if (NULL != c_dbcsr_acc_opencl_config.device.context) { + const cl_context context = c_dbcsr_acc_opencl_config.device.context; + c_dbcsr_acc_opencl_config.device.context = NULL; + clReleaseContext(context); /* ignore return code */ } for (i = 0; i < ACC_OPENCL_DEVICES_MAXCOUNT; ++i) { const cl_device_id device_id = c_dbcsr_acc_opencl_config.devices[i]; @@ -648,10 +639,17 @@ int c_dbcsr_acc_finalize(void) { c_dbcsr_acc_opencl_config.devices[i] = NULL; } } + /* destroy locks */ + for (i = 0; i < ACC_OPENCL_NLOCKS; ++i) { + ACC_OPENCL_DESTROY((ACC_OPENCL_LOCKTYPE*)(c_dbcsr_acc_opencl_locks + ACC_OPENCL_CACHELINE_NBYTES * i)); + } /* release/reset buffers */ - free(c_dbcsr_acc_opencl_config.events); - free(c_dbcsr_acc_opencl_config.storage); + free(c_dbcsr_acc_opencl_config.memptrs); + free(c_dbcsr_acc_opencl_config.memptr_data); free(c_dbcsr_acc_opencl_config.streams); + free(c_dbcsr_acc_opencl_config.stream_data); + free(c_dbcsr_acc_opencl_config.events); + free(c_dbcsr_acc_opencl_config.event_data); /* clear configuration */ memset(&c_dbcsr_acc_opencl_config, 0, sizeof(c_dbcsr_acc_opencl_config)); } @@ -692,27 +690,6 @@ int c_dbcsr_acc_get_ndevices(int* ndevices) { } -int c_dbcsr_acc_opencl_device(int thread_id, cl_device_id* device) { - int result = EXIT_SUCCESS; - assert(0 <= thread_id && thread_id < c_dbcsr_acc_opencl_config.nthreads); - assert(NULL != device); - if (NULL != c_dbcsr_acc_opencl_config.device) { - cl_context context = c_dbcsr_acc_opencl_config.device[thread_id].context; -# if defined(_OPENMP) - if (NULL == context && 0 < thread_id) { /* fallback to master's context */ - context = c_dbcsr_acc_opencl_config.device[/*main*/ 0].context; - } -# endif - if (NULL != context) { - result = clGetContextInfo(context, CL_CONTEXT_DEVICES, sizeof(cl_device_id), device, NULL); - } - else *device = NULL; - } - else *device = NULL; - return result; -} - - int c_dbcsr_acc_opencl_device_id(cl_device_id device, int* device_id, int* global_id) { int result = EXIT_SUCCESS, i; assert(c_dbcsr_acc_opencl_config.ndevices < ACC_OPENCL_DEVICES_MAXCOUNT); @@ -808,7 +785,7 @@ int c_dbcsr_acc_opencl_device_name( assert(NULL != name || NULL != platform); if (NULL != name && 0 != name_maxlen) { result_name = clGetDeviceInfo(device, CL_DEVICE_NAME, name_maxlen, name, NULL); - if (0 != cleanup && CL_SUCCESS == result_name) { + if (0 != cleanup && EXIT_SUCCESS == result_name) { char* const part = strchr(name, ':'); if (NULL != part) *part = '\0'; } @@ -816,7 +793,7 @@ int c_dbcsr_acc_opencl_device_name( if (NULL != platform && 0 != platform_maxlen) { cl_platform_id platform_id; result_platform = clGetDeviceInfo(device, CL_DEVICE_PLATFORM, sizeof(cl_platform_id), &platform_id, NULL); - if (CL_SUCCESS == result_platform) { + if (EXIT_SUCCESS == result_platform) { result_platform = clGetPlatformInfo(platform_id, CL_PLATFORM_NAME, platform_maxlen, platform, NULL); } } @@ -826,12 +803,12 @@ int c_dbcsr_acc_opencl_device_name( int c_dbcsr_acc_opencl_device_level( cl_device_id device, int* level_major, int* level_minor, char cl_std[16], cl_device_type* type) { - cl_int result = EXIT_SUCCESS; + int result = EXIT_SUCCESS; assert(NULL != device && (NULL != level_major || NULL != level_minor || NULL != cl_std || NULL != type)); if (NULL != level_major || NULL != level_minor || NULL != cl_std) { char buffer[ACC_OPENCL_BUFFERSIZE]; result = clGetDeviceInfo(device, CL_DEVICE_VERSION, ACC_OPENCL_BUFFERSIZE, buffer, NULL); - if (CL_SUCCESS == result) { + if (EXIT_SUCCESS == result) { unsigned int cl_std_level[2]; if (2 == sscanf(buffer, "OpenCL %u.%u", cl_std_level, cl_std_level + 1)) { if (NULL != level_major) *level_major = (int)cl_std_level[0]; @@ -848,7 +825,7 @@ int c_dbcsr_acc_opencl_device_level( } else { result = clGetDeviceInfo(device, CL_DEVICE_OPENCL_C_VERSION, ACC_OPENCL_BUFFERSIZE, buffer, NULL); - if (CL_SUCCESS == result) { + if (EXIT_SUCCESS == result) { if (2 == sscanf(buffer, "OpenCL C %u.%u", cl_std_level, cl_std_level + 1)) { const int nchar = LIBXSMM_SNPRINTF(cl_std, 16, "-cl-std=CL%u.%u", cl_std_level[0], cl_std_level[1]); if (0 >= nchar || 16 <= nchar) result = EXIT_FAILURE; @@ -907,35 +884,28 @@ int c_dbcsr_acc_opencl_device_ext(cl_device_id device, const char* const extname } -int c_dbcsr_acc_opencl_create_context(int thread_id, cl_device_id active_id) { +int c_dbcsr_acc_opencl_create_context(cl_device_id active_id, cl_context* context) { cl_platform_id platform = NULL; - cl_int result; - assert(0 <= thread_id && thread_id < c_dbcsr_acc_opencl_config.nthreads); - assert(NULL == c_dbcsr_acc_opencl_config.device[thread_id].context); + int result; assert(0 < c_dbcsr_acc_opencl_config.ndevices); - assert(NULL != active_id); + assert(NULL != active_id && NULL != context); result = clGetDeviceInfo(active_id, CL_DEVICE_PLATFORM, sizeof(cl_platform_id), &platform, NULL); - assert(CL_SUCCESS != result || NULL != platform); - if (CL_SUCCESS == result) { + assert(EXIT_SUCCESS != result || NULL != platform); + if (EXIT_SUCCESS == result) { void (*const notify)( const char*, const void*, size_t, void*) = (0 != c_dbcsr_acc_opencl_config.verbosity ? c_dbcsr_acc_opencl_notify : NULL); cl_context_properties properties[] = { CL_CONTEXT_PLATFORM, 0 /*placeholder*/, 0 /* end of properties */ }; - cl_context context = NULL; + cl_context ctx = NULL; properties[1] = (long)platform; - context = clCreateContext(properties, 1 /*num_devices*/, &active_id, notify, NULL /* user_data*/, &result); - if (CL_SUCCESS != result && CL_INVALID_DEVICE != result) { /* retry */ - context = clCreateContext(NULL /*properties*/, 1 /*num_devices*/, &active_id, notify, NULL /* user_data*/, &result); + ctx = clCreateContext(properties, 1 /*num_devices*/, &active_id, notify, NULL /* user_data*/, &result); + if (EXIT_SUCCESS != result && CL_INVALID_DEVICE != result) { /* retry */ + ctx = clCreateContext(NULL /*properties*/, 1 /*num_devices*/, &active_id, notify, NULL /* user_data*/, &result); } - if (CL_SUCCESS == result) { - assert(NULL != context); - c_dbcsr_acc_opencl_config.device[thread_id].context = context; - if (0 != thread_id) { - /* apply context to master-thread if master's context is NULL */ - LIBXSMM_ATOMIC_CMPSWP(&c_dbcsr_acc_opencl_config.device[/*main*/ 0].context, NULL, context, LIBXSMM_ATOMIC_RELAXED); - assert(NULL != c_dbcsr_acc_opencl_config.device[/*main*/ 0].context); - } + if (EXIT_SUCCESS == result) { + assert(NULL != ctx); + *context = ctx; if (0 != c_dbcsr_acc_opencl_config.verbosity) { char buffer[ACC_OPENCL_BUFFERSIZE]; int global_id = 0; @@ -955,90 +925,79 @@ int c_dbcsr_acc_opencl_create_context(int thread_id, cl_device_id active_id) { } } } - else if (CL_INVALID_DEVICE == result && - EXIT_SUCCESS == c_dbcsr_acc_opencl_device_vendor(active_id, "nvidia", 0 /*use_platform_name*/)) - { - fprintf(stderr, "WARN ACC/OpenCL: if MPI-ranks target the same device in exclusive mode,\n" - " SMI must be used to enable sharing the device.\n"); + else { + if (CL_INVALID_DEVICE == result && + EXIT_SUCCESS == c_dbcsr_acc_opencl_device_vendor(active_id, "nvidia", 0 /*use_platform_name*/)) + { + fprintf(stderr, "WARN ACC/OpenCL: if MPI-ranks target the same device in exclusive mode,\n" + " SMI must be used to enable sharing the device.\n"); + } + *context = NULL; } } return result; } -int c_dbcsr_acc_opencl_set_active_device(int thread_id, int device_id) { +int c_dbcsr_acc_opencl_set_active_device(ACC_OPENCL_LOCKTYPE* lock, int device_id) { int result = EXIT_SUCCESS; - cl_device_id active_id; - assert(0 <= thread_id && thread_id < c_dbcsr_acc_opencl_config.nthreads); + cl_device_id active_id = NULL; assert(c_dbcsr_acc_opencl_config.ndevices < ACC_OPENCL_DEVICES_MAXCOUNT); if (0 <= device_id && device_id < c_dbcsr_acc_opencl_config.ndevices) { - assert(NULL != c_dbcsr_acc_opencl_config.device); + if (NULL != lock) ACC_OPENCL_ACQUIRE(lock); active_id = c_dbcsr_acc_opencl_config.devices[device_id]; if (NULL != active_id) { -# if defined(_OPENMP) -# pragma omp critical(c_dbcsr_acc_set_active_device) -# endif - { - int inherit_id = 0; - const cl_context context = c_dbcsr_acc_opencl_device_context(active_id, &inherit_id); - const cl_context inherit = c_dbcsr_acc_opencl_config.device[inherit_id].context; - if (NULL != context) { - if (context != inherit) { - if (NULL != inherit) { - c_dbcsr_acc_opencl_config.device[inherit_id].context = NULL; - result = clReleaseContext(inherit); - } - else if (thread_id != inherit_id) { - c_dbcsr_acc_opencl_config.device[inherit_id].context = context; - result = clRetainContext(context); - } - } - } - else if (NULL == c_dbcsr_acc_opencl_config.device[thread_id].context) { - result = c_dbcsr_acc_opencl_create_context(thread_id, active_id); - if (EXIT_SUCCESS == result && NULL /*context*/ != inherit) { - c_dbcsr_acc_opencl_config.device[inherit_id].context = c_dbcsr_acc_opencl_config.device[thread_id].context; - result = clReleaseContext(inherit); - } + cl_context context = c_dbcsr_acc_opencl_config.device.context; + if (NULL != context) { + cl_device_id context_id = NULL; + result = clGetContextInfo(context, CL_CONTEXT_DEVICES, sizeof(cl_device_id), &context_id, NULL); + if (EXIT_SUCCESS == result && active_id != context_id) { + assert(NULL != context_id); + result = clReleaseContext(context); + c_dbcsr_acc_opencl_config.device.context = NULL; } - if (EXIT_SUCCESS == result) { /* update/cache device-specific information */ - result = c_dbcsr_acc_opencl_device_level(active_id, c_dbcsr_acc_opencl_config.device[thread_id].level, - c_dbcsr_acc_opencl_config.device[thread_id].level + 1, NULL /*cl_std*/, - &c_dbcsr_acc_opencl_config.device[thread_id].type); - if (EXIT_SUCCESS == result) { - char devname[ACC_OPENCL_BUFFERSIZE]; + } + if (EXIT_SUCCESS == result && NULL == c_dbcsr_acc_opencl_config.device.context) { + result = c_dbcsr_acc_opencl_create_context(active_id, &context); + } + if (EXIT_SUCCESS == result) { /* update/cache device-specific information */ + assert(NULL != context); + c_dbcsr_acc_opencl_config.device.context = context; + result = c_dbcsr_acc_opencl_device_level(active_id, c_dbcsr_acc_opencl_config.device.level, + c_dbcsr_acc_opencl_config.device.level + 1, NULL /*cl_std*/, &c_dbcsr_acc_opencl_config.device.type); + if (EXIT_SUCCESS == result) { + char devname[ACC_OPENCL_BUFFERSIZE]; # if defined(CL_VERSION_2_0) - const char* const env_svm = getenv("ACC_OPENCL_SVM"); - c_dbcsr_acc_opencl_config.device[thread_id].svm_interop = - ((NULL == env_svm || 2 > *c_dbcsr_acc_opencl_config.device[thread_id].level) ? 0 : atoi(env_svm)); + const char* const env_svm = getenv("ACC_OPENCL_SVM"); + c_dbcsr_acc_opencl_config.device.svm_interop = + ((NULL == env_svm || 2 > *c_dbcsr_acc_opencl_config.device.level) ? 0 : atoi(env_svm)); # endif - if (CL_SUCCESS != clGetDeviceInfo(active_id, CL_DEVICE_HOST_UNIFIED_MEMORY, sizeof(cl_bool), - &c_dbcsr_acc_opencl_config.device[thread_id].unified, NULL)) - { - c_dbcsr_acc_opencl_config.device[thread_id].unified = CL_FALSE; - } - if (EXIT_SUCCESS != c_dbcsr_acc_opencl_device_name(active_id, devname, ACC_OPENCL_BUFFERSIZE, NULL /*platform*/, - 0 /*platform_maxlen*/, /*cleanup*/ 1) || - EXIT_SUCCESS != c_dbcsr_acc_opencl_device_uid(active_id, devname, &c_dbcsr_acc_opencl_config.device[thread_id].uid)) - { - c_dbcsr_acc_opencl_config.device[thread_id].uid = (cl_uint)-1; - } - c_dbcsr_acc_opencl_config.device[thread_id].intel = (EXIT_SUCCESS == c_dbcsr_acc_opencl_device_vendor( - active_id, "intel", 0 /*use_platform_name*/)); - c_dbcsr_acc_opencl_config.device[thread_id].nv = (EXIT_SUCCESS == c_dbcsr_acc_opencl_device_vendor( - active_id, "nvidia", 0 /*use_platform_name*/)); - if (EXIT_SUCCESS == c_dbcsr_acc_opencl_device_vendor(active_id, "amd", 0 /*use_platform_name*/) || - EXIT_SUCCESS == c_dbcsr_acc_opencl_device_vendor(active_id, "amd", 1 /*use_platform_name*/)) + if (EXIT_SUCCESS != clGetDeviceInfo(active_id, CL_DEVICE_HOST_UNIFIED_MEMORY, sizeof(cl_bool), + &c_dbcsr_acc_opencl_config.device.unified, NULL)) + { + c_dbcsr_acc_opencl_config.device.unified = CL_FALSE; + } + if (EXIT_SUCCESS != c_dbcsr_acc_opencl_device_name(active_id, devname, ACC_OPENCL_BUFFERSIZE, NULL /*platform*/, + 0 /*platform_maxlen*/, /*cleanup*/ 1) || + EXIT_SUCCESS != c_dbcsr_acc_opencl_device_uid(active_id, devname, &c_dbcsr_acc_opencl_config.device.uid)) + { + c_dbcsr_acc_opencl_config.device.uid = (cl_uint)-1; + } + c_dbcsr_acc_opencl_config.device.intel = (EXIT_SUCCESS == + c_dbcsr_acc_opencl_device_vendor(active_id, "intel", 0 /*use_platform_name*/)); + c_dbcsr_acc_opencl_config.device.nv = (EXIT_SUCCESS == + c_dbcsr_acc_opencl_device_vendor(active_id, "nvidia", 0 /*use_platform_name*/)); + if (EXIT_SUCCESS == c_dbcsr_acc_opencl_device_vendor(active_id, "amd", 0 /*use_platform_name*/) || + EXIT_SUCCESS == c_dbcsr_acc_opencl_device_vendor(active_id, "amd", 1 /*use_platform_name*/)) + { + char buffer[ACC_OPENCL_BUFFERSIZE]; + c_dbcsr_acc_opencl_config.device.amd = 1; + if (EXIT_SUCCESS == c_dbcsr_acc_opencl_device_name(active_id, buffer, ACC_OPENCL_BUFFERSIZE, NULL /*platform*/, + 0 /*platform_maxlen*/, /*cleanup*/ 1)) { - char buffer[ACC_OPENCL_BUFFERSIZE]; - c_dbcsr_acc_opencl_config.device[thread_id].amd = 1; - if (EXIT_SUCCESS == c_dbcsr_acc_opencl_device_name(active_id, buffer, ACC_OPENCL_BUFFERSIZE, NULL /*platform*/, - 0 /*platform_maxlen*/, /*cleanup*/ 1)) - { - const char* const gfxname = LIBXSMM_STRISTR(buffer, "gfx"); - if (NULL != gfxname && 90 <= atoi(gfxname + 3)) { - c_dbcsr_acc_opencl_config.device[thread_id].amd = 2; - } + const char* const gfxname = LIBXSMM_STRISTR(buffer, "gfx"); + if (NULL != gfxname && 90 <= atoi(gfxname + 3)) { + c_dbcsr_acc_opencl_config.device.amd = 2; } } } @@ -1046,6 +1005,7 @@ int c_dbcsr_acc_opencl_set_active_device(int thread_id, int device_id) { } } else result = EXIT_FAILURE; + if (NULL != lock) ACC_OPENCL_RELEASE(lock); } return result; } @@ -1060,58 +1020,7 @@ int c_dbcsr_acc_set_active_device(int device_id) { c_dbcsr_timeset((const char**)&routine_name_ptr, &routine_name_len, &routine_handle); # endif assert(0 != c_dbcsr_acc_opencl_config.ndevices); - result = c_dbcsr_acc_opencl_set_active_device(ACC_OPENCL_OMP_TID(), device_id); -# if defined(__DBCSR_ACC) && defined(ACC_OPENCL_PROFILE) - c_dbcsr_timestop(&routine_handle); -# endif - ACC_OPENCL_RETURN(result); -} - - -int c_dbcsr_acc_opencl_device_synchronize(int thread_id) { - void** const streams = c_dbcsr_acc_opencl_config.streams + thread_id * c_dbcsr_acc_opencl_config.nstreams; - int result = EXIT_SUCCESS; - int i = 0; - assert(0 <= thread_id && thread_id < c_dbcsr_acc_opencl_config.nthreads); - assert(NULL != c_dbcsr_acc_opencl_config.streams); - for (; i < c_dbcsr_acc_opencl_config.nstreams; ++i) { - void* const stream = streams[i]; - if (NULL != stream) { - if (NULL != *ACC_OPENCL_STREAM(stream)) { /* soft-error? */ - result = c_dbcsr_acc_stream_sync(stream); - if (EXIT_SUCCESS != result) break; - } - } -# if defined(ACC_OPENCL_STREAM_COMPACT) - else break; -# endif - } - return result; -} - - -int c_dbcsr_acc_device_synchronize(void) { - int result = EXIT_SUCCESS; -# if defined(__DBCSR_ACC) && defined(ACC_OPENCL_PROFILE) - int routine_handle; - static const char* const routine_name_ptr = LIBXSMM_FUNCNAME; - static const int routine_name_len = (int)sizeof(LIBXSMM_FUNCNAME) - 1; - c_dbcsr_timeset((const char**)&routine_name_ptr, &routine_name_len, &routine_handle); -# endif -# if defined(_OPENMP) - if (1 < omp_get_num_threads()) { - result = c_dbcsr_acc_opencl_device_synchronize(omp_get_thread_num()); - } - else { - int i; -# pragma omp parallel for private(i) - for (i = 0; i < c_dbcsr_acc_opencl_config.nthreads; ++i) { - ACC_OPENCL_EXPECT(EXIT_SUCCESS == c_dbcsr_acc_opencl_device_synchronize(i)); - } - } -# else - result = c_dbcsr_acc_opencl_device_synchronize(/*main*/ 0); -# endif + result = c_dbcsr_acc_opencl_set_active_device(c_dbcsr_acc_opencl_config.lock_main, device_id); # if defined(__DBCSR_ACC) && defined(ACC_OPENCL_PROFILE) c_dbcsr_timestop(&routine_handle); # endif @@ -1151,14 +1060,19 @@ int c_dbcsr_acc_opencl_wgsize(cl_device_id device, cl_kernel kernel, size_t* max } -int c_dbcsr_acc_opencl_flags_atomics(cl_device_id device_id, c_dbcsr_acc_opencl_atomic_fp_t kind, - const c_dbcsr_acc_opencl_device_t* devinfo, const char* exts[], int exts_maxlen, char flags[], size_t flags_maxlen) { +int c_dbcsr_acc_opencl_flags_atomics(const c_dbcsr_acc_opencl_device_t* devinfo, c_dbcsr_acc_opencl_atomic_fp_t kind, + const char* exts[], int exts_maxlen, char flags[], size_t flags_maxlen) { + cl_device_id device_id = NULL; int result = 0, ext1, ext2; - for (ext1 = 0; ext1 < exts_maxlen; ++ext1) + for (ext1 = 0; ext1 < exts_maxlen; ++ext1) { if (NULL == exts[ext1] || '\0' == *exts[ext1]) break; - for (ext2 = ext1 + 1; ext2 < exts_maxlen; ++ext2) + } + for (ext2 = ext1 + 1; ext2 < exts_maxlen; ++ext2) { if (NULL == exts[ext2] || '\0' == *exts[ext2]) break; - if (NULL != devinfo && ext2 < exts_maxlen) { + } + if (NULL != devinfo && ext2 < exts_maxlen && + EXIT_SUCCESS == clGetContextInfo(devinfo->context, CL_CONTEXT_DEVICES, sizeof(cl_device_id), &device_id, NULL)) + { const char* atomic_type = ""; switch (kind) { case c_dbcsr_acc_opencl_atomic_fp_64: { @@ -1219,8 +1133,9 @@ int c_dbcsr_acc_opencl_flags_atomics(cl_device_id device_id, c_dbcsr_acc_opencl_ const int force_atomics = ((NULL == env_atomics || '\0' == *env_atomics) ? 0 : atoi(env_atomics)); if (NULL == env_atomics || '\0' == *env_atomics || 0 != force_atomics) { cl_bitfield fp_atomics; - if (CL_SUCCESS == clGetDeviceInfo(device_id, (cl_device_info)(c_dbcsr_acc_opencl_atomic_fp_64 == kind ? 0x4232 : 0x4231), - sizeof(cl_bitfield), &fp_atomics, NULL) && + if (EXIT_SUCCESS == clGetDeviceInfo(device_id, + (cl_device_info)(c_dbcsr_acc_opencl_atomic_fp_64 == kind ? 0x4232 : 0x4231), sizeof(cl_bitfield), + &fp_atomics, NULL) && 0 != (/*add*/ (1 << 1) & fp_atomics)) { exts[ext2] = "cl_ext_float_atomics"; @@ -1317,17 +1232,18 @@ int c_dbcsr_acc_opencl_flags(const char build_params[], const char build_options int c_dbcsr_acc_opencl_kernel(int source_is_file, const char source[], const char kernel_name[], const char build_params[], const char build_options[], const char try_build_options[], int* try_ok, const char* const extnames[], int num_exts, cl_kernel* kernel) { - char buffer[ACC_OPENCL_BUFFERSIZE] = "", cl_std[16]; - char buffer_name[ACC_OPENCL_MAXSTRLEN * 2]; - int tid = 0, ok = EXIT_SUCCESS, source_is_cl = 1, nchar, level_major, level_minor; - const cl_context context = c_dbcsr_acc_opencl_context(&tid); + char buffer[ACC_OPENCL_BUFFERSIZE] = "", buffer_name[ACC_OPENCL_MAXSTRLEN * 2], cl_std[16]; + int ok = EXIT_SUCCESS, source_is_cl = 1, nchar, level_major, level_minor; cl_device_id active_id = NULL; - cl_int result = ((NULL != source && NULL != kernel_name && '\0' != *kernel_name && NULL != kernel) - ? c_dbcsr_acc_opencl_device(tid, &active_id) - : EXIT_FAILURE); + int result = ((NULL != source && NULL != kernel_name && '\0' != *kernel_name) + ? clGetContextInfo( + c_dbcsr_acc_opencl_config.device.context, CL_CONTEXT_DEVICES, sizeof(cl_device_id), &active_id, NULL) + : EXIT_FAILURE); cl_program program = NULL; FILE* file_src = NULL; size_t size_src = 0; + assert(NULL != kernel); + *kernel = NULL; if (EXIT_SUCCESS == result) { result = c_dbcsr_acc_opencl_device_level(active_id, &level_major, &level_minor, cl_std, NULL /*type*/); if (0 != source_is_file) file_src = fopen(source, "rb"); @@ -1488,20 +1404,20 @@ int c_dbcsr_acc_opencl_kernel(int source_is_file, const char source[], const cha } } # endif - program = clCreateProgramWithSource(context, 1 /*nlines*/, &ext_source, NULL, &result); - if (CL_SUCCESS == result) { + program = clCreateProgramWithSource(c_dbcsr_acc_opencl_config.device.context, 1 /*nlines*/, &ext_source, NULL, &result); + if (EXIT_SUCCESS == result) { assert(NULL != program); result = c_dbcsr_acc_opencl_flags(build_params, build_options, try_build_options, cl_std, buffer, sizeof(buffer)); if (EXIT_SUCCESS == result) { result = clBuildProgram(program, 1 /*num_devices*/, &active_id, buffer, NULL /*callback*/, NULL /*user_data*/); } - if (CL_SUCCESS != result && NULL != try_build_options && '\0' != *try_build_options) { + if (EXIT_SUCCESS != result && NULL != try_build_options && '\0' != *try_build_options) { result = c_dbcsr_acc_opencl_flags(build_params, build_options, NULL /*try_build_options*/, cl_std, buffer, sizeof(buffer)); if (EXIT_SUCCESS == result) { - ACC_OPENCL_EXPECT(CL_SUCCESS == clReleaseProgram(program)); /* recreate below (to avoid unclean state) */ - program = clCreateProgramWithSource(context, 1 /*nlines*/, &ext_source, NULL, &result); - assert(CL_SUCCESS != result || NULL != program); - if (CL_SUCCESS == result) { + ACC_OPENCL_EXPECT(EXIT_SUCCESS == clReleaseProgram(program)); /* recreate below (to avoid unclean state) */ + program = clCreateProgramWithSource(c_dbcsr_acc_opencl_config.device.context, 1 /*nlines*/, &ext_source, NULL, &result); + assert(EXIT_SUCCESS != result || NULL != program); + if (EXIT_SUCCESS == result) { result = clBuildProgram(program, 1 /*num_devices*/, &active_id, buffer, NULL /*callback*/, NULL /*user_data*/); } } @@ -1513,58 +1429,35 @@ int c_dbcsr_acc_opencl_kernel(int source_is_file, const char source[], const cha libxsmm_free(p); } buffer[0] = '\0'; /* reset to empty */ - if (CL_SUCCESS == result) { + if (EXIT_SUCCESS == result) { *kernel = clCreateKernel(program, kernel_name, &result); - if (CL_SUCCESS == result) { + if (EXIT_SUCCESS == result) { assert(NULL != *kernel); if (NULL == file_src && (2 <= c_dbcsr_acc_opencl_config.dump || 0 > c_dbcsr_acc_opencl_config.dump)) { unsigned char* binary = NULL; size_t size; - binary = (unsigned char*)(CL_SUCCESS == clGetProgramInfo(program, CL_PROGRAM_BINARY_SIZES, sizeof(size_t), &size, NULL) + binary = (unsigned char*)(EXIT_SUCCESS == + clGetProgramInfo(program, CL_PROGRAM_BINARY_SIZES, sizeof(size_t), &size, NULL) ? libxsmm_aligned_scratch(size, 0 /*auto-align*/) : NULL); if (NULL != binary) { result = clGetProgramInfo(program, CL_PROGRAM_BINARIES, sizeof(unsigned char*), &binary, NULL); - if (CL_SUCCESS == result) { + if (EXIT_SUCCESS == result) { /* successfully queried program binary */ FILE* file; nchar = LIBXSMM_SNPRINTF(buffer, sizeof(buffer), "%s.dump", kernel_name); file = (0 < nchar && (int)sizeof(buffer) > nchar) ? fopen(buffer, "wb") : NULL; buffer[0] = '\0'; /* reset to empty */ if (NULL != file) { - if (size != fwrite(binary, 1, size, file)) { - ACC_OPENCL_EXPECT(CL_SUCCESS == clReleaseProgram(program)); - ACC_OPENCL_EXPECT(CL_SUCCESS == clReleaseKernel(*kernel)); - result = EXIT_FAILURE; - } + if (size != fwrite(binary, 1, size, file)) result = EXIT_FAILURE; fclose(file); } - else { - ACC_OPENCL_EXPECT(CL_SUCCESS == clReleaseProgram(program)); - ACC_OPENCL_EXPECT(CL_SUCCESS == clReleaseKernel(*kernel)); - result = EXIT_FAILURE; - } - } - else { /* error: querying program binary */ - ACC_OPENCL_EXPECT(CL_SUCCESS == clReleaseProgram(program)); - ACC_OPENCL_EXPECT(CL_SUCCESS == clReleaseKernel(*kernel)); + else result = EXIT_FAILURE; } libxsmm_free(binary); } - else { - ACC_OPENCL_EXPECT(CL_SUCCESS == clReleaseProgram(program)); - ACC_OPENCL_EXPECT(CL_SUCCESS == clReleaseKernel(*kernel)); - result = EXIT_FAILURE; - } + else result = EXIT_FAILURE; } } - else { /* error: creating kernel */ - ACC_OPENCL_EXPECT(CL_SUCCESS == clReleaseProgram(program)); - } - } - else { - ACC_OPENCL_EXPECT( - CL_SUCCESS == clGetProgramBuildInfo(program, active_id, CL_PROGRAM_BUILD_LOG, ACC_OPENCL_BUFFERSIZE, buffer, NULL)); - ACC_OPENCL_EXPECT(CL_SUCCESS == clReleaseProgram(program)); } } else if (source != ext_source) { /* error: creating program */ @@ -1575,61 +1468,52 @@ int c_dbcsr_acc_opencl_kernel(int source_is_file, const char source[], const cha } else if (EXIT_SUCCESS == result) { /* binary representation */ # if defined(CL_VERSION_2_1) - if (0 != c_dbcsr_acc_opencl_config.dump) program = clCreateProgramWithIL(context, source, size_src, &result); + if (0 != c_dbcsr_acc_opencl_config.dump) + program = clCreateProgramWithIL(c_dbcsr_acc_opencl_config.device.context, source, size_src, &result); else # endif { - program = clCreateProgramWithBinary( - context, 1, &active_id, &size_src, (const unsigned char**)&source, NULL /*binary_status*/, &result); + program = clCreateProgramWithBinary(c_dbcsr_acc_opencl_config.device.context, 1, &active_id, &size_src, + (const unsigned char**)&source, NULL /*binary_status*/, &result); } - if (CL_SUCCESS == result) { + if (EXIT_SUCCESS == result) { assert(NULL != program); result = c_dbcsr_acc_opencl_flags(build_params, build_options, try_build_options, cl_std, buffer, sizeof(buffer)); if (EXIT_SUCCESS == result) { result = clBuildProgram(program, 1 /*num_devices*/, &active_id, buffer, NULL /*callback*/, NULL /*user_data*/); } - if (CL_SUCCESS != result && NULL != try_build_options && '\0' != *try_build_options) { + if (EXIT_SUCCESS != result && NULL != try_build_options && '\0' != *try_build_options) { result = c_dbcsr_acc_opencl_flags(build_params, build_options, NULL /*try_build_options*/, cl_std, buffer, sizeof(buffer)); if (EXIT_SUCCESS == result) { - ACC_OPENCL_EXPECT(CL_SUCCESS == clReleaseProgram(program)); /* recreate below (to avoid unclean state) */ + ACC_OPENCL_EXPECT(EXIT_SUCCESS == clReleaseProgram(program)); /* recreate below (to avoid unclean state) */ # if defined(CL_VERSION_2_1) - if (0 != c_dbcsr_acc_opencl_config.dump) program = clCreateProgramWithIL(context, source, size_src, &result); + if (0 != c_dbcsr_acc_opencl_config.dump) + program = clCreateProgramWithIL(c_dbcsr_acc_opencl_config.device.context, source, size_src, &result); else # endif { - program = clCreateProgramWithBinary( - context, 1, &active_id, &size_src, (const unsigned char**)&source, NULL /*binary_status*/, &result); + program = clCreateProgramWithBinary(c_dbcsr_acc_opencl_config.device.context, 1, &active_id, &size_src, + (const unsigned char**)&source, NULL /*binary_status*/, &result); } - assert(CL_SUCCESS != result || NULL != program); - if (CL_SUCCESS == result) { + assert(EXIT_SUCCESS != result || NULL != program); + if (EXIT_SUCCESS == result) { result = clBuildProgram(program, 1 /*num_devices*/, &active_id, buffer, NULL /*callback*/, NULL /*user_data*/); } } ok = EXIT_FAILURE; } - if (CL_SUCCESS == result) { + if (EXIT_SUCCESS == result) { *kernel = clCreateKernel(program, kernel_name, &result); - assert(CL_SUCCESS != result || NULL != *kernel); - if (CL_SUCCESS != result) { /* error: creating kernel */ # if defined(CL_VERSION_1_2) - /* discover available kernels in program, and adopt the last kernel listed */ - if (CL_SUCCESS == clGetProgramInfo(program, CL_PROGRAM_KERNEL_NAMES, sizeof(char*), buffer, NULL) && '\0' != *buffer) { - const char *const semicolon = strrchr(buffer, ';'), *const name = (NULL == semicolon ? buffer : (semicolon + 1)); - *kernel = clCreateKernel(program, name, &result); - assert(CL_SUCCESS != result || NULL != *kernel); - if (CL_SUCCESS != result) ACC_OPENCL_EXPECT(CL_SUCCESS == clReleaseProgram(program)); - } - else -# endif - { - ACC_OPENCL_EXPECT(CL_SUCCESS == clReleaseProgram(program)); - } + /* error creating kernel: discover available kernels in program, and adopt the last kernel listed */ + if (EXIT_SUCCESS != result && + EXIT_SUCCESS == clGetProgramInfo(program, CL_PROGRAM_KERNEL_NAMES, sizeof(char*), buffer, NULL) && '\0' != *buffer) + { + const char *const semicolon = strrchr(buffer, ';'), *const name = (NULL == semicolon ? buffer : (semicolon + 1)); + *kernel = clCreateKernel(program, name, &result); } - } - else { - ACC_OPENCL_EXPECT( - CL_SUCCESS == clGetProgramBuildInfo(program, active_id, CL_PROGRAM_BUILD_LOG, ACC_OPENCL_BUFFERSIZE, buffer, NULL)); - ACC_OPENCL_EXPECT(CL_SUCCESS == clReleaseProgram(program)); +# endif + assert(EXIT_SUCCESS != result || NULL != *kernel); } } } @@ -1639,9 +1523,21 @@ int c_dbcsr_acc_opencl_kernel(int source_is_file, const char source[], const cha assert(0 != source_is_file); libxsmm_free(p); } -# if !defined(NDEBUG) - if (EXIT_SUCCESS != result && NULL != kernel) *kernel = NULL; -# endif + if (NULL != program) { + if (EXIT_SUCCESS != result && NULL != *kernel) { + ACC_OPENCL_EXPECT(EXIT_SUCCESS == clReleaseKernel(*kernel)); + *kernel = NULL; + } + if (2 <= c_dbcsr_acc_opencl_config.verbosity || 0 > c_dbcsr_acc_opencl_config.verbosity) { + if (EXIT_SUCCESS == clGetProgramBuildInfo(program, active_id, CL_PROGRAM_BUILD_LOG, ACC_OPENCL_BUFFERSIZE, buffer, NULL)) { + const char* info = buffer; + while (NULL != strchr("\n\r\t ", *info)) ++info; /* remove trailing newline etc. */ + if ('\0' != *info) fprintf(stderr, "INFO ACC/OpenCL: %s\n", info); + } + else buffer[0] = '\0'; /* reset to empty */ + } + ACC_OPENCL_EXPECT(EXIT_SUCCESS == clReleaseProgram(program)); /* release in any case (EXIT_SUCCESS) */ + } if (NULL != try_ok) *try_ok = result | ok; ACC_OPENCL_RETURN_CAUSE(result, buffer); } diff --git a/src/acc/opencl/acc_opencl.h b/src/acc/opencl/acc_opencl.h index 58e1e4ea9e9..2bcc52ce964 100644 --- a/src/acc/opencl/acc_opencl.h +++ b/src/acc/opencl/acc_opencl.h @@ -34,10 +34,6 @@ # endif #endif -#if !defined(LIBXSMM_SYNC_NPAUSE) -# define LIBXSMM_SYNC_NPAUSE 0 -#endif - #if defined(__LIBXSMM) && !defined(LIBXSMM_DEFAULT_CONFIG) # include # if !defined(LIBXSMM_TIMER_H) @@ -59,12 +55,6 @@ LIBXSMM_VERSION4(LIBXSMM_VERSION_MAJOR, LIBXSMM_VERSION_MINOR, LIBXSMM_VERSION_UPDATE, LIBXSMM_VERSION_PATCH) #endif -#if LIBXSMM_VERSION4(1, 17, 0, 0) < LIBXSMM_VERSION_NUMBER -# define LIBXSMM_STRISTR libxsmm_stristr -#else -# define LIBXSMM_STRISTR strstr -#endif - #include "../acc.h" #if !defined(NDEBUG) # include @@ -75,6 +65,14 @@ #if !defined(ACC_OPENCL_CACHELINE_NBYTES) # define ACC_OPENCL_CACHELINE_NBYTES LIBXSMM_CACHELINE #endif +#if !defined(ACC_OPENCL_ATOMIC_KIND) +# define ACC_OPENCL_ATOMIC_KIND LIBXSMM_ATOMIC_SEQ_CST +#endif +#if defined(LIBXSMM_ATOMIC_LOCKTYPE) +# define ACC_OPENCL_ATOMIC_LOCKTYPE volatile LIBXSMM_ATOMIC_LOCKTYPE +#else +# define ACC_OPENCL_ATOMIC_LOCKTYPE volatile int +#endif #if !defined(ACC_OPENCL_MAXALIGN_NBYTES) # define ACC_OPENCL_MAXALIGN_NBYTES (2 << 20 /*2MB*/) #endif @@ -91,22 +89,10 @@ #if !defined(ACC_OPENCL_HANDLES_MAXCOUNT) # define ACC_OPENCL_HANDLES_MAXCOUNT 64 #endif -/** Counted on a per-thread basis! */ -#if !defined(ACC_OPENCL_STREAMS_MAXCOUNT) -# define ACC_OPENCL_STREAMS_MAXCOUNT 64 -#endif -#if !defined(ACC_OPENCL_OVERMALLOC) -# if defined(__DBCSR_ACC) || 1 -# define ACC_OPENCL_OVERMALLOC 0 -# else -# define ACC_OPENCL_OVERMALLOC 8192 -# endif -#endif /* First char is CSV-separator by default (w/o spaces) */ #if !defined(ACC_OPENCL_DELIMS) # define ACC_OPENCL_DELIMS ",;" #endif - #if !defined(ACC_OPENCL_LAZYINIT) && (defined(__DBCSR_ACC) || 1) # define ACC_OPENCL_LAZYINIT #endif @@ -115,46 +101,61 @@ # define ACC_OPENCL_STREAM_PRIORITIES # endif #endif -/** Streams are registered in compact/consecutive fashion */ -#if !defined(ACC_OPENCL_STREAM_COMPACT) && 1 -# define ACC_OPENCL_STREAM_COMPACT -#endif /** Stream-argument (ACC-interface) can be NULL (synchronous) */ #if !defined(ACC_OPENCL_STREAM_NULL) && 1 # define ACC_OPENCL_STREAM_NULL #endif - -/** Automatically determine cl_mem offset */ -#if !defined(ACC_OPENCL_MEM_OFFSET) && 1 -# define ACC_OPENCL_MEM_OFFSET +#if !defined(ACC_OPENCL_OMPLOCKS) && 0 +# define ACC_OPENCL_OMPLOCKS #endif - /** Use DBCSR's profile for detailed timings */ #if !defined(ACC_OPENCL_PROFILE) && 0 # define ACC_OPENCL_PROFILE #endif -/* attaching c_dbcsr_acc_opencl_info_stream_t is needed */ -#define ACC_OPENCL_STREAM(A) ((cl_command_queue*)(A)) +/* attaching c_dbcsr_acc_opencl_stream_t is needed */ +#define ACC_OPENCL_STREAM(A) ((const c_dbcsr_acc_opencl_stream_t*)(A)) /* incompatible with c_dbcsr_acc_event_record */ -#define ACC_OPENCL_EVENT(A) ((cl_event*)(A)) +#define ACC_OPENCL_EVENT(A) ((const cl_event*)(A)) #if defined(_OPENMP) # include # define ACC_OPENCL_OMP_TID() omp_get_thread_num() #else # define ACC_OPENCL_OMP_TID() (/*main*/ 0) +# undef ACC_OPENCL_OMPLOCKS #endif -#if 1 -# if LIBXSMM_VERSION4(1, 17, 0, 0) < LIBXSMM_VERSION_NUMBER -# define ACC_OPENCL_EXPECT(EXPR) LIBXSMM_EXPECT(EXPR) -# else -# define ACC_OPENCL_EXPECT(EXPR) \ - if (0 == (EXPR)) assert(0); -# endif -#else /* elide */ -# define ACC_OPENCL_EXPECT(EXPR) (void)(EXPR) +#define ACC_OPENCL_ATOMIC_ACQUIRE(LOCK) \ + do { \ + LIBXSMM_ATOMIC_ACQUIRE(LOCK, 0 /*LIBXSMM_SYNC_NPAUSE*/, ACC_OPENCL_ATOMIC_KIND); \ + } while (0) +#define ACC_OPENCL_ATOMIC_RELEASE(LOCK) \ + do { \ + LIBXSMM_ATOMIC_RELEASE(LOCK, ACC_OPENCL_ATOMIC_KIND); \ + } while (0) + +#if defined(ACC_OPENCL_OMPLOCKS) +# define ACC_OPENCL_INIT(LOCK) omp_init_lock(LOCK) +# define ACC_OPENCL_DESTROY(LOCK) omp_destroy_lock(LOCK) +# define ACC_OPENCL_ACQUIRE(LOCK) omp_set_lock(LOCK) +# define ACC_OPENCL_RELEASE(LOCK) omp_unset_lock(LOCK) +# define ACC_OPENCL_LOCKTYPE omp_lock_t +#else +# define ACC_OPENCL_INIT(LOCK) (*(LOCK) = 0) +# define ACC_OPENCL_DESTROY(LOCK) +# define ACC_OPENCL_ACQUIRE(LOCK) ACC_OPENCL_ATOMIC_ACQUIRE(LOCK) +# define ACC_OPENCL_RELEASE(LOCK) ACC_OPENCL_ATOMIC_RELEASE(LOCK) +# define ACC_OPENCL_LOCKTYPE ACC_OPENCL_ATOMIC_LOCKTYPE +#endif + +#if LIBXSMM_VERSION4(1, 17, 0, 0) < LIBXSMM_VERSION_NUMBER +# define ACC_OPENCL_EXPECT(EXPR) LIBXSMM_EXPECT(EXPR) +# define LIBXSMM_STRISTR libxsmm_stristr +#else +# define ACC_OPENCL_EXPECT(EXPR) \ + if (0 == (EXPR)) assert(0); +# define LIBXSMM_STRISTR strstr #endif #if !defined(NDEBUG) && 1 @@ -163,15 +164,15 @@ if (EXIT_SUCCESS == (RESULT)) { \ (RESULT) = (EXPR); \ assert((MSG) && *(MSG)); \ - if (CL_SUCCESS != (RESULT)) { \ - assert(CL_SUCCESS == EXIT_SUCCESS); \ + if (EXIT_SUCCESS != (RESULT)) { \ + assert(EXIT_SUCCESS == EXIT_SUCCESS); \ if (-1001 != (RESULT)) { \ fprintf(stderr, "ERROR ACC/OpenCL: " MSG); \ if (EXIT_FAILURE != (RESULT)) { \ fprintf(stderr, " (code=%i)", RESULT); \ } \ fprintf(stderr, ".\n"); \ - assert(CL_SUCCESS != (RESULT)); \ + assert(EXIT_SUCCESS != (RESULT)); \ } \ else { \ fprintf(stderr, "ERROR ACC/OpenCL: incomplete installation (" MSG ").\n"); \ @@ -238,12 +239,31 @@ typedef struct c_dbcsr_acc_opencl_device_t { cl_int intel, amd, nv; } c_dbcsr_acc_opencl_device_t; +/** Information about host/device-memory pointer. */ +typedef struct c_dbcsr_acc_opencl_info_memptr_t { + cl_mem memory; + void* memptr; +} c_dbcsr_acc_opencl_info_memptr_t; + +/** Information about streams (c_dbcsr_acc_stream_create). */ +typedef struct c_dbcsr_acc_opencl_stream_t { + cl_command_queue queue; + int tid, priority; +} c_dbcsr_acc_opencl_stream_t; + /** Enumeration of timer kinds used for built-in execution-profile. */ typedef enum c_dbcsr_acc_opencl_timer_t { c_dbcsr_acc_opencl_timer_device, c_dbcsr_acc_opencl_timer_host } c_dbcsr_acc_opencl_timer_t; +/** Enumeration of FP-atomic kinds. */ +typedef enum c_dbcsr_acc_opencl_atomic_fp_t { + c_dbcsr_acc_opencl_atomic_fp_no = 0, + c_dbcsr_acc_opencl_atomic_fp_32 = 1, + c_dbcsr_acc_opencl_atomic_fp_64 = 2 +} c_dbcsr_acc_opencl_atomic_fp_t; + /** * Settings discovered/setup during c_dbcsr_acc_init (independent of the device) * and settings updated during c_dbcsr_acc_set_active_device (devinfo). @@ -252,13 +272,17 @@ typedef struct c_dbcsr_acc_opencl_config_t { /** Table of ordered viable/discovered devices (matching criterion). */ cl_device_id devices[ACC_OPENCL_DEVICES_MAXCOUNT]; /** Table of devices (thread-specific). */ - c_dbcsr_acc_opencl_device_t* device; + c_dbcsr_acc_opencl_device_t device; + /** Locks used by domain. */ + ACC_OPENCL_LOCKTYPE *lock_main, *lock_stream, *lock_memory, *lock_memset, *lock_memcpy; /** Handle-counter. */ - size_t nclmems, nevents; - /** All handles and related storage. */ - void **clmems, **events, *storage; + size_t nmemptrs, nstreams, nevents; + /** All memptrs and related storage. */ + c_dbcsr_acc_opencl_info_memptr_t **memptrs, *memptr_data; /** All created streams partitioned by thread-ID (thread-local slots). */ - void** streams; + c_dbcsr_acc_opencl_stream_t **streams, *stream_data; + /** All events and related storage. */ + cl_event **events, *event_data; /** Kind of timer used for built-in execution-profile. */ c_dbcsr_acc_opencl_timer_t timer; /* c_dbcsr_acc_opencl_device_t? */ /** Kernel-parameters are matched against device's UID */ @@ -269,8 +293,6 @@ typedef struct c_dbcsr_acc_opencl_config_t { cl_int ndevices; /** Maximum number of threads (omp_get_max_threads). */ cl_int nthreads; - /** Maximum number of streams per thread. */ - cl_int nstreams; /** How to apply/use stream priorities. */ cl_int priority; /** How to zero/copy device-side buffers. */ @@ -286,39 +308,25 @@ typedef struct c_dbcsr_acc_opencl_config_t { /** Global configuration setup in c_dbcsr_acc_init. */ extern c_dbcsr_acc_opencl_config_t c_dbcsr_acc_opencl_config; -/** Contexts implement 1:1 relation with device. */ -cl_context c_dbcsr_acc_opencl_context(int* thread_id); -/** Share context for given device (start searching at optional thread_id), or return NULL). */ -cl_context c_dbcsr_acc_opencl_device_context(cl_device_id device, const int* thread_id); - -/** Information about host-memory pointer (c_dbcsr_acc_host_mem_allocate). */ -typedef struct c_dbcsr_acc_opencl_info_hostptr_t { - cl_mem memory; - void* mapped; -} c_dbcsr_acc_opencl_info_hostptr_t; -c_dbcsr_acc_opencl_info_hostptr_t* c_dbcsr_acc_opencl_info_hostptr(void* memory); - -/** Determines cl_mem object and offset of memory. */ -void* c_dbcsr_acc_opencl_info_devptr(const void* memory, size_t elsize, const size_t* amount, size_t* offset); - -/** Information about streams (c_dbcsr_acc_stream_create). */ -typedef struct c_dbcsr_acc_opencl_info_stream_t { - void* pointer; - int priority; - int tid; -} c_dbcsr_acc_opencl_info_stream_t; -c_dbcsr_acc_opencl_info_stream_t* c_dbcsr_acc_opencl_info_stream(void* stream); -const int* c_dbcsr_acc_opencl_stream_priority(const void* stream); - -void* c_dbcsr_acc_opencl_stream_default(void); - -/** Get host-pointer associated with device-memory (c_dbcsr_acc_dev_mem_allocate). */ -void* c_dbcsr_acc_opencl_get_hostptr(cl_mem memory); +/** Determines device-side value of device-memory. */ +int c_dbcsr_acc_opencl_get_ptr( + ACC_OPENCL_LOCKTYPE* lock, const c_dbcsr_acc_opencl_stream_t* stream, void** dev_mem, cl_mem memory, size_t offset); +/** Determines cl_mem object and storage pointer. */ +c_dbcsr_acc_opencl_info_memptr_t* c_dbcsr_acc_opencl_info_hostptr(void* memory); +/** Determines cl_mem object and memory offset (device). */ +c_dbcsr_acc_opencl_info_memptr_t* c_dbcsr_acc_opencl_info_devptr_lock( + ACC_OPENCL_LOCKTYPE* lock, const void* memory, size_t elsize, const size_t* amount, size_t* offset); +/** Determines cl_mem object and memory offset (device). */ +const c_dbcsr_acc_opencl_info_memptr_t* c_dbcsr_acc_opencl_info_devptr( + const void* memory, size_t elsize, const size_t* amount, size_t* offset); +/** Finds an existing stream for the given thread-ID (or NULL). */ +const c_dbcsr_acc_opencl_stream_t* c_dbcsr_acc_opencl_stream(ACC_OPENCL_LOCKTYPE* lock, int thread_id); +/** Determines default-stream (see ACC_OPENCL_STREAM_NULL). */ +const c_dbcsr_acc_opencl_stream_t* c_dbcsr_acc_opencl_stream_default(void); +/** Like c_dbcsr_acc_memset_zero, but supporting an arbitrary value used as initialization pattern. */ int c_dbcsr_acc_opencl_memset(void* dev_mem, int value, size_t offset, size_t nbytes, void* stream); /** Amount of device memory; local memory is only non-zero if separate from global. */ int c_dbcsr_acc_opencl_info_devmem(cl_device_id device, size_t* mem_free, size_t* mem_total, size_t* mem_local, int* mem_unified); -/** Get device associated with thread-ID. */ -int c_dbcsr_acc_opencl_device(int thread_id, cl_device_id* device); /** Get device-ID for given device, and optionally global device-ID. */ int c_dbcsr_acc_opencl_device_id(cl_device_id device, int* device_id, int* global_id); /** Confirm the vendor of the given device. */ @@ -332,10 +340,10 @@ int c_dbcsr_acc_opencl_device_name( int c_dbcsr_acc_opencl_device_level(cl_device_id device, int* level_major, int* level_minor, char cl_std[16], cl_device_type* type); /** Check if given device supports the extensions. */ int c_dbcsr_acc_opencl_device_ext(cl_device_id device, const char* const extnames[], int num_exts); -/** Create context for given thread-ID and device. */ -int c_dbcsr_acc_opencl_create_context(int thread_id, cl_device_id device_id); +/** Create context for given device. */ +int c_dbcsr_acc_opencl_create_context(cl_device_id device_id, cl_context* context); /** Internal variant of c_dbcsr_acc_set_active_device. */ -int c_dbcsr_acc_opencl_set_active_device(int thread_id, int device_id); +int c_dbcsr_acc_opencl_set_active_device(ACC_OPENCL_LOCKTYPE* lock, int device_id); /** Get preferred multiple and max. size of workgroup (kernel- or device-specific). */ int c_dbcsr_acc_opencl_wgsize(cl_device_id device, cl_kernel kernel, size_t* max_value, size_t* preferred_multiple); /** @@ -347,25 +355,21 @@ int c_dbcsr_acc_opencl_kernel(int source_is_file, const char source[], const cha const char build_options[], const char try_build_options[], int* try_ok, const char* const extnames[], int num_exts, cl_kernel* kernel); /** Per-thread variant of c_dbcsr_acc_device_synchronize. */ -int c_dbcsr_acc_opencl_device_synchronize(int thread_id); -/** Create user-event if not created and sets initial state. */ -int c_dbcsr_acc_opencl_event_create(cl_event* event_p); - -/** Enumeration of FP-atomic kinds. */ -typedef enum c_dbcsr_acc_opencl_atomic_fp_t { - c_dbcsr_acc_opencl_atomic_fp_no = 0, - c_dbcsr_acc_opencl_atomic_fp_32 = 1, - c_dbcsr_acc_opencl_atomic_fp_64 = 2 -} c_dbcsr_acc_opencl_atomic_fp_t; - +int c_dbcsr_acc_opencl_device_synchronize(ACC_OPENCL_LOCKTYPE* lock, int thread_id); /** Assemble flags to support atomic operations. */ -int c_dbcsr_acc_opencl_flags_atomics(cl_device_id device_id, c_dbcsr_acc_opencl_atomic_fp_t kind, - const c_dbcsr_acc_opencl_device_t* devinfo, const char* exts[], int exts_maxlen, char flags[], size_t flags_maxlen); - +int c_dbcsr_acc_opencl_flags_atomics(const c_dbcsr_acc_opencl_device_t* devinfo, c_dbcsr_acc_opencl_atomic_fp_t kind, + const char* exts[], int exts_maxlen, char flags[], size_t flags_maxlen); /** Combines build-params and build-options, some optional flags (try_build_options), and applies language std. (cl_std). */ int c_dbcsr_acc_opencl_flags(const char build_params[], const char build_options[], const char try_build_options[], const char cl_std[], char buffer[], size_t buffer_size); +/** Support older LIBXSMM (libxsmm_pmalloc_init). */ +void c_dbcsr_acc_opencl_pmalloc_init(size_t size, size_t* num, void* pool[], void* storage); +/** Support older LIBXSMM (libxsmm_pmalloc). */ +void* c_dbcsr_acc_opencl_pmalloc(void* pool[], size_t* i); +/** Support older LIBXSMM (libxsmm_pfree). */ +void c_dbcsr_acc_opencl_pfree(const void* pointer, void* pool[], size_t* i); + #if defined(__cplusplus) } #endif diff --git a/src/acc/opencl/acc_opencl_event.c b/src/acc/opencl/acc_opencl_event.c index a755fa42741..c98f20f5230 100644 --- a/src/acc/opencl/acc_opencl_event.c +++ b/src/acc/opencl/acc_opencl_event.c @@ -9,17 +9,8 @@ #if defined(__OPENCL) # include "acc_opencl.h" -# if defined(CL_VERSION_1_2) -# define ACC_OPENCL_WAIT_EVENT(QUEUE, EVENT) clEnqueueMarkerWithWaitList(QUEUE, 1, EVENT, NULL) -# else -# define ACC_OPENCL_WAIT_EVENT(QUEUE, EVENT) clEnqueueWaitForEvents(QUEUE, 1, EVENT) -# endif - -# if !defined(ACC_OPENCL_EVENT_BARRIER) && 0 -# define ACC_OPENCL_EVENT_BARRIER -# endif -# if !defined(ACC_OPENCL_EVENT_CREATE) && 0 -# define ACC_OPENCL_EVENT_CREATE +# if !defined(ACC_OPENCL_EVENT_FLUSH) && 0 +# define ACC_OPENCL_EVENT_FLUSH # endif @@ -27,69 +18,18 @@ extern "C" { # endif -int c_dbcsr_acc_opencl_event_create(cl_event* event_p) { - int result; - assert(NULL != event_p); - if (NULL != *event_p) result = EXIT_SUCCESS; - else { - *event_p = clCreateUserEvent(c_dbcsr_acc_opencl_context(NULL /*tid*/), &result); - } - if (CL_SUCCESS == result) { - assert(NULL != *event_p); - /* an empty event (unrecorded) has no work to wait for; hence it is - * considered occurred and c_dbcsr_acc_event_synchronize must not block - */ - result = clSetUserEventStatus(*event_p, CL_COMPLETE); - if (CL_SUCCESS != result) { /* error: setting initial event state */ - ACC_OPENCL_EXPECT(CL_SUCCESS == clReleaseEvent(*event_p)); - *event_p = NULL; - } - } - else { - *event_p = NULL; /* error: creating user-defined event */ - } - return result; -} - - int c_dbcsr_acc_event_create(void** event_p) { int result = EXIT_SUCCESS; - cl_event event = NULL; # if defined(__DBCSR_ACC) && defined(ACC_OPENCL_PROFILE) int routine_handle; static const char* const routine_name_ptr = LIBXSMM_FUNCNAME; static const int routine_name_len = (int)sizeof(LIBXSMM_FUNCNAME) - 1; c_dbcsr_timeset((const char**)&routine_name_ptr, &routine_name_len, &routine_handle); # endif - assert(NULL != event_p); -# if defined(ACC_OPENCL_EVENT_CREATE) - result = c_dbcsr_acc_opencl_event_create(&event); - assert(NULL != event || EXIT_SUCCESS != result); - if (EXIT_SUCCESS == result) -# endif - { - assert(NULL == c_dbcsr_acc_opencl_config.events || sizeof(void*) >= sizeof(cl_event)); - *event_p = ( -# if LIBXSMM_VERSION4(1, 17, 0, 0) < LIBXSMM_VERSION_NUMBER && defined(ACC_OPENCL_HANDLES_MAXCOUNT) && \ - (0 < ACC_OPENCL_HANDLES_MAXCOUNT) - NULL != c_dbcsr_acc_opencl_config.events - ? libxsmm_pmalloc(c_dbcsr_acc_opencl_config.events, &c_dbcsr_acc_opencl_config.nevents) - : -# endif - malloc(sizeof(cl_event))); - if (NULL != *event_p) { - *(cl_event*)*event_p = event; - } - else { - if (NULL != event) ACC_OPENCL_EXPECT(CL_SUCCESS == clReleaseEvent(event)); - result = EXIT_FAILURE; - } - } -# if defined(ACC_OPENCL_EVENT_CREATE) - else { - *event_p = NULL; /* error: creating user-defined event */ - } -# endif + assert(NULL != c_dbcsr_acc_opencl_config.events && NULL != event_p); + *event_p = c_dbcsr_acc_opencl_pmalloc((void**)c_dbcsr_acc_opencl_config.events, &c_dbcsr_acc_opencl_config.nevents); + if (NULL != *event_p) *(cl_event*)*event_p = NULL; + else result = EXIT_FAILURE; # if defined(__DBCSR_ACC) && defined(ACC_OPENCL_PROFILE) c_dbcsr_timestop(&routine_handle); # endif @@ -107,18 +47,9 @@ int c_dbcsr_acc_event_destroy(void* event) { # endif if (NULL != event) { const cl_event clevent = *ACC_OPENCL_EVENT(event); + assert(NULL != c_dbcsr_acc_opencl_config.events); + c_dbcsr_acc_opencl_pfree(event, (void**)c_dbcsr_acc_opencl_config.events, &c_dbcsr_acc_opencl_config.nevents); if (NULL != clevent) result = clReleaseEvent(clevent); -# if LIBXSMM_VERSION4(1, 17, 0, 0) < LIBXSMM_VERSION_NUMBER && defined(ACC_OPENCL_HANDLES_MAXCOUNT) && \ - (0 < ACC_OPENCL_HANDLES_MAXCOUNT) - if (NULL != c_dbcsr_acc_opencl_config.events) { - /**(cl_event*)event = NULL; assert(NULL == *ACC_OPENCL_EVENT(event));*/ - libxsmm_pfree(event, c_dbcsr_acc_opencl_config.events, &c_dbcsr_acc_opencl_config.nevents); - } - else -# endif - { - free(event); - } } # if defined(__DBCSR_ACC) && defined(ACC_OPENCL_PROFILE) c_dbcsr_timestop(&routine_handle); @@ -129,23 +60,29 @@ int c_dbcsr_acc_event_destroy(void* event) { int c_dbcsr_acc_stream_wait_event(void* stream, void* event) { /* wait for an event (device-side) */ int result = EXIT_SUCCESS; - cl_event clevent; + const c_dbcsr_acc_opencl_stream_t* str = NULL; + cl_event clevent = NULL; # if defined(__DBCSR_ACC) && defined(ACC_OPENCL_PROFILE) int routine_handle; static const char* const routine_name_ptr = LIBXSMM_FUNCNAME; static const int routine_name_len = (int)sizeof(LIBXSMM_FUNCNAME) - 1; c_dbcsr_timeset((const char**)&routine_name_ptr, &routine_name_len, &routine_handle); # endif - assert(NULL != stream && NULL != event); +# if defined(ACC_OPENCL_STREAM_NULL) + str = (NULL != stream ? ACC_OPENCL_STREAM(stream) : c_dbcsr_acc_opencl_stream_default()); +# else + str = ACC_OPENCL_STREAM(stream); +# endif + assert(NULL != str && NULL != str->queue && NULL != event); clevent = *ACC_OPENCL_EVENT(event); -# if defined(ACC_OPENCL_EVENT_CREATE) - assert(NULL != clevent); + if (NULL != clevent) { +# if defined(CL_VERSION_1_2) + result = clEnqueueBarrierWithWaitList(str->queue, 1, &clevent, NULL); # else - if (NULL != clevent) + result = clEnqueueWaitForEvents(str->queue, 1, &clevent); # endif - { - result = ACC_OPENCL_WAIT_EVENT(*ACC_OPENCL_STREAM(stream), &clevent); } + /*else result = EXIT_FAILURE;*/ # if defined(__DBCSR_ACC) && defined(ACC_OPENCL_PROFILE) c_dbcsr_timestop(&routine_handle); # endif @@ -154,7 +91,8 @@ int c_dbcsr_acc_stream_wait_event(void* stream, void* event) { /* wait for an ev int c_dbcsr_acc_event_record(void* event, void* stream) { - int result; + int result = EXIT_SUCCESS; + const c_dbcsr_acc_opencl_stream_t* str = NULL; cl_event clevent = NULL; # if defined(__DBCSR_ACC) && defined(ACC_OPENCL_PROFILE) int routine_handle; @@ -162,18 +100,34 @@ int c_dbcsr_acc_event_record(void* event, void* stream) { static const int routine_name_len = (int)sizeof(LIBXSMM_FUNCNAME) - 1; c_dbcsr_timeset((const char**)&routine_name_ptr, &routine_name_len, &routine_handle); # endif - assert(NULL != event && NULL != stream); -# if defined(ACC_OPENCL_EVENT_BARRIER) && defined(CL_VERSION_1_2) - result = clEnqueueBarrierWithWaitList(*ACC_OPENCL_STREAM(stream), 0, NULL, &clevent); -# elif defined(CL_VERSION_1_2) - result = clEnqueueMarkerWithWaitList(*ACC_OPENCL_STREAM(stream), 0, NULL, &clevent); +# if defined(ACC_OPENCL_STREAM_NULL) + str = (NULL != stream ? ACC_OPENCL_STREAM(stream) : c_dbcsr_acc_opencl_stream_default()); +# else + str = ACC_OPENCL_STREAM(stream); +# endif + assert(NULL != str && NULL != str->queue && NULL != event); + clevent = *ACC_OPENCL_EVENT(event); + if (NULL != clevent) { + ACC_OPENCL_EXPECT(EXIT_SUCCESS == clReleaseEvent(clevent)); +# if !defined(NDEBUG) + clevent = NULL; +# endif + } +# if defined(CL_VERSION_1_2) + result = clEnqueueMarkerWithWaitList(str->queue, 0, NULL, &clevent); # else - result = clEnqueueMarker(*ACC_OPENCL_STREAM(stream), &clevent); + result = clEnqueueMarker(str->queue, &clevent); # endif - if (CL_SUCCESS == result) { + if (EXIT_SUCCESS == result) { assert(NULL != clevent); +# if defined(ACC_OPENCL_EVENT_FLUSH) + result = clFlush(str->queue); + *(cl_event*)event = (EXIT_SUCCESS == result ? clevent : NULL); +# else *(cl_event*)event = clevent; +# endif } + else *(cl_event*)event = NULL; # if defined(__DBCSR_ACC) && defined(ACC_OPENCL_PROFILE) c_dbcsr_timestop(&routine_handle); # endif @@ -192,13 +146,9 @@ int c_dbcsr_acc_event_query(void* event, c_dbcsr_acc_bool_t* has_occurred) { # endif assert(NULL != event && NULL != has_occurred); result = clGetEventInfo(*ACC_OPENCL_EVENT(event), CL_EVENT_COMMAND_EXECUTION_STATUS, sizeof(cl_int), &status, NULL); - if (CL_SUCCESS == result && 0 <= status) *has_occurred = (CL_COMPLETE == status ? 1 : 0); + if (EXIT_SUCCESS == result && 0 <= status) *has_occurred = (CL_COMPLETE == status ? 1 : 0); else { /* error state */ -# if defined(ACC_OPENCL_EVENT_CREATE) - if (CL_SUCCESS == result) result = EXIT_FAILURE; -# else - result = EXIT_SUCCESS; -# endif + result = EXIT_SUCCESS; /* soft-error */ *has_occurred = 1; } # if defined(__DBCSR_ACC) && defined(ACC_OPENCL_PROFILE) @@ -209,7 +159,7 @@ int c_dbcsr_acc_event_query(void* event, c_dbcsr_acc_bool_t* has_occurred) { int c_dbcsr_acc_event_synchronize(void* event) { /* waits on the host-side */ - int result; + int result = EXIT_SUCCESS; cl_event clevent; # if defined(__DBCSR_ACC) && defined(ACC_OPENCL_PROFILE) int routine_handle; @@ -219,13 +169,7 @@ int c_dbcsr_acc_event_synchronize(void* event) { /* waits on the host-side */ # endif assert(NULL != event); clevent = *ACC_OPENCL_EVENT(event); -# if !defined(ACC_OPENCL_EVENT_CREATE) - if (NULL == clevent) { - result = EXIT_SUCCESS; - } - else -# endif - { + if (NULL != clevent) { result = clWaitForEvents(1, &clevent); } # if defined(__DBCSR_ACC) && defined(ACC_OPENCL_PROFILE) diff --git a/src/acc/opencl/acc_opencl_mem.c b/src/acc/opencl/acc_opencl_mem.c index 9f4bf1e1fa3..c01afc1b2c4 100644 --- a/src/acc/opencl/acc_opencl_mem.c +++ b/src/acc/opencl/acc_opencl_mem.c @@ -19,18 +19,27 @@ # include # endif -# if !defined(ACC_OPENCL_MEM_DEBUG) && !defined(NDEBUG) && 0 -# define ACC_OPENCL_MEM_DEBUG -# endif # if !defined(ACC_OPENCL_MEM_ALIGNSCALE) # define ACC_OPENCL_MEM_ALIGNSCALE 8 # endif +# if !defined(ACC_OPENCL_MEM_DEBUG) && 0 +# define ACC_OPENCL_MEM_DEBUG +# endif +# if !defined(ACC_OPENCL_MEM_NPLOCKS) +# define ACC_OPENCL_MEM_NPLOCKS 16 +# endif +# if !defined(ACC_OPENCL_MEM_TLS) && 0 +# define ACC_OPENCL_MEM_TLS +# endif # if defined(__cplusplus) extern "C" { # endif +ACC_OPENCL_ATOMIC_LOCKTYPE c_dbcsr_acc_opencl_mem_plocks[ACC_OPENCL_MEM_NPLOCKS]; + + int c_dbcsr_acc_opencl_memalignment(size_t /*size*/); int c_dbcsr_acc_opencl_memalignment(size_t size) { int result; @@ -47,66 +56,121 @@ int c_dbcsr_acc_opencl_memalignment(size_t size) { } -void* c_dbcsr_acc_opencl_get_hostptr(cl_mem memory) { - void* result = NULL; - ACC_OPENCL_EXPECT(CL_SUCCESS == clGetMemObjectInfo(memory, CL_MEM_HOST_PTR, sizeof(void*), &result, NULL)); - return result; +void c_dbcsr_acc_opencl_pmalloc_init(size_t size, size_t* num, void* pool[], void* storage) { + const unsigned int hash = libxsmm_hash(pool, sizeof(void*), 0 /*seed*/); + char* p = (char*)storage; + ACC_OPENCL_ATOMIC_LOCKTYPE* lock; + size_t n, i = 0; + assert(0 < size && NULL != num && NULL != pool && NULL != storage); + lock = c_dbcsr_acc_opencl_mem_plocks + LIBXSMM_MOD2(hash, ACC_OPENCL_MEM_NPLOCKS); + ACC_OPENCL_ATOMIC_ACQUIRE(lock); + for (n = *num; i < n; ++i, p += size) pool[i] = p; + ACC_OPENCL_ATOMIC_RELEASE(lock); +} + + +void* c_dbcsr_acc_opencl_pmalloc(void* pool[], size_t* i) { + const unsigned int hash = libxsmm_hash(pool, sizeof(void*), 0 /*seed*/); + ACC_OPENCL_ATOMIC_LOCKTYPE* const lock = c_dbcsr_acc_opencl_mem_plocks + LIBXSMM_MOD2(hash, ACC_OPENCL_MEM_NPLOCKS); + void* pointer; + assert(NULL != pool && NULL != i); + ACC_OPENCL_ATOMIC_ACQUIRE(lock); + assert(0 < *i && ((size_t)-1) != *i); + pointer = pool[--(*i)]; + ACC_OPENCL_ATOMIC_RELEASE(lock); + assert(NULL != pointer); + return pointer; +} + + +void c_dbcsr_acc_opencl_pfree(const void* pointer, void* pool[], size_t* i) { + assert(NULL != pool && NULL != i); + if (NULL != pointer) { + const unsigned int hash = libxsmm_hash(pool, sizeof(void*), 0 /*seed*/); + ACC_OPENCL_ATOMIC_LOCKTYPE* const lock = c_dbcsr_acc_opencl_mem_plocks + LIBXSMM_MOD2(hash, ACC_OPENCL_MEM_NPLOCKS); + ACC_OPENCL_ATOMIC_ACQUIRE(lock); + LIBXSMM_ASSIGN127(pool + *i, &pointer); + ++(*i); + ACC_OPENCL_ATOMIC_RELEASE(lock); + } } -c_dbcsr_acc_opencl_info_hostptr_t* c_dbcsr_acc_opencl_info_hostptr(void* memory) { - assert(NULL == memory || sizeof(c_dbcsr_acc_opencl_info_hostptr_t) <= (uintptr_t)memory); - return (NULL != memory ? (c_dbcsr_acc_opencl_info_hostptr_t*)((uintptr_t)memory - sizeof(c_dbcsr_acc_opencl_info_hostptr_t)) - : (c_dbcsr_acc_opencl_info_hostptr_t*)NULL); +c_dbcsr_acc_opencl_info_memptr_t* c_dbcsr_acc_opencl_info_hostptr(void* memory) { + assert(NULL == memory || sizeof(c_dbcsr_acc_opencl_info_memptr_t) <= (uintptr_t)memory); + return (NULL != memory ? (c_dbcsr_acc_opencl_info_memptr_t*)((uintptr_t)memory - sizeof(c_dbcsr_acc_opencl_info_memptr_t)) + : (c_dbcsr_acc_opencl_info_memptr_t*)NULL); } -void* c_dbcsr_acc_opencl_info_devptr(const void* memory, size_t elsize, const size_t* amount, size_t* offset) { - void* result = NULL; -# if defined(ACC_OPENCL_MEM_OFFSET) && LIBXSMM_VERSION4(1, 17, 0, 0) < LIBXSMM_VERSION_NUMBER && \ - defined(ACC_OPENCL_HANDLES_MAXCOUNT) && (0 < ACC_OPENCL_HANDLES_MAXCOUNT) - if (NULL != c_dbcsr_acc_opencl_config.clmems && NULL != memory && 0 < elsize) { - const char* const buffer = (const char*)memory; +c_dbcsr_acc_opencl_info_memptr_t* c_dbcsr_acc_opencl_info_devptr_lock( + ACC_OPENCL_LOCKTYPE* lock, const void* memory, size_t elsize, const size_t* amount, size_t* offset) { + c_dbcsr_acc_opencl_info_memptr_t* result = NULL; + assert(0 < elsize); + if (NULL != memory) { + const char* const pointer = (const char*)memory; const size_t n = ACC_OPENCL_HANDLES_MAXCOUNT * c_dbcsr_acc_opencl_config.nthreads; - size_t i = c_dbcsr_acc_opencl_config.nclmems, hit = (size_t)-1; - for (; i < n; ++i) { - void** const handle = c_dbcsr_acc_opencl_config.clmems[i]; - char* const mem = (char*)(NULL != handle ? *handle : NULL); - if (mem == buffer) { /* fast-path */ - if (NULL != offset) *offset = 0; - assert(NULL != mem); - result = handle; - break; - } - else if (NULL != mem && mem < buffer && NULL != offset) { - size_t d = buffer - mem, s = 0; - if (d < hit && CL_SUCCESS == clGetMemObjectInfo((cl_mem)mem, CL_MEM_SIZE, sizeof(size_t), &s, NULL) && - (1 == elsize || (0 == (d % elsize) && 0 == (s % elsize))) && (NULL == amount || (*amount + d) <= s)) - { - *offset = (1 == elsize ? d : (d / elsize)); - result = handle; - hit = d; + size_t hit = (size_t)-1, i; + assert(NULL != c_dbcsr_acc_opencl_config.memptrs); + if (NULL != lock) ACC_OPENCL_ACQUIRE(lock); + for (i = c_dbcsr_acc_opencl_config.nmemptrs; i < n; ++i) { + c_dbcsr_acc_opencl_info_memptr_t* const info = c_dbcsr_acc_opencl_config.memptrs[i]; + if (NULL != info) { + char* const memptr = (char*)info->memptr; + assert(NULL != memptr); + if (memptr == pointer) { /* fast-path */ + if (NULL != offset) *offset = 0; + result = info; + break; + } + else if (memptr < pointer && NULL != offset) { + size_t d = pointer - memptr, s = d; + assert(0 != d); + if (d < hit && +# if !defined(NDEBUG) + (EXIT_SUCCESS == clGetMemObjectInfo(info->memory, CL_MEM_SIZE, sizeof(size_t), &s, NULL)) && + (NULL == amount || (*amount * elsize + d) <= s) && +# endif + (1 == elsize || (0 == (d % elsize) && 0 == (s % elsize))) && d <= s) + { + *offset = (1 == elsize ? d : (d / elsize)); + result = info; + hit = d; + } } } + else break; } + if (NULL != lock) ACC_OPENCL_RELEASE(lock); } -# else - LIBXSMM_UNUSED(memory); - LIBXSMM_UNUSED(elsize); +# if defined(NDEBUG) LIBXSMM_UNUSED(amount); - LIBXSMM_UNUSED(offset); +# endif + return result; +} + + +const c_dbcsr_acc_opencl_info_memptr_t* c_dbcsr_acc_opencl_info_devptr( + const void* memory, size_t elsize, const size_t* amount, size_t* offset) { + const c_dbcsr_acc_opencl_info_memptr_t* result = c_dbcsr_acc_opencl_info_devptr_lock( + c_dbcsr_acc_opencl_config.lock_memory, memory, elsize, amount, offset); +# if defined(ACC_OPENCL_MEM_TLS) + if (NULL != result) { + static LIBXSMM_TLS c_dbcsr_acc_opencl_info_memptr_t info; + LIBXSMM_ASSIGN127(&info, result); + result = &info; + } # endif return result; } int c_dbcsr_acc_host_mem_allocate(void** host_mem, size_t nbytes, void* stream) { - c_dbcsr_acc_opencl_info_stream_t* const info = c_dbcsr_acc_opencl_info_stream(stream); - const size_t size_meminfo = sizeof(c_dbcsr_acc_opencl_info_hostptr_t); + const size_t size_meminfo = sizeof(c_dbcsr_acc_opencl_info_memptr_t); const int alignment = c_dbcsr_acc_opencl_memalignment(nbytes); void* host_ptr = NULL; cl_mem memory = NULL; - cl_int result; + int result = EXIT_SUCCESS; # if defined(__DBCSR_ACC) && defined(ACC_OPENCL_PROFILE) int routine_handle; static const char* const routine_name_ptr = LIBXSMM_FUNCNAME; @@ -114,35 +178,35 @@ int c_dbcsr_acc_host_mem_allocate(void** host_mem, size_t nbytes, void* stream) c_dbcsr_timeset((const char**)&routine_name_ptr, &routine_name_len, &routine_handle); # endif nbytes += alignment + size_meminfo - 1; - assert(NULL != host_mem && NULL != info); + assert(NULL != host_mem); # if defined(CL_VERSION_2_0) - if (0 != c_dbcsr_acc_opencl_config.device[info->tid].svm_interop) { - host_ptr = clSVMAlloc( - c_dbcsr_acc_opencl_config.device[info->tid].context, CL_MEM_READ_WRITE, nbytes, sizeof(void*) /*minimal alignment*/); - if (NULL == host_ptr) c_dbcsr_acc_opencl_config.device[info->tid].svm_interop = 0; /* sanitize */ + if (0 != c_dbcsr_acc_opencl_config.device.svm_interop) { + host_ptr = clSVMAlloc(c_dbcsr_acc_opencl_config.device.context, CL_MEM_READ_WRITE, nbytes, sizeof(void*) /*minimal alignment*/); + if (NULL == host_ptr) c_dbcsr_acc_opencl_config.device.svm_interop = 0; /* sanitize */ } # endif - memory = clCreateBuffer(c_dbcsr_acc_opencl_config.device[info->tid].context, - NULL == host_ptr ? CL_MEM_ALLOC_HOST_PTR : CL_MEM_USE_HOST_PTR, nbytes, host_ptr, &result); - assert(CL_SUCCESS == result || NULL == memory); - if (CL_SUCCESS == result) { + memory = clCreateBuffer(c_dbcsr_acc_opencl_config.device.context, NULL == host_ptr ? CL_MEM_ALLOC_HOST_PTR : CL_MEM_USE_HOST_PTR, + nbytes, host_ptr, &result); + assert(EXIT_SUCCESS == result || NULL == memory); + if (EXIT_SUCCESS == result) { # if defined(ACC_OPENCL_STREAM_NULL) - cl_command_queue queue = *ACC_OPENCL_STREAM(NULL != stream ? stream : c_dbcsr_acc_opencl_stream_default()); + const c_dbcsr_acc_opencl_stream_t* const str = (NULL != stream ? ACC_OPENCL_STREAM(stream) + : c_dbcsr_acc_opencl_stream_default()); # else - cl_command_queue queue = *ACC_OPENCL_STREAM(stream); + const c_dbcsr_acc_opencl_stream_t* const str = ACC_OPENCL_STREAM(stream); # endif void* const mapped = clEnqueueMapBuffer( - queue, memory, CL_TRUE /*blocking*/, CL_MAP_READ | CL_MAP_WRITE, 0 /*offset*/, nbytes, 0, NULL, NULL, &result); - assert(CL_SUCCESS == result || NULL == mapped); - if (CL_SUCCESS == result) { + str->queue, memory, CL_TRUE /*blocking*/, CL_MAP_READ | CL_MAP_WRITE, 0 /*offset*/, nbytes, 0, NULL, NULL, &result); + assert(EXIT_SUCCESS == result || NULL == mapped); + if (EXIT_SUCCESS == result) { const uintptr_t address = (uintptr_t)mapped; const uintptr_t aligned = LIBXSMM_UP2(address + size_meminfo, alignment); - c_dbcsr_acc_opencl_info_hostptr_t* meminfo; + c_dbcsr_acc_opencl_info_memptr_t* meminfo; assert(address + size_meminfo <= aligned); - meminfo = (c_dbcsr_acc_opencl_info_hostptr_t*)(aligned - size_meminfo); + meminfo = (c_dbcsr_acc_opencl_info_memptr_t*)(aligned - size_meminfo); if (NULL != meminfo) { meminfo->memory = memory; - meminfo->mapped = mapped; + meminfo->memptr = mapped; *host_mem = (void*)aligned; } else { /* error: buffer info */ @@ -151,12 +215,12 @@ int c_dbcsr_acc_host_mem_allocate(void** host_mem, size_t nbytes, void* stream) } # if defined(ACC_OPENCL_STREAM_NULL) if (NULL == stream && EXIT_SUCCESS == result) { - result = c_dbcsr_acc_stream_sync(&queue); + result = clFinish(str->queue); } # endif } else { /* error: mapping host buffer */ - ACC_OPENCL_EXPECT(CL_SUCCESS == clReleaseMemObject(memory)); + ACC_OPENCL_EXPECT(EXIT_SUCCESS == clReleaseMemObject(memory)); *host_mem = NULL; } } @@ -179,28 +243,26 @@ int c_dbcsr_acc_host_mem_deallocate(void* host_mem, void* stream) { c_dbcsr_timeset((const char**)&routine_name_ptr, &routine_name_len, &routine_handle); # endif if (NULL != host_mem) { - c_dbcsr_acc_opencl_info_hostptr_t* const meminfo = c_dbcsr_acc_opencl_info_hostptr(host_mem); + c_dbcsr_acc_opencl_info_memptr_t* const meminfo = c_dbcsr_acc_opencl_info_hostptr(host_mem); if (NULL != meminfo->memory) { - const c_dbcsr_acc_opencl_info_hostptr_t info = *meminfo; /* copy meminfo prior to unmap */ + const c_dbcsr_acc_opencl_info_memptr_t info = *meminfo; /* copy meminfo prior to unmap */ # if defined(ACC_OPENCL_STREAM_NULL) - cl_command_queue queue = *ACC_OPENCL_STREAM(NULL != stream ? stream : c_dbcsr_acc_opencl_stream_default()); + const c_dbcsr_acc_opencl_stream_t* const str = ACC_OPENCL_STREAM( + NULL != stream ? stream : c_dbcsr_acc_opencl_stream_default()); # else - cl_command_queue queue = *ACC_OPENCL_STREAM(stream); + const c_dbcsr_acc_opencl_stream_t* const str = ACC_OPENCL_STREAM(stream); # endif int result_release; - result = clEnqueueUnmapMemObject(queue, info.memory, info.mapped, 0, NULL, NULL); + assert(NULL != str && NULL != str->queue); + result = clEnqueueUnmapMemObject(str->queue, info.memory, info.memptr, 0, NULL, NULL); # if defined(CL_VERSION_2_0) - { - const c_dbcsr_acc_opencl_info_stream_t* const qinfo = c_dbcsr_acc_opencl_info_stream(stream); - assert(NULL != qinfo); - if (0 != c_dbcsr_acc_opencl_config.device[qinfo->tid].svm_interop) { - clSVMFree(c_dbcsr_acc_opencl_config.device[qinfo->tid].context, info.mapped); - } + if (0 != c_dbcsr_acc_opencl_config.device.svm_interop) { + clSVMFree(c_dbcsr_acc_opencl_config.device.context, info.memptr); } # endif # if defined(ACC_OPENCL_STREAM_NULL) if (NULL == stream && EXIT_SUCCESS == result) { - result = c_dbcsr_acc_stream_sync(&queue); + result = clFinish(str->queue); } # endif result_release = clReleaseMemObject(info.memory); @@ -221,68 +283,81 @@ int c_dbcsr_acc_host_mem_deallocate(void* host_mem, void* stream) { int c_dbcsr_acc_dev_mem_allocate(void** dev_mem, size_t nbytes) { - cl_int result; - int tid = 0; - const cl_context context = c_dbcsr_acc_opencl_context(&tid); - const int devuid = c_dbcsr_acc_opencl_config.device[tid].uid, - try_flag = ((0 != c_dbcsr_acc_opencl_config.device[tid].unified || 0 == c_dbcsr_acc_opencl_config.device[tid].intel || + int result; + const int devuid = c_dbcsr_acc_opencl_config.device.uid, + try_flag = ((0 != c_dbcsr_acc_opencl_config.device.unified || 0 == c_dbcsr_acc_opencl_config.device.intel || (0x4905 != devuid && 0x020a != devuid && (0x0bd0 > devuid || 0x0bdb < devuid))) ? 0 : (1u << 22)); - cl_mem buffer; + cl_mem memory = NULL; # if defined(__DBCSR_ACC) && defined(ACC_OPENCL_PROFILE) int routine_handle; static const char* const routine_name_ptr = LIBXSMM_FUNCNAME; static const int routine_name_len = (int)sizeof(LIBXSMM_FUNCNAME) - 1; c_dbcsr_timeset((const char**)&routine_name_ptr, &routine_name_len, &routine_handle); # endif - assert(NULL != dev_mem && 0 <= ACC_OPENCL_OVERMALLOC); - assert(sizeof(void*) >= sizeof(cl_mem)); - assert(NULL != context); - buffer = ( + assert(NULL != dev_mem && NULL != c_dbcsr_acc_opencl_config.device.context); + memory = ( # if defined(CL_VERSION_2_0) - 0 != c_dbcsr_acc_opencl_config.device[tid].svm_interop - ? clCreateBuffer(context, CL_MEM_USE_HOST_PTR, nbytes + ACC_OPENCL_OVERMALLOC, - clSVMAlloc( - context, (cl_mem_flags)(CL_MEM_READ_WRITE | try_flag), nbytes + ACC_OPENCL_OVERMALLOC, 0 /*default alignment*/), + 0 != c_dbcsr_acc_opencl_config.device.svm_interop + ? clCreateBuffer(c_dbcsr_acc_opencl_config.device.context, CL_MEM_USE_HOST_PTR, nbytes, + clSVMAlloc(c_dbcsr_acc_opencl_config.device.context, (cl_mem_flags)(CL_MEM_READ_WRITE | try_flag), nbytes, + 0 /*default alignment*/), &result) : # endif - clCreateBuffer( - context, (cl_mem_flags)(CL_MEM_READ_WRITE | try_flag), nbytes + ACC_OPENCL_OVERMALLOC, NULL /*host_ptr*/, &result)); - if (0 != try_flag && CL_SUCCESS != result) { /* retry without try_flag */ - buffer = ( + clCreateBuffer(c_dbcsr_acc_opencl_config.device.context, (cl_mem_flags)(CL_MEM_READ_WRITE | try_flag), nbytes, + NULL /*host_ptr*/, &result)); + if (0 != try_flag && EXIT_SUCCESS != result) { /* retry without try_flag */ + memory = ( # if defined(CL_VERSION_2_0) - 0 != c_dbcsr_acc_opencl_config.device[tid].svm_interop - ? clCreateBuffer(context, CL_MEM_USE_HOST_PTR, nbytes + ACC_OPENCL_OVERMALLOC, - clSVMAlloc(context, CL_MEM_READ_WRITE, nbytes + ACC_OPENCL_OVERMALLOC, 0 /*default alignment*/), &result) + 0 != c_dbcsr_acc_opencl_config.device.svm_interop + ? clCreateBuffer(c_dbcsr_acc_opencl_config.device.context, CL_MEM_USE_HOST_PTR, nbytes, + clSVMAlloc(c_dbcsr_acc_opencl_config.device.context, CL_MEM_READ_WRITE, nbytes, 0 /*default alignment*/), &result) : # endif - clCreateBuffer(context, CL_MEM_READ_WRITE, nbytes + ACC_OPENCL_OVERMALLOC, NULL /*host_ptr*/, &result)); + clCreateBuffer(c_dbcsr_acc_opencl_config.device.context, CL_MEM_READ_WRITE, nbytes, NULL /*host_ptr*/, &result)); } if (EXIT_SUCCESS == result) { - assert(NULL != buffer); -# if defined(ACC_OPENCL_MEM_OFFSET) && LIBXSMM_VERSION4(1, 17, 0, 0) < LIBXSMM_VERSION_NUMBER && \ - defined(ACC_OPENCL_HANDLES_MAXCOUNT) && (0 < ACC_OPENCL_HANDLES_MAXCOUNT) - assert(NULL != c_dbcsr_acc_opencl_config.clmems); - { - void** handle = libxsmm_pmalloc(c_dbcsr_acc_opencl_config.clmems, &c_dbcsr_acc_opencl_config.nclmems); - if (NULL != handle) { - *handle = buffer; -# if defined(ACC_OPENCL_MEM_DEBUG) - printf("c_dbcsr_acc_dev_mem_allocate: %p size=%llu\n", buffer, (unsigned long long)nbytes); -# endif + void* memptr = NULL; + const c_dbcsr_acc_opencl_stream_t* const stream = c_dbcsr_acc_opencl_stream_default(); + ACC_OPENCL_ACQUIRE(c_dbcsr_acc_opencl_config.lock_memory); + result = c_dbcsr_acc_opencl_get_ptr(NULL /*lock*/, stream, &memptr, memory, 0 /*offset*/); + if (EXIT_SUCCESS == result) { + c_dbcsr_acc_opencl_info_memptr_t* info = (c_dbcsr_acc_opencl_info_memptr_t*)c_dbcsr_acc_opencl_pmalloc( + (void**)c_dbcsr_acc_opencl_config.memptrs, &c_dbcsr_acc_opencl_config.nmemptrs); + assert(NULL != memory && NULL != memptr); + if (NULL != info) { + info->memory = memory; + info->memptr = memptr; } else result = EXIT_FAILURE; - } - if (EXIT_SUCCESS != result) { - *dev_mem = NULL; /* TODO: clReleaseMemObject */ - } - else + if (EXIT_SUCCESS != result) { + *dev_mem = NULL; /* TODO: clReleaseMemObject */ + } + else { +# if defined(ACC_OPENCL_MEM_DEBUG) + size_t offset = 0, amount = nbytes / 2; + info = c_dbcsr_acc_opencl_info_devptr_lock( + NULL /*lock*/, (const char*)memptr + amount, 1 /*elsize*/, NULL /*&amount*/, &offset); + fprintf(stderr, "INFO ACC/OpenCL: memory=%p pointer=%p size=%llu allocated\n", memory, memptr, (unsigned long long)nbytes); + if (NULL != info && memory == info->memory && amount == offset) # endif - { - *dev_mem = (void*)buffer; + { + *dev_mem = (void*)memptr; + } +# if defined(ACC_OPENCL_MEM_DEBUG) + else { + result = EXIT_FAILURE; + *dev_mem = NULL; + } +# endif + } + } + else { + *dev_mem = NULL; /* error: querying device pointer */ } + ACC_OPENCL_RELEASE(c_dbcsr_acc_opencl_config.lock_memory); } else { *dev_mem = NULL; /* error: creating device buffer */ @@ -302,42 +377,32 @@ int c_dbcsr_acc_dev_mem_deallocate(void* dev_mem) { static const int routine_name_len = (int)sizeof(LIBXSMM_FUNCNAME) - 1; c_dbcsr_timeset((const char**)&routine_name_ptr, &routine_name_len, &routine_handle); # endif + ACC_OPENCL_ACQUIRE(c_dbcsr_acc_opencl_config.lock_memory); if (NULL != dev_mem) { - const cl_mem buffer = (cl_mem)dev_mem; - assert(sizeof(void*) >= sizeof(cl_mem)); -# if defined(ACC_OPENCL_MEM_OFFSET) && LIBXSMM_VERSION4(1, 17, 0, 0) < LIBXSMM_VERSION_NUMBER && \ - defined(ACC_OPENCL_HANDLES_MAXCOUNT) && (0 < ACC_OPENCL_HANDLES_MAXCOUNT) - assert(NULL != c_dbcsr_acc_opencl_config.clmems); -# if defined(_OPENMP) -# pragma omp critical(c_dbcsr_acc_dev_mem_deallocate) -# endif - { - void** handle = c_dbcsr_acc_opencl_info_devptr(dev_mem, 1 /*elsize*/, NULL /*amount*/, NULL /*offset*/); - if (NULL != handle) { - void** const pfree = c_dbcsr_acc_opencl_config.clmems[c_dbcsr_acc_opencl_config.nclmems]; - libxsmm_pfree(pfree, c_dbcsr_acc_opencl_config.clmems, &c_dbcsr_acc_opencl_config.nclmems); - *handle = *pfree; -# if defined(ACC_OPENCL_MEM_DEBUG) - printf("c_dbcsr_acc_dev_mem_deallocate: %p\n", buffer); -# endif - } -# if !defined(NDEBUG) || defined(ACC_OPENCL_MEM_DEBUG) - else result = EXIT_FAILURE; -# endif - } -# endif + c_dbcsr_acc_opencl_info_memptr_t* const info = c_dbcsr_acc_opencl_info_devptr_lock( + NULL, dev_mem, 1 /*elsize*/, NULL /*amount*/, NULL /*offset*/); + if (NULL != info && info->memptr == dev_mem && NULL != info->memory) { + c_dbcsr_acc_opencl_info_memptr_t* const pfree = c_dbcsr_acc_opencl_config.memptrs[c_dbcsr_acc_opencl_config.nmemptrs]; # if defined(CL_VERSION_2_0) - { - const int tid = ACC_OPENCL_OMP_TID(); - if (0 != c_dbcsr_acc_opencl_config.device[tid].svm_interop) { - void* const ptr = (0 != c_dbcsr_acc_opencl_config.device[tid].svm_interop ? c_dbcsr_acc_opencl_get_hostptr(buffer) : NULL); - const cl_context context = c_dbcsr_acc_opencl_context(NULL /*thread_id*/); - clSVMFree(context, ptr); + if (0 != c_dbcsr_acc_opencl_config.device.svm_interop) { + void* ptr = NULL; + /* get host-pointer associated with device-memory (c_dbcsr_acc_dev_mem_allocate) */ + ACC_OPENCL_EXPECT(EXIT_SUCCESS == clGetMemObjectInfo(info->memory, CL_MEM_HOST_PTR, sizeof(void*), &ptr, NULL)); + clSVMFree(c_dbcsr_acc_opencl_config.device.context, ptr); } +# endif + c_dbcsr_acc_opencl_pfree(pfree, (void**)c_dbcsr_acc_opencl_config.memptrs, &c_dbcsr_acc_opencl_config.nmemptrs); + ACC_OPENCL_CHECK(clReleaseMemObject(info->memory), "release device memory buffer", result); +# if defined(ACC_OPENCL_MEM_DEBUG) + fprintf(stderr, "INFO ACC/OpenCL: memory=%p pointer=%p deallocated\n", info->memory, dev_mem); +# endif + *info = *pfree; } +# if !defined(NDEBUG) + else result = EXIT_FAILURE; # endif - ACC_OPENCL_CHECK(clReleaseMemObject(buffer), "release device memory buffer", result); } + ACC_OPENCL_RELEASE(c_dbcsr_acc_opencl_config.lock_memory); # if defined(__DBCSR_ACC) && defined(ACC_OPENCL_PROFILE) c_dbcsr_timestop(&routine_handle); # endif @@ -345,8 +410,8 @@ int c_dbcsr_acc_dev_mem_deallocate(void* dev_mem) { } -int c_dbcsr_acc_dev_mem_set_ptr(void** dev_mem, void* other, size_t lb) { - int result; +int c_dbcsr_acc_dev_mem_set_ptr(void** dev_mem, void* memory, size_t offset) { + int result = EXIT_SUCCESS; # if defined(__DBCSR_ACC) && defined(ACC_OPENCL_PROFILE) int routine_handle; static const char* const routine_name_ptr = LIBXSMM_FUNCNAME; @@ -354,12 +419,12 @@ int c_dbcsr_acc_dev_mem_set_ptr(void** dev_mem, void* other, size_t lb) { c_dbcsr_timeset((const char**)&routine_name_ptr, &routine_name_len, &routine_handle); # endif assert(NULL != dev_mem); - if (NULL != other || 0 == lb) { - *dev_mem = (char*)other + lb; - result = EXIT_SUCCESS; + if (NULL != memory) { + *dev_mem = (char*)*dev_mem + offset; } else { result = EXIT_FAILURE; + *dev_mem = NULL; } # if defined(__DBCSR_ACC) && defined(ACC_OPENCL_PROFILE) c_dbcsr_timestop(&routine_handle); @@ -368,6 +433,44 @@ int c_dbcsr_acc_dev_mem_set_ptr(void** dev_mem, void* other, size_t lb) { } +int c_dbcsr_acc_opencl_get_ptr( + ACC_OPENCL_LOCKTYPE* lock, const c_dbcsr_acc_opencl_stream_t* stream, void** dev_mem, cl_mem memory, size_t offset) { + int result = EXIT_SUCCESS; + assert(NULL != dev_mem); + *dev_mem = NULL; + if (NULL != memory && NULL != stream && NULL != stream->queue) { + static cl_kernel kernel = NULL; + const size_t size = 1; + if (NULL != lock) ACC_OPENCL_ACQUIRE(lock); + if (NULL == kernel) { /* generate kernel */ + const char source[] = "kernel void memptr(global unsigned long* ptr, unsigned long offset) {\n" + " const size_t i = get_global_id(0);\n" + " const union {\n" + " global unsigned long* p;\n" + " unsigned long u;\n" + " } cast = { ptr };\n" + " ptr[i] = cast.u + offset + i;\n" + "}\n"; + assert(sizeof(size_t) == sizeof(cl_ulong)); + result = c_dbcsr_acc_opencl_kernel(0 /*source_is_file*/, source, "memptr" /*kernel_name*/, NULL /*build_params*/, + NULL /*build_options*/, NULL /*try_build_options*/, NULL /*try_ok*/, NULL /*extnames*/, 0 /*num_exts*/, &kernel); + } + /* TODO: backup/restore memory */ + ACC_OPENCL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &memory), "set pointer-argument of memptr kernel", result); + ACC_OPENCL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset), "set offset-argument of memptr kernel", result); + ACC_OPENCL_CHECK(clEnqueueNDRangeKernel( + stream->queue, kernel, 1 /*work_dim*/, NULL /*offset*/, &size, NULL /*local_work_size*/, 0, NULL, NULL), + "launch memptr kernel", result); + ACC_OPENCL_CHECK(/* TODO: investigate issue with blocking_read=CL_TRUE */ + clEnqueueReadBuffer(stream->queue, memory, CL_TRUE, 0, sizeof(void*), dev_mem, 0, NULL, NULL), "transfer memptr", result); + if (NULL != lock) ACC_OPENCL_RELEASE(lock); + assert(EXIT_SUCCESS != result || NULL != *dev_mem); + } + else result = EXIT_FAILURE; + ACC_OPENCL_RETURN(result); +} + + int c_dbcsr_acc_memcpy_h2d(const void* host_mem, void* dev_mem, size_t nbytes, void* stream) { int result = EXIT_SUCCESS; # if defined(__DBCSR_ACC) && defined(ACC_OPENCL_PROFILE) @@ -378,36 +481,27 @@ int c_dbcsr_acc_memcpy_h2d(const void* host_mem, void* dev_mem, size_t nbytes, v # endif assert((NULL != host_mem || 0 == nbytes) && (NULL != dev_mem || 0 == nbytes)); if (NULL != host_mem && NULL != dev_mem && 0 != nbytes) { - cl_mem buffer = (cl_mem)dev_mem; size_t offset = 0; -# if defined(ACC_OPENCL_MEM_OFFSET) && LIBXSMM_VERSION4(1, 17, 0, 0) < LIBXSMM_VERSION_NUMBER && \ - defined(ACC_OPENCL_HANDLES_MAXCOUNT) && (0 < ACC_OPENCL_HANDLES_MAXCOUNT) - assert(NULL != c_dbcsr_acc_opencl_config.clmems); - { - void* const handle = c_dbcsr_acc_opencl_info_devptr(dev_mem, 1 /*elsize*/, &nbytes, &offset); - if (NULL != handle) buffer = *(cl_mem*)handle; -# if !defined(NDEBUG) || defined(ACC_OPENCL_MEM_DEBUG) - else result = EXIT_FAILURE; -# endif - } -# if !defined(NDEBUG) || defined(ACC_OPENCL_MEM_DEBUG) - if (EXIT_SUCCESS == result) -# endif -# endif - { + const c_dbcsr_acc_opencl_info_memptr_t* const info = c_dbcsr_acc_opencl_info_devptr(dev_mem, 1 /*elsize*/, &nbytes, &offset); + assert(NULL != info); + if (NULL != info) { # if defined(ACC_OPENCL_STREAM_NULL) - cl_command_queue queue = *ACC_OPENCL_STREAM(NULL != stream ? stream : c_dbcsr_acc_opencl_stream_default()); + const c_dbcsr_acc_opencl_stream_t* const str = ACC_OPENCL_STREAM( + NULL != stream ? stream : c_dbcsr_acc_opencl_stream_default()); # else - cl_command_queue queue = *ACC_OPENCL_STREAM(stream); + const c_dbcsr_acc_opencl_stream_t* const str = ACC_OPENCL_STREAM(stream); # endif + assert(NULL != str && NULL != str->queue && NULL != info->memory); result = clEnqueueWriteBuffer( - queue, buffer, 0 == (1 & c_dbcsr_acc_opencl_config.async), offset, nbytes, host_mem, 0, NULL, NULL); + str->queue, info->memory, 0 == (1 & c_dbcsr_acc_opencl_config.async), offset, nbytes, host_mem, 0, NULL, NULL); # if defined(ACC_OPENCL_STREAM_NULL) if (NULL == stream && EXIT_SUCCESS == result) { - result = c_dbcsr_acc_stream_sync(&queue); + result = clFinish(str->queue); } # endif + assert(EXIT_SUCCESS == result); } + else result = EXIT_FAILURE; } # if defined(__DBCSR_ACC) && defined(ACC_OPENCL_PROFILE) c_dbcsr_timestop(&routine_handle); @@ -426,45 +520,35 @@ int c_dbcsr_acc_memcpy_d2h(const void* dev_mem, void* host_mem, size_t nbytes, v # endif assert((NULL != dev_mem || 0 == nbytes) && (NULL != host_mem || 0 == nbytes)); if (NULL != host_mem && NULL != dev_mem && 0 != nbytes) { - cl_mem buffer = NULL; size_t offset = 0; - LIBXSMM_ASSIGN127(&buffer, &dev_mem); -# if defined(ACC_OPENCL_MEM_OFFSET) && LIBXSMM_VERSION4(1, 17, 0, 0) < LIBXSMM_VERSION_NUMBER && \ - defined(ACC_OPENCL_HANDLES_MAXCOUNT) && (0 < ACC_OPENCL_HANDLES_MAXCOUNT) - assert(NULL != c_dbcsr_acc_opencl_config.clmems); - { - void* const handle = c_dbcsr_acc_opencl_info_devptr(dev_mem, 1 /*elsize*/, &nbytes, &offset); - if (NULL != handle) buffer = *(cl_mem*)handle; -# if !defined(NDEBUG) || defined(ACC_OPENCL_MEM_DEBUG) - else result = EXIT_FAILURE; -# endif - } -# if !defined(NDEBUG) || defined(ACC_OPENCL_MEM_DEBUG) - if (EXIT_SUCCESS == result) -# endif -# endif - { + const c_dbcsr_acc_opencl_info_memptr_t* const info = c_dbcsr_acc_opencl_info_devptr(dev_mem, 1 /*elsize*/, &nbytes, &offset); + assert(NULL != info); + if (NULL != info) { # if defined(ACC_OPENCL_STREAM_NULL) - cl_command_queue queue = *ACC_OPENCL_STREAM(NULL != stream ? stream : c_dbcsr_acc_opencl_stream_default()); + const c_dbcsr_acc_opencl_stream_t* const str = ACC_OPENCL_STREAM( + NULL != stream ? stream : c_dbcsr_acc_opencl_stream_default()); # else - cl_command_queue queue = *ACC_OPENCL_STREAM(stream); -# endif - result = clEnqueueReadBuffer( - queue, buffer, 0 == (2 & c_dbcsr_acc_opencl_config.async), offset, nbytes, host_mem, 0, NULL, NULL); - if (CL_SUCCESS == result) { -# if defined(ACC_OPENCL_STREAM_NULL) - if (NULL == stream) result = c_dbcsr_acc_stream_sync(&queue); -# endif - } - else { /* synchronous */ - const int result_sync = clEnqueueReadBuffer(queue, buffer, CL_TRUE, offset, nbytes, host_mem, 0, NULL, NULL); + const c_dbcsr_acc_opencl_stream_t* const str = ACC_OPENCL_STREAM(stream); +# endif + assert(NULL != str && NULL != str->queue && NULL != info->memory); + if (EXIT_SUCCESS != clEnqueueReadBuffer(str->queue, info->memory, 0 == (2 & c_dbcsr_acc_opencl_config.async), offset, nbytes, + host_mem, 0, NULL, NULL) && + 0 != (2 & c_dbcsr_acc_opencl_config.async)) + { /* synchronous */ + const int result_sync = clEnqueueReadBuffer(str->queue, info->memory, CL_TRUE, offset, nbytes, host_mem, 0, NULL, NULL); c_dbcsr_acc_opencl_config.async |= 2; /* retract feature */ if (0 != c_dbcsr_acc_opencl_config.verbosity) { fprintf(stderr, "WARN ACC/OpenCL: falling back to synchronous readback (code=%i).\n", result); } result = result_sync; } +# if defined(ACC_OPENCL_STREAM_NULL) + if (NULL == stream && EXIT_SUCCESS == result) { + result = clFinish(str->queue); + } +# endif } + else result = EXIT_FAILURE; } # if defined(__DBCSR_ACC) && defined(ACC_OPENCL_PROFILE) c_dbcsr_timestop(&routine_handle); @@ -483,69 +567,59 @@ int c_dbcsr_acc_memcpy_d2d(const void* devmem_src, void* devmem_dst, size_t nbyt # endif assert((NULL != devmem_src || 0 == nbytes) && (NULL != devmem_dst || 0 == nbytes)); if (NULL != devmem_src && NULL != devmem_dst && 0 != nbytes) { - cl_mem src = NULL, dst = (cl_mem)devmem_dst; - size_t src_offset = 0, dst_offset = 0; - LIBXSMM_ASSIGN127(&src, &devmem_src); -# if defined(ACC_OPENCL_MEM_OFFSET) && LIBXSMM_VERSION4(1, 17, 0, 0) < LIBXSMM_VERSION_NUMBER && \ - defined(ACC_OPENCL_HANDLES_MAXCOUNT) && (0 < ACC_OPENCL_HANDLES_MAXCOUNT) - assert(NULL != c_dbcsr_acc_opencl_config.clmems); - { - void* const handle_src = c_dbcsr_acc_opencl_info_devptr(devmem_src, 1 /*elsize*/, &nbytes, &src_offset); - void* const handle_dst = c_dbcsr_acc_opencl_info_devptr(devmem_dst, 1 /*elsize*/, &nbytes, &dst_offset); - if (NULL != handle_src) src = *(cl_mem*)handle_src; -# if !defined(NDEBUG) || defined(ACC_OPENCL_MEM_DEBUG) - else result = EXIT_FAILURE; -# endif - if (NULL != handle_dst) dst = *(cl_mem*)handle_dst; -# if !defined(NDEBUG) || defined(ACC_OPENCL_MEM_DEBUG) - else result = EXIT_FAILURE; -# endif - } -# if !defined(NDEBUG) || defined(ACC_OPENCL_MEM_DEBUG) - if (EXIT_SUCCESS == result) -# endif -# endif - { + size_t offset_src = 0, offset_dst = 0; + const c_dbcsr_acc_opencl_info_memptr_t* const info_src = c_dbcsr_acc_opencl_info_devptr( + devmem_src, 1 /*elsize*/, &nbytes, &offset_src); + const c_dbcsr_acc_opencl_info_memptr_t* const info_dst = c_dbcsr_acc_opencl_info_devptr( + devmem_dst, 1 /*elsize*/, &nbytes, &offset_dst); + assert(NULL != info_src && NULL != info_dst); + if (NULL != info_src && NULL != info_dst) { # if defined(ACC_OPENCL_STREAM_NULL) - cl_command_queue queue = *ACC_OPENCL_STREAM(NULL != stream ? stream : c_dbcsr_acc_opencl_stream_default()); + const c_dbcsr_acc_opencl_stream_t* const str = ACC_OPENCL_STREAM( + NULL != stream ? stream : c_dbcsr_acc_opencl_stream_default()); # else - cl_command_queue queue = *ACC_OPENCL_STREAM(stream); + const c_dbcsr_acc_opencl_stream_t* const str = ACC_OPENCL_STREAM(stream); # endif + assert(NULL != str && NULL != str->queue && NULL != info_src->memory && NULL != info_dst->memory); if (0 == (2 & c_dbcsr_acc_opencl_config.devcopy)) { - result = clEnqueueCopyBuffer(queue, src, dst, src_offset, dst_offset, nbytes, 0, NULL, NULL); + result = clEnqueueCopyBuffer(str->queue, info_src->memory, info_dst->memory, offset_src, offset_dst, nbytes, 0, NULL, NULL); + assert(EXIT_SUCCESS == result); } - else { - static volatile int lock; /* creating cl_kernel and clSetKernelArg must be synchronized */ + else { /* creating cl_kernel and clSetKernelArg must be synchronized */ static cl_kernel kernel = NULL; - LIBXSMM_ATOMIC_ACQUIRE(&lock, LIBXSMM_SYNC_NPAUSE, LIBXSMM_ATOMIC_RELAXED); + ACC_OPENCL_ACQUIRE(c_dbcsr_acc_opencl_config.lock_memcpy); if (NULL == kernel) { /* generate kernel */ - const char source[] = "kernel void memcpy_d2d(\n" - " global uchar *restrict dst, size_t dst_offset,\n" - " global uchar *restrict src, size_t src_offset)\n" + const char source[] = "kernel void memcpy(\n" + " global const uchar *restrict src, unsigned long offset_src,\n" + " global uchar *restrict dst, unsigned long offset_dst)\n" "{\n" " const size_t i = get_global_id(0);\n" - " dst[i+dst_offset] = src[i+src_offset];\n" + " dst[i+offset_dst] = src[i+offset_src];\n" "}\n"; - result = c_dbcsr_acc_opencl_kernel(0 /*source_is_file*/, source, "memcpy_d2d" /*kernel_name*/, NULL /*build_params*/, + assert(sizeof(size_t) == sizeof(cl_ulong)); + result = c_dbcsr_acc_opencl_kernel(0 /*source_is_file*/, source, "memcpy" /*kernel_name*/, NULL /*build_params*/, NULL /*build_options*/, NULL /*try_build_options*/, NULL /*try_ok*/, NULL /*extnames*/, 0 /*num_exts*/, &kernel); } if (EXIT_SUCCESS == result) { - ACC_OPENCL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &dst), "set src argument of memcpy_d2d kernel", result); - ACC_OPENCL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), &dst_offset), "set dst-offset of memcpy_d2d kernel", result); - ACC_OPENCL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &src), "set dst argument of memcpy_d2d kernel", result); - ACC_OPENCL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_mem), &src_offset), "set src-offset of memcpy_d2d kernel", result); + ACC_OPENCL_CHECK( + clSetKernelArg(kernel, 0, sizeof(cl_mem), &info_src->memory), "set src argument of memcpy kernel", result); + ACC_OPENCL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset_src), "set src-offset of memcpy kernel", result); + ACC_OPENCL_CHECK( + clSetKernelArg(kernel, 2, sizeof(cl_mem), &info_dst->memory), "set dst argument of memcpy kernel", result); + ACC_OPENCL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offset_dst), "set dst-offset of memcpy kernel", result); ACC_OPENCL_CHECK(clEnqueueNDRangeKernel( - queue, kernel, 1 /*work_dim*/, NULL /*offset*/, &nbytes, NULL /*local_work_size*/, 0, NULL, NULL), - "launch memcpy_d2d kernel", result); + str->queue, kernel, 1 /*work_dim*/, NULL /*offset*/, &nbytes, NULL /*local_work_size*/, 0, NULL, NULL), + "launch memcpy kernel", result); } - LIBXSMM_ATOMIC_RELEASE(&lock, LIBXSMM_ATOMIC_RELAXED); + ACC_OPENCL_RELEASE(c_dbcsr_acc_opencl_config.lock_memcpy); } # if defined(ACC_OPENCL_STREAM_NULL) if (NULL == stream && EXIT_SUCCESS == result) { - result = c_dbcsr_acc_stream_sync(&queue); + result = clFinish(str->queue); } # endif } + else result = EXIT_FAILURE; } # if defined(__DBCSR_ACC) && defined(ACC_OPENCL_PROFILE) c_dbcsr_timestop(&routine_handle); @@ -564,61 +638,51 @@ int c_dbcsr_acc_opencl_memset(void* dev_mem, int value, size_t offset, size_t nb # endif assert(NULL != dev_mem || 0 == nbytes); if (0 != nbytes) { - cl_mem buffer = (cl_mem)dev_mem; -# if defined(ACC_OPENCL_MEM_OFFSET) && LIBXSMM_VERSION4(1, 17, 0, 0) < LIBXSMM_VERSION_NUMBER && \ - defined(ACC_OPENCL_HANDLES_MAXCOUNT) && (0 < ACC_OPENCL_HANDLES_MAXCOUNT) - if (0 == offset && NULL != c_dbcsr_acc_opencl_config.clmems) { - void* const handle = c_dbcsr_acc_opencl_info_devptr(dev_mem, 1 /*elsize*/, &nbytes, &offset); - if (NULL != handle) buffer = *(cl_mem*)handle; -# if !defined(NDEBUG) || defined(ACC_OPENCL_MEM_DEBUG) - else result = EXIT_FAILURE; -# endif - } -# if !defined(NDEBUG) || defined(ACC_OPENCL_MEM_DEBUG) - if (EXIT_SUCCESS == result) -# endif -# endif - { + size_t offset_info = 0; + const c_dbcsr_acc_opencl_info_memptr_t* const info = c_dbcsr_acc_opencl_info_devptr( + (char*)dev_mem + offset, 1 /*elsize*/, &nbytes, &offset_info); + assert(NULL != info && offset <= offset_info); + if (NULL != info && offset <= offset_info) { # if defined(ACC_OPENCL_STREAM_NULL) - cl_command_queue queue = *ACC_OPENCL_STREAM(NULL != stream ? stream : c_dbcsr_acc_opencl_stream_default()); + const c_dbcsr_acc_opencl_stream_t* const str = ACC_OPENCL_STREAM( + NULL != stream ? stream : c_dbcsr_acc_opencl_stream_default()); # else - cl_command_queue queue = *ACC_OPENCL_STREAM(stream); + const c_dbcsr_acc_opencl_stream_t* const str = ACC_OPENCL_STREAM(stream); # endif + assert(NULL != str && NULL != str->queue && NULL != info->memory); if (0 == (1 & c_dbcsr_acc_opencl_config.devcopy)) { - static LIBXSMM_TLS cl_long pattern = 0; - size_t size_of_pattern = 1; - pattern = value; /* fill with value */ - if (0 == LIBXSMM_MOD2(nbytes, sizeof(cl_long))) size_of_pattern = sizeof(cl_long); - else if (0 == LIBXSMM_MOD2(nbytes, 4)) size_of_pattern = 4; - else if (0 == LIBXSMM_MOD2(nbytes, 2)) size_of_pattern = 2; - result = clEnqueueFillBuffer(queue, buffer, &pattern, size_of_pattern, offset, nbytes, 0, NULL, NULL); + size_t size_of_value = 1; + if (0 == LIBXSMM_MOD2(nbytes, 4)) size_of_value = 4; + else if (0 == LIBXSMM_MOD2(nbytes, 2)) size_of_value = 2; + result = clEnqueueFillBuffer(str->queue, info->memory, &value, size_of_value, offset_info, nbytes, 0, NULL, NULL); } - else { - static volatile int lock; /* creating cl_kernel and clSetKernelArg must be synchronized */ + else { /* creating cl_kernel and clSetKernelArg must be synchronized */ static cl_kernel kernel = NULL; - LIBXSMM_ATOMIC_ACQUIRE(&lock, LIBXSMM_SYNC_NPAUSE, LIBXSMM_ATOMIC_RELAXED); + ACC_OPENCL_ACQUIRE(c_dbcsr_acc_opencl_config.lock_memset); if (NULL == kernel) { /* generate kernel */ - const char source[] = "kernel void memset(global uchar *restrict buffer, uchar value) {\n" + const char source[] = "kernel void memset(global uchar* buffer, uchar value) {\n" " buffer[get_global_id(0)] = value;\n" "}\n"; result = c_dbcsr_acc_opencl_kernel(0 /*source_is_file*/, source, "memset" /*kernel_name*/, NULL /*build_params*/, NULL /*build_options*/, NULL /*try_build_options*/, NULL /*try_ok*/, NULL /*extnames*/, 0 /*num_exts*/, &kernel); } if (EXIT_SUCCESS == result) { - ACC_OPENCL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &buffer), "set buffer argument of memset-kernel", result); - ACC_OPENCL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_uchar), &value), "set value argument of memset-kernel", result); ACC_OPENCL_CHECK( - clEnqueueNDRangeKernel(queue, kernel, 1 /*work_dim*/, &offset, &nbytes, NULL /*local_work_size*/, 0, NULL, NULL), + clSetKernelArg(kernel, 0, sizeof(cl_mem), &info->memory), "set buffer argument of memset-kernel", result); + ACC_OPENCL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_uchar), &value), "set value argument of memset-kernel", result); + ACC_OPENCL_CHECK(clEnqueueNDRangeKernel( + str->queue, kernel, 1 /*work_dim*/, &offset_info, &nbytes, NULL /*local_work_size*/, 0, NULL, NULL), "launch memset-kernel", result); } - LIBXSMM_ATOMIC_RELEASE(&lock, LIBXSMM_ATOMIC_RELAXED); + ACC_OPENCL_RELEASE(c_dbcsr_acc_opencl_config.lock_memset); } # if defined(ACC_OPENCL_STREAM_NULL) if (NULL == stream && EXIT_SUCCESS == result) { - result = c_dbcsr_acc_stream_sync(&queue); + result = clFinish(str->queue); } # endif } + else result = EXIT_FAILURE; } # if defined(__DBCSR_ACC) && defined(ACC_OPENCL_PROFILE) c_dbcsr_timestop(&routine_handle); @@ -707,7 +771,9 @@ int c_dbcsr_acc_opencl_info_devmem(cl_device_id device, size_t* mem_free, size_t int c_dbcsr_acc_dev_mem_info(size_t* mem_free, size_t* mem_total) { cl_device_id active_id = NULL; - int result = 0 < c_dbcsr_acc_opencl_config.ndevices ? c_dbcsr_acc_opencl_device(ACC_OPENCL_OMP_TID(), &active_id) : EXIT_FAILURE; + int result = 0 < c_dbcsr_acc_opencl_config.ndevices ? clGetContextInfo(c_dbcsr_acc_opencl_config.device.context, + CL_CONTEXT_DEVICES, sizeof(cl_device_id), &active_id, NULL) + : EXIT_FAILURE; # if defined(__DBCSR_ACC) && defined(ACC_OPENCL_PROFILE) int routine_handle; static const char* const routine_name_ptr = LIBXSMM_FUNCNAME; diff --git a/src/acc/opencl/acc_opencl_stream.c b/src/acc/opencl/acc_opencl_stream.c index cc1df74160c..78e9c2c0f48 100644 --- a/src/acc/opencl/acc_opencl_stream.c +++ b/src/acc/opencl/acc_opencl_stream.c @@ -27,41 +27,38 @@ int c_dbcsr_acc_opencl_stream_counter_base; int c_dbcsr_acc_opencl_stream_counter; -c_dbcsr_acc_opencl_info_stream_t* c_dbcsr_acc_opencl_info_stream(void* stream) { - assert(NULL == stream || sizeof(c_dbcsr_acc_opencl_info_stream_t) <= (uintptr_t)stream); - return ( - NULL != stream ? ((c_dbcsr_acc_opencl_info_stream_t*)((uintptr_t)stream - sizeof(c_dbcsr_acc_opencl_info_stream_t))) : NULL); -} - - -const int* c_dbcsr_acc_opencl_stream_priority(const void* stream) { - const int* result; -# if !defined(ACC_OPENCL_STREAM_PRIORITIES) - LIBXSMM_UNUSED(stream); -# else - const c_dbcsr_acc_opencl_info_stream_t* const info = c_dbcsr_acc_opencl_info_stream((void*)stream); - if (NULL != info) { - result = &info->priority; +const c_dbcsr_acc_opencl_stream_t* c_dbcsr_acc_opencl_stream(ACC_OPENCL_LOCKTYPE* lock, int thread_id) { + const c_dbcsr_acc_opencl_stream_t *result = NULL, *result_main = NULL; + const size_t n = ACC_OPENCL_HANDLES_MAXCOUNT * c_dbcsr_acc_opencl_config.nthreads; + size_t i; + assert(NULL != c_dbcsr_acc_opencl_config.streams); + assert(thread_id < c_dbcsr_acc_opencl_config.nthreads); + if (NULL != lock) ACC_OPENCL_ACQUIRE(lock); + for (i = c_dbcsr_acc_opencl_config.nstreams; i < n; ++i) { + const c_dbcsr_acc_opencl_stream_t* const str = c_dbcsr_acc_opencl_config.streams[i]; + if (NULL != str && NULL != str->queue) { + if (str->tid == thread_id || 0 > thread_id) { /* hit */ + result = str; + break; + } + else if (NULL == result_main && 0 == str->tid) { + result_main = str; + } + } + else break; /* error */ } - else -# endif - result = NULL; + if (0 != thread_id && NULL == result) { /* fallback */ + result = result_main; + } + if (NULL != lock) ACC_OPENCL_RELEASE(lock); return result; } -void* c_dbcsr_acc_opencl_stream_default(void) { - const int tid = ACC_OPENCL_OMP_TID(), base = tid * c_dbcsr_acc_opencl_config.nstreams; - void* result = NULL; - int i = base; - assert(tid < c_dbcsr_acc_opencl_config.nthreads); - assert(NULL != c_dbcsr_acc_opencl_config.streams); - for (; i < (base + c_dbcsr_acc_opencl_config.nstreams); ++i) { - if (NULL != c_dbcsr_acc_opencl_config.streams[i]) { - result = c_dbcsr_acc_opencl_config.streams + i; - break; - } - } +const c_dbcsr_acc_opencl_stream_t* c_dbcsr_acc_opencl_stream_default(void) { + const c_dbcsr_acc_opencl_stream_t* result = NULL; + result = c_dbcsr_acc_opencl_stream(c_dbcsr_acc_opencl_config.lock_stream, ACC_OPENCL_OMP_TID()); + assert(NULL != result); return result; } @@ -72,7 +69,6 @@ int c_dbcsr_acc_stream_create(void** stream_p, const char* name, int priority) { }; int result, i, tid = 0, offset = 0; cl_command_queue queue = NULL; - cl_context context = NULL; # if defined(__DBCSR_ACC) && defined(ACC_OPENCL_PROFILE) int routine_handle; static const char* const routine_name_ptr = LIBXSMM_FUNCNAME; @@ -106,42 +102,34 @@ int c_dbcsr_acc_stream_create(void** stream_p, const char* name, int priority) { properties[4] = 0; /* terminator */ } # endif + ACC_OPENCL_ACQUIRE(c_dbcsr_acc_opencl_config.lock_stream); # if defined(_OPENMP) if (1 < omp_get_num_threads()) { assert(0 < c_dbcsr_acc_opencl_config.nthreads); -# if (201107 /*v3.1*/ <= _OPENMP) -# pragma omp atomic capture -# else -# pragma omp critical(c_dbcsr_acc_opencl_stream) -# endif i = c_dbcsr_acc_opencl_stream_counter++; tid = (i < c_dbcsr_acc_opencl_config.nthreads ? i : (i % c_dbcsr_acc_opencl_config.nthreads)); - if (NULL != c_dbcsr_acc_opencl_config.device) { /* inherit master's context if current context is NULL */ - LIBXSMM_ATOMIC_CMPSWP(&c_dbcsr_acc_opencl_config.device[tid].context, NULL, - c_dbcsr_acc_opencl_config.device[/*main*/ 0].context, LIBXSMM_ATOMIC_RELAXED); - } } else offset = c_dbcsr_acc_opencl_stream_counter_base++; # endif - if (NULL != c_dbcsr_acc_opencl_config.device) context = c_dbcsr_acc_opencl_config.device[tid].context; - if (NULL != context) { + if (NULL != c_dbcsr_acc_opencl_config.device.context) { cl_device_id device = NULL; - result = clGetContextInfo(context, CL_CONTEXT_DEVICES, sizeof(cl_device_id), &device, NULL); - if (CL_SUCCESS == result) { - if (0 != c_dbcsr_acc_opencl_config.device[tid].intel) { + result = clGetContextInfo(c_dbcsr_acc_opencl_config.device.context, CL_CONTEXT_DEVICES, sizeof(cl_device_id), &device, NULL); + if (EXIT_SUCCESS == result) { + if (0 != c_dbcsr_acc_opencl_config.device.intel) { const int xhints = ((1 == c_dbcsr_acc_opencl_config.xhints || 0 > c_dbcsr_acc_opencl_config.xhints) - ? (0 != c_dbcsr_acc_opencl_config.device[tid].intel ? 1 : 0) + ? (0 != c_dbcsr_acc_opencl_config.device.intel ? 1 : 0) : (c_dbcsr_acc_opencl_config.xhints >> 1)); if (0 != (1 & xhints)) { /* attempt to enable command aggregation */ const ACC_OPENCL_STREAM_PROPERTIES_TYPE props[4] = { CL_QUEUE_PROPERTIES, CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE, 0 /* terminator */ }; - const cl_command_queue q = ACC_OPENCL_CREATE_COMMAND_QUEUE(context, device, props, &result); - if (CL_SUCCESS == result) { + const cl_command_queue q = ACC_OPENCL_CREATE_COMMAND_QUEUE( + c_dbcsr_acc_opencl_config.device.context, device, props, &result); + if (EXIT_SUCCESS == result) { c_dbcsr_acc_opencl_config.timer = c_dbcsr_acc_opencl_timer_host; /* force host-timer */ clReleaseCommandQueue(q); } - else result = CL_SUCCESS; + else result = EXIT_SUCCESS; } if (0 != (2 & xhints)) { /* attempt to enable queue families */ struct { @@ -151,8 +139,8 @@ int c_dbcsr_acc_stream_create(void** stream_p, const char* name, int priority) { char name[64 /*CL_QUEUE_FAMILY_MAX_NAME_SIZE_INTEL*/]; } intel_qfprops[16]; size_t nbytes = 0, i; - if (CL_SUCCESS == clGetDeviceInfo(device, 0x418B /*CL_DEVICE_QUEUE_FAMILY_PROPERTIES_INTEL*/, sizeof(intel_qfprops), - intel_qfprops, &nbytes)) + if (EXIT_SUCCESS == clGetDeviceInfo(device, 0x418B /*CL_DEVICE_QUEUE_FAMILY_PROPERTIES_INTEL*/, sizeof(intel_qfprops), + intel_qfprops, &nbytes)) { for (i = 0; (i * sizeof(*intel_qfprops)) < nbytes; ++i) { if (0 /*CL_QUEUE_DEFAULT_CAPABILITIES_INTEL*/ == intel_qfprops[i].capabilities && 1 < intel_qfprops[i].count) { @@ -173,51 +161,29 @@ int c_dbcsr_acc_stream_create(void** stream_p, const char* name, int priority) { { properties[1] = CL_QUEUE_PROFILING_ENABLE; } - queue = ACC_OPENCL_CREATE_COMMAND_QUEUE(context, device, properties, &result); + queue = ACC_OPENCL_CREATE_COMMAND_QUEUE(c_dbcsr_acc_opencl_config.device.context, device, properties, &result); } } else { result = EXIT_FAILURE; } -# if defined(_OPENMP) && 0 -# pragma omp critical(c_dbcsr_acc_opencl_stream) + if (EXIT_SUCCESS == result) { /* register stream */ + assert(NULL != c_dbcsr_acc_opencl_config.streams && NULL != queue); + *stream_p = c_dbcsr_acc_opencl_pmalloc((void**)c_dbcsr_acc_opencl_config.streams, &c_dbcsr_acc_opencl_config.nstreams); + if (NULL != *stream_p) { + c_dbcsr_acc_opencl_stream_t* const str = (c_dbcsr_acc_opencl_stream_t*)*stream_p; +# if !defined(NDEBUG) + LIBXSMM_MEMZERO127(str); # endif - if (EXIT_SUCCESS == result) { - void** const streams = c_dbcsr_acc_opencl_config.streams + tid * c_dbcsr_acc_opencl_config.nstreams; - for (i = 0; i < c_dbcsr_acc_opencl_config.nstreams; ++i) { - if (NULL == streams[i]) break; - } - if (i < c_dbcsr_acc_opencl_config.nstreams) { /* register stream */ - const size_t size_info = sizeof(c_dbcsr_acc_opencl_info_stream_t); - const size_t size = sizeof(cl_command_queue) + sizeof(void*) + size_info - 1; - void* const handle = malloc(size); - assert(NULL != queue); - if (NULL != handle) { - const uintptr_t address = (uintptr_t)handle; - const uintptr_t aligned = LIBXSMM_UP2(address + size_info, sizeof(void*)); - c_dbcsr_acc_opencl_info_stream_t* const info = (c_dbcsr_acc_opencl_info_stream_t*)(aligned - size_info); - assert(address + size_info <= aligned && NULL != info); - info->pointer = (void*)address; - info->priority = priority; - info->tid = tid; - *(cl_command_queue*)aligned = queue; - streams[i] = *stream_p = (void*)aligned; - assert(queue == *ACC_OPENCL_STREAM(streams[i])); - assert(queue == *ACC_OPENCL_STREAM(*stream_p)); - } - else { - clReleaseCommandQueue(queue); - result = EXIT_FAILURE; - *stream_p = NULL; - } - } - else { - clReleaseCommandQueue(queue); - result = EXIT_FAILURE; - *stream_p = NULL; + str->queue = queue; + str->priority = priority; + str->tid = tid; } + else result = EXIT_FAILURE; } - else { + ACC_OPENCL_RELEASE(c_dbcsr_acc_opencl_config.lock_stream); + if (EXIT_SUCCESS != result && NULL != queue) { + clReleaseCommandQueue(queue); *stream_p = NULL; } # if defined(__DBCSR_ACC) && defined(ACC_OPENCL_PROFILE) @@ -234,43 +200,13 @@ int c_dbcsr_acc_stream_destroy(void* stream) { static const char* const routine_name_ptr = LIBXSMM_FUNCNAME; static const int routine_name_len = (int)sizeof(LIBXSMM_FUNCNAME) - 1; c_dbcsr_timeset((const char**)&routine_name_ptr, &routine_name_len, &routine_handle); -# endif -# if defined(_OPENMP) -# pragma omp critical(c_dbcsr_acc_opencl_stream) # endif if (NULL != stream) { - const cl_command_queue queue = *ACC_OPENCL_STREAM(stream); - if (NULL != queue) { - const int result_release = clReleaseCommandQueue(queue); /* soft-error */ - int tid = 0, i = c_dbcsr_acc_opencl_config.nstreams; - assert(NULL != c_dbcsr_acc_opencl_config.streams); - for (; tid < c_dbcsr_acc_opencl_config.nthreads; ++tid) { /* unregister */ - void** const streams = c_dbcsr_acc_opencl_config.streams + tid * c_dbcsr_acc_opencl_config.nstreams; - for (i = 0; i < c_dbcsr_acc_opencl_config.nstreams; ++i) { - if (stream == streams[i]) { - int k = i; -# if defined(ACC_OPENCL_STREAM_COMPACT) - const int j = i + 1; - if (j < c_dbcsr_acc_opencl_config.nstreams && NULL != streams[j]) { /* compacting streams is not thread-safe */ - k = c_dbcsr_acc_opencl_config.nstreams - j; - memmove(streams + i, streams + j, sizeof(void*) * k); - } -# endif - streams[k] = NULL; - tid = c_dbcsr_acc_opencl_config.nthreads; /* leave outer loop */ - result = result_release; /* promote */ - break; - } -# if defined(ACC_OPENCL_STREAM_COMPACT) - else if (NULL == streams[i]) { /* compact streams */ - break; - } -# endif - } - } - } - c_dbcsr_acc_opencl_stream_counter_base = c_dbcsr_acc_opencl_stream_counter = 0; /* reset */ - free(c_dbcsr_acc_opencl_info_stream(stream)->pointer); + const c_dbcsr_acc_opencl_stream_t* const str = ACC_OPENCL_STREAM(stream); + const cl_command_queue queue = str->queue; + assert(NULL != c_dbcsr_acc_opencl_config.streams); + c_dbcsr_acc_opencl_pfree(stream, (void**)c_dbcsr_acc_opencl_config.streams, &c_dbcsr_acc_opencl_config.nstreams); + if (NULL != queue) result = clReleaseCommandQueue(queue); } # if defined(__DBCSR_ACC) && defined(ACC_OPENCL_PROFILE) c_dbcsr_timestop(&routine_handle); @@ -295,7 +231,8 @@ int c_dbcsr_acc_stream_priority_range(int* least, int* greatest) { cl_platform_id platform = NULL; cl_device_id active_id = NULL; if (EXIT_SUCCESS == result) { - result = c_dbcsr_acc_opencl_device(ACC_OPENCL_OMP_TID(), &active_id); + result = clGetContextInfo( + c_dbcsr_acc_opencl_config.device.context, CL_CONTEXT_DEVICES, sizeof(cl_device_id), &active_id, NULL); } ACC_OPENCL_CHECK(clGetDeviceInfo(active_id, CL_DEVICE_PLATFORM, sizeof(cl_platform_id), &platform, NULL), "retrieve platform associated with active device", result); @@ -321,11 +258,8 @@ int c_dbcsr_acc_stream_priority_range(int* least, int* greatest) { int c_dbcsr_acc_stream_sync(void* stream) { - cl_command_queue queue = NULL; + const c_dbcsr_acc_opencl_stream_t* str = NULL; int result = EXIT_SUCCESS; -# if defined(ACC_OPENCL_STREAM_PRIORITIES) - const int* const priority = NULL; -# endif # if defined(__DBCSR_ACC) && defined(ACC_OPENCL_PROFILE) int routine_handle; static const char* const routine_name_ptr = LIBXSMM_FUNCNAME; @@ -333,11 +267,62 @@ int c_dbcsr_acc_stream_sync(void* stream) { c_dbcsr_timeset((const char**)&routine_name_ptr, &routine_name_len, &routine_handle); # endif # if defined(ACC_OPENCL_STREAM_NULL) - queue = *ACC_OPENCL_STREAM(NULL != stream ? stream : c_dbcsr_acc_opencl_stream_default()); + str = (NULL != stream ? ACC_OPENCL_STREAM(stream) : c_dbcsr_acc_opencl_stream_default()); +# else + str = ACC_OPENCL_STREAM(stream); +# endif + assert(NULL != str && NULL != str->queue); + result = clFinish(str->queue); +# if defined(__DBCSR_ACC) && defined(ACC_OPENCL_PROFILE) + c_dbcsr_timestop(&routine_handle); +# endif + ACC_OPENCL_RETURN(result); +} + + +int c_dbcsr_acc_opencl_device_synchronize(ACC_OPENCL_LOCKTYPE* lock, int thread_id) { + int result = EXIT_SUCCESS; + const size_t n = ACC_OPENCL_HANDLES_MAXCOUNT * c_dbcsr_acc_opencl_config.nthreads; + size_t i; + assert(thread_id < c_dbcsr_acc_opencl_config.nthreads); + assert(NULL != c_dbcsr_acc_opencl_config.streams); + if (NULL != lock) ACC_OPENCL_ACQUIRE(lock); + for (i = c_dbcsr_acc_opencl_config.nstreams; i < n; ++i) { + const c_dbcsr_acc_opencl_stream_t* const str = c_dbcsr_acc_opencl_config.streams[i]; + if (NULL != str && NULL != str->queue) { + if (str->tid == thread_id || 0 > thread_id) { /* hit */ + result = clFinish(str->queue); + if (EXIT_SUCCESS != result) break; + } + } + else { /* error */ + result = EXIT_FAILURE; + break; + } + } + if (NULL != lock) ACC_OPENCL_RELEASE(lock); + return result; +} + + +int c_dbcsr_acc_device_synchronize(void) { + int result = EXIT_SUCCESS; +# if defined(__DBCSR_ACC) && defined(ACC_OPENCL_PROFILE) + int routine_handle; + static const char* const routine_name_ptr = LIBXSMM_FUNCNAME; + static const int routine_name_len = (int)sizeof(LIBXSMM_FUNCNAME) - 1; + c_dbcsr_timeset((const char**)&routine_name_ptr, &routine_name_len, &routine_handle); +# endif +# if defined(_OPENMP) + if (1 == omp_get_num_threads()) { + result = c_dbcsr_acc_opencl_device_synchronize(c_dbcsr_acc_opencl_config.lock_stream, -1 /*all*/); + } + else { + result = c_dbcsr_acc_opencl_device_synchronize(NULL /*lock*/, omp_get_thread_num()); + } # else - queue = *ACC_OPENCL_STREAM(stream); + result = c_dbcsr_acc_opencl_device_synchronize(NULL /*lock*/, /*main*/ 0); # endif - result = clFinish(queue); # if defined(__DBCSR_ACC) && defined(ACC_OPENCL_PROFILE) c_dbcsr_timestop(&routine_handle); # endif diff --git a/src/acc/opencl/common/opencl_common.h b/src/acc/opencl/common/opencl_common.h index 16326725c76..1443be58b74 100644 --- a/src/acc/opencl/common/opencl_common.h +++ b/src/acc/opencl/common/opencl_common.h @@ -11,8 +11,10 @@ #if (200 /*CL_VERSION_2_0*/ <= __OPENCL_VERSION__) || defined(__NV_CL_C_VERSION) # define UNROLL_FORCE(N) __attribute__((opencl_unroll_hint(N))) +# define UNROLL_AUTO __attribute__((opencl_unroll_hint)) #else # define UNROLL_FORCE(N) +# define UNROLL_AUTO #endif #if !defined(MIN) @@ -28,11 +30,13 @@ #if !defined(LU) || (-1 == LU) # define UNROLL_OUTER(N) # define UNROLL(N) -#else -# if (1 <= LU) +#else /* (-2) full, (-1) no hints, (0) inner, (1) outer-dehint, (2) block-m */ +# if (1 <= LU) /* outer-dehint */ # define UNROLL_OUTER(N) UNROLL_FORCE(1) -# else +# elif (-1 > LU) /* full */ # define UNROLL_OUTER(N) UNROLL_FORCE(N) +# else /* inner */ +# define UNROLL_OUTER(N) # endif # define UNROLL(N) UNROLL_FORCE(N) #endif diff --git a/src/acc/opencl/smm/opencl_libsmm.c b/src/acc/opencl/smm/opencl_libsmm.c index d3771e37ac2..4e120c6af56 100644 --- a/src/acc/opencl/smm/opencl_libsmm.c +++ b/src/acc/opencl/smm/opencl_libsmm.c @@ -422,7 +422,7 @@ int libsmm_acc_init(void) { int result = EXIT_SUCCESS; # endif /* multiple calls to libsmm_acc_init are not considered as an error */ - if (1 == LIBXSMM_ATOMIC_ADD_FETCH(&opencl_libsmm_initialized, 1, LIBXSMM_ATOMIC_RELAXED)) { + if (1 == LIBXSMM_ATOMIC_ADD_FETCH(&opencl_libsmm_initialized, 1, ACC_OPENCL_ATOMIC_KIND)) { # if !defined(__DBCSR_ACC) /* DBCSR shall call c_dbcsr_acc_init as well as libsmm_acc_init (since both interfaces are used). * Also, libsmm_acc_init may privately call c_dbcsr_acc_init (as it depends on the ACC interface). @@ -501,7 +501,8 @@ int libsmm_acc_init(void) { cl_device_id active_id = NULL; unsigned int active_uid; int active_match = -1; - if (EXIT_SUCCESS == c_dbcsr_acc_opencl_device(ACC_OPENCL_OMP_TID(), &active_id) && + if (EXIT_SUCCESS == clGetContextInfo(c_dbcsr_acc_opencl_config.device.context, CL_CONTEXT_DEVICES, sizeof(cl_device_id), + &active_id, NULL) && EXIT_SUCCESS == c_dbcsr_acc_opencl_device_name(active_id, bufname, ACC_OPENCL_BUFFERSIZE, NULL /*platform*/, 0 /*platform_maxlen*/, /*cleanup*/ 1) && EXIT_SUCCESS == c_dbcsr_acc_opencl_device_uid(active_id, bufname, &active_uid)) @@ -599,7 +600,7 @@ int libsmm_acc_init(void) { fprintf(stderr, "INFO ACC/LIBSMM: PARAMS in %u set%s loaded targeting ", ntuned, 1 != ntuned ? "s" : ""); if (0 != c_dbcsr_acc_opencl_config.devmatch) { fprintf(stderr, "%i device%s\n", ndevices_params, 1 != ndevices_params ? "s" : ""); - if (1 < c_dbcsr_acc_opencl_config.verbosity || 0 > c_dbcsr_acc_opencl_config.verbosity) { + if (2 <= c_dbcsr_acc_opencl_config.verbosity || 0 > c_dbcsr_acc_opencl_config.verbosity) { unsigned int i = 0; for (; i < (unsigned int)ndevices_params; ++i) { fprintf(stderr, "INFO ACC/LIBSMM: PARAMS -> \"%s\"\n", OPENCL_KERNELS_DEVICES[i]); @@ -708,7 +709,7 @@ int libsmm_acc_finalize(void) { int result = EXIT_SUCCESS; # endif /* multiple calls to libsmm_acc_finalize are not considered as an error */ - if (0 == LIBXSMM_ATOMIC_SUB_FETCH(&opencl_libsmm_initialized, 1, LIBXSMM_ATOMIC_RELAXED)) { + if (0 == LIBXSMM_ATOMIC_SUB_FETCH(&opencl_libsmm_initialized, 1, ACC_OPENCL_ATOMIC_KIND)) { # if LIBXSMM_VERSION4(1, 17, 0, 0) < LIBXSMM_VERSION_NUMBER char fname[ACC_OPENCL_MAXSTRLEN]; const void* regentry = libxsmm_get_registry_begin(LIBXSMM_KERNEL_KIND_USER, NULL /*key*/); @@ -717,19 +718,19 @@ int libsmm_acc_finalize(void) { cl_kernel kernel = *(const cl_kernel*)regentry; if (NULL == kernel) kernel = ((const opencl_libsmm_smm_t*)regentry)->kernel[1]; if (NULL != kernel) { /* only consider user-entry if clGetKernelInfo succeeded */ - cl_int result_entry = clGetKernelInfo(kernel, CL_KERNEL_FUNCTION_NAME, sizeof(fname), fname, NULL); - if (CL_SUCCESS == result_entry) { + int result_entry = clGetKernelInfo(kernel, CL_KERNEL_FUNCTION_NAME, sizeof(fname), fname, NULL); + if (EXIT_SUCCESS == result_entry) { if (NULL != strstr(fname, OPENCL_LIBSMM_KERNELNAME_TRANS)) { /* trans-kernel */ result_entry = clReleaseKernel(kernel); } else if (NULL != strstr(fname, OPENCL_LIBSMM_KERNELNAME_SMM)) { /* SMM-kernel */ result_entry = clReleaseKernel(kernel); - if (CL_SUCCESS == result_entry && kernel != ((const opencl_libsmm_smm_t*)regentry)->kernel[1]) { + if (EXIT_SUCCESS == result_entry && kernel != ((const opencl_libsmm_smm_t*)regentry)->kernel[1]) { kernel = ((const opencl_libsmm_smm_t*)regentry)->kernel[1]; /* release 2nd kernel */ if (NULL != kernel) result_entry = clReleaseKernel(kernel); } } - if (CL_SUCCESS != result_entry) result = result_entry; + if (EXIT_SUCCESS != result_entry) result = result_entry; } } } @@ -762,12 +763,16 @@ c_dbcsr_acc_bool_t libsmm_acc_is_thread_safe(void) { int libsmm_acc_transpose(const int* dev_trs_stack, int offset, int stack_size, void* dev_data, libsmm_acc_data_t datatype, int m, int n, int max_kernel_dim, void* stream) { + const c_dbcsr_acc_opencl_info_memptr_t* const info_stack = c_dbcsr_acc_opencl_info_devptr( + dev_trs_stack, sizeof(int), NULL /*amount*/, NULL /*offset*/); + const c_dbcsr_acc_opencl_info_memptr_t* const info_mdata = c_dbcsr_acc_opencl_info_devptr( + dev_data, 1 /*elsize*/, NULL /*amount*/, NULL /*offset*/); int result = EXIT_SUCCESS; + const int mn = m * n; # if !defined(OPENCL_KERNELS_SOURCE_TRANSPOSE) result = EXIT_FAILURE; # else - const int mn = m * n; - assert((NULL != dev_trs_stack && NULL != stream && NULL != dev_data && 0 <= offset && 0 <= stack_size) || 0 == stack_size); + assert((NULL != info_stack && NULL != stream && NULL != info_mdata && 0 <= offset && 0 <= stack_size) || 0 == stack_size); if (( # if defined(OPENCL_LIBSMM_F64) dbcsr_type_real_8 == datatype @@ -783,7 +788,7 @@ int libsmm_acc_transpose(const int* dev_trs_stack, int offset, int stack_size, v ) && 0 < stack_size && 1 < mn && m <= max_kernel_dim && n <= max_kernel_dim) { - const cl_command_queue queue = *ACC_OPENCL_STREAM(stream); + const c_dbcsr_acc_opencl_stream_t* const str = ACC_OPENCL_STREAM(stream); opencl_libsmm_trans_t* config; opencl_libsmm_transkey_t key; # if !defined(OPENCL_LIBSMM_VALIDATE_TRANS) @@ -807,8 +812,8 @@ int libsmm_acc_transpose(const int* dev_trs_stack, int offset, int stack_size, v # endif if (0 < nchar && (int)sizeof(fname) > nchar) { cl_device_id active_device; - result = clGetCommandQueueInfo(queue, CL_QUEUE_DEVICE, sizeof(cl_device_id), &active_device, NULL); - if (CL_SUCCESS == result) { + result = clGetCommandQueueInfo(str->queue, CL_QUEUE_DEVICE, sizeof(cl_device_id), &active_device, NULL); + if (EXIT_SUCCESS == result) { const char *const env_cl = getenv("OPENCL_LIBSMM_TRANS_BUILDOPTS"), *const env_bm = getenv("OPENCL_LIBSMM_TRANS_BM"); const char* const cmem = (EXIT_SUCCESS != opencl_libsmm_use_cmem(active_device) ? "global" : "constant"); const char* const param_format = "-DGLOBAL=%s -DINPLACE=%i -DFN=%s -DSM=%i -DSN=%i -DSWG=%i -DT=%s"; @@ -907,7 +912,7 @@ int libsmm_acc_transpose(const int* dev_trs_stack, int offset, int stack_size, v void* scratch = NULL; int* stack = NULL; size_t data_size; - if (CL_SUCCESS == clGetMemObjectInfo((cl_mem)dev_data, CL_MEM_SIZE, sizeof(size_t), &data_size, NULL)) { + if (EXIT_SUCCESS == clGetMemObjectInfo(info_mdata->memory, CL_MEM_SIZE, sizeof(size_t), &data_size, NULL)) { const size_t scratch_size = (sizeof(int) * offset_stack_size) /*stack*/ + data_size /*imat*/ + data_size /*omat*/ + (mn * typesize) /*gold*/ + 3 * (LIBXSMM_ALIGNMENT - 1) /*alignments*/; @@ -929,24 +934,24 @@ int libsmm_acc_transpose(const int* dev_trs_stack, int offset, int stack_size, v # endif assert(!(OPENCL_LIBSMM_NLOCKS_TRANS & (OPENCL_LIBSMM_NLOCKS_TRANS - 1))); /* POT */ { /* OpenCL is thread-safe except for clSetKernelArg and launching such shared kernel */ - static volatile int locks[OPENCL_LIBSMM_NLOCKS_TRANS]; + static ACC_OPENCL_ATOMIC_LOCKTYPE locks[OPENCL_LIBSMM_NLOCKS_TRANS]; # if (1 < OPENCL_LIBSMM_NLOCKS_TRANS) const unsigned int hash = libxsmm_hash(&config->kernel, sizeof(cl_kernel), 25071975 /*seed*/); const unsigned int lidx = LIBXSMM_MOD2(hash, OPENCL_LIBSMM_NLOCKS_TRANS); - volatile int* const lock = locks + lidx; + ACC_OPENCL_ATOMIC_LOCKTYPE* const lock = locks + lidx; # else - volatile int* const lock = locks; + ACC_OPENCL_ATOMIC_LOCKTYPE* const lock = locks; # endif /* calling clSetKernelArg must be consistent across host-threads */ - LIBXSMM_ATOMIC_ACQUIRE(lock, LIBXSMM_SYNC_NPAUSE, LIBXSMM_ATOMIC_RELAXED); + ACC_OPENCL_ATOMIC_ACQUIRE(lock); ACC_OPENCL_CHECK( clSetKernelArg(config->kernel, 0, sizeof(int), &offset), "set offset argument of transpose kernel", result); - ACC_OPENCL_CHECK( - clSetKernelArg(config->kernel, 1, sizeof(cl_mem), &dev_trs_stack), "set batch-list argument of transpose kernel", result); - ACC_OPENCL_CHECK( - clSetKernelArg(config->kernel, 2, sizeof(cl_mem), &dev_data), "set matrix-data argument of transpose kernel", result); - ACC_OPENCL_CHECK(clEnqueueNDRangeKernel(queue, config->kernel, 1 /*work_dim*/, NULL /*offset*/, &work_size, &config->wgsize, - 0, NULL, perf_event), + ACC_OPENCL_CHECK(clSetKernelArg(config->kernel, 1, sizeof(cl_mem), &info_stack->memory), + "set batch-list argument of transpose kernel", result); + ACC_OPENCL_CHECK(clSetKernelArg(config->kernel, 2, sizeof(cl_mem), &info_mdata->memory), + "set matrix-data argument of transpose kernel", result); + ACC_OPENCL_CHECK(clEnqueueNDRangeKernel(str->queue, config->kernel, 1 /*work_dim*/, NULL /*offset*/, &work_size, + &config->wgsize, 0, NULL, perf_event), "launch transpose kernel", result); /* eventually update performance counters inside of locked region */ # if !defined(OPENCL_LIBSMM_VALIDATE_TRANS) @@ -961,25 +966,23 @@ int libsmm_acc_transpose(const int* dev_trs_stack, int offset, int stack_size, v duration = 1E-9 * LIBXSMM_DELTA(begin, end); /* Nanoseconds->seconds */ } else { - clFinish(queue); + clFinish(str->queue); duration = libxsmm_timer_duration(start, libxsmm_timer_tick()); /* seconds */ } if (EXIT_SUCCESS == result) { const double membw = (1ULL * stack_size * (typesize * m * n)) / (duration * (1ULL << 30)); - const int* const priority = c_dbcsr_acc_opencl_stream_priority(stream); LIBXSMM_STDIO_ACQUIRE(); fprintf(stderr, "INFO ACC/LIBSMM: TRANS-kernel "); opencl_libsmm_write_trans_params( stderr, 1 /*only_key*/, &key, NULL /*config*/, NULL /*delim*/, NULL /*begin*/, NULL /*close*/); fprintf(stderr, "="); opencl_libsmm_write_trans_params(stderr, 1 /*only_key*/, &key, config, NULL /*delim*/, NULL /*begin*/, NULL /*close*/); - fprintf(stderr, " prio=%i ss=%i cur=%.1f GB/s dur=%.2g ms\n", NULL != priority ? *priority : -1, stack_size, membw, - 1E3 * duration); + fprintf(stderr, " prio=%i ss=%i cur=%.1f GB/s dur=%.2g ms\n", str->priority, stack_size, membw, 1E3 * duration); LIBXSMM_STDIO_RELEASE(); } } # endif - LIBXSMM_ATOMIC_RELEASE(lock, LIBXSMM_ATOMIC_RELAXED); + ACC_OPENCL_ATOMIC_RELEASE(lock); } # if defined(OPENCL_LIBSMM_VALIDATE_TRANS) ACC_OPENCL_CHECK(c_dbcsr_acc_memcpy_d2h(dev_data, omat, data_size, stream), "transfer validation test", result); @@ -1116,14 +1119,22 @@ c_dbcsr_acc_bool_t libsmm_acc_process_suitable( int libsmm_acc_process(const int* host_param_stack, const int* dev_param_stack, int stack_size, libsmm_acc_data_t datatype, const void* dev_a_data, const void* dev_b_data, void* dev_c_data, int m_max, int n_max, int k_max, int max_kernel_dim, c_dbcsr_acc_bool_t def_mnk, void* stream, void* c_stream) { + const c_dbcsr_acc_opencl_info_memptr_t* const info_stack = c_dbcsr_acc_opencl_info_devptr( + dev_param_stack, sizeof(int), NULL /*amount*/, NULL /*offset*/); + const c_dbcsr_acc_opencl_info_memptr_t* const info_adata = c_dbcsr_acc_opencl_info_devptr( + dev_a_data, 1 /*elsize*/, NULL /*amount*/, NULL /*offset*/); + const c_dbcsr_acc_opencl_info_memptr_t* const info_bdata = c_dbcsr_acc_opencl_info_devptr( + dev_b_data, 1 /*elsize*/, NULL /*amount*/, NULL /*offset*/); + const c_dbcsr_acc_opencl_info_memptr_t* const info_cdata = c_dbcsr_acc_opencl_info_devptr( + dev_c_data, 1 /*elsize*/, NULL /*amount*/, NULL /*offset*/); int result = EXIT_SUCCESS; const int nparams = 3; # if !defined(OPENCL_KERNELS_SOURCE_MULTIPLY) result = EXIT_FAILURE; # else LIBXSMM_UNUSED(c_stream); /* TODO */ - assert(0 == stack_size || (NULL != dev_a_data && NULL != dev_b_data && NULL != dev_c_data)); - assert(0 == stack_size || (NULL != host_param_stack && NULL != dev_param_stack)); + assert(0 == stack_size || (NULL != info_adata && NULL != info_bdata && NULL != info_cdata)); + assert(0 == stack_size || (NULL != host_param_stack && NULL != info_stack)); assert(0 < nparams && 0 < max_kernel_dim && NULL != stream); assert(0 <= stack_size && 0 <= m_max && 0 <= n_max && 0 <= k_max); if (0 != libsmm_acc_process_suitable(def_mnk, datatype, stack_size, m_max, n_max, k_max, max_kernel_dim)) { @@ -1132,29 +1143,27 @@ int libsmm_acc_process(const int* host_param_stack, const int* dev_param_stack, double duration; const libxsmm_timer_tickint start = libxsmm_timer_tick(); # endif - const c_dbcsr_acc_opencl_info_stream_t* const qinfo = c_dbcsr_acc_opencl_info_stream(stream); - const c_dbcsr_acc_opencl_device_t* const devinfo = c_dbcsr_acc_opencl_config.device + qinfo->tid; - const cl_command_queue queue = *ACC_OPENCL_STREAM(stream); + const c_dbcsr_acc_opencl_stream_t* const str = ACC_OPENCL_STREAM(stream); LIBXSMM_MEMZERO127(&key); /* potentially heterogeneous key-data */ key.devuid = ((1 != c_dbcsr_acc_opencl_config.devmatch && ((unsigned int)-1) != c_dbcsr_acc_opencl_config.devmatch) ? c_dbcsr_acc_opencl_config.devmatch - : devinfo->uid); + : c_dbcsr_acc_opencl_config.device.uid); key.type = datatype; key.m = m_max; key.n = n_max; key.k = k_max; - if (CL_SUCCESS == result) { - static volatile int locks[OPENCL_LIBSMM_NLOCKS_SMM]; /* OpenCL is thread-safe except for clSetKernelArg */ + if (EXIT_SUCCESS == result) { + static ACC_OPENCL_ATOMIC_LOCKTYPE locks[OPENCL_LIBSMM_NLOCKS_SMM]; /* OpenCL is thread-safe except for clSetKernelArg */ const char *const env_s = getenv("OPENCL_LIBSMM_SMM_S"), *const env_bs = getenv("OPENCL_LIBSMM_SMM_BS"); const int s = ((NULL == env_s || '\0' == *env_s) ? OPENCL_LIBSMM_SMM_S : atoi(env_s)); int kernel_idx = 0, bs = ((NULL == env_bs || '\0' == *env_bs) ? 0 : atoi(env_bs)); opencl_libsmm_smm_t* config; - volatile int* lock = locks; + ACC_OPENCL_ATOMIC_LOCKTYPE* lock = locks; # if (1 < OPENCL_LIBSMM_NLOCKS_SMM) assert(!(OPENCL_LIBSMM_NLOCKS_SMM & (OPENCL_LIBSMM_NLOCKS_SMM - 1))); /* POT */ lock += LIBXSMM_MOD2(libxsmm_hash(&key, sizeof(key), 25071975 /*seed*/), OPENCL_LIBSMM_NLOCKS_SMM); # endif - LIBXSMM_ATOMIC_ACQUIRE(lock, LIBXSMM_SYNC_NPAUSE, LIBXSMM_ATOMIC_RELAXED); + ACC_OPENCL_ATOMIC_ACQUIRE(lock); config = (opencl_libsmm_smm_t*)libxsmm_xdispatch(&key, sizeof(key)); if (0 >= bs) bs = ((NULL != config && 0 < config->bs) ? config->bs : OPENCL_LIBSMM_DEFAULT_BS); /* determine kernel-kind (mini-batch vs. mini-kernel) */ @@ -1170,7 +1179,7 @@ int libsmm_acc_process(const int* host_param_stack, const int* dev_param_stack, c_dbcsr_timeset(LIBSMM_ACC_PROCESS_ROUTINE_NAME_STRPTR, LIBSMM_ACC_PROCESS_ROUTINE_NAME_LENPTR, &routine_handle); # endif result = ((0 < nchar && (int)sizeof(fname) > nchar) - ? clGetCommandQueueInfo(queue, CL_QUEUE_DEVICE, sizeof(cl_device_id), &active_device, NULL) + ? clGetCommandQueueInfo(str->queue, CL_QUEUE_DEVICE, sizeof(cl_device_id), &active_device, NULL) : EXIT_FAILURE); if (EXIT_SUCCESS == result) { c_dbcsr_acc_opencl_atomic_fp_t tkind = c_dbcsr_acc_opencl_atomic_fp_no; @@ -1190,7 +1199,7 @@ int libsmm_acc_process(const int* host_param_stack, const int* dev_param_stack, } if (NULL != tname) { const char *extensions[] = {NULL, NULL}, *const env_devid = getenv("OPENCL_LIBSMM_SMM_DEVID"); - const unsigned int devuid = (NULL == env_devid || '\0' == *env_devid) ? devinfo->uid + const unsigned int devuid = (NULL == env_devid || '\0' == *env_devid) ? c_dbcsr_acc_opencl_config.device.uid : (unsigned int)strtoul(env_devid, NULL, 0); size_t wgsize_max, wgsize_prf, sgs = 0; opencl_libsmm_smm_t new_config; @@ -1212,7 +1221,7 @@ int libsmm_acc_process(const int* host_param_stack, const int* dev_param_stack, const char *const env_ab = getenv("OPENCL_LIBSMM_SMM_AB"), *const env_ac = getenv("OPENCL_LIBSMM_SMM_AC"); const char *const env_xf = getenv("OPENCL_LIBSMM_SMM_XF"), *const env_cl = getenv("OPENCL_LIBSMM_SMM_BUILDOPTS"); const char* const intel_xf = "-cl-intel-256-GRF-per-thread"; - const int default_lu = (0 != devinfo->intel ? -1 : 0); + const int default_lu = (0 != c_dbcsr_acc_opencl_config.device.intel ? -1 : 0); const int unroll = LIBXSMM_MAX(-2, (NULL == env_lu || '\0' == *env_lu) ? (0 == kernel_idx ? (NULL == config ? default_lu : config->lu) : default_lu) : atoi(env_lu)); /* populate only lower bound */ @@ -1254,7 +1263,9 @@ int libsmm_acc_process(const int* host_param_stack, const int* dev_param_stack, 0, 1); new_config.al = LIBXSMM_CLMP( (NULL == env_al || '\0' == *env_al) - ? (0 == devinfo->amd ? (0 == kernel_idx ? (NULL == config ? /*default*/ 0 : config->al) : /*default*/ 0) : 1) + ? (0 == c_dbcsr_acc_opencl_config.device.amd + ? (0 == kernel_idx ? (NULL == config ? /*default*/ 0 : config->al) : /*default*/ 0) + : 1) : atoi(env_al), 0, 1); new_config.tb = LIBXSMM_CLMP((NULL == env_tb || '\0' == *env_tb) @@ -1285,8 +1296,8 @@ int libsmm_acc_process(const int* host_param_stack, const int* dev_param_stack, : atoi(env_ac), 0, 1); if (NULL == env_xf || '\0' == *env_xf) { - if (0 == devinfo->intel || CL_DEVICE_TYPE_GPU != devinfo->type || NULL == env_cl || - NULL == strstr(env_cl, intel_xf)) + if (0 == c_dbcsr_acc_opencl_config.device.intel || CL_DEVICE_TYPE_GPU != c_dbcsr_acc_opencl_config.device.type || + NULL == env_cl || NULL == strstr(env_cl, intel_xf)) { new_config.flags = (NULL == config ? /*default*/ 0 : config->flags); } @@ -1379,10 +1390,10 @@ int libsmm_acc_process(const int* host_param_stack, const int* dev_param_stack, "-DT=%s -DINTEL=%u -DGLOBAL=%s -DSWG=%i -DSGS=%i -DFN=%s -DREPEAT=%i -DLU=%i " "-DSM=%i -DSN=%i -DSK=%i -DBS=%i -DVL=%i %s -DBM=%i -DBN=%i -DBK=%i " "%s %s %s %s %s %s %s %s ", /* space! */ - tname, 0 != devinfo->intel ? devinfo->uid : 0, cmem, (int)new_config.wgsize[kernel_idx], (int)sgs, fname, - NULL == env_nrepeat ? 1 : atoi(env_nrepeat), new_config.lu, m_max, n_max, k_max, bs, OPENCL_LIBSMM_VMIN, - bs == new_config.bs ? "-DBSC" : "", new_config.bm, new_config.bn, new_config.bk, - 0 == new_config.tb ? "" : "-DTRACK_B", 0 != new_config.tc ? "-DTRACK_C" : "", + tname, 0 != c_dbcsr_acc_opencl_config.device.intel ? c_dbcsr_acc_opencl_config.device.uid : 0, cmem, + (int)new_config.wgsize[kernel_idx], (int)sgs, fname, NULL == env_nrepeat ? 1 : atoi(env_nrepeat), new_config.lu, + m_max, n_max, k_max, bs, OPENCL_LIBSMM_VMIN, bs == new_config.bs ? "-DBSC" : "", new_config.bm, new_config.bn, + new_config.bk, 0 == new_config.tb ? "" : "-DTRACK_B", 0 != new_config.tc ? "-DTRACK_C" : "", 0 == new_config.nz ? "" : "-DATOMIC_INC_NZ", 0 == new_config.al ? "" : "-DAL", 0 == new_config.ap ? "" : "-DSLM_P", 0 == new_config.aa ? "" : (1 == slm_a ? "-DSLM_A=1" : (0 != slm_a ? "-DSLM_A=2" : "-DREG_A")), @@ -1390,19 +1401,24 @@ int libsmm_acc_process(const int* host_param_stack, const int* dev_param_stack, 0 == new_config.ac ? "" : (1 == slm_c ? "-DSLM_C=1" : "-DSLM_C=2")); /* apply support for FP-atomics */ if (0 < nchar && (int)sizeof(build_params) > nchar) { - nchar = c_dbcsr_acc_opencl_flags_atomics(active_device, tkind, devinfo, extensions, + nchar = c_dbcsr_acc_opencl_flags_atomics(&c_dbcsr_acc_opencl_config.device, tkind, extensions, sizeof(extensions) / sizeof(*extensions), build_params + nchar, sizeof(build_params) - nchar); } else result = EXIT_FAILURE; if (0 < nchar && (int)sizeof(build_params) > nchar) { const char* const cl_debug = ( # if !defined(NDBGDEV) - (0 != devinfo->intel && CL_DEVICE_TYPE_CPU != devinfo->type) ? "-gline-tables-only" : + (0 != c_dbcsr_acc_opencl_config.device.intel && CL_DEVICE_TYPE_CPU != c_dbcsr_acc_opencl_config.device.type) + ? "-gline-tables-only" + : # endif - ""); + ""); nchar = LIBXSMM_SNPRINTF(buffer, sizeof(buffer), "%s %s -cl-fast-relaxed-math -cl-denorms-are-zero %s", - (0 == new_config.flags || 0 == devinfo->intel || CL_DEVICE_TYPE_GPU != devinfo->type) ? "" : intel_xf, cl_debug, - NULL == env_cl ? "" : env_cl); + (0 == new_config.flags || 0 == c_dbcsr_acc_opencl_config.device.intel || + CL_DEVICE_TYPE_GPU != c_dbcsr_acc_opencl_config.device.type) + ? "" + : intel_xf, + cl_debug, NULL == env_cl ? "" : env_cl); if (0 >= nchar || (int)sizeof(buffer) <= nchar) result = EXIT_FAILURE; } else result = EXIT_FAILURE; @@ -1515,10 +1531,10 @@ int libsmm_acc_process(const int* host_param_stack, const int* dev_param_stack, libxsmm_xmmfunction kernel_cpu = {NULL}; size_t psize, asize, bsize, csize; void* scratch = NULL; - if (CL_SUCCESS == clGetMemObjectInfo((cl_mem)dev_param_stack, CL_MEM_SIZE, sizeof(size_t), &psize, NULL) && - CL_SUCCESS == clGetMemObjectInfo((cl_mem)dev_a_data, CL_MEM_SIZE, sizeof(size_t), &asize, NULL) && - CL_SUCCESS == clGetMemObjectInfo((cl_mem)dev_b_data, CL_MEM_SIZE, sizeof(size_t), &bsize, NULL) && - CL_SUCCESS == clGetMemObjectInfo((cl_mem)dev_c_data, CL_MEM_SIZE, sizeof(size_t), &csize, NULL)) + if (EXIT_SUCCESS == clGetMemObjectInfo(info_stack->memory, CL_MEM_SIZE, sizeof(size_t), &psize, NULL) && + EXIT_SUCCESS == clGetMemObjectInfo(info_adata->memory, CL_MEM_SIZE, sizeof(size_t), &asize, NULL) && + EXIT_SUCCESS == clGetMemObjectInfo(info_bdata->memory, CL_MEM_SIZE, sizeof(size_t), &bsize, NULL) && + EXIT_SUCCESS == clGetMemObjectInfo(info_cdata->memory, CL_MEM_SIZE, sizeof(size_t), &csize, NULL)) { libxsmm_descriptor_blob blob; libxsmm_gemm_descriptor* const desc = OPENCL_LIBSMM_DESCINIT( @@ -1558,13 +1574,13 @@ int libsmm_acc_process(const int* host_param_stack, const int* dev_param_stack, /* adjust launchsize according to intra-kernel batchsize */ work_size = ((stack_size + bs - 1) / bs) * config->wgsize[kernel_idx]; /* calling clSetKernelArg must be consistent across host-threads */ - ACC_OPENCL_CHECK(clSetKernelArg(config->kernel[kernel_idx], 0, sizeof(cl_mem), &dev_c_data), + ACC_OPENCL_CHECK(clSetKernelArg(config->kernel[kernel_idx], 0, sizeof(cl_mem), &info_cdata->memory), "set C-matrix argument of SMM-kernel", result); - ACC_OPENCL_CHECK(clSetKernelArg(config->kernel[kernel_idx], 1, sizeof(cl_mem), &dev_a_data), + ACC_OPENCL_CHECK(clSetKernelArg(config->kernel[kernel_idx], 1, sizeof(cl_mem), &info_adata->memory), "set A-matrix argument of SMM-kernel", result); - ACC_OPENCL_CHECK(clSetKernelArg(config->kernel[kernel_idx], 2, sizeof(cl_mem), &dev_b_data), + ACC_OPENCL_CHECK(clSetKernelArg(config->kernel[kernel_idx], 2, sizeof(cl_mem), &info_bdata->memory), "set B-matrix argument of SMM-kernel", result); - ACC_OPENCL_CHECK(clSetKernelArg(config->kernel[kernel_idx], 3, sizeof(cl_mem), &dev_param_stack), + ACC_OPENCL_CHECK(clSetKernelArg(config->kernel[kernel_idx], 3, sizeof(cl_mem), &info_stack->memory), "set batch-list argument of SMM-kernel", result); if (0 == kernel_idx) { assert(bs <= config->bs); @@ -1573,7 +1589,7 @@ int libsmm_acc_process(const int* host_param_stack, const int* dev_param_stack, ACC_OPENCL_CHECK( clSetKernelArg(config->kernel[kernel_idx], 5, sizeof(int), &bs), "set minibatch argument of SMM-kernel", result); } - ACC_OPENCL_CHECK(clEnqueueNDRangeKernel(queue, config->kernel[kernel_idx], 1 /*work_dim*/, NULL /*offset*/, &work_size, + ACC_OPENCL_CHECK(clEnqueueNDRangeKernel(str->queue, config->kernel[kernel_idx], 1 /*work_dim*/, NULL /*offset*/, &work_size, config->wgsize + kernel_idx, 0, NULL, perf_event), "launch SMM-kernel", result); /* eventually update performance counters inside of locked region */ @@ -1589,7 +1605,7 @@ int libsmm_acc_process(const int* host_param_stack, const int* dev_param_stack, duration = 1E-9 * LIBXSMM_DELTA(begin, end); /* Nanoseconds->seconds */ } else { - clFinish(queue); + clFinish(str->queue); duration = libxsmm_timer_duration(start, libxsmm_timer_tick()); /* seconds */ } if (EXIT_SUCCESS == result) { @@ -1597,14 +1613,13 @@ int libsmm_acc_process(const int* host_param_stack, const int* dev_param_stack, const double est = (dbcsr_type_real_8 == datatype ? (OPENCL_LIBSMM_AI(m_max, n_max, k_max, sizeof(double)) * opencl_libsmm_dacc) : (OPENCL_LIBSMM_AI(m_max, n_max, k_max, sizeof(float)) * opencl_libsmm_sacc)); - const int* const priority = c_dbcsr_acc_opencl_stream_priority(stream); LIBXSMM_STDIO_ACQUIRE(); fprintf(stderr, "INFO ACC/LIBSMM: SMM-kernel "); opencl_libsmm_write_smm_params( stderr, 1 /*only_key*/, &key, NULL /*config*/, NULL /*delim*/, NULL /*begin*/, NULL /*close*/); fprintf(stderr, "="); opencl_libsmm_write_smm_params(stderr, 1 /*only_key*/, &key, config, NULL /*delim*/, NULL /*begin*/, NULL /*close*/); - fprintf(stderr, " prio=%i ss=%i cur=%.1f", NULL != priority ? *priority : -1, stack_size, gflops); + fprintf(stderr, " prio=%i ss=%i cur=%.1f", str->priority, stack_size, gflops); if (0 < est) fprintf(stderr, " est=%.1f", est); fprintf(stderr, " GFLOPS/s dur=%.2g ms\n", 1E3 * duration); LIBXSMM_STDIO_RELEASE(); @@ -1681,7 +1696,7 @@ int libsmm_acc_process(const int* host_param_stack, const int* dev_param_stack, LIBXSMM_UNUSED(nparams); # endif } - LIBXSMM_ATOMIC_RELEASE(lock, LIBXSMM_ATOMIC_RELAXED); + ACC_OPENCL_ATOMIC_RELEASE(lock); } } else if (0 < stack_size) { /* inhomogeneous, large kernel, or unsupported datatype */