Clean up device memory and cuda error checks

brian-team · Mar 16, 2018 · 688c14f · 688c14f
1 parent 215fd13
commit 688c14f
Show file tree

Hide file tree

Showing 6 changed files with 116 additions and 103 deletions.
diff --git a/brian2cuda/brianlib/cuda_utils.h b/brian2cuda/brianlib/cuda_utils.h
@@ -3,40 +3,43 @@
 
 // Define this to turn on error checking
 #define BRIAN2CUDA_ERROR_CHECK
-// Define this to make kernel calls block CPU execution
+// Define this to synchronize device before checking errors
 //#define BRIAN2CUDA_ERROR_CHECK_BLOCKING
 
 // Define this to turn on memory checking
 //#define BRIAN2CUDA_MEMORY_CHECK
 // Define this to synchronize device before checking memory
 //#define BRIAN2CUDA_MEMORY_CHECK_BLOCKING
 
+
 // partly adapted from https://gist.github.com/ashwin/2652488
-#define CudaSafeCall(err)       __cudaSafeCall(err, __FILE__, __LINE__)
-#define CudaCheckError()        __cudaCheckError(__FILE__, __LINE__)
-#define CudaCheckMemory(param)  __cudaCheckMemory(param, __FILE__, __LINE__)
+#define CUDA_SAFE_CALL(err)  __cudaSafeCall(err, __FILE__, __LINE__)
+#define CUDA_CHECK_ERROR()   __cudaCheckError(__FILE__, __LINE__)
+#define CUDA_CHECK_MEMORY()  __cudaCheckMemory(__FILE__, __LINE__)
+
 
 inline void __cudaSafeCall(cudaError err, const char *file, const int line)
 {
 #ifdef BRIAN2CUDA_ERROR_CHECK
     if (cudaSuccess != err)
     {
-        fprintf(stderr, "cudaSafeCall() failed at %s:%i : %s\n",
+        fprintf(stderr, "ERROR: CUDA_SAFE_CALL() failed at %s:%i : %s\n",
                 file, line, cudaGetErrorString(err));
-        exit( -1 );
+        exit(-1);
     }
 #endif
 
     return;
 }
 
+
 inline void __cudaCheckError(const char *file, const int line)
 {
 #ifdef BRIAN2CUDA_ERROR_CHECK
     cudaError err = cudaGetLastError();
     if (cudaSuccess != err)
     {
-        fprintf(stderr, "cudaCheckError() failed at %s:%i : %s\n",
+        fprintf(stderr, "ERROR: CUDA_CHECK_ERROR() failed at %s:%i : %s\n",
                 file, line, cudaGetErrorString(err));
         exit(-1);
     }
@@ -46,7 +49,7 @@ inline void __cudaCheckError(const char *file, const int line)
     err = cudaDeviceSynchronize();
     if(cudaSuccess != err)
     {
-        fprintf(stderr, "cudaCheckError() with sync failed at %s:%i : %s\n",
+        fprintf(stderr, "ERROR: CUDA_CHECK_ERROR() with sync failed at %s:%i : %s\n",
                 file, line, cudaGetErrorString(err));
         exit(-1);
     }
@@ -56,52 +59,29 @@ inline void __cudaCheckError(const char *file, const int line)
     return;
 }
 
-#endif
-
 
-// Report cuda device memory usage
-inline void __cudaCheckMemory(const char *msg, const char *file, const int line)
+// Report device memory usage. The memory diff is reported with respect to the
+// global brian::used_device_memory as reference, which was set in the last
+// __cudaCheckMemory call.
+inline void __cudaCheckMemory(const char *file, const int line)
 {
 #ifdef BRIAN2CUDA_MEMORY_CHECK
 #ifdef BRIAN2CUDA_MEMORY_CHECK_BLOCKING
     cudaDeviceSynchronize();
 #endif
     const double to_MB = 1.0 / (1024.0 * 1024.0);
-    size_t avail;
-    size_t total;
+    size_t avail, total, used, diff;
     cudaMemGetInfo(&avail, &total);
-    size_t used = total - avail;
-    printf("INFO: cuda device memory usage in %s:%i (%s)\n"
-           "\t used:  \t %f MB\n"
-           "\t avail: \t %f MB\n"
-           "\t total: \t %f MB\n",
-           file, line, msg,
-           double(used) * to_MB,
-           double(avail) * to_MB,
-           double(total) * to_MB);
-#endif
-}
-
-
-// In this version the memory difference is always reported with respect to
-// some reference memory, e.g. from a previous cudaMemGetInfo call
-inline void __cudaCheckMemory(size_t &reference_memory, const char *file, const int line)
-{
-#ifdef BRIAN2CUDA_MEMORY_CHECK
-#ifdef BRIAN2CUDA_MEMORY_CHECK_BLOCKING
-    cudaDeviceSynchronize();
-#endif
-    const double to_MB = 1.0 / (1024.0 * 1024.0);
-    size_t avail;
-    size_t total;
-    cudaMemGetInfo(&avail, &total);
-    size_t used = total - avail;
-    size_t diff = used - reference_memory;
+    used = total - avail;
+    diff = used - brian::used_device_memory;
+    // print memory information only if device memory usage changed
+    // NOTE: Device memory is allocated in chunks. When allocating only little
+    // memory, the memory usage reported by cudaMemGetInfo might not change if
+    // the previously allocated chunk has enough free memory to be used for the
+    // newly requested allocation.
     if (diff > 0)
     {
-        if (reference_memory == 0)
-            diff = NAN;
-        printf("INFO: cuda device memory usage in %s:%i\n"
+        fprintf(stdout, "INFO: cuda device memory usage in %s:%i\n"
                "\t used:  \t %f MB\n"
                "\t avail: \t %f MB\n"
                "\t total: \t %f MB\n"
@@ -111,7 +91,9 @@ inline void __cudaCheckMemory(size_t &reference_memory, const char *file, const
                double(avail) * to_MB,
                double(total) * to_MB,
                double(diff) * to_MB, diff);
-        reference_memory = used;
+        brian::used_device_memory = used;
     }
 #endif
 }
+
+#endif
diff --git a/brian2cuda/templates/objects.cu b/brian2cuda/templates/objects.cu
@@ -3,6 +3,7 @@
 #include "objects.h"
 #include "synapses_classes.h"
 #include "brianlib/clocks.h"
+#include "brianlib/cuda_utils.h"
 #include "network.h"
 #include <stdint.h>
 #include <iostream>
@@ -14,6 +15,8 @@
 #include <thrust/device_vector.h>
 #include <curand.h>
 
+size_t brian::used_device_memory = 0;
+
 //////////////// clocks ///////////////////
 {% for clock in clocks | sort(attribute='name') %}
 Clock brian::{{clock.name}};
@@ -154,6 +157,11 @@ void _init_arrays()
 {
 	using namespace brian;
 
+	std::clock_t start_timer = std::clock();
+
+	CUDA_CHECK_MEMORY();
+	size_t used_device_memory_start = used_device_memory;
+
 	cudaDeviceProp props;
 	cudaGetDeviceProperties(&props, 0);
 
@@ -267,6 +275,15 @@ void _init_arrays()
 	cudaMalloc((void**)&dev{{varname}}[0], sizeof({{c_data_type(var.dtype)}})*_num_{{varname}});
 	{{varname}} = new {{c_data_type(var.dtype)}}[{{var.size}}];
 	{% endfor %}
+
+	CUDA_CHECK_MEMORY();
+	const double to_MB = 1.0 / (1024.0 * 1024.0);
+	double tot_memory_MB = (used_device_memory - used_device_memory_start) * to_MB;
+	double time_passed = (double)(std::clock() - start_timer) / CLOCKS_PER_SEC;
+	std::cout << "INFO: _init_arrays() took " <<  time_passed << "s";
+	if (tot_memory_MB > 0)
+		std::cout << " and used " << tot_memory_MB << "MB of device memory.";
+	std::cout << std::endl;
 }
 
 void _load_arrays()
@@ -498,6 +515,8 @@ typedef {{curand_float_type}} randomNumber_t;  // random number type
 
 namespace brian {
 
+extern size_t used_device_memory;
+
 //////////////// clocks ///////////////////
 {% for clock in clocks %}
 extern Clock {{clock.name}};

diff --git a/brian2cuda/templates/synapses.cu b/brian2cuda/templates/synapses.cu
@@ -31,7 +31,9 @@ kernel_{{codeobj_name}}(
 
 	unsigned int tid = threadIdx.x;
 	unsigned int bid = blockIdx.x + bid_offset;
-	//TODO: do we need _idx here? if now, get also rid of scoping after scalar code
+	//TODO: do we need _idx here? if no, get also rid of scoping after scalar code
+	// scalar_code can depend on _idx (e.g. if the state update depends on a
+	// subexpression that is the same for all synapses, ?)
 	unsigned int _idx = bid * THREADS_PER_BLOCK + tid;
 	unsigned int _vectorisation_idx = _idx;
 	%KERNEL_VARIABLES%

diff --git a/brian2cuda/templates/synapses_create_array.cu b/brian2cuda/templates/synapses_create_array.cu
@@ -3,6 +3,7 @@
 {% block extra_headers %}
 {{ super() }}
 #include<map>
+#include "brianlib/cuda_utils.h"
 {% endblock %}
 
 {% block kernel %}
@@ -24,9 +25,21 @@
 {% endblock %}
 
 {% block profiling_start %}
+std::clock_t start_timer = std::clock();
+
+CUDA_CHECK_MEMORY();
+size_t used_device_memory_start = used_device_memory;
 {% endblock %}
 
 {% block profiling_stop %}
+CUDA_CHECK_MEMORY();
+const double to_MB = 1.0 / (1024.0 * 1024.0);
+double tot_memory_MB = (used_device_memory - used_device_memory_start) * to_MB;
+double time_passed = (double)(std::clock() - start_timer) / CLOCKS_PER_SEC;
+std::cout << "INFO: {{owner.name}} creation took " <<  time_passed << "s";
+if (tot_memory_MB > 0)
+    std::cout << " and used " << tot_memory_MB << "MB of device memory.";
+std::cout << std::endl;
 {% endblock %}
 
 {% block extra_maincode %}
@@ -76,6 +89,7 @@ const int32_t newsize = {{_dynamic__synaptic_pre}}.size();
 		{{varname}}.resize(newsize);
 	{% endif %}
 {% endfor %}
+CUDA_CHECK_MEMORY();
 
 // update the total number of synapses
 {{N}} = newsize;

diff --git a/brian2cuda/templates/synapses_create_generator.cu b/brian2cuda/templates/synapses_create_generator.cu
@@ -5,6 +5,7 @@
 #include<iostream>
 #include<curand.h>
 #include<brianlib/curand_buffer.h>
+#include "brianlib/cuda_utils.h"
 #include<map>
 {% endblock %}
 
@@ -27,9 +28,21 @@
 {% endblock %}
 
 {% block profiling_start %}
+std::clock_t start_timer = std::clock();
+
+CUDA_CHECK_MEMORY();
+size_t used_device_memory_start = used_device_memory;
 {% endblock %}
 
 {% block profiling_stop %}
+CUDA_CHECK_MEMORY();
+const double to_MB = 1.0 / (1024.0 * 1024.0);
+double tot_memory_MB = (used_device_memory - used_device_memory_start) * to_MB;
+double time_passed = (double)(std::clock() - start_timer) / CLOCKS_PER_SEC;
+std::cout << "INFO: {{owner.name}} creation took " <<  time_passed << "s";
+if (tot_memory_MB > 0)
+    std::cout << " and used " << tot_memory_MB << "MB of memory.";
+std::cout << std::endl;
 {% endblock %}
 
 {% block extra_maincode %}