Merge pull request #138 from brian-team/single_precision_support

brian-team · Jul 16, 2018 · c3eee3b · c3eee3b
2 parents b00b7fd + 29911fd
commit c3eee3b
Show file tree

Hide file tree

Showing 13 changed files with 105 additions and 106 deletions.
diff --git a/.gitmodules b/.gitmodules
@@ -1,9 +1,9 @@
-[submodule "brian2_frozen"]
-	path = frozen_repos/brian2
-	url = https://github.com/brian-team/brian2.git
 [submodule "genn_frozen"]
 	path = frozen_repos/genn
 	url = https://github.com/genn-team/genn.git
 [submodule "brian2genn_frozen"]
 	path = frozen_repos/brian2genn
 	url = https://github.com/brian-team/brian2genn.git
+[submodule "frozen_repos/brian2"]
+	path = frozen_repos/brian2
+	url = https://github.com/denisalevi/brian2.git
diff --git a/brian2cuda/brianlib/spikequeue.h b/brian2cuda/brianlib/spikequeue.h
@@ -14,7 +14,6 @@ using namespace std;
 //      variables (delays, dt) are assumed to use the same data type
 typedef int32_t DTYPE_int;
 
-template <class scalar>
 class CudaSpikeQueue
 {
 private:
@@ -74,7 +73,7 @@ class CudaSpikeQueue
         int tid,
         int num_threads,
         int _num_blocks,
-        scalar _dt,
+        double _dt,
         int _neuron_N,
         int _syn_N,
         int _num_queues,

diff --git a/brian2cuda/cuda_generator.py b/brian2cuda/cuda_generator.py
@@ -33,8 +33,8 @@
                 types is not type safe. And convertion from 64bit integral types to double precision (64bit)
                 floating-point types neither. In those cases the closest higher or lower (implementation
                 defined) representable value will be selected.''',
-            validator=lambda v: v in ['single_precision', 'double_precision'],
-            default='double_precision')
+            validator=lambda v: v in ['float32', 'float64'],
+            default='float64')
 )
 
 
@@ -258,7 +258,7 @@ def translate_one_statement_sequence(self, statements, scalar=False):
                 brian_funcs = re.search('_brian_(' + '|'.join(functions_C99) + ')', line)
                 if brian_funcs is not None:
                     for identifier in get_identifiers(line):
-                        if convertion_pref == 'double_precision':
+                        if convertion_pref == 'float64':
                             # 64bit integer to floating-point conversions are not type safe
                             int64_type = re.search(r'\bu?int64_t\s*{}\b'.format(identifier), code)
                             if int64_type is not None:
@@ -269,20 +269,20 @@ def translate_one_statement_sequence(self, statements, scalar=False):
                                             "statement:\n\t{}\nGenerated from abstract code statements:\n\t{}\n".format(line, statements),
                                             once=True)
                                 self.warned_integral_convertion = True
-                                self.previous_convertion_pref = 'double_precision'
-                        else:  # convertion_pref = 'single_precision'
+                                self.previous_convertion_pref = 'float64'
+                        else:  # convertion_pref = 'float32'
                             # 32bit and 64bit integer to floating-point conversions are not type safe
                             int32_64_type = re.search(r'\bu?int(32|64)_t\s*{}\b'.format(identifier), code)
                             if int32_64_type is not None:
                                 logger.warn("Detected code statement with default function and 32bit or 64bit integer type in the same line and the "
-                                            "preference for default_functions_integral_convertion is 'single_precision'. "
+                                            "preference for default_functions_integral_convertion is 'float32'. "
                                             "Using 32bit or 64bit integer types as default function arguments is not type safe due to convertion of "
                                             "integer to single-precision floating-point types in device code. (relevant functions: sin, cos, tan, sinh, "
                                             "cosh, tanh, exp, log, log10, sqrt, ceil, floor, arcsin, arccos, arctan)\nDetected code "
                                             "statement:\n\t{}\nGenerated from abstract code statements:\n\t{}\n".format(line, statements),
                                             once=True)
                                 self.warned_integral_convertion = True
-                                self.previous_convertion_pref = 'single_precision'
+                                self.previous_convertion_pref = 'float32'
         return stripped_deindented_lines(code)
 
     def denormals_to_zero_code(self):
@@ -374,10 +374,10 @@ def determine_keywords(self):
         support_code = ''
         hash_defines = ''
         # set convertion types for standard C99 functions in device code
-        if prefs.codegen.generators.cuda.default_functions_integral_convertion == 'double_precision':
+        if prefs.codegen.generators.cuda.default_functions_integral_convertion == 'float64':
             default_func_type = 'double'
             other_func_type = 'float'
-        else:  # 'single_precision'
+        else:  # 'float32'
             default_func_type = 'float'
             other_func_type = 'double'
         for varname, variable in self.variables.items():

diff --git a/brian2cuda/device.py b/brian2cuda/device.py
@@ -69,13 +69,6 @@
         validator=lambda v: isinstance(v, int) and v >= 0,
         default=128),
 
-    curand_float_type=BrianPreference(
-        docs='''
-        Floating point type of generated random numbers (float/double).
-        ''',
-        validator=lambda v: v in ['float', 'double'],
-        default='float'),
-
     launch_bounds=BrianPreference(
         docs='''
         Weather or not to use `__launch_bounds__` to optimise register usage in kernels.
@@ -266,9 +259,9 @@ def code_object(self, owner, name, abstract_code, variables, template_name,
             for varname in variables.iterkeys():
                 if varname in read_write:
                     idx = variable_indices[varname]
-                    if idx == '_presynaptic_idx':
+                    if idx == '_presynaptic_idx' or varname == 'i':
                         self.delete_synaptic_pre[synaptic_pre_array_name] = False
-                    if idx == '_postsynaptic_idx':
+                    if idx == '_postsynaptic_idx' or varname == 'j':
                         self.delete_synaptic_post[synaptic_post_array_name] = False
         if template_name == "synapses":
             prepost = template_kwds['pathway'].prepost
@@ -343,7 +336,7 @@ def generate_objects_source(self, writer, arange_arrays, synapses, static_array_
                         num_parallel_blocks=num_parallel_blocks,
                         curand_generator_type=curand_generator_type,
                         curand_generator_ordering=curand_generator_ordering,
-                        curand_float_type=prefs['devices.cuda_standalone.curand_float_type'],
+                        curand_float_type=c_data_type(prefs['core.default_float_dtype']),
                         eventspace_arrays=self.eventspace_arrays,
                         multisynaptic_idx_vars=multisyn_vars,
                         profiled_codeobjects=self.profiled_codeobjects)
@@ -531,13 +524,13 @@ def generate_codeobj_source(self, writer):
                                 cudaMalloc((void**)&dev_array_randn, sizeof({dtype})*{number_elements}*{codeobj.randn_calls})
                                 );
                         curandGenerateNormal{curand_suffix}(curand_generator, dev_array_randn, {number_elements}*{codeobj.randn_calls}, 0, 1);
-                        '''.format(number_elements=number_elements, codeobj=codeobj, dtype=prefs['devices.cuda_standalone.curand_float_type'],
-                                   curand_suffix='Double' if prefs['devices.cuda_standalone.curand_float_type']=='double' else '')
+                        '''.format(number_elements=number_elements, codeobj=codeobj, dtype=c_data_type(prefs['core.default_float_dtype']),
+                                   curand_suffix='Double' if prefs['core.default_float_dtype']=='float64' else '')
                     additional_code.append(code_snippet)
-                    line = "{dtype}* par_array_{name}_randn".format(dtype=prefs['devices.cuda_standalone.curand_float_type'], name=codeobj.name)
+                    line = "{dtype}* par_array_{name}_randn".format(dtype=c_data_type(prefs['core.default_float_dtype']), name=codeobj.name)
                     device_parameters_lines.append(line)
-                    kernel_variables_lines.append("{dtype}* _ptr_array_{name}_randn = par_array_{name}_randn;".format(dtype=prefs['devices.cuda_standalone.curand_float_type'],
-                                                                                                          name=codeobj.name))
+                    kernel_variables_lines.append("{dtype}* _ptr_array_{name}_randn = par_array_{name}_randn;".format(dtype=c_data_type(prefs['core.default_float_dtype']),
+                                                                                                                      name=codeobj.name))
                     host_parameters_lines.append("dev_array_randn")
                 elif k == "_python_rand" and codeobj.runs_every_tick == False and codeobj.template_name != "synapses_create_generator":
                     code_snippet = '''
@@ -547,13 +540,13 @@ def generate_codeobj_source(self, writer):
                                 cudaMalloc((void**)&dev_array_rand, sizeof({dtype})*{number_elements}*{codeobj.rand_calls})
                                 );
                         curandGenerateUniform{curand_suffix}(curand_generator, dev_array_rand, {number_elements}*{codeobj.rand_calls});
-                        '''.format(number_elements=number_elements, codeobj=codeobj, dtype=prefs['devices.cuda_standalone.curand_float_type'],
-                                   curand_suffix='Double' if prefs['devices.cuda_standalone.curand_float_type']=='double' else '')
+                        '''.format(number_elements=number_elements, codeobj=codeobj, dtype=c_data_type(prefs['core.default_float_dtype']),
+                                   curand_suffix='Double' if prefs['core.default_float_dtype']=='float64' else '')
                     additional_code.append(code_snippet)
-                    line = "{dtype}* par_array_{name}_rand".format(dtype=prefs['devices.cuda_standalone.curand_float_type'], name=codeobj.name)
+                    line = "{dtype}* par_array_{name}_rand".format(dtype=c_data_type(prefs['core.default_float_dtype']), name=codeobj.name)
                     device_parameters_lines.append(line)
-                    kernel_variables_lines.append("{dtype}* _ptr_array_{name}_rand = par_array_{name}_rand;".format(dtype=prefs['devices.cuda_standalone.curand_float_type'],
-                                                                                                          name=codeobj.name))
+                    kernel_variables_lines.append("{dtype}* _ptr_array_{name}_rand = par_array_{name}_rand;".format(dtype=c_data_type(prefs['core.default_float_dtype']),
+                                                                                                                    name=codeobj.name))
                     host_parameters_lines.append("dev_array_rand")
                 elif isinstance(v, ArrayVariable):
                     if k in ['t', 'timestep', '_clock_t', '_clock_timestep', '_source_t', '_source_timestep'] and v.scalar:  # monitors have not scalar t variables
@@ -605,15 +598,15 @@ def generate_codeobj_source(self, writer):
             # TODO can we just include this in the k == '_python_rand' test above?
             if codeobj.rand_calls >= 1 and codeobj.runs_every_tick:
                 host_parameters_lines.append("dev_{name}_rand".format(name=codeobj.name))
-                device_parameters_lines.append("{dtype}* par_array_{name}_rand".format(dtype=prefs['devices.cuda_standalone.curand_float_type'],
+                device_parameters_lines.append("{dtype}* par_array_{name}_rand".format(dtype=c_data_type(prefs['core.default_float_dtype']),
                                                                                 name=codeobj.name))
-                kernel_variables_lines.append("{dtype}* _ptr_array_{name}_rand = par_array_{name}_rand;".format(dtype=prefs['devices.cuda_standalone.curand_float_type'],
+                kernel_variables_lines.append("{dtype}* _ptr_array_{name}_rand = par_array_{name}_rand;".format(dtype=c_data_type(prefs['core.default_float_dtype']),
                                                                                   name=codeobj.name))
             if codeobj.randn_calls >= 1 and codeobj.runs_every_tick:
                 host_parameters_lines.append("dev_{name}_randn".format(name=codeobj.name))
-                device_parameters_lines.append("{dtype}* par_array_{name}_randn".format(dtype=prefs['devices.cuda_standalone.curand_float_type'],
+                device_parameters_lines.append("{dtype}* par_array_{name}_randn".format(dtype=c_data_type(prefs['core.default_float_dtype']),
                                                                                 name=codeobj.name))
-                kernel_variables_lines.append("{dtype}* _ptr_array_{name}_randn = par_array_{name}_randn;".format(dtype=prefs['devices.cuda_standalone.curand_float_type'],
+                kernel_variables_lines.append("{dtype}* _ptr_array_{name}_randn = par_array_{name}_randn;".format(dtype=c_data_type(prefs['core.default_float_dtype']),
                                                                                   name=codeobj.name))
 
             # Sometimes an array is referred to by to different keys in our
@@ -663,7 +656,7 @@ def generate_rand_source(self, writer):
                                                            codeobj_with_rand=codeobj_with_rand,
                                                            codeobj_with_randn=codeobj_with_randn,
                                                            profiled=self.enable_profiling,
-                                                           curand_float_type=prefs['devices.cuda_standalone.curand_float_type'])
+                                                           curand_float_type=c_data_type(prefs['core.default_float_dtype']))
         writer.write('rand.*', rand_tmp)
 
     def copy_source_files(self, writer, directory):

diff --git a/brian2cuda/templates/objects.cu b/brian2cuda/templates/objects.cu
@@ -75,7 +75,6 @@ const int brian::_num_{{name}} = {{N}};
 //////////////// synapses /////////////////
 {% for S in synapses | sort(attribute='name') %}
 // {{S.name}}
-Synapses<double> brian::{{S.name}}({{S.source|length}}, {{S.target|length}});
 int32_t {{S.name}}_source_start_index;
 int32_t {{S.name}}_source_stop_index;
 bool brian::{{S.name}}_multiple_pre_post = false;
@@ -95,7 +94,7 @@ __device__ int32_t** brian::{{path.name}}_synapse_ids_by_pre;
 __device__ int32_t* brian::{{path.name}}_synapse_ids;
 __device__ int* brian::{{path.name}}_unique_delay_start_idcs;
 __device__ int* brian::{{path.name}}_unique_delays_offset_by_pre;
-__device__ SynapticPathway<double> brian::{{path.name}};
+__device__ SynapticPathway brian::{{path.name}};
 int brian::{{path.name}}_eventspace_idx = 0;
 int brian::{{path.name}}_delay;
 bool brian::{{path.name}}_scalar_delay;
@@ -113,7 +112,6 @@ int brian::num_threads_per_warp;
 __global__ void {{path.name}}_init(
                 int Nsource,
                 int Ntarget,
-                double* delays,
                 int32_t* sources,
                 int32_t* targets,
                 double dt,
@@ -123,7 +121,7 @@ __global__ void {{path.name}}_init(
 {
     using namespace brian;
 
-    {{path.name}}.init(Nsource, Ntarget, delays, sources, targets, dt, start, stop);
+    {{path.name}}.init(Nsource, Ntarget, sources, targets, dt, start, stop);
 }
 {% endfor %}
 {% endfor %}
@@ -186,7 +184,6 @@ void _init_arrays()
     {{path.name}}_init<<<1,1>>>(
             {{path.source|length}},
             {{path.target|length}},
-            thrust::raw_pointer_cast(&dev{{dynamic_array_specs[path.variables['delay']]}}[0]),
             thrust::raw_pointer_cast(&dev{{dynamic_array_specs[path.synapse_sources]}}[0]),
             thrust::raw_pointer_cast(&dev{{dynamic_array_specs[path.synapse_targets]}}[0]),
             0,  //was dt, maybe irrelevant?
@@ -551,7 +548,6 @@ extern const int _num_{{name}};
 //////////////// synapses /////////////////
 {% for S in synapses | sort(attribute='name') %}
 // {{S.name}}
-extern Synapses<double> {{S.name}};
 extern bool {{S.name}}_multiple_pre_post;
 {% for path in S._pathways | sort(attribute='name') %}
 extern __device__ int* {{path.name}}_num_synapses_by_pre;
@@ -568,7 +564,7 @@ extern __device__ int32_t** {{path.name}}_synapse_ids_by_pre;
 extern __device__ int32_t* {{path.name}}_synapse_ids;
 extern __device__ int* {{path.name}}_unique_delay_start_idcs;
 extern __device__ int* {{path.name}}_unique_delays_offset_by_pre;
-extern __device__ SynapticPathway<double> {{path.name}};
+extern __device__ SynapticPathway {{path.name}};
 extern int {{path.name}}_eventspace_idx;
 extern int {{path.name}}_delay;
 extern bool {{path.name}}_scalar_delay;

diff --git a/brian2cuda/templates/ratemonitor.cu b/brian2cuda/templates/ratemonitor.cu
@@ -41,8 +41,10 @@ __launch_bounds__(1024, {{sm_multiplier}})
 {% endif %}
 kernel_{{codeobj_name}}(
     int32_t current_iteration,
-    double* ratemonitor_rate,
-    double* ratemonitor_t,
+    {% set c_type = c_data_type(variables['rate'].dtype) %}
+    {{c_type}}* ratemonitor_rate,
+    {% set c_type = c_data_type(variables['t'].dtype) %}
+    {{c_type}}* ratemonitor_t,
     ///// DEVICE_PARAMETERS /////
     %DEVICE_PARAMETERS%
     )

diff --git a/brian2cuda/templates/synapses_classes.cu b/brian2cuda/templates/synapses_classes.cu
@@ -11,18 +11,13 @@
 
 #include "brianlib/spikequeue.h"
 
-template<class scalar> class Synapses;
-template<class scalar> class SynapticPathway;
-
-template <class scalar>
 class SynapticPathway
 {
 public:
     // total number of neurons in source and target NeuronGroup / Subgroup
     int Nsource;
     int Ntarget;
 
-    scalar* dev_delay;
     int32_t* dev_sources;
     int32_t* dev_targets;
 
@@ -31,23 +26,22 @@ public:
     int spikes_start;
     int spikes_stop;
 
-    scalar dt;
-    CudaSpikeQueue<scalar>* queue;
+    double dt;
+    CudaSpikeQueue* queue;
     bool no_or_const_delay_mode;
 
     //our real constructor
-    __device__ void init(int _Nsource, int _Ntarget, scalar* d_delay, int32_t* _sources,
-                int32_t* _targets, scalar _dt, int _spikes_start, int _spikes_stop)
+    __device__ void init(int _Nsource, int _Ntarget, int32_t* _sources,
+                int32_t* _targets, double _dt, int _spikes_start, int _spikes_stop)
     {
         Nsource = _Nsource;
         Ntarget = _Ntarget;
-        dev_delay = d_delay;
         dev_sources = _sources;
         dev_targets = _targets;
         dt = _dt;
         spikes_start = _spikes_start;
         spikes_stop = _spikes_stop;
-        queue = new CudaSpikeQueue<scalar>;
+        queue = new CudaSpikeQueue;
     };
 
     //our real destructor
@@ -57,29 +51,6 @@ public:
         delete queue;
     }
 };
-
-template <class scalar>
-class Synapses
-{
-public:
-    int _N_value;
-    inline double _N() { return _N_value;};
-    int Nsource;
-    int Ntarget;
-    std::vector< std::vector<int> > _pre_synaptic;
-    std::vector< std::vector<int> > _post_synaptic;
-
-    Synapses(int _Nsource, int _Ntarget)
-        : Nsource(_Nsource), Ntarget(_Ntarget)
-    {
-        for(int i=0; i<Nsource; i++)
-            _pre_synaptic.push_back(std::vector<int>());
-        for(int i=0; i<Ntarget; i++)
-            _post_synaptic.push_back(std::vector<int>());
-        _N_value = 0;
-    };
-};
-
 #endif
 
 {% endmacro %}
diff --git a/brian2cuda/tests/features/cuda_configuration.py b/brian2cuda/tests/features/cuda_configuration.py
@@ -163,10 +163,6 @@ class CUDAStandaloneConfigurationNoAssert(CUDAStandaloneConfigurationBase):
     name = 'CUDA standalone (asserts disabled)'
     device_kwargs = {'disable_asserts': True}
 
-class CUDAStandaloneConfigurationCurandDouble(CUDAStandaloneConfigurationBase):
-    name = 'CUDA standalone (curand_float_type = double)'
-    extra_prefs = {'devices.cuda_standalone.curand_float_type': 'double'}
-
 class CUDAStandaloneConfigurationNoCudaOccupancyAPI(CUDAStandaloneConfigurationBase):
     name = 'CUDA standalone (not using cuda occupancy API)'
     extra_prefs = {'devices.cuda_standalone.calc_occupancy': False}