From 9ed439b97c7a09f4ef6a603b356cf1c6a22e5ef1 Mon Sep 17 00:00:00 2001
From: Routhleck <1310722434@qq.com>
Date: Sun, 10 Dec 2023 19:43:23 +0800
Subject: [PATCH] Optimize event csr matvec with taichi customized op and Add
 taichi event csr matvec benchmark

---
 brainpy/_src/math/event/_csr_matvec_taichi.py | 193 ++++++-----
 .../event_csrmv_taichi_VS_event_csrmv.py      | 306 ++++++++++++++++++
 2 files changed, 426 insertions(+), 73 deletions(-)
 create mode 100644 brainpy/_src/math/event/tests/event_csrmv_taichi_VS_event_csrmv.py

diff --git a/brainpy/_src/math/event/_csr_matvec_taichi.py b/brainpy/_src/math/event/_csr_matvec_taichi.py
index de5bb7c3a..7505e5623 100644
--- a/brainpy/_src/math/event/_csr_matvec_taichi.py
+++ b/brainpy/_src/math/event/_csr_matvec_taichi.py
@@ -19,6 +19,7 @@
   'csrmv_taichi'
 ]
 
+### CPU
 
 @ti.kernel
 def _event_csr_matvec_transpose_bool_cpu(values: ti.types.ndarray(ndim=1),
@@ -115,93 +116,114 @@ def _event_csr_matvec_cpu(values: ti.types.ndarray(ndim=1),
           r += values[j]
       out[row_i] = r
 
+### GPU
+# homo
 
 @ti.kernel
-def _event_csr_matvec_transpose_bool_gpu(values: ti.types.ndarray(ndim=1),
+def _event_csr_matvec_transpose_bool_homo_gpu(values: ti.types.ndarray(ndim=1),
+                                              indices: ti.types.ndarray(ndim=1),
+                                              indptr: ti.types.ndarray(ndim=1),
+                                              events: ti.types.ndarray(ndim=1),
+                                              out: ti.types.ndarray(ndim=1)):
+  value = values[0]
+  for row_i in ti.ndrange(indptr.shape[0] - 1):
+    if events[row_i]:
+      for j in range(indptr[row_i], indptr[row_i + 1]):
+        out[indices[j]] += value
+
+
+@ti.kernel
+def _event_csr_matvec_transpose_homo_gpu(values: ti.types.ndarray(ndim=1),
                                          indices: ti.types.ndarray(ndim=1),
                                          indptr: ti.types.ndarray(ndim=1),
                                          events: ti.types.ndarray(ndim=1),
                                          out: ti.types.ndarray(ndim=1)):
-  if values.shape[0] == 1:
-    value = values[0]
-    for row_i in range(indptr.shape[0] - 1):
-      if events[row_i]:
-        for j in range(indptr[row_i], indptr[row_i + 1]):
-          out[indices[j]] += value
-
-  else:
-    for row_i in range(indptr.shape[0] - 1):
-      if events[row_i]:
-        for j in range(indptr[row_i], indptr[row_i + 1]):
-          out[indices[j]] += values[j]
+  value = values[0]
+  for row_i in ti.ndrange(indptr.shape[0] - 1):
+    if events[row_i] > 0.:
+      for j in range(indptr[row_i], indptr[row_i + 1]):
+        out[indices[j]] += value
 
 
 @ti.kernel
-def _event_csr_matvec_transpose_gpu(values: ti.types.ndarray(ndim=1),
+def _event_csr_matvec_bool_homo_gpu(values: ti.types.ndarray(ndim=1),
                                     indices: ti.types.ndarray(ndim=1),
                                     indptr: ti.types.ndarray(ndim=1),
                                     events: ti.types.ndarray(ndim=1),
                                     out: ti.types.ndarray(ndim=1)):
-  if values.shape[0] == 1:
-    value = values[0]
-    for row_i in range(indptr.shape[0] - 1):
-      if events[row_i] > 0.:
-        for j in range(indptr[row_i], indptr[row_i + 1]):
-          out[indices[j]] += value
-
-  else:
-    for row_i in range(indptr.shape[0] - 1):
-      if events[row_i] > 0.:
-        for j in range(indptr[row_i], indptr[row_i + 1]):
-          out[indices[j]] += values[j]
-
+  value = values[0]
+  for row_i in ti.ndrange(indptr.shape[0] - 1):
+    r = 0.
+    for j in range(indptr[row_i], indptr[row_i + 1]):
+      if events[indices[j]]:
+        r += value
+    out[row_i] = r
 
 @ti.kernel
-def _event_csr_matvec_bool_gpu(values: ti.types.ndarray(ndim=1),
+def _event_csr_matvec_homo_gpu(values: ti.types.ndarray(ndim=1),
                                indices: ti.types.ndarray(ndim=1),
                                indptr: ti.types.ndarray(ndim=1),
                                events: ti.types.ndarray(ndim=1),
                                out: ti.types.ndarray(ndim=1)):
-  if values.shape[0] == 1:
-    value = values[0]
-    for row_i in range(indptr.shape[0] - 1):
-      r = 0.
-      for j in range(indptr[row_i], indptr[row_i + 1]):
-        if events[indices[j]]:
-          r += value
-      out[row_i] = r
+  value = values[0]
+  for row_i in ti.ndrange(indptr.shape[0] - 1):
+    r = 0.
+    for j in range(indptr[row_i], indptr[row_i + 1]):
+      if events[indices[j]] > 0.:
+        r += value
+    out[row_i] = r
 
-  else:
-    for row_i in range(indptr.shape[0] - 1):
-      r = 0.
+# heter
+
+@ti.kernel
+def _event_csr_matvec_transpose_bool_heter_gpu(values: ti.types.ndarray(ndim=1),
+                                              indices: ti.types.ndarray(ndim=1),
+                                              indptr: ti.types.ndarray(ndim=1),
+                                              events: ti.types.ndarray(ndim=1),
+                                              out: ti.types.ndarray(ndim=1)):
+  for row_i in ti.ndrange(indptr.shape[0] - 1):
+    if events[row_i]:
       for j in range(indptr[row_i], indptr[row_i + 1]):
-        if events[indices[j]]:
-          r += values[j]
-      out[row_i] = r
+        out[indices[j]] += values[j]
 
 
 @ti.kernel
-def _event_csr_matvec_gpu(values: ti.types.ndarray(ndim=1),
-                          indices: ti.types.ndarray(ndim=1),
-                          indptr: ti.types.ndarray(ndim=1),
-                          events: ti.types.ndarray(ndim=1),
-                          out: ti.types.ndarray(ndim=1)):
-  if values.shape[0] == 1:
-    value = values[0]
-    for row_i in range(indptr.shape[0] - 1):
-      r = 0.
+def _event_csr_matvec_transpose_heter_gpu(values: ti.types.ndarray(ndim=1),
+                                         indices: ti.types.ndarray(ndim=1),
+                                         indptr: ti.types.ndarray(ndim=1),
+                                         events: ti.types.ndarray(ndim=1),
+                                         out: ti.types.ndarray(ndim=1)):
+  for row_i in ti.ndrange(indptr.shape[0] - 1):
+    if events[row_i] > 0.:
       for j in range(indptr[row_i], indptr[row_i + 1]):
-        if events[indices[j]] > 0.:
-          r += value
-      out[row_i] = r
+        out[indices[j]] += values[j]
 
-  else:
-    for row_i in range(indptr.shape[0] - 1):
-      r = 0.
-      for j in range(indptr[row_i], indptr[row_i + 1]):
-        if events[indices[j]] > 0.:
-          r += values[j]
-      out[row_i] = r
+
+@ti.kernel
+def _event_csr_matvec_bool_heter_gpu(values: ti.types.ndarray(ndim=1),
+                                    indices: ti.types.ndarray(ndim=1),
+                                    indptr: ti.types.ndarray(ndim=1),
+                                    events: ti.types.ndarray(ndim=1),
+                                    out: ti.types.ndarray(ndim=1)):
+  for row_i in ti.ndrange(indptr.shape[0] - 1):
+    r = 0.
+    for j in range(indptr[row_i], indptr[row_i + 1]):
+      if events[indices[j]]:
+        r += values[j]
+    out[row_i] = r
+
+@ti.kernel
+def _event_csr_matvec_heter_gpu(values: ti.types.ndarray(ndim=1),
+                               indices: ti.types.ndarray(ndim=1),
+                               indptr: ti.types.ndarray(ndim=1),
+                               events: ti.types.ndarray(ndim=1),
+                               out: ti.types.ndarray(ndim=1)):
+  for row_i in ti.ndrange(indptr.shape[0] - 1):
+    r = 0.
+    for j in range(indptr[row_i], indptr[row_i + 1]):
+      if events[indices[j]] > 0.:
+        r += values[j]
+    out[row_i] = r
 
 
 def _event_csr_matvec_jvp_values(val_dot, values, indices, indptr, events, *, outs, transpose, shape):
@@ -313,14 +335,26 @@ def csrmv_taichi(
 
   if transpose:
     if events.dtype == jnp.bool_:
-      prim = _event_csrmv_transpose_bool_p
+      if events.shape[0] == 1:
+        prim = _event_csrmv_transpose_bool_homo_p
+      else:
+        prim = _event_csrmv_transpose_bool_heter_p
     else:
-      prim = _event_csrmv_transpose_p
+      if events.shape[0] == 1:
+        prim = _event_csrmv_transpose_homo_p
+      else:
+        prim = _event_csrmv_transpose_heter_p
   else:
     if events.dtype == jnp.bool_:
-      prim = _event_csrmv_bool_p
+      if events.shape[0] == 1:
+        prim = _event_csrmv_bool_homo_p
+      else:
+        prim = _event_csrmv_bool_heter_p
     else:
-      prim = _event_csrmv_p
+      if events.shape[0] == 1:
+        prim = _event_csrmv_homo_p
+      else:
+        prim = _event_csrmv_heter_p
 
   # computing
   return prim(data,
@@ -339,14 +373,27 @@ def _define_op(cpu_kernel, gpu_kernel):
   return prim
 
 
-# transpose bool
-_event_csrmv_transpose_bool_p = _define_op(_event_csr_matvec_transpose_bool_cpu, _event_csr_matvec_transpose_bool_gpu)
+# transpose bool homo
+_event_csrmv_transpose_bool_homo_p = _define_op(_event_csr_matvec_transpose_bool_cpu, _event_csr_matvec_transpose_bool_homo_gpu)
+
+# transpose homo
+_event_csrmv_transpose_homo_p = _define_op(_event_csr_matvec_transpose_cpu, _event_csr_matvec_transpose_homo_gpu)
+
+# not transpose bool homo
+_event_csrmv_bool_homo_p = _define_op(_event_csr_matvec_bool_cpu, _event_csr_matvec_bool_homo_gpu)
+
+# not transpose homo
+_event_csrmv_homo_p = _define_op(_event_csr_matvec_cpu, _event_csr_matvec_homo_gpu)
+
+# transpose bool heter
+_event_csrmv_transpose_bool_heter_p = _define_op(_event_csr_matvec_transpose_bool_cpu, _event_csr_matvec_transpose_bool_heter_gpu)
+
+# transpose heter
+_event_csrmv_transpose_heter_p = _define_op(_event_csr_matvec_transpose_cpu, _event_csr_matvec_transpose_heter_gpu)
 
-# transpose
-_event_csrmv_transpose_p = _define_op(_event_csr_matvec_transpose_cpu, _event_csr_matvec_transpose_gpu)
+# not transpose bool heter
+_event_csrmv_bool_heter_p = _define_op(_event_csr_matvec_bool_cpu, _event_csr_matvec_bool_heter_gpu)
 
-# not transpose bool
-_event_csrmv_bool_p = _define_op(_event_csr_matvec_bool_cpu, _event_csr_matvec_bool_gpu)
+# not transpose heter
+_event_csrmv_heter_p = _define_op(_event_csr_matvec_cpu, _event_csr_matvec_heter_gpu)
 
-# not transpose
-_event_csrmv_p = _define_op(_event_csr_matvec_cpu, _event_csr_matvec_gpu)
diff --git a/brainpy/_src/math/event/tests/event_csrmv_taichi_VS_event_csrmv.py b/brainpy/_src/math/event/tests/event_csrmv_taichi_VS_event_csrmv.py
new file mode 100644
index 000000000..a61ef4d25
--- /dev/null
+++ b/brainpy/_src/math/event/tests/event_csrmv_taichi_VS_event_csrmv.py
@@ -0,0 +1,306 @@
+# from jax_taichi import jax_taichi_call
+
+import time
+from functools import partial
+import os
+
+import brainpy as bp
+import brainpy.math as bm
+import jax
+import jax.numpy as jnp
+import numpy as np
+import pandas as pd
+import taichi as ti
+
+bm.set_platform('cpu')
+
+s = [1000, 2500, 5000, 10000, 25000, 50000]
+p = [0.1, 0.2, 0.3, 0.4, 0.5]
+values_type = ['homo', 'heter']
+events_type = ['bool', 'float']
+
+print(bm.get_platform())
+
+
+def test_event_ell_cpu(s, p, values_type, events_type):
+  print('s: ', s, 'p: ', p)
+  k = int(s * p)
+  bm.random.seed(1234)
+  rng = bm.random.RandomState(seed=1234)
+  # init
+  indices = bm.random.randint(0, s, (s, k))
+  vector = bm.random.rand(s) < 0.5
+  weight = jnp.array([1.0])
+  csr_indices = indices.flatten()
+  csr_indptr = np.cumsum(np.insert(np.ones(s, dtype=int) * k, 0, 0))
+
+  pre_indices = np.repeat(np.arange(s), k)
+  dense = np.zeros((s, s))
+  dense[pre_indices, csr_indices] = 1.0
+
+  if events_type == 'float':
+    vector = vector.astype(np.float32)
+    vector[vector == 1.0] = bm.random.rand(bm.sum(vector == 1.0))
+  if values_type == 'heter':
+    heter_data = bm.as_jax(rng.random(csr_indices.shape))
+    weight = heter_data
+
+  # groundtruth = bm.as_jax(vector, dtype=float) @ bm.as_jax(dense)
+
+  result1 = jax.block_until_ready(bm.event.csrmv_taichi(weight, csr_indices, csr_indptr, vector, shape=(s, s), transpose=True))
+  # time.sleep(2)
+
+  time0 = time.time()
+  result1 = jax.block_until_ready(bm.event.csrmv_taichi(weight, csr_indices, csr_indptr, vector, shape=(s, s), transpose=True))
+  time1 = time.time()
+  # time.sleep(2)
+
+  time2 = time.time()
+  result1 = jax.block_until_ready(bm.event.csrmv_taichi(weight, csr_indices, csr_indptr, vector, shape=(s, s), transpose=True))
+  time3 = time.time()
+  # time.sleep(2)
+
+  time4 = time.time()
+  result1 = jax.block_until_ready(bm.event.csrmv_taichi(weight, csr_indices, csr_indptr, vector, shape=(s, s), transpose=True))
+  time5 = time.time()
+  # time.sleep(2)
+
+  time6 = time.time()
+  result1 = jax.block_until_ready(bm.event.csrmv_taichi(weight, csr_indices, csr_indptr, vector, shape=(s, s), transpose=True))
+  time7 = time.time()
+
+  time8 = time.time()
+  result1 = jax.block_until_ready(bm.event.csrmv_taichi(weight, csr_indices, csr_indptr, vector, shape=(s, s), transpose=True))
+  time9 = time.time()
+
+  result2 = jax.block_until_ready(bm.event.csrmv(weight, csr_indices, csr_indptr, vector, shape=(s, s), transpose=True))
+#   print(result1[0])
+#   print(result2)
+#   print(groundtruth - result1[0])
+#   print(groundtruth - result2)
+  
+  # print(result1[0] - result2)
+  # print(bm.allclose(groundtruth, result1[0]))
+  # print(bm.allclose(groundtruth, result2))
+  # assert bm.allclose(result1[0], result2)
+
+  time12 = time.time()
+  result2 = jax.block_until_ready(bm.event.csrmv(weight, csr_indices, csr_indptr, vector, shape=(s, s), transpose=True))
+  time13 = time.time()
+  # time.sleep(2)
+
+  time14 = time.time()
+  result2 = jax.block_until_ready(bm.event.csrmv(weight, csr_indices, csr_indptr, vector, shape=(s, s), transpose=True))
+  time15 = time.time()
+  # time.sleep(2)
+
+  time16 = time.time()
+  result2 = jax.block_until_ready(bm.event.csrmv(weight, csr_indices, csr_indptr, vector, shape=(s, s), transpose=True))
+  time17 = time.time()
+  # time.sleep(2)
+
+  time18 = time.time()
+  result2 = jax.block_until_ready(bm.event.csrmv(weight, csr_indices, csr_indptr, vector, shape=(s, s), transpose=True))
+  time19 = time.time()
+
+  time20 = time.time()
+  result2 = jax.block_until_ready(bm.event.csrmv(weight, csr_indices, csr_indptr, vector, shape=(s, s), transpose=True))
+  time21 = time.time()
+
+  taichi_aot_time1 = (time1 - time0) * 1000
+  taichi_aot_time2 = (time3 - time2) * 1000
+  taichi_aot_time3 = (time5 - time4) * 1000
+  taichi_aot_time4 = (time7 - time6) * 1000
+  taichi_aot_time5 = (time9 - time8) * 1000
+  brainpy_time1 = (time13 - time12) * 1000
+  brainpy_time2 = (time15 - time14) * 1000
+  brainpy_time3 = (time17 - time16) * 1000
+  brainpy_time4 = (time19 - time18) * 1000
+  brainpy_time5 = (time21 - time20) * 1000
+
+  print('taichi_aot_1: ', taichi_aot_time1, 'ms')
+  print('taichi_aot_2: ', taichi_aot_time2, 'ms')
+  print('taichi_aot_3: ', taichi_aot_time3, 'ms')
+  print('taichi_aot_4: ', taichi_aot_time4, 'ms')
+  print('taichi_aot_5: ', taichi_aot_time5, 'ms')
+  print('brainpylib_cpu_1: ', brainpy_time1, 'ms')
+  print('brainpylib_cpu_2: ', brainpy_time2, 'ms')
+  print('brainpylib_cpu_3: ', brainpy_time3, 'ms')
+  print('brainpylib_cpu_4: ', brainpy_time4, 'ms')
+  print('brainpylib_cpu_5: ', brainpy_time5, 'ms')
+  assert(jnp.allclose(result1[0], result2))
+
+  speedup = (brainpy_time1 + brainpy_time2 + brainpy_time3 + brainpy_time4 + brainpy_time5) / \
+            (taichi_aot_time1 + taichi_aot_time2 + taichi_aot_time3 + taichi_aot_time4 + taichi_aot_time5) - 1
+
+  return taichi_aot_time1, taichi_aot_time2, taichi_aot_time3, taichi_aot_time4, taichi_aot_time5,\
+      brainpy_time1, brainpy_time2, brainpy_time3, brainpy_time4, brainpy_time5, speedup
+
+def test_event_ell_gpu(s, p, values_type, events_type):
+  print('s: ', s, 'p: ', p)
+  k = int(s * p)
+  bm.random.seed(1234)
+  rng = bm.random.RandomState(seed=1234)
+  # init
+  indices = bm.random.randint(0, s, (s, k))
+  vector = bm.random.rand(s) < 0.5
+  weight = jnp.array([1.0])
+  csr_indices = indices.flatten()
+  csr_indptr = np.cumsum(np.insert(np.ones(s, dtype=int) * k, 0, 0))
+  pre_indices = np.repeat(np.arange(s), k)
+  dense = np.zeros((s, s))
+  dense[pre_indices, csr_indices] = 1.0
+
+  if events_type == 'float':
+    vector = vector.astype(np.float32)
+    vector[vector == 1.0] = bm.random.rand(bm.sum(vector == 1.0))
+  if values_type == 'heter':
+    heter_data = bm.as_jax(rng.random(csr_indices.shape))
+    weight = heter_data
+
+  groundtruth = bm.as_jax(vector, dtype=float) @ bm.as_jax(dense)
+
+
+
+  result1 = jax.block_until_ready(bm.event.csrmv_taichi(weight, csr_indices, csr_indptr, vector, shape=(s, s), transpose=True))
+  # time.sleep(2)
+
+  time0 = time.time()
+  result1 = jax.block_until_ready(bm.event.csrmv_taichi(weight, csr_indices, csr_indptr, vector, shape=(s, s), transpose=True))
+  time1 = time.time()
+  # time.sleep(2)
+
+  time2 = time.time()
+  result1 = jax.block_until_ready(bm.event.csrmv_taichi(weight, csr_indices, csr_indptr, vector, shape=(s, s), transpose=True))
+  time3 = time.time()
+  # time.sleep(2)
+
+  time4 = time.time()
+  result1 = jax.block_until_ready(bm.event.csrmv_taichi(weight, csr_indices, csr_indptr, vector, shape=(s, s), transpose=True))
+  time5 = time.time()
+  # time.sleep(2)
+
+  time6 = time.time()
+  result1 = jax.block_until_ready(bm.event.csrmv_taichi(weight, csr_indices, csr_indptr, vector, shape=(s, s), transpose=True))
+  time7 = time.time()
+
+  time8 = time.time()
+  result1 = jax.block_until_ready(bm.event.csrmv_taichi(weight, csr_indices, csr_indptr, vector, shape=(s, s), transpose=True))
+  time9 = time.time()
+
+  result2 = jax.block_until_ready(bm.event.csrmv(weight, csr_indices, csr_indptr, vector, shape=(s, s), transpose=True))
+  # print('--------------------result1[0]------------------')
+  # print(result1[0])
+  # print('--------------------result2------------------')
+  # print(result2)
+  # print('--------------------gt - result1[0]------------------')
+  # print(groundtruth - result1[0])
+  # print('--------------------gt - result2------------------')
+  # print(groundtruth - result2)
+  
+  # print(result1[0] - result2)
+  # print(bm.allclose(groundtruth, result1[0]))
+  # print(bm.allclose(groundtruth, result2))
+  # assert bm.allclose(result1[0], result2)
+
+  time12 = time.time()
+  result2 = jax.block_until_ready(bm.event.csrmv(weight, csr_indices, csr_indptr, vector, shape=(s, s), transpose=True))
+  time13 = time.time()
+  # time.sleep(2)
+
+  time14 = time.time()
+  result2 = jax.block_until_ready(bm.event.csrmv(weight, csr_indices, csr_indptr, vector, shape=(s, s), transpose=True))
+  time15 = time.time()
+  # time.sleep(2)
+
+  time16 = time.time()
+  result2 = jax.block_until_ready(bm.event.csrmv(weight, csr_indices, csr_indptr, vector, shape=(s, s), transpose=True))
+  time17 = time.time()
+  # time.sleep(2)
+
+  time18 = time.time()
+  result2 = jax.block_until_ready(bm.event.csrmv(weight, csr_indices, csr_indptr, vector, shape=(s, s), transpose=True))
+  time19 = time.time()
+
+  time20 = time.time()
+  result2 = jax.block_until_ready(bm.event.csrmv(weight, csr_indices, csr_indptr, vector, shape=(s, s), transpose=True))
+  time21 = time.time()
+
+  taichi_aot_time1 = (time1 - time0) * 1000
+  taichi_aot_time2 = (time3 - time2) * 1000
+  taichi_aot_time3 = (time5 - time4) * 1000
+  taichi_aot_time4 = (time7 - time6) * 1000
+  taichi_aot_time5 = (time9 - time8) * 1000
+  brainpy_time1 = (time13 - time12) * 1000
+  brainpy_time2 = (time15 - time14) * 1000
+  brainpy_time3 = (time17 - time16) * 1000
+  brainpy_time4 = (time19 - time18) * 1000
+  brainpy_time5 = (time21 - time20) * 1000
+
+  print('taichi_aot_1: ', taichi_aot_time1, 'ms')
+  print('taichi_aot_2: ', taichi_aot_time2, 'ms')
+  print('taichi_aot_3: ', taichi_aot_time3, 'ms')
+  print('taichi_aot_4: ', taichi_aot_time4, 'ms')
+  print('taichi_aot_5: ', taichi_aot_time5, 'ms')
+  print('brainpylib_gpu_1: ', brainpy_time1, 'ms')
+  print('brainpylib_gpu_2: ', brainpy_time2, 'ms')
+  print('brainpylib_gpu_3: ', brainpy_time3, 'ms')
+  print('brainpylib_gpu_4: ', brainpy_time4, 'ms')
+  print('brainpylib_gpu_5: ', brainpy_time5, 'ms')
+
+  # assert(jnp.allclose(result1[0], result2))
+
+  speedup = (brainpy_time1 + brainpy_time2 + brainpy_time3 + brainpy_time4 + brainpy_time5) / \
+            (taichi_aot_time1 + taichi_aot_time2 + taichi_aot_time3 + taichi_aot_time4 + taichi_aot_time5) - 1
+
+  return taichi_aot_time1, taichi_aot_time2, taichi_aot_time3, taichi_aot_time4, taichi_aot_time5,\
+      brainpy_time1, brainpy_time2, brainpy_time3, brainpy_time4, brainpy_time5, speedup
+
+bm.set_platform('cpu')
+block_dim = 64
+# init dataframe
+df = pd.DataFrame(columns=['s', 'p', 'backend', 'values type', 'events type'
+                           'taichi aot time1(ms)', 'taichi aot time2(ms)', 'taichi aot time3(ms)', 'taichi aot time4(ms)', 'taichi aot time5(ms)',
+                           'brainpy time1(ms)', 'brainpy time2(ms)', 'brainpy time3(ms)', 'brainpy time4(ms)', 'brainpy time5(ms)',
+                           'speedup'])
+
+if (bm.get_platform() == 'cpu'):
+  for _s in s:
+      for _p in p:
+          for _values_type in values_type:
+             for _events_type in events_type:
+              taichi_aot_time_1, taichi_aot_time_2, taichi_aot_time_3, taichi_aot_time_4, taichi_aot_time_5,\
+                  brainpy_time_1, brainpy_time_2, brainpy_time_3, brainpy_time_4, brainpy_time_5, speedup = test_event_ell_cpu(_s, _p, _values_type, _events_type)
+              # append to dataframe
+              df.loc[df.shape[0]] = [_s, _p, 'cpu', _values_type, _events_type,
+                                    taichi_aot_time_1, taichi_aot_time_2, taichi_aot_time_3, taichi_aot_time_4, taichi_aot_time_5,
+                                    brainpy_time_1, brainpy_time_2, brainpy_time_3, brainpy_time_4, brainpy_time_5, speedup]
+  df.to_csv('event_csrmv_cpu.csv', index=False)
+
+if (bm.get_platform() == 'gpu'):
+  for _s in s:
+      for _p in p:
+          for _values_type in values_type:
+             for _events_type in events_type:
+              taichi_aot_time_1, taichi_aot_time_2, taichi_aot_time_3, taichi_aot_time_4, taichi_aot_time_5,\
+                  brainpy_time_1, brainpy_time_2, brainpy_time_3, brainpy_time_4, brainpy_time_5, speedup = test_event_ell_gpu(_s, _p, _values_type, _events_type)
+              # append to dataframe
+              df.loc[df.shape[0]] = [_s, _p, 'gpu', _values_type, _events_type,
+                                    taichi_aot_time_1, taichi_aot_time_2, taichi_aot_time_3, taichi_aot_time_4, taichi_aot_time_5,
+                                    brainpy_time_1, brainpy_time_2, brainpy_time_3, brainpy_time_4, brainpy_time_5, speedup]
+  df.to_csv('event_csrmv_gpu.csv', index=False)
+
+# if (bm.get_platform() == 'gpu'):
+#   for _s in s:
+#     for _p in p:
+#       taichi_aot_avg_time = test_event_ell_gpu_taichi(_s, _p)
+#       df.loc[df.shape[0]] = [_s, _p, 'gpu', block_dim, taichi_aot_avg_time, 0]
+#   df.to_csv('event_ell_gpu.csv', index=False)
+
+  # df = pd.read_csv('event_ell_gpu.csv')
+  # for _s in s:
+  #     for _p in p:
+  #         brainpy_avg_time = test_event_ell_gpu_brainpylib(_s, _p)
+  #         # 找到对应的行
+  #         df.loc[(df['s'] == _s) & (df['p'] == _p) & (df['backend'] == 'gpu'), 'brainpy avg time(ms)'] = brainpy_avg_time
+  # df.to_csv('event_ell_gpu.csv', index=False)