From 66724b1ef71a1d55561b0191084843fe0c006665 Mon Sep 17 00:00:00 2001
From: routhleck <1310722434@qq.com>
Date: Tue, 12 Dec 2023 13:55:02 +0800
Subject: [PATCH] Add more benchmarks

---
 brainpy/_src/math/event/_csr_matvec_taichi.py | 59 +++++++++++-
 .../event_csrmv_taichi_VS_event_csrmv.py      | 91 ++++++++++---------
 .../sparse/tests/csrmv_taichi_VS_csrmv.py     | 59 ++++++------
 3 files changed, 134 insertions(+), 75 deletions(-)

diff --git a/brainpy/_src/math/event/_csr_matvec_taichi.py b/brainpy/_src/math/event/_csr_matvec_taichi.py
index 829b343aa..4c48f27d6 100644
--- a/brainpy/_src/math/event/_csr_matvec_taichi.py
+++ b/brainpy/_src/math/event/_csr_matvec_taichi.py
@@ -126,12 +126,19 @@ def _event_csr_matvec_transpose_bool_homo_gpu(values: ti.types.ndarray(ndim=1),
                                               events: ti.types.ndarray(ndim=1),
                                               out: ti.types.ndarray(ndim=1)):
   value = values[0]
+  # total_rows = indptr.shape[0] - 1
+  # for i in range(total_rows * 32):
+  #   row_i = ti.cast(ti.floor(i / 32), ti.i32)
+  #   index = i % 32
+  #   if events[row_i]:
+  #     for j in range(indptr[row_i], indptr[row_i + 1]):
+  #       if j % 32 == index:
+  #         out[indices[j]] += value
   for row_i in ti.ndrange(indptr.shape[0] - 1):
     if events[row_i]:
       for j in range(indptr[row_i], indptr[row_i + 1]):
         out[indices[j]] += value
 
-
 @ti.kernel
 def _event_csr_matvec_transpose_homo_gpu(values: ti.types.ndarray(ndim=1),
                                          indices: ti.types.ndarray(ndim=1),
@@ -139,6 +146,14 @@ def _event_csr_matvec_transpose_homo_gpu(values: ti.types.ndarray(ndim=1),
                                          events: ti.types.ndarray(ndim=1),
                                          out: ti.types.ndarray(ndim=1)):
   value = values[0]
+  # total_rows = indptr.shape[0] - 1
+  # for i in range(total_rows * 32):
+  #   row_i = ti.cast(ti.floor(i / 32), ti.i32)
+  #   index = i % 32
+  #   if events[row_i] > 0.:
+  #     for j in range(indptr[row_i], indptr[row_i + 1]):
+  #       if j % 32 == index:
+  #         out[indices[j]] += value
   for row_i in ti.ndrange(indptr.shape[0] - 1):
     if events[row_i] > 0.:
       for j in range(indptr[row_i], indptr[row_i + 1]):
@@ -152,7 +167,16 @@ def _event_csr_matvec_bool_homo_gpu(values: ti.types.ndarray(ndim=1),
                                     events: ti.types.ndarray(ndim=1),
                                     out: ti.types.ndarray(ndim=1)):
   value = values[0]
-  for row_i in ti.ndrange(indptr.shape[0] - 1):
+  # total_rows = indptr.shape[0] - 1
+  # for i in ti.ndrange(total_rows * 32):
+  #   row_i = ti.cast(ti.floor(i / 32), ti.i32)
+  #   index = i % 32
+  #   r = 0.
+  #   for j in range(indptr[row_i], indptr[row_i + 1]):
+  #       if j % 32 == index and events[indices[j]]:
+  #         r += value
+  #   out[row_i] += r
+  for row_i in range(indptr.shape[0] - 1):
     r = 0.
     for j in range(indptr[row_i], indptr[row_i + 1]):
       if events[indices[j]]:
@@ -166,7 +190,7 @@ def _event_csr_matvec_homo_gpu(values: ti.types.ndarray(ndim=1),
                                events: ti.types.ndarray(ndim=1),
                                out: ti.types.ndarray(ndim=1)):
   value = values[0]
-  for row_i in ti.ndrange(indptr.shape[0] - 1):
+  for row_i in range(indptr.shape[0] - 1):
     r = 0.
     for j in range(indptr[row_i], indptr[row_i + 1]):
       if events[indices[j]] > 0.:
@@ -181,6 +205,14 @@ def _event_csr_matvec_transpose_bool_heter_gpu(values: ti.types.ndarray(ndim=1),
                                               indptr: ti.types.ndarray(ndim=1),
                                               events: ti.types.ndarray(ndim=1),
                                               out: ti.types.ndarray(ndim=1)):
+  # total_rows = indptr.shape[0] - 1
+  # for i in range(total_rows * 32):
+  #   row_i = ti.cast(ti.floor(i / 32), ti.i32)
+  #   index = i % 32
+  #   if events[row_i]:
+  #     for j in range(indptr[row_i], indptr[row_i + 1]):
+  #       if j % 32 == index:
+  #         out[indices[j]] += values[j]
   for row_i in ti.ndrange(indptr.shape[0] - 1):
     if events[row_i]:
       for j in range(indptr[row_i], indptr[row_i + 1]):
@@ -193,6 +225,14 @@ def _event_csr_matvec_transpose_heter_gpu(values: ti.types.ndarray(ndim=1),
                                          indptr: ti.types.ndarray(ndim=1),
                                          events: ti.types.ndarray(ndim=1),
                                          out: ti.types.ndarray(ndim=1)):
+  # total_rows = indptr.shape[0] - 1
+  # for i in range(total_rows * 32):
+  #   row_i = ti.cast(ti.floor(i / 32), ti.i32)
+  #   index = i % 32
+  #   if events[row_i] > 0.:
+  #     for j in range(indptr[row_i], indptr[row_i + 1]):
+  #       if j % 32 == index:
+  #         out[indices[j]] += values[j]
   for row_i in ti.ndrange(indptr.shape[0] - 1):
     if events[row_i] > 0.:
       for j in range(indptr[row_i], indptr[row_i + 1]):
@@ -205,7 +245,16 @@ def _event_csr_matvec_bool_heter_gpu(values: ti.types.ndarray(ndim=1),
                                     indptr: ti.types.ndarray(ndim=1),
                                     events: ti.types.ndarray(ndim=1),
                                     out: ti.types.ndarray(ndim=1)):
-  for row_i in ti.ndrange(indptr.shape[0] - 1):
+  # total_rows = indptr.shape[0] - 1
+  # for i in ti.ndrange(total_rows * 32):
+  #   row_i = ti.cast(ti.floor(i / 32), ti.i32)
+  #   index = i % 32
+  #   r = 0.
+  #   for j in range(indptr[row_i], indptr[row_i + 1]):
+  #     if j % 32 == index and events[indices[j]]:
+  #       r += values[j]
+  #   out[row_i] += r
+  for row_i in range(indptr.shape[0] - 1):
     r = 0.
     for j in range(indptr[row_i], indptr[row_i + 1]):
       if events[indices[j]]:
@@ -218,7 +267,7 @@ def _event_csr_matvec_heter_gpu(values: ti.types.ndarray(ndim=1),
                                indptr: ti.types.ndarray(ndim=1),
                                events: ti.types.ndarray(ndim=1),
                                out: ti.types.ndarray(ndim=1)):
-  for row_i in ti.ndrange(indptr.shape[0] - 1):
+  for row_i in range(indptr.shape[0] - 1):
     r = 0.
     for j in range(indptr[row_i], indptr[row_i + 1]):
       if events[indices[j]] > 0.:
diff --git a/brainpy/_src/math/event/tests/event_csrmv_taichi_VS_event_csrmv.py b/brainpy/_src/math/event/tests/event_csrmv_taichi_VS_event_csrmv.py
index 8a8c9a067..e81d2ea1e 100644
--- a/brainpy/_src/math/event/tests/event_csrmv_taichi_VS_event_csrmv.py
+++ b/brainpy/_src/math/event/tests/event_csrmv_taichi_VS_event_csrmv.py
@@ -16,13 +16,18 @@
 
 s = [1000, 2500, 5000, 10000, 25000, 50000]
 p = [0.1, 0.2, 0.3, 0.4, 0.5]
-values_type = ['homo', 'heter']
-events_type = ['bool', 'float']
+values_type = ['homo', 
+               'heter']
+events_type = ['bool', 
+               'float',
+               ]
+transpose = [True, 
+             False]
 
 print(bm.get_platform())
 
 
-def test_event_ell_cpu(s, p, values_type, events_type):
+def test_event_ell_cpu(s, p, values_type, events_type, transpose):
   print('s: ', s, 'p: ', p)
   k = int(s * p)
   bm.random.seed(1234)
@@ -39,43 +44,42 @@ def test_event_ell_cpu(s, p, values_type, events_type):
   dense[pre_indices, csr_indices] = 1.0
 
   if events_type == 'float':
-    vector = vector.astype(np.float32)
-    vector[vector == 1.0] = bm.random.rand(bm.sum(vector == 1.0))
+    vector = vector.astype(bm.float32)
   if values_type == 'heter':
     heter_data = bm.as_jax(rng.random(csr_indices.shape))
     weight = heter_data
 
   # groundtruth = bm.as_jax(vector, dtype=float) @ bm.as_jax(dense)
 
-  result1 = jax.block_until_ready(bm.event.csrmv_taichi(weight, csr_indices, csr_indptr, vector, shape=(s, s), transpose=True))
+  result1 = jax.block_until_ready(bm.event.csrmv_taichi(weight, csr_indices, csr_indptr, vector, shape=(s, s), transpose=transpose))
   # time.sleep(2)
 
   time0 = time.time()
-  result1 = jax.block_until_ready(bm.event.csrmv_taichi(weight, csr_indices, csr_indptr, vector, shape=(s, s), transpose=True))
+  result1 = jax.block_until_ready(bm.event.csrmv_taichi(weight, csr_indices, csr_indptr, vector, shape=(s, s), transpose=transpose))
   time1 = time.time()
   # time.sleep(2)
 
   time2 = time.time()
-  result1 = jax.block_until_ready(bm.event.csrmv_taichi(weight, csr_indices, csr_indptr, vector, shape=(s, s), transpose=True))
+  result1 = jax.block_until_ready(bm.event.csrmv_taichi(weight, csr_indices, csr_indptr, vector, shape=(s, s), transpose=transpose))
   time3 = time.time()
   # time.sleep(2)
 
   time4 = time.time()
-  result1 = jax.block_until_ready(bm.event.csrmv_taichi(weight, csr_indices, csr_indptr, vector, shape=(s, s), transpose=True))
+  result1 = jax.block_until_ready(bm.event.csrmv_taichi(weight, csr_indices, csr_indptr, vector, shape=(s, s), transpose=transpose))
   time5 = time.time()
   # time.sleep(2)
 
   time6 = time.time()
-  result1 = jax.block_until_ready(bm.event.csrmv_taichi(weight, csr_indices, csr_indptr, vector, shape=(s, s), transpose=True))
+  result1 = jax.block_until_ready(bm.event.csrmv_taichi(weight, csr_indices, csr_indptr, vector, shape=(s, s), transpose=transpose))
   time7 = time.time()
 
   time8 = time.time()
-  result1 = jax.block_until_ready(bm.event.csrmv_taichi(weight, csr_indices, csr_indptr, vector, shape=(s, s), transpose=True))
+  result1 = jax.block_until_ready(bm.event.csrmv_taichi(weight, csr_indices, csr_indptr, vector, shape=(s, s), transpose=transpose))
   time9 = time.time()
 
-  result2 = jax.block_until_ready(bm.event.csrmv(weight, csr_indices, csr_indptr, vector, shape=(s, s), transpose=True))
-#   print(result1[0])
-#   print(result2)
+  result2 = jax.block_until_ready(bm.event.csrmv(weight, csr_indices, csr_indptr, vector, shape=(s, s), transpose=transpose))
+  # print(result1[0])
+  # print(result2)
 #   print(groundtruth - result1[0])
 #   print(groundtruth - result2)
   
@@ -85,26 +89,26 @@ def test_event_ell_cpu(s, p, values_type, events_type):
   # assert bm.allclose(result1[0], result2)
 
   time12 = time.time()
-  result2 = jax.block_until_ready(bm.event.csrmv(weight, csr_indices, csr_indptr, vector, shape=(s, s), transpose=True))
+  result2 = jax.block_until_ready(bm.event.csrmv(weight, csr_indices, csr_indptr, vector, shape=(s, s), transpose=transpose))
   time13 = time.time()
   # time.sleep(2)
 
   time14 = time.time()
-  result2 = jax.block_until_ready(bm.event.csrmv(weight, csr_indices, csr_indptr, vector, shape=(s, s), transpose=True))
+  result2 = jax.block_until_ready(bm.event.csrmv(weight, csr_indices, csr_indptr, vector, shape=(s, s), transpose=transpose))
   time15 = time.time()
   # time.sleep(2)
 
   time16 = time.time()
-  result2 = jax.block_until_ready(bm.event.csrmv(weight, csr_indices, csr_indptr, vector, shape=(s, s), transpose=True))
+  result2 = jax.block_until_ready(bm.event.csrmv(weight, csr_indices, csr_indptr, vector, shape=(s, s), transpose=transpose))
   time17 = time.time()
   # time.sleep(2)
 
   time18 = time.time()
-  result2 = jax.block_until_ready(bm.event.csrmv(weight, csr_indices, csr_indptr, vector, shape=(s, s), transpose=True))
+  result2 = jax.block_until_ready(bm.event.csrmv(weight, csr_indices, csr_indptr, vector, shape=(s, s), transpose=transpose))
   time19 = time.time()
 
   time20 = time.time()
-  result2 = jax.block_until_ready(bm.event.csrmv(weight, csr_indices, csr_indptr, vector, shape=(s, s), transpose=True))
+  result2 = jax.block_until_ready(bm.event.csrmv(weight, csr_indices, csr_indptr, vector, shape=(s, s), transpose=transpose))
   time21 = time.time()
 
   taichi_aot_time1 = (time1 - time0) * 1000
@@ -136,7 +140,7 @@ def test_event_ell_cpu(s, p, values_type, events_type):
   return taichi_aot_time1, taichi_aot_time2, taichi_aot_time3, taichi_aot_time4, taichi_aot_time5,\
       brainpy_time1, brainpy_time2, brainpy_time3, brainpy_time4, brainpy_time5, speedup
 
-def test_event_ell_gpu(s, p, values_type, events_type):
+def test_event_ell_gpu(s, p, values_type, events_type, transpose):
   print('s: ', s, 'p: ', p)
   k = int(s * p)
   bm.random.seed(1234)
@@ -152,8 +156,7 @@ def test_event_ell_gpu(s, p, values_type, events_type):
   dense[pre_indices, csr_indices] = 1.0
 
   if events_type == 'float':
-    vector = vector.astype(np.float32)
-    vector[vector == 1.0] = bm.random.rand(bm.sum(vector == 1.0))
+    vector = vector.astype(bm.float32)
   if values_type == 'heter':
     heter_data = bm.as_jax(rng.random(csr_indices.shape))
     weight = heter_data
@@ -162,37 +165,39 @@ def test_event_ell_gpu(s, p, values_type, events_type):
 
 
 
-  result1 = jax.block_until_ready(bm.event.csrmv_taichi(weight, csr_indices, csr_indptr, vector, shape=(s, s), transpose=True))
+  result1 = jax.block_until_ready(bm.event.csrmv_taichi(weight, csr_indices, csr_indptr, vector, shape=(s, s), transpose=transpose))
   # time.sleep(2)
 
   time0 = time.time()
-  result1 = jax.block_until_ready(bm.event.csrmv_taichi(weight, csr_indices, csr_indptr, vector, shape=(s, s), transpose=True))
+  result1 = jax.block_until_ready(bm.event.csrmv_taichi(weight, csr_indices, csr_indptr, vector, shape=(s, s), transpose=transpose))
   time1 = time.time()
   # time.sleep(2)
 
   time2 = time.time()
-  result1 = jax.block_until_ready(bm.event.csrmv_taichi(weight, csr_indices, csr_indptr, vector, shape=(s, s), transpose=True))
+  result1 = jax.block_until_ready(bm.event.csrmv_taichi(weight, csr_indices, csr_indptr, vector, shape=(s, s), transpose=transpose))
   time3 = time.time()
   # time.sleep(2)
 
   time4 = time.time()
-  result1 = jax.block_until_ready(bm.event.csrmv_taichi(weight, csr_indices, csr_indptr, vector, shape=(s, s), transpose=True))
+  result1 = jax.block_until_ready(bm.event.csrmv_taichi(weight, csr_indices, csr_indptr, vector, shape=(s, s), transpose=transpose))
   time5 = time.time()
   # time.sleep(2)
 
   time6 = time.time()
-  result1 = jax.block_until_ready(bm.event.csrmv_taichi(weight, csr_indices, csr_indptr, vector, shape=(s, s), transpose=True))
+  result1 = jax.block_until_ready(bm.event.csrmv_taichi(weight, csr_indices, csr_indptr, vector, shape=(s, s), transpose=transpose))
   time7 = time.time()
 
   time8 = time.time()
-  result1 = jax.block_until_ready(bm.event.csrmv_taichi(weight, csr_indices, csr_indptr, vector, shape=(s, s), transpose=True))
+  result1 = jax.block_until_ready(bm.event.csrmv_taichi(weight, csr_indices, csr_indptr, vector, shape=(s, s), transpose=transpose))
   time9 = time.time()
 
-  result2 = jax.block_until_ready(bm.event.csrmv(weight, csr_indices, csr_indptr, vector, shape=(s, s), transpose=True))
+  result2 = jax.block_until_ready(bm.event.csrmv(weight, csr_indices, csr_indptr, vector, shape=(s, s), transpose=transpose))
   # print('--------------------result1[0]------------------')
   # print(result1[0])
   # print('--------------------result2------------------')
   # print(result2)
+  # print('--------------------gt------------------')
+  # print(groundtruth)
   # print('--------------------gt - result1[0]------------------')
   # print(groundtruth - result1[0])
   # print('--------------------gt - result2------------------')
@@ -204,26 +209,26 @@ def test_event_ell_gpu(s, p, values_type, events_type):
   # assert bm.allclose(result1[0], result2)
 
   time12 = time.time()
-  result2 = jax.block_until_ready(bm.event.csrmv(weight, csr_indices, csr_indptr, vector, shape=(s, s), transpose=True))
+  result2 = jax.block_until_ready(bm.event.csrmv(weight, csr_indices, csr_indptr, vector, shape=(s, s), transpose=transpose))
   time13 = time.time()
   # time.sleep(2)
 
   time14 = time.time()
-  result2 = jax.block_until_ready(bm.event.csrmv(weight, csr_indices, csr_indptr, vector, shape=(s, s), transpose=True))
+  result2 = jax.block_until_ready(bm.event.csrmv(weight, csr_indices, csr_indptr, vector, shape=(s, s), transpose=transpose))
   time15 = time.time()
   # time.sleep(2)
 
   time16 = time.time()
-  result2 = jax.block_until_ready(bm.event.csrmv(weight, csr_indices, csr_indptr, vector, shape=(s, s), transpose=True))
+  result2 = jax.block_until_ready(bm.event.csrmv(weight, csr_indices, csr_indptr, vector, shape=(s, s), transpose=transpose))
   time17 = time.time()
   # time.sleep(2)
 
   time18 = time.time()
-  result2 = jax.block_until_ready(bm.event.csrmv(weight, csr_indices, csr_indptr, vector, shape=(s, s), transpose=True))
+  result2 = jax.block_until_ready(bm.event.csrmv(weight, csr_indices, csr_indptr, vector, shape=(s, s), transpose=transpose))
   time19 = time.time()
 
   time20 = time.time()
-  result2 = jax.block_until_ready(bm.event.csrmv(weight, csr_indices, csr_indptr, vector, shape=(s, s), transpose=True))
+  result2 = jax.block_until_ready(bm.event.csrmv(weight, csr_indices, csr_indptr, vector, shape=(s, s), transpose=transpose))
   time21 = time.time()
 
   taichi_aot_time1 = (time1 - time0) * 1000
@@ -236,7 +241,7 @@ def test_event_ell_gpu(s, p, values_type, events_type):
   brainpy_time3 = (time17 - time16) * 1000
   brainpy_time4 = (time19 - time18) * 1000
   brainpy_time5 = (time21 - time20) * 1000
-
+  print('s: ', s, 'p: ', p, 'values_type: ', values_type, 'events_type: ', events_type, 'transpose: ', transpose)
   print('taichi_aot_1: ', taichi_aot_time1, 'ms')
   print('taichi_aot_2: ', taichi_aot_time2, 'ms')
   print('taichi_aot_3: ', taichi_aot_time3, 'ms')
@@ -257,7 +262,7 @@ def test_event_ell_gpu(s, p, values_type, events_type):
       brainpy_time1, brainpy_time2, brainpy_time3, brainpy_time4, brainpy_time5, speedup
 
 # init dataframe
-df = pd.DataFrame(columns=['s', 'p', 'backend', 'values type', 'events type',
+df = pd.DataFrame(columns=['s', 'p', 'backend', 'values type', 'events type', 'transpose',
                            'taichi aot time1(ms)', 'taichi aot time2(ms)', 'taichi aot time3(ms)', 'taichi aot time4(ms)', 'taichi aot time5(ms)',
                            'brainpy time1(ms)', 'brainpy time2(ms)', 'brainpy time3(ms)', 'brainpy time4(ms)', 'brainpy time5(ms)',
                            'speedup'])
@@ -267,10 +272,11 @@ def test_event_ell_gpu(s, p, values_type, events_type):
       for _p in p:
           for _values_type in values_type:
              for _events_type in events_type:
-              taichi_aot_time_1, taichi_aot_time_2, taichi_aot_time_3, taichi_aot_time_4, taichi_aot_time_5,\
-                  brainpy_time_1, brainpy_time_2, brainpy_time_3, brainpy_time_4, brainpy_time_5, speedup = test_event_ell_cpu(_s, _p, _values_type, _events_type)
+              for _transpose in transpose:
+                taichi_aot_time_1, taichi_aot_time_2, taichi_aot_time_3, taichi_aot_time_4, taichi_aot_time_5,\
+                    brainpy_time_1, brainpy_time_2, brainpy_time_3, brainpy_time_4, brainpy_time_5, speedup = test_event_ell_cpu(_s, _p, _values_type, _events_type, _transpose)
               # append to dataframe
-              df.loc[df.shape[0]] = [_s, _p, 'cpu', _values_type, _events_type,
+              df.loc[df.shape[0]] = [_s, _p, 'cpu', _values_type, _events_type, _transpose,
                                     taichi_aot_time_1, taichi_aot_time_2, taichi_aot_time_3, taichi_aot_time_4, taichi_aot_time_5,
                                     brainpy_time_1, brainpy_time_2, brainpy_time_3, brainpy_time_4, brainpy_time_5, speedup]
   df.to_csv('event_csrmv_cpu.csv', index=False)
@@ -280,10 +286,11 @@ def test_event_ell_gpu(s, p, values_type, events_type):
       for _p in p:
           for _values_type in values_type:
              for _events_type in events_type:
-              taichi_aot_time_1, taichi_aot_time_2, taichi_aot_time_3, taichi_aot_time_4, taichi_aot_time_5,\
-                  brainpy_time_1, brainpy_time_2, brainpy_time_3, brainpy_time_4, brainpy_time_5, speedup = test_event_ell_gpu(_s, _p, _values_type, _events_type)
+              for _transpose in transpose:
+                taichi_aot_time_1, taichi_aot_time_2, taichi_aot_time_3, taichi_aot_time_4, taichi_aot_time_5,\
+                    brainpy_time_1, brainpy_time_2, brainpy_time_3, brainpy_time_4, brainpy_time_5, speedup = test_event_ell_gpu(_s, _p, _values_type, _events_type, _transpose)
               # append to dataframe
-              df.loc[df.shape[0]] = [_s, _p, 'gpu', _values_type, _events_type,
+              df.loc[df.shape[0]] = [_s, _p, 'gpu', _values_type, _events_type, transpose,
                                     taichi_aot_time_1, taichi_aot_time_2, taichi_aot_time_3, taichi_aot_time_4, taichi_aot_time_5,
                                     brainpy_time_1, brainpy_time_2, brainpy_time_3, brainpy_time_4, brainpy_time_5, speedup]
   df.to_csv('event_csrmv_gpu.csv', index=False)
diff --git a/brainpy/_src/math/sparse/tests/csrmv_taichi_VS_csrmv.py b/brainpy/_src/math/sparse/tests/csrmv_taichi_VS_csrmv.py
index 887d501be..b11e98abc 100644
--- a/brainpy/_src/math/sparse/tests/csrmv_taichi_VS_csrmv.py
+++ b/brainpy/_src/math/sparse/tests/csrmv_taichi_VS_csrmv.py
@@ -18,11 +18,12 @@
 p = [0.1, 0.2, 0.3, 0.4, 0.5]
 values_type = ['homo', 'heter']
 events_type = ['float']
+transpose = [True, False]
 
 print(bm.get_platform())
 
 
-def test_event_ell_cpu(s, p, values_type, events_type):
+def test_event_ell_cpu(s, p, values_type, events_type, transpose):
   print('s: ', s, 'p: ', p)
   k = int(s * p)
   rng = bm.random.RandomState(seed=1234)
@@ -43,33 +44,33 @@ def test_event_ell_cpu(s, p, values_type, events_type):
 
   # groundtruth = bm.as_jax(vector, dtype=float) @ bm.as_jax(dense)
 
-  result1 = jax.block_until_ready(bm.sparse.csrmv_taichi(weight, csr_indices, csr_indptr, vector, shape=(s, s), transpose=True))
+  result1 = jax.block_until_ready(bm.sparse.csrmv_taichi(weight, csr_indices, csr_indptr, vector, shape=(s, s), transpose=transpose))
   # time.sleep(2)
 
   time0 = time.time()
-  result1 = jax.block_until_ready(bm.sparse.csrmv_taichi(weight, csr_indices, csr_indptr, vector, shape=(s, s), transpose=True))
+  result1 = jax.block_until_ready(bm.sparse.csrmv_taichi(weight, csr_indices, csr_indptr, vector, shape=(s, s), transpose=transpose))
   time1 = time.time()
   # time.sleep(2)
 
   time2 = time.time()
-  result1 = jax.block_until_ready(bm.sparse.csrmv_taichi(weight, csr_indices, csr_indptr, vector, shape=(s, s), transpose=True))
+  result1 = jax.block_until_ready(bm.sparse.csrmv_taichi(weight, csr_indices, csr_indptr, vector, shape=(s, s), transpose=transpose))
   time3 = time.time()
   # time.sleep(2)
 
   time4 = time.time()
-  result1 = jax.block_until_ready(bm.sparse.csrmv_taichi(weight, csr_indices, csr_indptr, vector, shape=(s, s), transpose=True))
+  result1 = jax.block_until_ready(bm.sparse.csrmv_taichi(weight, csr_indices, csr_indptr, vector, shape=(s, s), transpose=transpose))
   time5 = time.time()
   # time.sleep(2)
 
   time6 = time.time()
-  result1 = jax.block_until_ready(bm.sparse.csrmv_taichi(weight, csr_indices, csr_indptr, vector, shape=(s, s), transpose=True))
+  result1 = jax.block_until_ready(bm.sparse.csrmv_taichi(weight, csr_indices, csr_indptr, vector, shape=(s, s), transpose=transpose))
   time7 = time.time()
 
   time8 = time.time()
-  result1 = jax.block_until_ready(bm.sparse.csrmv_taichi(weight, csr_indices, csr_indptr, vector, shape=(s, s), transpose=True))
+  result1 = jax.block_until_ready(bm.sparse.csrmv_taichi(weight, csr_indices, csr_indptr, vector, shape=(s, s), transpose=transpose))
   time9 = time.time()
 
-  result2 = jax.block_until_ready(bm.sparse.csrmv(weight, csr_indices, csr_indptr, vector, shape=(s, s), transpose=True))
+  result2 = jax.block_until_ready(bm.sparse.csrmv(weight, csr_indices, csr_indptr, vector, shape=(s, s), transpose=transpose))
 #   print(result1[0])
 #   print(result2)
 #   print(groundtruth - result1[0])
@@ -81,26 +82,26 @@ def test_event_ell_cpu(s, p, values_type, events_type):
   # assert bm.allclose(result1[0], result2)
 
   time12 = time.time()
-  result2 = jax.block_until_ready(bm.sparse.csrmv(weight, csr_indices, csr_indptr, vector, shape=(s, s), transpose=True))
+  result2 = jax.block_until_ready(bm.sparse.csrmv(weight, csr_indices, csr_indptr, vector, shape=(s, s), transpose=transpose))
   time13 = time.time()
   # time.sleep(2)
 
   time14 = time.time()
-  result2 = jax.block_until_ready(bm.sparse.csrmv(weight, csr_indices, csr_indptr, vector, shape=(s, s), transpose=True))
+  result2 = jax.block_until_ready(bm.sparse.csrmv(weight, csr_indices, csr_indptr, vector, shape=(s, s), transpose=transpose))
   time15 = time.time()
   # time.sleep(2)
 
   time16 = time.time()
-  result2 = jax.block_until_ready(bm.sparse.csrmv(weight, csr_indices, csr_indptr, vector, shape=(s, s), transpose=True))
+  result2 = jax.block_until_ready(bm.sparse.csrmv(weight, csr_indices, csr_indptr, vector, shape=(s, s), transpose=transpose))
   time17 = time.time()
   # time.sleep(2)
 
   time18 = time.time()
-  result2 = jax.block_until_ready(bm.sparse.csrmv(weight, csr_indices, csr_indptr, vector, shape=(s, s), transpose=True))
+  result2 = jax.block_until_ready(bm.sparse.csrmv(weight, csr_indices, csr_indptr, vector, shape=(s, s), transpose=transpose))
   time19 = time.time()
 
   time20 = time.time()
-  result2 = jax.block_until_ready(bm.sparse.csrmv(weight, csr_indices, csr_indptr, vector, shape=(s, s), transpose=True))
+  result2 = jax.block_until_ready(bm.sparse.csrmv(weight, csr_indices, csr_indptr, vector, shape=(s, s), transpose=transpose))
   time21 = time.time()
 
   taichi_aot_time1 = (time1 - time0) * 1000
@@ -132,7 +133,7 @@ def test_event_ell_cpu(s, p, values_type, events_type):
   return taichi_aot_time1, taichi_aot_time2, taichi_aot_time3, taichi_aot_time4, taichi_aot_time5,\
       brainpy_time1, brainpy_time2, brainpy_time3, brainpy_time4, brainpy_time5, speedup
 
-def test_event_ell_gpu(s, p, values_type, events_type):
+def test_event_ell_gpu(s, p, values_type, events_type, transpose):
   print('s: ', s, 'p: ', p)
   k = int(s * p)
   bm.random.seed(1234)
@@ -155,33 +156,33 @@ def test_event_ell_gpu(s, p, values_type, events_type):
 
 
 
-  result1 = jax.block_until_ready(bm.sparse.csrmv_taichi(weight, csr_indices, csr_indptr, vector, shape=(s, s), transpose=True))
+  result1 = jax.block_until_ready(bm.sparse.csrmv_taichi(weight, csr_indices, csr_indptr, vector, shape=(s, s), transpose=transpose))
   # time.sleep(2)
 
   time0 = time.time()
-  result1 = jax.block_until_ready(bm.sparse.csrmv_taichi(weight, csr_indices, csr_indptr, vector, shape=(s, s), transpose=True))
+  result1 = jax.block_until_ready(bm.sparse.csrmv_taichi(weight, csr_indices, csr_indptr, vector, shape=(s, s), transpose=transpose))
   time1 = time.time()
   # time.sleep(2)
 
   time2 = time.time()
-  result1 = jax.block_until_ready(bm.sparse.csrmv_taichi(weight, csr_indices, csr_indptr, vector, shape=(s, s), transpose=True))
+  result1 = jax.block_until_ready(bm.sparse.csrmv_taichi(weight, csr_indices, csr_indptr, vector, shape=(s, s), transpose=transpose))
   time3 = time.time()
   # time.sleep(2)
 
   time4 = time.time()
-  result1 = jax.block_until_ready(bm.sparse.csrmv_taichi(weight, csr_indices, csr_indptr, vector, shape=(s, s), transpose=True))
+  result1 = jax.block_until_ready(bm.sparse.csrmv_taichi(weight, csr_indices, csr_indptr, vector, shape=(s, s), transpose=transpose))
   time5 = time.time()
   # time.sleep(2)
 
   time6 = time.time()
-  result1 = jax.block_until_ready(bm.sparse.csrmv_taichi(weight, csr_indices, csr_indptr, vector, shape=(s, s), transpose=True))
+  result1 = jax.block_until_ready(bm.sparse.csrmv_taichi(weight, csr_indices, csr_indptr, vector, shape=(s, s), transpose=transpose))
   time7 = time.time()
 
   time8 = time.time()
-  result1 = jax.block_until_ready(bm.sparse.csrmv_taichi(weight, csr_indices, csr_indptr, vector, shape=(s, s), transpose=True))
+  result1 = jax.block_until_ready(bm.sparse.csrmv_taichi(weight, csr_indices, csr_indptr, vector, shape=(s, s), transpose=transpose))
   time9 = time.time()
 
-  result2 = jax.block_until_ready(bm.sparse.csrmv(weight, csr_indices, csr_indptr, vector, shape=(s, s), transpose=True))
+  result2 = jax.block_until_ready(bm.sparse.csrmv(weight, csr_indices, csr_indptr, vector, shape=(s, s), transpose=transpose))
   # print('--------------------result1[0]------------------')
   # print(result1[0])
   # print('--------------------result2------------------')
@@ -252,7 +253,7 @@ def test_event_ell_gpu(s, p, values_type, events_type):
 bm.set_platform('cpu')
 block_dim = 64
 # init dataframe
-df = pd.DataFrame(columns=['s', 'p', 'backend', 'values type', 'events type',
+df = pd.DataFrame(columns=['s', 'p', 'backend', 'values type', 'events type', 'transpose',
                            'taichi aot time1(ms)', 'taichi aot time2(ms)', 'taichi aot time3(ms)', 'taichi aot time4(ms)', 'taichi aot time5(ms)',
                            'brainpy time1(ms)', 'brainpy time2(ms)', 'brainpy time3(ms)', 'brainpy time4(ms)', 'brainpy time5(ms)',
                            'speedup'])
@@ -262,10 +263,11 @@ def test_event_ell_gpu(s, p, values_type, events_type):
       for _p in p:
           for _values_type in values_type:
              for _events_type in events_type:
-              taichi_aot_time_1, taichi_aot_time_2, taichi_aot_time_3, taichi_aot_time_4, taichi_aot_time_5,\
-                  brainpy_time_1, brainpy_time_2, brainpy_time_3, brainpy_time_4, brainpy_time_5, speedup = test_event_ell_cpu(_s, _p, _values_type, _events_type)
+              for _transpose in transpose:
+                taichi_aot_time_1, taichi_aot_time_2, taichi_aot_time_3, taichi_aot_time_4, taichi_aot_time_5,\
+                    brainpy_time_1, brainpy_time_2, brainpy_time_3, brainpy_time_4, brainpy_time_5, speedup = test_event_ell_cpu(_s, _p, _values_type, _events_type, _transpose)
               # append to dataframe
-              df.loc[df.shape[0]] = [_s, _p, 'cpu', _values_type, _events_type,
+              df.loc[df.shape[0]] = [_s, _p, 'cpu', _values_type, _events_type, _transpose,
                                     taichi_aot_time_1, taichi_aot_time_2, taichi_aot_time_3, taichi_aot_time_4, taichi_aot_time_5,
                                     brainpy_time_1, brainpy_time_2, brainpy_time_3, brainpy_time_4, brainpy_time_5, speedup]
   df.to_csv('event_csrmv_cpu.csv', index=False)
@@ -275,10 +277,11 @@ def test_event_ell_gpu(s, p, values_type, events_type):
       for _p in p:
           for _values_type in values_type:
              for _events_type in events_type:
-              taichi_aot_time_1, taichi_aot_time_2, taichi_aot_time_3, taichi_aot_time_4, taichi_aot_time_5,\
-                  brainpy_time_1, brainpy_time_2, brainpy_time_3, brainpy_time_4, brainpy_time_5, speedup = test_event_ell_gpu(_s, _p, _values_type, _events_type)
+              for _transpose in transpose:
+                taichi_aot_time_1, taichi_aot_time_2, taichi_aot_time_3, taichi_aot_time_4, taichi_aot_time_5,\
+                    brainpy_time_1, brainpy_time_2, brainpy_time_3, brainpy_time_4, brainpy_time_5, speedup = test_event_ell_gpu(_s, _p, _values_type, _events_type, _transpose)
               # append to dataframe
-              df.loc[df.shape[0]] = [_s, _p, 'gpu', _values_type, _events_type,
+              df.loc[df.shape[0]] = [_s, _p, 'gpu', _values_type, _events_type, transpose,
                                     taichi_aot_time_1, taichi_aot_time_2, taichi_aot_time_3, taichi_aot_time_4, taichi_aot_time_5,
                                     brainpy_time_1, brainpy_time_2, brainpy_time_3, brainpy_time_4, brainpy_time_5, speedup]
   df.to_csv('event_csrmv_gpu.csv', index=False)