diff --git a/.github/workflows/CI.yml b/.github/workflows/CI.yml
index 84aa028e3..d29b07ebc 100644
--- a/.github/workflows/CI.yml
+++ b/.github/workflows/CI.yml
@@ -11,6 +11,12 @@ on:
     branches:
       - '**'        # matches every branch
 
+
+permissions:
+  contents: read  # to fetch code
+  actions: write  # to cancel previous workflows
+
+
 #on:
 #  push:
 #    branches: [ master ]
@@ -27,6 +33,10 @@ jobs:
         python-version: [ "3.9", "3.10", "3.11"]
 
     steps:
+      - name: Cancel Previous Runs
+        uses: styfle/cancel-workflow-action@85880fa0301c86cca9da44039ee3bb12d3bedbfa # ratchet: styfle/cancel-workflow-action@0.12.1
+        with:
+            access_token: ${{ github.token }}
       - uses: actions/checkout@v4
       - name: Set up Python ${{ matrix.python-version }}
         uses: actions/setup-python@v5
@@ -35,10 +45,34 @@ jobs:
       - name: Install dependencies
         run: |
           python -m pip install --upgrade pip
-          python -m pip install flake8 pytest
           if [ -f requirements-dev.txt ]; then pip install -r requirements-dev.txt; fi
           pip uninstall brainpy -y
           python setup.py install
+      - name: Test with pytest
+        run: |
+          cd brainpy
+          pytest _src/
+
+  test_linux_with_taichi_numba:
+    runs-on: ubuntu-latest
+    strategy:
+      fail-fast: false
+      matrix:
+        python-version: [ "3.9", "3.10", "3.11"]
+
+    steps:
+      - uses: actions/checkout@v4
+      - name: Set up Python ${{ matrix.python-version }}
+        uses: actions/setup-python@v5
+        with:
+          python-version: ${{ matrix.python-version }}
+      - name: Install dependencies
+        run: |
+          python -m pip install --upgrade pip
+          python -m pip install flake8 pytest taichi numba
+          if [ -f requirements-dev-raw.txt ]; then pip install -r requirements-dev-raw.txt; fi
+          pip uninstall brainpy -y
+          python setup.py install
       - name: Lint with flake8
         run: |
           # stop the build if there are Python syntax errors or undefined names
@@ -51,40 +85,6 @@ jobs:
           pytest _src/
 
 
-#  test_linux_py37:
-#    runs-on: ubuntu-latest
-#    strategy:
-#      fail-fast: false
-#      matrix:
-#        python-version: ["3.7"]
-#
-#    steps:
-#    - uses: actions/checkout@v4
-#    - name: Set up Python ${{ matrix.python-version }}
-#      uses: actions/setup-python@v5
-#      with:
-#        python-version: ${{ matrix.python-version }}
-#    - name: Install dependencies
-#      run: |
-#        python -m pip install --upgrade pip
-#        python -m pip install flake8 pytest
-#        if [ -f requirements-dev.txt ]; then pip install -r requirements-dev.txt; fi
-#        pip install jax==0.3.25
-#        pip install jaxlib==0.3.25
-#        pip uninstall brainpy -y
-#        python setup.py install
-#    - name: Lint with flake8
-#      run: |
-#        # stop the build if there are Python syntax errors or undefined names
-#        flake8 brainpy/ --count --select=E9,F63,F7,F82 --show-source --statistics
-#        # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide
-#        flake8 brainpy/ --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics
-#    - name: Test with pytest
-#      run: |
-#        cd examples
-#        pytest ../brainpy/
-#
-
   test_macos:
     runs-on: macos-latest
     strategy:
@@ -93,6 +93,10 @@ jobs:
         python-version: ["3.9", "3.10", "3.11"]
 
     steps:
+    - name: Cancel Previous Runs
+      uses: styfle/cancel-workflow-action@85880fa0301c86cca9da44039ee3bb12d3bedbfa # ratchet: styfle/cancel-workflow-action@0.12.1
+      with:
+          access_token: ${{ github.token }}
     - uses: actions/checkout@v4
     - name: Set up Python ${{ matrix.python-version }}
       uses: actions/setup-python@v5
@@ -101,10 +105,65 @@ jobs:
     - name: Install dependencies
       run: |
         python -m pip install --upgrade pip
-        python -m pip install flake8 pytest
         if [ -f requirements-dev.txt ]; then pip install -r requirements-dev.txt; fi
         pip uninstall brainpy -y
         python setup.py install
+    - name: Test with pytest
+      run: |
+        cd brainpy
+        pytest -n auto --tb=short _src/
+
+
+  test_windows:
+    strategy:
+      fail-fast: false
+      matrix:
+        os: [ win-2019-16core ]
+        arch: [ AMD64 ]
+        python-version: ["3.9", "3.10", "3.11"]
+    runs-on: ${{ matrix.os }}
+
+    steps:
+    - name: Cancel Previous Runs
+      uses: styfle/cancel-workflow-action@85880fa0301c86cca9da44039ee3bb12d3bedbfa # ratchet: styfle/cancel-workflow-action@0.12.1
+      with:
+        access_token: ${{ github.token }}
+    - uses: actions/checkout@v4
+    - name: Set up Python ${{ matrix.python-version }}
+      uses: actions/setup-python@v5
+      with:
+        python-version: ${{ matrix.python-version }}
+    - name: Install dependencies
+      run: |
+        python -m pip install --upgrade pip
+        python -m pip install -r requirements-dev.txt
+        pip uninstall brainpy -y
+        python setup.py install
+    - name: Test with pytest
+      run: |
+        cd brainpy
+        pytest _src/
+
+  test_macos_with_taichi_numba:
+    runs-on: macos-latest
+    strategy:
+      fail-fast: false
+      matrix:
+        python-version: ["3.9", "3.10", "3.11"]
+
+    steps:
+    - uses: actions/checkout@v4
+    - name: Set up Python ${{ matrix.python-version }}
+      uses: actions/setup-python@v5
+      with:
+        python-version: ${{ matrix.python-version }}
+    - name: Install dependencies
+      run: |
+        python -m pip install --upgrade pip
+        python -m pip install flake8 pytest taichi numba
+        if [ -f requirements-dev-raw.txt ]; then pip install -r requirements-dev-raw.txt; fi
+        pip uninstall brainpy -y
+        python setup.py install
     - name: Lint with flake8
       run: |
         # stop the build if there are Python syntax errors or undefined names
@@ -116,104 +175,3 @@ jobs:
         cd brainpy
         pytest _src/
 
-#  test_macos_py37:
-#    runs-on: macos-latest
-#    strategy:
-#      fail-fast: false
-#      matrix:
-#        python-version: [ "3.7" ]
-#
-#    steps:
-#      - uses: actions/checkout@v4
-#      - name: Set up Python ${{ matrix.python-version }}
-#        uses: actions/setup-python@v5
-#        with:
-#          python-version: ${{ matrix.python-version }}
-#      - name: Install dependencies
-#        run: |
-#          python -m pip install --upgrade pip
-#          python -m pip install flake8 pytest
-#          if [ -f requirements-dev.txt ]; then pip install -r requirements-dev.txt; fi
-#          pip install jax==0.3.25
-#          pip install jaxlib==0.3.25
-#          pip uninstall brainpy -y
-#          python setup.py install
-#      - name: Lint with flake8
-#        run: |
-#          # stop the build if there are Python syntax errors or undefined names
-#          flake8 brainpy/ --count --select=E9,F63,F7,F82 --show-source --statistics
-#          # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide
-#          flake8 brainpy/ --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics
-#      - name: Test with pytest
-#        run: |
-#          cd examples
-#          pytest ../brainpy/
-#
-
-
-#  test_windows:
-#    runs-on: windows-latest
-#    strategy:
-#      fail-fast: false
-#      matrix:
-#        python-version: ["3.9", "3.10", "3.11"]
-#
-#    steps:
-#    - uses: actions/checkout@v4
-#    - name: Set up Python ${{ matrix.python-version }}
-#      uses: actions/setup-python@v5
-#      with:
-#        python-version: ${{ matrix.python-version }}
-#    - name: Install dependencies
-#      run: |
-#        python -m pip install --upgrade pip
-#        python -m pip install flake8 pytest
-#        python -m pip install -r requirements-dev.txt
-#        pip uninstall brainpy -y
-#        python setup.py install
-#    - name: Lint with flake8
-#      run: |
-#        # stop the build if there are Python syntax errors or undefined names
-#        flake8 brainpy/ --count --select=E9,F63,F7,F82 --show-source --statistics
-#        # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide
-#        flake8 brainpy/ --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics
-#    - name: Test with pytest
-#      run: |
-#        cd brainpy
-#        pytest _src/
-
-
-#  test_windows_py37:
-#    runs-on: windows-latest
-#    strategy:
-#      fail-fast: false
-#      matrix:
-#        python-version: ["3.7"]
-#
-#    steps:
-#    - uses: actions/checkout@v4
-#    - name: Set up Python ${{ matrix.python-version }}
-#      uses: actions/setup-python@v5
-#      with:
-#        python-version: ${{ matrix.python-version }}
-#    - name: Install dependencies
-#      run: |
-#        python -m pip install --upgrade pip
-#        python -m pip install flake8 pytest
-#        python -m pip install numpy>=1.21.0
-#        python -m pip install "jaxlib==0.3.25" -f https://whls.blob.core.windows.net/unstable/index.html --use-deprecated legacy-resolver
-#        python -m pip install https://github.com/google/jax/archive/refs/tags/jax-v0.3.25.tar.gz
-#        python -m pip install -r requirements-dev.txt
-#        python -m pip install tqdm brainpylib
-#        pip uninstall brainpy -y
-#        python setup.py install
-#    - name: Lint with flake8
-#      run: |
-#        # stop the build if there are Python syntax errors or undefined names
-#        flake8 brainpy/ --count --select=E9,F63,F7,F82 --show-source --statistics
-#        # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide
-#        flake8 brainpy/ --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics
-#    - name: Test with pytest
-#      run: |
-#        cd examples
-#        pytest ../brainpy/
diff --git a/README.md b/README.md
index 6d2ee4bf4..a7fe0b721 100644
--- a/README.md
+++ b/README.md
@@ -25,29 +25,7 @@ BrainPy is a flexible, efficient, and extensible framework for computational neu
 
 ## Installation
 
-BrainPy is based on Python (>=3.8) and can be installed on Linux (Ubuntu 16.04 or later), macOS (10.12 or later), and Windows platforms. Install the latest version of BrainPy:
-
-```bash
-$ pip install brainpy -U
-```
-
-In addition, many customized operators in BrainPy are implemented in ``brainpylib``.
-Install the latest version of `brainpylib` by:
-
-```bash
-# CPU installation for Linux, macOS and Windows
-$ pip install --upgrade brainpylib
-```
-
-```bash
-# CUDA 12 installation for Linux only
-$ pip install --upgrade brainpylib-cu12x
-```
-
-```bash
-# CUDA 11 installation for Linux only
-$ pip install --upgrade brainpylib-cu11x
-```
+BrainPy is based on Python (>=3.8) and can be installed on Linux (Ubuntu 16.04 or later), macOS (10.12 or later), and Windows platforms. 
 
 For detailed installation instructions, please refer to the documentation: [Quickstart/Installation](https://brainpy.readthedocs.io/en/latest/quickstart/installation.html)
 
diff --git a/brainpy/_src/connect/random_conn.py b/brainpy/_src/connect/random_conn.py
index 1f5b1db6d..0e4ee769c 100644
--- a/brainpy/_src/connect/random_conn.py
+++ b/brainpy/_src/connect/random_conn.py
@@ -1,1372 +1,1245 @@
-# -*- coding: utf-8 -*-
-from functools import partial
-from typing import Optional
-
-from jax import vmap, jit, numpy as jnp
-import numpy as np
-from numba import njit
-
-import brainpy.math as bm
-from brainpy.errors import ConnectorError
-from brainpy.tools import numba_seed, numba_jit, numba_range, format_seed
-from brainpy._src.tools.package import SUPPORT_NUMBA
-from .base import *
-
-__all__ = [
-  'FixedProb',
-  'FixedPreNum',
-  'FixedPostNum',
-  'FixedTotalNum',
-  'GaussianProb',
-  'ProbDist',
-
-  'SmallWorld',
-  'ScaleFreeBA',
-  'ScaleFreeBADual',
-  'PowerLaw',
-]
-
-
-class FixedProb(TwoEndConnector):
-  """Connect the post-synaptic neurons with fixed probability.
-
-  Parameters
-  ----------
-  prob: float
-    The conn probability.
-  pre_ratio: float
-    The ratio of pre-synaptic neurons to connect.
-  include_self : bool
-    Whether create (i, i) conn?
-  allow_multi_conn: bool
-    Allow one pre-synaptic neuron connects to multiple post-synaptic neurons?
-
-    .. versionadded:: 2.2.3.2
-
-  seed : optional, int
-    Seed the random generator.
-  """
-
-  def __init__(self,
-               prob,
-               pre_ratio=1.,
-               include_self=True,
-               allow_multi_conn=False,
-               seed=None,
-               **kwargs):
-    super(FixedProb, self).__init__(**kwargs)
-    assert 0. <= prob <= 1.
-    assert 0. <= pre_ratio <= 1.
-    self.prob = prob
-    self.pre_ratio = pre_ratio
-    self.include_self = include_self
-    self.seed = format_seed(seed)
-    self.allow_multi_conn = allow_multi_conn
-    self._jaxrand = bm.random.default_rng(self.seed)
-    self._nprand = np.random.RandomState(self.seed)
-
-  def __repr__(self):
-    return (f'{self.__class__.__name__}(prob={self.prob}, pre_ratio={self.pre_ratio}, '
-            f'include_self={self.include_self}, allow_multi_conn={self.allow_multi_conn}, '
-            f'seed={self.seed})')
-
-  def _iii(self):
-    if (not self.include_self) and (self.pre_num != self.post_num):
-      raise ConnectorError(f'We found pre_num != post_num ({self.pre_num} != {self.post_num}). '
-                           f'But `include_self` is set to True.')
-
-    if self.pre_ratio < 1.:
-      pre_num_to_select = int(self.pre_num * self.pre_ratio)
-      pre_ids = self._jaxrand.choice(self.pre_num, size=(pre_num_to_select,), replace=False)
-    else:
-      pre_num_to_select = self.pre_num
-      pre_ids = jnp.arange(self.pre_num)
-
-    post_num_total = self.post_num
-    post_num_to_select = int(self.post_num * self.prob)
-
-    if self.allow_multi_conn:
-      selected_post_ids = self._jaxrand.randint(0, post_num_total, (pre_num_to_select, post_num_to_select))
-
-    else:
-      if SUPPORT_NUMBA:
-        rng = np.random
-        numba_seed(self._nprand.randint(0, int(1e8)))
-      else:
-        rng = self._nprand
-
-      @numba_jit  # (parallel=True, nogil=True)
-      def single_conn():
-        posts = np.zeros((pre_num_to_select, post_num_to_select), dtype=IDX_DTYPE)
-        for i in numba_range(pre_num_to_select):
-          posts[i] = rng.choice(post_num_total, post_num_to_select, replace=False)
-        return posts
-
-      selected_post_ids = jnp.asarray(single_conn())
-    return pre_num_to_select, post_num_to_select, bm.as_jax(selected_post_ids), bm.as_jax(pre_ids)
-
-  def build_coo(self):
-    _, post_num_to_select, selected_post_ids, pre_ids = self._iii()
-    selected_post_ids = selected_post_ids.flatten()
-    selected_pre_ids = jnp.repeat(pre_ids, post_num_to_select)
-    if not self.include_self:
-      true_ids = selected_pre_ids != selected_post_ids
-      selected_pre_ids = selected_pre_ids[true_ids]
-      selected_post_ids = selected_post_ids[true_ids]
-    return selected_pre_ids.astype(get_idx_type()), selected_post_ids.astype(get_idx_type())
-
-  def build_csr(self):
-    pre_num_to_select, post_num_to_select, selected_post_ids, pre_ids = self._iii()
-    pre_nums = jnp.ones(pre_num_to_select) * post_num_to_select
-    if not self.include_self:
-      true_ids = selected_post_ids == jnp.reshape(pre_ids, (-1, 1))
-      pre_nums -= jnp.sum(true_ids, axis=1)
-      selected_post_ids = selected_post_ids.flatten()[jnp.logical_not(true_ids).flatten()]
-    else:
-      selected_post_ids = selected_post_ids.flatten()
-    selected_pre_inptr = jnp.cumsum(jnp.concatenate([jnp.zeros(1), pre_nums]))
-    return selected_post_ids.astype(get_idx_type()), selected_pre_inptr.astype(get_idx_type())
-
-  def build_mat(self):
-    if self.pre_ratio < 1.:
-      pre_state = self._jaxrand.uniform(size=(self.pre_num, 1)) < self.pre_ratio
-      mat = (self._jaxrand.uniform(size=(self.pre_num, self.post_num)) < self.prob) * pre_state
-    else:
-      mat = (self._jaxrand.uniform(size=(self.pre_num, self.post_num)) < self.prob)
-    mat = bm.asarray(mat)
-    if not self.include_self:
-      bm.fill_diagonal(mat, False)
-    return mat.astype(MAT_DTYPE)
-
-
-class FixedTotalNum(TwoEndConnector):
-  """Connect the synaptic neurons with fixed total number.
-
-  Parameters
-  ----------
-  num : float,int
-    The conn total number.
-  allow_multi_conn : bool, optional
-    Whether allow one pre-synaptic neuron connects to multiple post-synaptic neurons.
-  seed: int, optional
-    The random number seed.
-  """
-
-  def __init__(self,
-               num,
-               allow_multi_conn=False,
-               seed=None, **kwargs):
-    super().__init__(**kwargs)
-    if isinstance(num, int):
-      assert num >= 0, '"num" must be a non-negative integer.'
-    elif isinstance(num, float):
-      assert 0. <= num <= 1., '"num" must be in [0., 1.).'
-    else:
-      raise ConnectorError(f'Unknown type: {type(num)}')
-    self.num = num
-    self.seed = format_seed(seed)
-    self.allow_multi_conn = allow_multi_conn
-    self.rng = bm.random.RandomState(self.seed)
-
-  def build_coo(self):
-    mat_element_num = self.pre_num * self.post_num
-    if self.num > mat_element_num:
-      raise ConnectorError(f'"num" must be smaller than "all2all num", '
-                           f'but got {self.num} > {mat_element_num}')
-    if self.allow_multi_conn:
-      selected_pre_ids = self.rng.randint(0, self.pre_num, (self.num,))
-      selected_post_ids = self.rng.randint(0, self.post_num, (self.num,))
-    else:
-      index = self.rng.choice(mat_element_num, size=(self.num,), replace=False)
-      selected_pre_ids = index // self.post_num
-      selected_post_ids = index % self.post_num
-    return selected_pre_ids.astype(get_idx_type()), selected_post_ids.astype(get_idx_type())
-
-  def __repr__(self):
-    return f'{self.__class__.__name__}(num={self.num}, seed={self.seed})'
-
-
-class FixedNum(TwoEndConnector):
-  def __init__(self,
-               num,
-               include_self=True,
-               allow_multi_conn=False,
-               seed=None,
-               **kwargs):
-    super(FixedNum, self).__init__(**kwargs)
-    if isinstance(num, int):
-      assert num >= 0, '"num" must be a non-negative integer.'
-    elif isinstance(num, float):
-      assert 0. <= num <= 1., '"num" must be in [0., 1.).'
-    else:
-      raise ConnectorError(f'Unknown type: {type(num)}')
-    self.num = num
-    self.seed = format_seed(seed)
-    self.include_self = include_self
-    self.allow_multi_conn = allow_multi_conn
-    self.rng = bm.random.RandomState(self.seed) if allow_multi_conn else np.random.RandomState(self.seed)
-
-  def __repr__(self):
-    return f'{self.__class__.__name__}(num={self.num}, include_self={self.include_self}, seed={self.seed})'
-
-
-class FixedPreNum(FixedNum):
-  """Connect a fixed number pf pre-synaptic neurons for each post-synaptic neuron.
-
-  Parameters
-  ----------
-  num : float, int
-      The conn probability (if "num" is float) or the fixed number of
-      connectivity (if "num" is int).
-  include_self : bool
-      Whether create (i, i) conn ?
-  seed : None, int
-      Seed the random generator.
-  allow_multi_conn: bool
-    Allow one pre-synaptic neuron connects to multiple post-synaptic neurons?
-
-    .. versionadded:: 2.2.3.2
-
-  """
-
-  def build_coo(self):
-    if isinstance(self.num, int) and self.num > self.pre_num:
-      raise ConnectorError(f'"num" must be smaller than "pre_num", '
-                           f'but got {self.num} > {self.pre_num}')
-    if (not self.include_self) and (self.pre_num != self.post_num):
-      raise ConnectorError(f'We found pre_num != post_num ({self.pre_num} != {self.post_num}). '
-                           f'But `include_self` is set to True.')
-    pre_num_to_select = int(self.pre_num * self.num) if isinstance(self.num, float) else self.num
-    pre_num_total = self.pre_num
-    post_num_total = self.post_num
-
-    if self.allow_multi_conn:
-      selected_pre_ids = self.rng.randint(0, pre_num_total, (post_num_total, pre_num_to_select,))
-
-    else:
-      if SUPPORT_NUMBA:
-        rng = np.random
-        numba_seed(self.rng.randint(0, int(1e8)))
-      else:
-        rng = self.rng
-
-      @numba_jit  # (parallel=True, nogil=True)
-      def single_conn():
-        posts = np.zeros((post_num_total, pre_num_to_select), dtype=IDX_DTYPE)
-        for i in numba_range(post_num_total):
-          posts[i] = rng.choice(pre_num_total, pre_num_to_select, replace=False)
-        return posts
-
-      selected_pre_ids = jnp.asarray(single_conn())
-
-    post_nums = jnp.ones((post_num_total,), dtype=get_idx_type()) * pre_num_to_select
-    if not self.include_self:
-      true_ids = selected_pre_ids == jnp.reshape(jnp.arange(pre_num_total), (-1, 1))
-      post_nums -= jnp.sum(true_ids, axis=1)
-      selected_pre_ids = selected_pre_ids.flatten()[jnp.logical_not(true_ids).flatten()]
-    else:
-      selected_pre_ids = selected_pre_ids.flatten()
-    selected_post_ids = jnp.repeat(jnp.arange(post_num_total), post_nums)
-    return selected_pre_ids.astype(get_idx_type()), selected_post_ids.astype(get_idx_type())
-
-
-class FixedPostNum(FixedNum):
-  """Connect the fixed number of post-synaptic neurons for each pre-synaptic neuron.
-
-  Parameters
-  ----------
-  num : float, int
-      The conn probability (if "num" is float) or the fixed number of
-      connectivity (if "num" is int).
-  include_self : bool
-      Whether create (i, i) conn ?
-  seed : None, int
-      Seed the random generator.
-  allow_multi_conn: bool
-    Allow one pre-synaptic neuron connects to multiple post-synaptic neurons?
-
-    .. versionadded:: 2.2.3.2
-
-  """
-
-  def _ii(self):
-    if isinstance(self.num, int) and self.num > self.post_num:
-      raise ConnectorError(f'"num" must be smaller than "post_num", '
-                           f'but got {self.num} > {self.post_num}')
-    if (not self.include_self) and (self.pre_num != self.post_num):
-      raise ConnectorError(f'We found pre_num != post_num ({self.pre_num} != {self.post_num}). '
-                           f'But `include_self` is set to True.')
-    post_num_to_select = int(self.post_num * self.num) if isinstance(self.num, float) else self.num
-    pre_num_to_select = self.pre_num
-    pre_ids = jnp.arange(self.pre_num)
-    post_num_total = self.post_num
-
-    if self.allow_multi_conn:
-      selected_post_ids = self.rng.randint(0, post_num_total, (pre_num_to_select, post_num_to_select,))
-
-    else:
-      if SUPPORT_NUMBA:
-        rng = np.random
-        numba_seed(self.rng.randint(0, int(1e8)))
-      else:
-        rng = self.rng
-
-      @numba_jit  # (parallel=True, nogil=True)
-      def single_conn():
-        posts = np.zeros((pre_num_to_select, post_num_to_select), dtype=IDX_DTYPE)
-        for i in numba_range(pre_num_to_select):
-          posts[i] = rng.choice(post_num_total, post_num_to_select, replace=False)
-        return posts
-
-      selected_post_ids = jnp.asarray(single_conn())
-    return pre_num_to_select, post_num_to_select, bm.as_jax(selected_post_ids), bm.as_jax(pre_ids)
-
-  def build_coo(self):
-    _, post_num_to_select, selected_post_ids, pre_ids = self._ii()
-    selected_post_ids = selected_post_ids.flatten()
-    selected_pre_ids = jnp.repeat(pre_ids, post_num_to_select)
-    if not self.include_self:
-      true_ids = selected_pre_ids != selected_post_ids
-      selected_pre_ids = selected_pre_ids[true_ids]
-      selected_post_ids = selected_post_ids[true_ids]
-    return selected_pre_ids.astype(get_idx_type()), selected_post_ids.astype(get_idx_type())
-
-  def build_csr(self):
-    pre_num_to_select, post_num_to_select, selected_post_ids, pre_ids = self._ii()
-    pre_nums = jnp.ones(pre_num_to_select) * post_num_to_select
-    if not self.include_self:
-      true_ids = selected_post_ids == jnp.reshape(pre_ids, (-1, 1))
-      pre_nums -= jnp.sum(true_ids, axis=1)
-      selected_post_ids = selected_post_ids.flatten()[jnp.logical_not(true_ids).flatten()]
-    else:
-      selected_post_ids = selected_post_ids.flatten()
-    selected_pre_inptr = jnp.cumsum(jnp.concatenate([jnp.zeros(1), pre_nums]))
-    return selected_post_ids.astype(get_idx_type()), selected_pre_inptr.astype(get_idx_type())
-
-@jit
-@partial(vmap, in_axes=(0, None, None))
-def gaussian_prob_dist_cal1(i_value, post_values, sigma):
-  dists = jnp.abs(i_value - post_values)
-  exp_dists = jnp.exp(-(jnp.sqrt(jnp.sum(dists ** 2, axis=0)) / sigma) ** 2 / 2)
-  return bm.asarray(exp_dists)
-
-@jit
-@partial(vmap, in_axes=(0, None, None, None))
-def gaussian_prob_dist_cal2(i_value, post_values, value_sizes, sigma):
-  dists = jnp.abs(i_value - post_values)
-  dists = jnp.where(dists > (value_sizes / 2), value_sizes - dists, dists)
-  exp_dists = jnp.exp(-(jnp.sqrt(jnp.sum(dists ** 2, axis=0)) / sigma) ** 2 / 2)
-  return bm.asarray(exp_dists)
-
-
-class GaussianProb(OneEndConnector):
-  r"""Builds a Gaussian connectivity pattern within a population of neurons,
-  where the connection probability decay according to the gaussian function.
-
-  Specifically, for any pair of neurons :math:`(i, j)`,
-
-  .. math::
-
-      p(i, j)=\exp(-\frac{\sum_{k=1}^n |v_k^i - v_k^j|^2 }{2\sigma^2})
-
-  where :math:`v_k^i` is the :math:`i`-th neuron's encoded value at dimension :math:`k`.
-
-  Parameters
-  ----------
-  sigma : float
-      Width of the Gaussian function.
-  encoding_values : optional, list, tuple, int, float
-    The value ranges to encode for neurons at each axis.
-
-    - If `values` is not provided, the neuron only encodes each positional
-      information, i.e., :math:`(i, j, k, ...)`, where :math:`i, j, k` is
-      the index in the high-dimensional space.
-    - If `values` is a single tuple/list of int/float, neurons at each dimension
-      will encode the same range of values. For example, ``values=(0, np.pi)``,
-      neurons at each dimension will encode a continuous value space ``[0, np.pi]``.
-    - If `values` is a tuple/list of list/tuple, it means the value space will be
-      different for each dimension. For example, ``values=((-np.pi, np.pi), (10, 20), (0, 2 * np.pi))``.
-
-  periodic_boundary : bool
-    Whether the neuron encode the value space with the periodic boundary.
-  normalize : bool
-      Whether normalize the connection probability .
-  include_self : bool
-      Whether create the connection at the same position.
-  seed : int
-      The random seed.
-  """
-
-  def __init__(
-      self,
-      sigma: float,
-      encoding_values: Optional[np.ndarray] = None,
-      normalize: bool = True,
-      include_self: bool = True,
-      periodic_boundary: bool = False,
-      seed: int = None,
-      **kwargs
-  ):
-    super(GaussianProb, self).__init__(**kwargs)
-    self.sigma = sigma
-    self.encoding_values = encoding_values
-    self.normalize = normalize
-    self.include_self = include_self
-    self.periodic_boundary = periodic_boundary
-    self.seed = format_seed(seed)
-    self.rng = np.random.RandomState(self.seed)
-
-  def __repr__(self):
-    return (f'{self.__class__.__name__}(sigma={self.sigma}, '
-            f'normalize={self.normalize}, '
-            f'periodic_boundary={self.periodic_boundary}, '
-            f'include_self={self.include_self}, '
-            f'seed={self.seed})')
-
-  def build_mat(self, isOptimized=True):
-    self.rng = np.random.RandomState(self.seed)
-    # value range to encode
-    if self.encoding_values is None:
-      value_ranges = tuple([(0, s) for s in self.pre_size])
-    elif isinstance(self.encoding_values, (tuple, list)):
-      if len(self.encoding_values) == 0:
-        raise ConnectorError(f'encoding_values has a length of 0.')
-      elif isinstance(self.encoding_values[0], (int, float)):
-        assert len(self.encoding_values) == 2
-        assert self.encoding_values[0] < self.encoding_values[1]
-        value_ranges = tuple([self.encoding_values for _ in self.pre_size])
-      elif isinstance(self.encoding_values[0], (tuple, list)):
-        if len(self.encoding_values) != len(self.pre_size):
-          raise ConnectorError(f'The network size has {len(self.pre_size)} dimensions, while '
-                               f'the encoded values provided only has {len(self.encoding_values)}-D. '
-                               f'Error in {str(self)}.')
-        for v in self.encoding_values:
-          assert isinstance(v[0], (int, float))
-          assert len(v) == 2
-        value_ranges = tuple(self.encoding_values)
-      else:
-        raise ConnectorError(f'Unsupported encoding values: {self.encoding_values}')
-    else:
-      raise ConnectorError(f'Unsupported encoding values: {self.encoding_values}')
-
-    # values
-    values = [np.linspace(vs[0], vs[1], n + 1)[:n] for vs, n in zip(value_ranges, self.pre_size)]
-    # post_values = np.stack([v.flatten() for v in np.meshgrid(*values, indexing='ij')])
-    post_values = np.stack([v.flatten() for v in np.meshgrid(*values)])
-    value_sizes = np.array([v[1] - v[0] for v in value_ranges])
-    if value_sizes.ndim < post_values.ndim:
-      value_sizes = np.expand_dims(value_sizes, axis=tuple([i + 1 for i in range(post_values.ndim - 1)]))
-
-    # probability of connections
-    if isOptimized:
-      i_value_list = np.zeros(shape=(self.pre_num, len(self.pre_size), 1))
-      for i in range(self.pre_num):
-        list_index = i
-        # values for node i
-        i_coordinate = tuple()
-        for s in self.pre_size[:-1]:
-          i, pos = divmod(i, s)
-          i_coordinate += (pos,)
-        i_coordinate += (i,)
-        i_value = np.array([values[i][c] for i, c in enumerate(i_coordinate)])
-        if i_value.ndim < post_values.ndim:
-          i_value = np.expand_dims(i_value, axis=tuple([i + 1 for i in range(post_values.ndim - 1)]))
-        i_value_list[list_index] = i_value
-
-      if self.periodic_boundary:
-        prob_mat = gaussian_prob_dist_cal2(i_value_list, post_values, value_sizes, self.sigma)
-      else:
-        prob_mat = gaussian_prob_dist_cal1(i_value_list, post_values, self.sigma)
-    else:
-      prob_mat = []
-      for i in range(self.pre_num):
-        # values for node i
-        i_coordinate = tuple()
-        for s in self.pre_size[:-1]:
-          i, pos = divmod(i, s)
-          i_coordinate += (pos,)
-        i_coordinate += (i,)
-        i_value = np.array([values[i][c] for i, c in enumerate(i_coordinate)])
-        if i_value.ndim < post_values.ndim:
-          i_value = np.expand_dims(i_value, axis=tuple([i + 1 for i in range(post_values.ndim - 1)]))
-        # distances
-        dists = np.abs(i_value - post_values)
-        if self.periodic_boundary:
-          dists = np.where(dists > value_sizes / 2, value_sizes - dists, dists)
-        exp_dists = np.exp(-(np.linalg.norm(dists, axis=0) / self.sigma) ** 2 / 2)
-        prob_mat.append(exp_dists)
-      prob_mat = np.stack(prob_mat)
-
-    if self.normalize:
-      prob_mat /= prob_mat.max()
-
-    # connectivity
-    conn_mat = np.asarray(prob_mat) >= self.rng.random(prob_mat.shape)
-    if not self.include_self:
-      np.fill_diagonal(conn_mat, False)
-    return conn_mat
-
-
-class SmallWorld(TwoEndConnector):
-  """Build a Watts–Strogatz small-world graph.
-
-  Parameters
-  ----------
-  num_neighbor : int
-      Each node is joined with its `k` nearest neighbors in a ring
-      topology.
-  prob : float
-      The probability of rewiring each edge
-  directed : bool
-      Whether the graph is a directed graph.
-  include_self : bool
-      Whether include the node self.
-
-  Notes
-  -----
-  First create a ring over :math:`num\_node` nodes [1]_.  Then each node in the ring is
-  joined to its :math:`num\_neighbor` nearest neighbors (or :math:`num\_neighbor - 1` neighbors
-  if :math:`num\_neighbor` is odd). Then shortcuts are created by replacing some edges as
-  follows: for each edge :math:`(u, v)` in the underlying ":math:`num\_node`-ring with
-  :math:`num\_neighbor` nearest neighbors" with probability :math:`prob` replace it with a new
-  edge :math:`(u, w)` with uniformly random choice of existing node :math:`w`.
-
-  References
-  ----------
-  .. [1] Duncan J. Watts and Steven H. Strogatz,
-         Collective dynamics of small-world networks,
-         Nature, 393, pp. 440--442, 1998.
-  """
-
-  def __init__(
-      self,
-      num_neighbor,
-      prob,
-      directed=False,
-      include_self=False,
-      seed=None,
-      **kwargs
-  ):
-    super(SmallWorld, self).__init__(**kwargs)
-    self.prob = prob
-    self.directed = directed
-    self.num_neighbor = num_neighbor
-    self.include_self = include_self
-
-    self.seed = format_seed(seed)
-    self.rng = np.random.RandomState(seed=self.seed)
-    rng = np.random if SUPPORT_NUMBA else self.rng
-
-    def _smallworld_rewire(i, all_j):
-      if rng.random(1) < prob:
-        non_connected = np.where(np.logical_not(all_j))[0]
-        if len(non_connected) <= 1:
-          return -1
-        # Enforce no self-loops or multiple edges
-        w = rng.choice(non_connected)
-        while (not include_self) and w == i:
-          # non_connected.remove(w)
-          w = rng.choice(non_connected)
-        return w
-      else:
-        return -1
-
-    self._connect = numba_jit(_smallworld_rewire)
-
-  def __repr__(self):
-    return (f'{self.__class__.__name__}(prob={self.prob}, '
-            f'directed={self.directed}, '
-            f'num_neighbor={self.num_neighbor}, '
-            f'include_self={self.include_self}, '
-            f'seed={self.seed})')
-
-  def build_conn(self):
-    assert self.pre_size == self.post_size
-
-    # seed
-    self.seed = self.rng.randint(1, int(1e7))
-    numba_seed(self.seed)
-
-    if isinstance(self.pre_size, int) or (isinstance(self.pre_size, (tuple, list)) and len(self.pre_size) == 1):
-      num_node = self.pre_num
-
-      if self.num_neighbor > num_node:
-        raise ConnectorError("num_neighbor > num_node, choose smaller num_neighbor or larger num_node")
-      # If k == n, the graph is complete not Watts-Strogatz
-      if self.num_neighbor == num_node:
-        conn = np.ones((num_node, num_node), dtype=MAT_DTYPE)
-      else:
-        conn = np.zeros((num_node, num_node), dtype=MAT_DTYPE)
-        nodes = np.array(list(range(num_node)))  # nodes are labeled 0 to n-1
-        # connect each node to k/2 neighbors
-        for j in range(1, self.num_neighbor // 2 + 1):
-          targets = np.concatenate([nodes[j:], nodes[0:j]])  # first j nodes are now last in list
-          conn[nodes, targets] = True
-          conn[targets, nodes] = True
-
-        # rewire edges from each node
-        # loop over all nodes in order (label) and neighbors in order (distance)
-        # no self loops or multiple edges allowed
-        for j in range(1, self.num_neighbor // 2 + 1):  # outer loop is neighbors
-          targets = np.concatenate([nodes[j:], nodes[0:j]])  # first j nodes are now last in list
-          if self.directed:
-            # inner loop in node order
-            for u, v in zip(nodes, targets):
-              w = self._connect(prob=self.prob, i=u, all_j=conn[u])
-              if w != -1:
-                conn[u, v] = False
-                conn[u, w] = True
-              w = self._connect(prob=self.prob, i=u, all_j=conn[:, u])
-              if w != -1:
-                conn[v, u] = False
-                conn[w, u] = True
-          else:
-            # inner loop in node order
-            for u, v in zip(nodes, targets):
-              w = self._connect(i=u, all_j=conn[u])
-              if w != -1:
-                conn[u, v] = False
-                conn[v, u] = False
-                conn[u, w] = True
-                conn[w, u] = True
-        # conn = np.asarray(conn, dtype=MAT_DTYPE)
-    else:
-      raise ConnectorError('Currently only support 1D ring connection.')
-
-    return 'mat', conn
-
-
-# def _random_subset(seq, m, rng):
-#   """Return m unique elements from seq.
-#
-#   This differs from random.sample which can return repeated
-#   elements if seq holds repeated elements.
-#
-#   Note: rng is a random.Random or numpy.random.RandomState instance.
-#   """
-#   targets = set()
-#   while len(targets) < m:
-#     x = rng.choice(seq)
-#     targets.add(x)
-#   return targets
-
-
-class ScaleFreeBA(TwoEndConnector):
-  """Build a random graph according to the Barabási–Albert preferential
-  attachment model.
-
-  A graph of :math:`num\_node` nodes is grown by attaching new nodes each with
-  :math:`m` edges that are preferentially attached to existing nodes
-  with high degree.
-
-  Parameters
-  ----------
-  m : int
-      Number of edges to attach from a new node to existing nodes
-  seed : integer, random_state, or None (default)
-      Indicator of random number generation state.
-
-  Raises
-  ------
-  ConnectorError
-      If `m` does not satisfy ``1 <= m < n``.
-
-  References
-  ----------
-  .. [1] A. L. Barabási and R. Albert "Emergence of scaling in
-         random networks", Science 286, pp 509-512, 1999.
-  """
-
-  def __init__(self, m, directed=False, seed=None, **kwargs):
-    super(ScaleFreeBA, self).__init__(**kwargs)
-    self.m = m
-    self.directed = directed
-    self.seed = format_seed(seed)
-    self.rng = np.random.RandomState(self.seed)
-    rng = np.random if SUPPORT_NUMBA else self.rng
-
-    def _random_subset(seq, m):
-      targets = set()
-      while len(targets) < m:
-        x = rng.choice(seq)
-        targets.add(x)
-      return targets
-
-    self._connect = numba_jit(_random_subset)
-
-  def __repr__(self):
-    return (f'{self.__class__.__name__}(m={self.m}, '
-            f'directed={self.directed}, '
-            f'seed={self.seed})')
-
-  def build_mat(self, isOptimized=True):
-    assert self.pre_num == self.post_num
-
-    # seed
-    self.rng = np.random.RandomState(self.seed)
-    numba_seed(self.seed)
-
-    num_node = self.pre_num
-    if self.m < 1 or self.m >= num_node:
-      raise ConnectorError(f"Barabási–Albert network must have m >= 1 and "
-                           f"m < n, while m = {self.m} and n = {num_node}")
-
-    # Add m initial nodes (m0 in barabasi-speak)
-    conn = np.zeros((num_node, num_node), dtype=MAT_DTYPE)
-    # Target nodes for new edges
-    targets = list(range(self.m))
-    # List of existing nodes, with nodes repeated once for each adjacent edge
-
-    if not isOptimized:
-      repeated_nodes = []
-      # Start adding the other n-m nodes. The first node is m.
-      source = self.m
-      while source < num_node:
-        # Add edges to m nodes from the source.
-        origins = [source] * self.m
-        conn[origins, targets] = True
-        if not self.directed:
-          conn[targets, origins] = True
-        # Add one node to the list for each new edge just created.
-        repeated_nodes.extend(targets)
-        # And the new node "source" has m edges to add to the list.
-        repeated_nodes.extend([source] * self.m)
-        # Now choose m unique nodes from the existing nodes
-        # Pick uniformly from repeated_nodes (preferential attachment)
-        targets = list(self._connect(np.asarray(repeated_nodes), self.m))
-        source += 1
-      return conn
-
-    # List of existing nodes, with nodes repeated once for each adjacent edge
-    # Preallocate repeated_nodes as a numpy array
-    repeated_nodes = np.empty(2 * num_node * self.m, dtype=int)
-    size_repeated_nodes = 0
-    # Start adding the other n-m nodes. The first node is m.
-    source = self.m
-    while source < num_node:
-      # Add edges to m nodes from the source.
-      origins = [source] * self.m
-      conn[origins, targets] = True
-      if not self.directed:
-        conn[targets, origins] = True
-      # Add one node to the list for each new edge just created.
-      repeated_nodes[size_repeated_nodes:size_repeated_nodes + self.m] = targets
-      size_repeated_nodes += self.m
-      # And the new node "source" has m edges to add to the list.
-      repeated_nodes[size_repeated_nodes:size_repeated_nodes + self.m] = source
-      size_repeated_nodes += self.m
-      # Now choose m unique nodes from the existing nodes
-      # Pick uniformly from repeated_nodes (preferential attachment)
-      targets = list(self._connect(repeated_nodes[:size_repeated_nodes], self.m))
-      source += 1
-
-    return conn
-
-
-class ScaleFreeBADual(TwoEndConnector):
-  r"""Build a random graph according to the dual Barabási–Albert preferential
-  attachment model.
-
-  A graph of :math::`num\_node` nodes is grown by attaching new nodes each with either $m_1$
-  edges (with probability :math:`p`) or :math:`m_2` edges (with probability :math:`1-p`) that
-  are preferentially attached to existing nodes with high degree.
-
-  Parameters
-  ----------
-  m1 : int
-      Number of edges to attach from a new node to existing nodes with probability :math:`p`
-  m2 : int
-      Number of edges to attach from a new node to existing nodes with probability :math:`1-p`
-  p : float
-      The probability of attaching :math:`m\_1` edges (as opposed to :math:`m\_2` edges)
-  seed : integer, random_state, or None (default)
-      Indicator of random number generation state.
-
-  Raises
-  ------
-  ConnectorError
-      If `m1` and `m2` do not satisfy ``1 <= m1,m2 < n`` or `p` does not satisfy ``0 <= p <= 1``.
-
-  References
-  ----------
-  .. [1] N. Moshiri "The dual-Barabasi-Albert model", arXiv:1810.10538.
-  """
-
-  def __init__(self, m1, m2, p, directed=False, seed=None, **kwargs):
-    super(ScaleFreeBADual, self).__init__(**kwargs)
-    self.m1 = m1
-    self.m2 = m2
-    self.p = p
-    self.directed = directed
-    self.seed = format_seed(seed)
-    self.rng = np.random.RandomState(self.seed)
-    rng = np.random if SUPPORT_NUMBA else self.rng
-
-    def _random_subset(seq, m):
-      targets = set()
-      while len(targets) < m:
-        x = rng.choice(seq)
-        targets.add(x)
-      return targets
-
-    self._connect = numba_jit(_random_subset)
-
-  def __repr__(self):
-    return (f'{self.__class__.__name__}(m1={self.m1}, m2={self.m2}, '
-            f'p={self.p}, directed={self.directed}, seed={self.seed})')
-
-  def build_mat(self, isOptimized=True):
-    assert self.pre_num == self.post_num
-    # seed
-    self.rng = np.random.RandomState(self.seed)
-    numba_seed(self.seed)
-
-    num_node = self.pre_num
-    if self.m1 < 1 or self.m1 >= num_node:
-      raise ConnectorError(f"Dual Barabási–Albert network must have m1 >= 1 and m1 < num_node, "
-                           f"while m1 = {self.m1} and num_node = {num_node}.")
-    if self.m2 < 1 or self.m2 >= num_node:
-      raise ConnectorError(f"Dual Barabási–Albert network must have m2 >= 1 and m2 < num_node, "
-                           f"while m2 = {self.m2} and num_node = {num_node}.")
-    if self.p < 0 or self.p > 1:
-      raise ConnectorError(f"Dual Barabási–Albert network must have 0 <= p <= 1, while p = {self.p}")
-
-    # Add max(m1,m2) initial nodes (m0 in barabasi-speak)
-    conn = np.zeros((num_node, num_node), dtype=MAT_DTYPE)
-
-    if not isOptimized:
-      # List of existing nodes, with nodes repeated once for each adjacent edge
-      repeated_nodes = []
-      # Start adding the remaining nodes.
-      source = max(self.m1, self.m2)
-      # Pick which m to use first time (m1 or m2)
-      m = self.m1 if self.rng.random() < self.p else self.m2
-      # Target nodes for new edges
-      targets = list(range(m))
-      while source < num_node:
-        # Add edges to m nodes from the source.
-        origins = [source] * m
-        conn[origins, targets] = True
-        if not self.directed:
-          conn[targets, origins] = True
-        # Add one node to the list for each new edge just created.
-        repeated_nodes.extend(targets)
-        # And the new node "source" has m edges to add to the list.
-        repeated_nodes.extend([source] * m)
-        # Pick which m to use next time (m1 or m2)
-        m = self.m1 if self.rng.random() < self.p else self.m2
-        # Now choose m unique nodes from the existing nodes
-        # Pick uniformly from repeated_nodes (preferential attachment)
-        targets = list(self._connect(np.asarray(repeated_nodes), m))
-        source += 1
-      return conn
-
-    # List of existing nodes, with nodes repeated once for each adjacent edge
-    # Preallocate repeated_nodes as a numpy array
-    repeated_nodes = np.empty(2 * num_node * max(self.m1, self.m2), dtype=int)
-    size_repeated_nodes = 0
-    # Start adding the remaining nodes.
-    source = max(self.m1, self.m2)
-    # Pick which m to use first time (m1 or m2)
-    m = self.m1 if self.rng.random() < self.p else self.m2
-    # Target nodes for new edges
-    targets = list(range(m))
-    while source < num_node:
-      # Add edges to m nodes from the source.
-      origins = [source] * m
-      conn[origins, targets] = True
-      if not self.directed:
-        conn[targets, origins] = True
-      # Add one node to the list for each new edge just created.
-      repeated_nodes[size_repeated_nodes:size_repeated_nodes + m] = targets
-      size_repeated_nodes += m
-      # And the new node "source" has m edges to add to the list.
-      repeated_nodes[size_repeated_nodes:size_repeated_nodes + m] = source
-      size_repeated_nodes += m
-      # Pick which m to use next time (m1 or m2)
-      m = self.m1 if self.rng.random() < self.p else self.m2
-      # Now choose m unique nodes from the existing nodes
-      # Pick uniformly from repeated_nodes (preferential attachment)
-      targets = list(self._connect(repeated_nodes[:size_repeated_nodes], m))
-      source += 1
-
-    return conn
-
-
-class PowerLaw(TwoEndConnector):
-  """Holme and Kim algorithm for growing graphs with powerlaw
-  degree distribution and approximate average clustering.
-
-  Parameters
-  ----------
-  m : int
-      the number of random edges to add for each new node
-  p : float,
-      Probability of adding a triangle after adding a random edge
-  seed : integer, random_state, or None (default)
-      Indicator of random number generation state.
-
-  Notes
-  -----
-  The average clustering has a hard time getting above a certain
-  cutoff that depends on :math:`m`.  This cutoff is often quite low.  The
-  transitivity (fraction of triangles to possible triangles) seems to
-  decrease with network size.
-
-  It is essentially the Barabási–Albert (BA) growth model with an
-  extra step that each random edge is followed by a chance of
-  making an edge to one of its neighbors too (and thus a triangle).
-
-  This algorithm improves on BA in the sense that it enables a
-  higher average clustering to be attained if desired.
-
-  It seems possible to have a disconnected graph with this algorithm
-  since the initial :math:`m` nodes may not be all linked to a new node
-  on the first iteration like the BA model.
-
-  Raises
-  ------
-  ConnectorError
-      If :math:`m` does not satisfy :math:`1 <= m <= n` or :math:`p` does not
-      satisfy :math:`0 <= p <= 1`.
-
-  References
-  ----------
-  .. [1] P. Holme and B. J. Kim,
-         "Growing scale-free networks with tunable clustering",
-         Phys. Rev. E, 65, 026107, 2002.
-  """
-
-  def __init__(self, m: int, p: float, directed=False, seed=None, **kwargs):
-    super(PowerLaw, self).__init__(**kwargs)
-    self.m = m
-    self.p = p
-    if self.p > 1 or self.p < 0:
-      raise ConnectorError(f"p must be in [0,1], while p={self.p}")
-    self.directed = directed
-    self.seed = format_seed(seed)
-    self.rng = np.random.RandomState(self.seed)
-    rng = np.random if SUPPORT_NUMBA else self.rng
-
-    def _random_subset(seq, m):
-      targets = set()
-      while len(targets) < m:
-        x = rng.choice(seq)
-        targets.add(x)
-      return targets
-
-    self._connect = numba_jit(_random_subset)
-
-  def __repr__(self):
-    return (f'{self.__class__.__name__}(m={self.m}, p={self.p}, directed={self.directed}, seed={self.seed})')
-
-  def build_mat(self, isOptimized=True):
-    assert self.pre_num == self.post_num
-    # seed
-    self.rng = np.random.RandomState(self.seed)
-    numba_seed(self.seed)
-    num_node = self.pre_num
-    if self.m < 1 or num_node < self.m:
-      raise ConnectorError(f"Must have m>1 and m<n, while m={self.m} and n={num_node}")
-    # add m initial nodes (m0 in barabasi-speak)
-    conn = np.zeros((num_node, num_node), dtype=MAT_DTYPE)
-
-    if not isOptimized:
-      repeated_nodes = list(range(self.m))  # list of existing nodes to sample from
-      # with nodes repeated once for each adjacent edge
-      source = self.m  # next node is m
-      while source < num_node:  # Now add the other n-1 nodes
-        possible_targets = self._connect(np.asarray(repeated_nodes), self.m)
-        # do one preferential attachment for new node
-        target = possible_targets.pop()
-        conn[source, target] = True
-        if not self.directed:
-          conn[target, source] = True
-        repeated_nodes.append(target)  # add one node to list for each new link
-        count = 1
-        while count < self.m:  # add m-1 more new links
-          if self.rng.random() < self.p:  # clustering step: add triangle
-            neighbors = np.where(conn[target])[0]
-            neighborhood = [nbr for nbr in neighbors if not conn[source, nbr] and not nbr == source]
-            if neighborhood:  # if there is a neighbor without a link
-              nbr = self.rng.choice(neighborhood)
-              conn[source, nbr] = True  # add triangle
-              if not self.directed:
-                conn[nbr, source] = True
-              repeated_nodes.append(nbr)
-              count = count + 1
-              continue  # go to top of while loop
-          # else do preferential attachment step if above fails
-          target = possible_targets.pop()
-          conn[source, target] = True
-          if not self.directed:
-            conn[target, source] = True
-          repeated_nodes.append(target)
-          count = count + 1
-        repeated_nodes.extend([source] * self.m)  # add source node to list m times
-        source += 1
-      return conn
-
-    # Preallocate repeated_nodes as a numpy array
-    repeated_nodes = np.empty(2 * num_node * self.m, dtype=int)
-    repeated_nodes[:self.m] = np.arange(self.m)
-    size_repeated_nodes = self.m
-
-    source = self.m  # next node is m
-    while source < num_node:  # Now add the other n-1 nodes
-      possible_targets = list(self._connect(repeated_nodes[:size_repeated_nodes], self.m))
-      possible_targets.reverse()
-
-      # do one preferential attachment for new node
-      target = possible_targets.pop()
-      conn[source, target] = True
-      if not self.directed:
-        conn[target, source] = True
-      repeated_nodes[size_repeated_nodes] = target
-      size_repeated_nodes += 1
-
-      count = 1
-      while count < self.m:  # add m-1 more new links
-        if self.rng.random() < self.p:  # clustering step: add triangle
-          neighbors = np.where(conn[target])[0]
-          neighborhood = [nbr for nbr in neighbors if not conn[source, nbr] and nbr != source]
-          if neighborhood:  # if there is a neighbor without a link
-            nbr = self.rng.choice(neighborhood)
-            conn[source, nbr] = True  # add triangle
-            if not self.directed:
-              conn[nbr, source] = True
-            repeated_nodes[size_repeated_nodes] = nbr
-            size_repeated_nodes += 1
-            count += 1
-            continue  # go to top of while loop
-
-        # else do preferential attachment step if above fails
-        target = possible_targets.pop()
-        conn[source, target] = True
-        if not self.directed:
-          conn[target, source] = True
-        repeated_nodes[size_repeated_nodes] = target
-        size_repeated_nodes += 1
-        count += 1
-
-      repeated_nodes[size_repeated_nodes:size_repeated_nodes + self.m] = source
-      size_repeated_nodes += self.m
-      source += 1
-    return conn
-
-
-@numba_jit
-def pos2ind(pos, size):
-  idx = 0
-  for i, p in enumerate(pos):
-    idx += p * np.prod(size[i + 1:])
-  return idx
-
-
-class ProbDist(TwoEndConnector):
-  """Connection with a maximum distance under a probability `p`.
-
-  .. versionadded:: 2.1.13
-
-  Parameters
-  ----------
-  dist: float, int
-    The maximum distance between two points.
-  prob: float
-    The connection probability, within 0. and 1.
-  pre_ratio: float
-    The ratio of pre-synaptic neurons to connect.
-  seed: optional, int
-    The random seed.
-  include_self: bool
-    Whether include the point at the same position.
-
-  """
-
-  def __init__(self, dist=1, prob=1., pre_ratio=1., seed=None, include_self=True, **kwargs):
-    super(ProbDist, self).__init__(**kwargs)
-
-    self.prob = prob
-    self.pre_ratio = pre_ratio
-    self.dist = dist
-    self.seed = format_seed(seed)
-    self.rng = np.random.RandomState(self.seed)
-    self.include_self = include_self
-
-    rng = np.random if SUPPORT_NUMBA else self.rng
-
-    # @njit(parallel=True)
-    # def _connect_1d_jit_parallel(pre_pos, pre_size, post_size, n_dim):
-    #   all_post_ids = np.zeros(post_size[0], dtype=get_idx_type())
-    #   all_pre_ids = np.zeros(post_size[0], dtype=get_idx_type())
-    #   size = 0
-    #
-    #   if rng.random() < pre_ratio:
-    #     normalized_pos = np.zeros(n_dim)
-    #     for i in prange(n_dim):  # Use prange for potential parallelism
-    #       pre_len = pre_size[i]
-    #       post_len = post_size[i]
-    #       normalized_pos[i] = pre_pos[i] * post_len / pre_len
-    #     for i in prange(post_size[0]):
-    #       post_pos = np.asarray((i,))
-    #       d = np.abs(pre_pos[0] - post_pos[0])  # Adjust the distance calculation
-    #       if d <= dist:
-    #         if d == 0. and not include_self:
-    #           continue
-    #         if rng.random() <= prob:
-    #           all_post_ids[size] = pos2ind(post_pos, post_size)
-    #           all_pre_ids[size] = pos2ind(pre_pos, pre_size)
-    #           size += 1
-    #   return all_pre_ids[:size], all_post_ids[:size]  # Return filled part of the arrays
-
-    @njit
-    def _connect_1d_jit(pre_pos, pre_size, post_size, n_dim):
-      all_post_ids = np.zeros(post_size[0], dtype=IDX_DTYPE)
-      all_pre_ids = np.zeros(post_size[0], dtype=IDX_DTYPE)
-      size = 0
-
-      if rng.random() < pre_ratio:
-        normalized_pos = np.zeros(n_dim)
-        for i in range(n_dim):
-          pre_len = pre_size[i]
-          post_len = post_size[i]
-          normalized_pos[i] = pre_pos[i] * post_len / pre_len
-        for i in range(post_size[0]):
-          post_pos = np.asarray((i,))
-          d = np.abs(pre_pos[0] - post_pos[0])
-          if d <= dist:
-            if d == 0. and not include_self:
-              continue
-            if rng.random() <= prob:
-              all_post_ids[size] = pos2ind(post_pos, post_size)
-              all_pre_ids[size] = pos2ind(pre_pos, pre_size)
-              size += 1
-      return all_pre_ids[:size], all_post_ids[:size]
-
-    def _connect_1d(pre_pos, pre_size, post_size, n_dim):
-      all_post_ids = []
-      all_pre_ids = []
-      if rng.random() < pre_ratio:
-        normalized_pos = []
-        for i in range(n_dim):
-          pre_len = pre_size[i]
-          post_len = post_size[i]
-          normalized_pos.append(pre_pos[i] * post_len / pre_len)
-        for i in range(post_size[0]):
-          post_pos = np.asarray((i,))
-          d = np.sum(np.abs(pre_pos - post_pos))
-          if d <= dist:
-            if d == 0. and not include_self:
-              continue
-            if rng.random() <= prob:
-              all_post_ids.append(pos2ind(post_pos, post_size))
-              all_pre_ids.append(pos2ind(pre_pos, pre_size))
-      return all_pre_ids, all_post_ids
-
-    @njit
-    def _connect_2d_jit(pre_pos, pre_size, post_size, n_dim):
-      max_size = post_size[0] * post_size[1]
-      all_post_ids = np.zeros(max_size, dtype=IDX_DTYPE)
-      all_pre_ids = np.zeros(max_size, dtype=IDX_DTYPE)
-      size = 0
-
-      if rng.random() < pre_ratio:
-        normalized_pos = np.zeros(n_dim)
-        for i in range(n_dim):
-          pre_len = pre_size[i]
-          post_len = post_size[i]
-          normalized_pos[i] = pre_pos[i] * post_len / pre_len
-        for i in range(post_size[0]):
-          for j in range(post_size[1]):
-            post_pos = np.asarray((i, j))
-            d = np.sqrt(np.sum(np.square(pre_pos - post_pos)))
-            if d <= dist:
-              if d == 0. and not include_self:
-                continue
-              if rng.random() <= prob:
-                all_post_ids[size] = pos2ind(post_pos, post_size)
-                all_pre_ids[size] = pos2ind(pre_pos, pre_size)
-                size += 1
-      return all_pre_ids[:size], all_post_ids[:size]  # Return filled part of the arrays
-
-    def _connect_2d(pre_pos, pre_size, post_size, n_dim):
-      all_post_ids = []
-      all_pre_ids = []
-      if rng.random() < pre_ratio:
-        normalized_pos = []
-        for i in range(n_dim):
-          pre_len = pre_size[i]
-          post_len = post_size[i]
-          normalized_pos.append(pre_pos[i] * post_len / pre_len)
-        for i in range(post_size[0]):
-          for j in range(post_size[1]):
-            post_pos = np.asarray((i, j))
-            d = np.sqrt(np.sum(np.square(pre_pos - post_pos)))
-            if d <= dist:
-              if d == 0. and not include_self:
-                continue
-              if np.random.random() <= prob:
-                all_post_ids.append(pos2ind(post_pos, post_size))
-                all_pre_ids.append(pos2ind(pre_pos, pre_size))
-      return all_pre_ids, all_post_ids
-
-    @njit
-    def _connect_3d_jit(pre_pos, pre_size, post_size, n_dim):
-      max_size = post_size[0] * post_size[1] * post_size[2]
-      all_post_ids = np.zeros(max_size, dtype=IDX_DTYPE)
-      all_pre_ids = np.zeros(max_size, dtype=IDX_DTYPE)
-      size = 0
-
-      if rng.random() < pre_ratio:
-        normalized_pos = np.zeros(n_dim)
-        for i in range(n_dim):
-          pre_len = pre_size[i]
-          post_len = post_size[i]
-          normalized_pos[i] = pre_pos[i] * post_len / pre_len
-        for i in range(post_size[0]):
-          for j in range(post_size[1]):
-            for k in range(post_size[2]):
-              post_pos = np.asarray((i, j, k))
-              d = np.sqrt(np.sum(np.square(pre_pos - post_pos)))
-              if d <= dist:
-                if d == 0. and not include_self:
-                  continue
-                if rng.random() <= prob:
-                  all_post_ids[size] = pos2ind(post_pos, post_size)
-                  all_pre_ids[size] = pos2ind(pre_pos, pre_size)
-                  size += 1
-      return all_pre_ids[:size], all_post_ids[:size]
-
-    def _connect_3d(pre_pos, pre_size, post_size, n_dim):
-      all_post_ids = []
-      all_pre_ids = []
-      if rng.random() < pre_ratio:
-        normalized_pos = []
-        for i in range(n_dim):
-          pre_len = pre_size[i]
-          post_len = post_size[i]
-          normalized_pos.append(pre_pos[i] * post_len / pre_len)
-        for i in range(post_size[0]):
-          for j in range(post_size[1]):
-            for k in range(post_size[2]):
-              post_pos = np.asarray((i, j, k))
-              d = np.sqrt(np.sum(np.square(pre_pos - post_pos)))
-              if d <= dist:
-                if d == 0. and not include_self:
-                  continue
-                if np.random.random() <= prob:
-                  all_post_ids.append(pos2ind(post_pos, post_size))
-                  all_pre_ids.append(pos2ind(pre_pos, pre_size))
-      return all_pre_ids, all_post_ids
-
-    @njit
-    def _connect_4d_jit(pre_pos, pre_size, post_size, n_dim):
-      max_size = post_size[0] * post_size[1] * post_size[2] * post_size[3]
-      all_post_ids = np.zeros(max_size, dtype=IDX_DTYPE)
-      all_pre_ids = np.zeros(max_size, dtype=IDX_DTYPE)
-      size = 0
-
-      if rng.random() < pre_ratio:
-        normalized_pos = np.zeros(n_dim)
-        for i in range(n_dim):
-          pre_len = pre_size[i]
-          post_len = post_size[i]
-          normalized_pos[i] = pre_pos[i] * post_len / pre_len
-        for i in range(post_size[0]):
-          for j in range(post_size[1]):
-            for k in range(post_size[2]):
-              for l in range(post_size[3]):
-                post_pos = np.asarray((i, j, k, l))
-                d = np.sqrt(np.sum(np.square(pre_pos - post_pos)))
-                if d <= dist:
-                  if d == 0. and not include_self:
-                    continue
-                  if rng.random() <= prob:
-                    all_post_ids[size] = pos2ind(post_pos, post_size)
-                    all_pre_ids[size] = pos2ind(pre_pos, pre_size)
-                    size += 1
-      return all_pre_ids[:size], all_post_ids[:size]
-
-    def _connect_4d(pre_pos, pre_size, post_size, n_dim):
-      all_post_ids = []
-      all_pre_ids = []
-      if rng.random() < pre_ratio:
-        normalized_pos = []
-        for i in range(n_dim):
-          pre_len = pre_size[i]
-          post_len = post_size[i]
-          normalized_pos.append(pre_pos[i] * post_len / pre_len)
-        for i in range(post_size[0]):
-          for j in range(post_size[1]):
-            for k in range(post_size[2]):
-              for l in range(post_size[3]):
-                post_pos = np.asarray((i, j, k, l))
-                d = np.sqrt(np.sum(np.square(pre_pos - post_pos)))
-                if d <= dist:
-                  if d == 0. and not include_self:
-                    continue
-                  if np.random.random() <= prob:
-                    all_post_ids.append(pos2ind(post_pos, post_size))
-                    all_pre_ids.append(pos2ind(pre_pos, pre_size))
-      return all_pre_ids, all_post_ids
-
-    self._connect_1d = numba_jit(_connect_1d)
-    self._connect_2d = numba_jit(_connect_2d)
-    self._connect_3d = numba_jit(_connect_3d)
-    self._connect_4d = numba_jit(_connect_4d)
-
-    self._connect_1d_jit = _connect_1d_jit
-    self._connect_2d_jit = _connect_2d_jit
-    self._connect_3d_jit = _connect_3d_jit
-    self._connect_4d_jit = _connect_4d_jit
-
-
-  def build_coo(self, isOptimized=True):
-    if len(self.pre_size) != len(self.post_size):
-      raise ValueError('The dimensions of shapes of two objects to establish connections should '
-                       f'be the same. But we got dimension {len(self.pre_size)} != {len(self.post_size)}. '
-                       f'Specifically, pre size = {self.pre_size}, post size = {self.post_size}')
-    self.rng = np.random.RandomState(self.seed)
-    numba_seed(self.seed)
-
-    # connections
-    n_dim = len(self.pre_size)
-    if not isOptimized:
-      if n_dim == 1:
-        f = self._connect_1d
-      elif n_dim == 2:
-        f = self._connect_2d
-      elif n_dim == 3:
-        f = self._connect_3d
-      elif n_dim == 4:
-        f = self._connect_4d
-      else:
-        raise NotImplementedError('Does not support the network dimension bigger than 4.')
-    else:
-      if n_dim == 1:
-        f = self._connect_1d_jit
-      elif n_dim == 2:
-        f = self._connect_2d_jit
-      elif n_dim == 3:
-        f = self._connect_3d_jit
-      elif n_dim == 4:
-        f = self._connect_4d_jit
-      else:
-        raise NotImplementedError('Does not support the network dimension bigger than 4.')
-
-
-
-    pre_size = np.asarray(self.pre_size)
-    post_size = np.asarray(self.post_size)
-    connected_pres = []
-    connected_posts = []
-    pre_ids = np.meshgrid(*(np.arange(p) for p in self.pre_size), indexing='ij')
-    pre_ids = tuple([(np.moveaxis(p, 0, 1).flatten()) if p.ndim > 1 else p.flatten() for p in pre_ids])
-    size = np.prod(pre_size)
-
-    for i in range(size):
-      pre_pos = np.asarray([p[i] for p in pre_ids])
-      pres, posts = f(pre_pos, pre_size=pre_size, post_size=post_size, n_dim=n_dim)
-      connected_pres.extend(pres)
-      connected_posts.extend(posts)
-    return np.asarray(connected_pres), np.asarray(connected_posts)
+# -*- coding: utf-8 -*-
+
+from functools import partial
+from typing import Optional
+
+from jax import vmap, jit, numpy as jnp
+import numpy as np
+
+import brainpy.math as bm
+from brainpy.errors import ConnectorError
+from brainpy.tools import numba_seed, numba_jit, numba_range, format_seed
+from brainpy._src.tools.package import SUPPORT_NUMBA
+from .base import *
+
+
+__all__ = [
+  'FixedProb',
+  'FixedPreNum',
+  'FixedPostNum',
+  'FixedTotalNum',
+  'GaussianProb',
+  'ProbDist',
+
+  'SmallWorld',
+  'ScaleFreeBA',
+  'ScaleFreeBADual',
+  'PowerLaw',
+]
+
+
+class FixedProb(TwoEndConnector):
+  """Connect the post-synaptic neurons with fixed probability.
+
+  Parameters
+  ----------
+  prob: float
+    The conn probability.
+  pre_ratio: float
+    The ratio of pre-synaptic neurons to connect.
+  include_self : bool
+    Whether create (i, i) conn?
+  allow_multi_conn: bool
+    Allow one pre-synaptic neuron connects to multiple post-synaptic neurons?
+
+    .. versionadded:: 2.2.3.2
+
+  seed : optional, int
+    Seed the random generator.
+  """
+
+  def __init__(self,
+               prob,
+               pre_ratio=1.,
+               include_self=True,
+               allow_multi_conn=False,
+               seed=None,
+               **kwargs):
+    super(FixedProb, self).__init__(**kwargs)
+    assert 0. <= prob <= 1.
+    assert 0. <= pre_ratio <= 1.
+    self.prob = prob
+    self.pre_ratio = pre_ratio
+    self.include_self = include_self
+    self.seed = format_seed(seed)
+    self.allow_multi_conn = allow_multi_conn
+    self._jaxrand = bm.random.default_rng(self.seed)
+    self._nprand = np.random.RandomState(self.seed)
+
+  def __repr__(self):
+    return (f'{self.__class__.__name__}(prob={self.prob}, pre_ratio={self.pre_ratio}, '
+            f'include_self={self.include_self}, allow_multi_conn={self.allow_multi_conn}, '
+            f'seed={self.seed})')
+
+  def _iii(self):
+    if (not self.include_self) and (self.pre_num != self.post_num):
+      raise ConnectorError(f'We found pre_num != post_num ({self.pre_num} != {self.post_num}). '
+                           f'But `include_self` is set to True.')
+
+    if self.pre_ratio < 1.:
+      pre_num_to_select = int(self.pre_num * self.pre_ratio)
+      pre_ids = self._jaxrand.choice(self.pre_num, size=(pre_num_to_select,), replace=False)
+    else:
+      pre_num_to_select = self.pre_num
+      pre_ids = jnp.arange(self.pre_num)
+
+    post_num_total = self.post_num
+    post_num_to_select = int(self.post_num * self.prob)
+
+    if self.allow_multi_conn:
+      selected_post_ids = self._jaxrand.randint(0, post_num_total, (pre_num_to_select, post_num_to_select))
+
+    else:
+      if SUPPORT_NUMBA:
+        rng = np.random
+        numba_seed(self._nprand.randint(0, int(1e8)))
+      else:
+        rng = self._nprand
+
+      @numba_jit  # (parallel=True, nogil=True)
+      def single_conn():
+        posts = np.zeros((pre_num_to_select, post_num_to_select), dtype=IDX_DTYPE)
+        for i in numba_range(pre_num_to_select):
+          posts[i] = rng.choice(post_num_total, post_num_to_select, replace=False)
+        return posts
+
+      selected_post_ids = jnp.asarray(single_conn())
+    return pre_num_to_select, post_num_to_select, bm.as_jax(selected_post_ids), bm.as_jax(pre_ids)
+
+  def build_coo(self):
+    _, post_num_to_select, selected_post_ids, pre_ids = self._iii()
+    selected_post_ids = selected_post_ids.flatten()
+    selected_pre_ids = jnp.repeat(pre_ids, post_num_to_select)
+    if not self.include_self:
+      true_ids = selected_pre_ids != selected_post_ids
+      selected_pre_ids = selected_pre_ids[true_ids]
+      selected_post_ids = selected_post_ids[true_ids]
+    return selected_pre_ids.astype(get_idx_type()), selected_post_ids.astype(get_idx_type())
+
+  def build_csr(self):
+    pre_num_to_select, post_num_to_select, selected_post_ids, pre_ids = self._iii()
+    pre_nums = jnp.ones(pre_num_to_select) * post_num_to_select
+    if not self.include_self:
+      true_ids = selected_post_ids == jnp.reshape(pre_ids, (-1, 1))
+      pre_nums -= jnp.sum(true_ids, axis=1)
+      selected_post_ids = selected_post_ids.flatten()[jnp.logical_not(true_ids).flatten()]
+    else:
+      selected_post_ids = selected_post_ids.flatten()
+    selected_pre_inptr = jnp.cumsum(jnp.concatenate([jnp.zeros(1), pre_nums]))
+    return selected_post_ids.astype(get_idx_type()), selected_pre_inptr.astype(get_idx_type())
+
+  def build_mat(self):
+    if self.pre_ratio < 1.:
+      pre_state = self._jaxrand.uniform(size=(self.pre_num, 1)) < self.pre_ratio
+      mat = (self._jaxrand.uniform(size=(self.pre_num, self.post_num)) < self.prob) * pre_state
+    else:
+      mat = (self._jaxrand.uniform(size=(self.pre_num, self.post_num)) < self.prob)
+    mat = bm.asarray(mat)
+    if not self.include_self:
+      bm.fill_diagonal(mat, False)
+    return mat.astype(MAT_DTYPE)
+
+
+class FixedTotalNum(TwoEndConnector):
+  """Connect the synaptic neurons with fixed total number.
+
+  Parameters
+  ----------
+  num : float,int
+    The conn total number.
+  allow_multi_conn : bool, optional
+    Whether allow one pre-synaptic neuron connects to multiple post-synaptic neurons.
+  seed: int, optional
+    The random number seed.
+  """
+
+  def __init__(self,
+               num,
+               allow_multi_conn=False,
+               seed=None, **kwargs):
+    super().__init__(**kwargs)
+    if isinstance(num, int):
+      assert num >= 0, '"num" must be a non-negative integer.'
+    elif isinstance(num, float):
+      assert 0. <= num <= 1., '"num" must be in [0., 1.).'
+    else:
+      raise ConnectorError(f'Unknown type: {type(num)}')
+    self.num = num
+    self.seed = format_seed(seed)
+    self.allow_multi_conn = allow_multi_conn
+    self.rng = bm.random.RandomState(self.seed)
+
+  def build_coo(self):
+    mat_element_num = self.pre_num * self.post_num
+    if self.num > mat_element_num:
+      raise ConnectorError(f'"num" must be smaller than "all2all num", '
+                           f'but got {self.num} > {mat_element_num}')
+    if self.allow_multi_conn:
+      selected_pre_ids = self.rng.randint(0, self.pre_num, (self.num,))
+      selected_post_ids = self.rng.randint(0, self.post_num, (self.num,))
+    else:
+      index = self.rng.choice(mat_element_num, size=(self.num,), replace=False)
+      selected_pre_ids = index // self.post_num
+      selected_post_ids = index % self.post_num
+    return selected_pre_ids.astype(get_idx_type()), selected_post_ids.astype(get_idx_type())
+
+  def __repr__(self):
+    return f'{self.__class__.__name__}(num={self.num}, seed={self.seed})'
+
+
+class FixedNum(TwoEndConnector):
+  def __init__(self,
+               num,
+               include_self=True,
+               allow_multi_conn=False,
+               seed=None,
+               **kwargs):
+    super(FixedNum, self).__init__(**kwargs)
+    if isinstance(num, int):
+      assert num >= 0, '"num" must be a non-negative integer.'
+    elif isinstance(num, float):
+      assert 0. <= num <= 1., '"num" must be in [0., 1.).'
+    else:
+      raise ConnectorError(f'Unknown type: {type(num)}')
+    self.num = num
+    self.seed = format_seed(seed)
+    self.include_self = include_self
+    self.allow_multi_conn = allow_multi_conn
+    self.rng = bm.random.RandomState(self.seed) if allow_multi_conn else np.random.RandomState(self.seed)
+
+  def __repr__(self):
+    return f'{self.__class__.__name__}(num={self.num}, include_self={self.include_self}, seed={self.seed})'
+
+
+class FixedPreNum(FixedNum):
+  """Connect a fixed number pf pre-synaptic neurons for each post-synaptic neuron.
+
+  Parameters
+  ----------
+  num : float, int
+      The conn probability (if "num" is float) or the fixed number of
+      connectivity (if "num" is int).
+  include_self : bool
+      Whether create (i, i) conn ?
+  seed : None, int
+      Seed the random generator.
+  allow_multi_conn: bool
+    Allow one pre-synaptic neuron connects to multiple post-synaptic neurons?
+
+    .. versionadded:: 2.2.3.2
+
+  """
+
+  def build_coo(self):
+    if isinstance(self.num, int) and self.num > self.pre_num:
+      raise ConnectorError(f'"num" must be smaller than "pre_num", '
+                           f'but got {self.num} > {self.pre_num}')
+    if (not self.include_self) and (self.pre_num != self.post_num):
+      raise ConnectorError(f'We found pre_num != post_num ({self.pre_num} != {self.post_num}). '
+                           f'But `include_self` is set to True.')
+    pre_num_to_select = int(self.pre_num * self.num) if isinstance(self.num, float) else self.num
+    pre_num_total = self.pre_num
+    post_num_total = self.post_num
+
+    if self.allow_multi_conn:
+      selected_pre_ids = self.rng.randint(0, pre_num_total, (post_num_total, pre_num_to_select,))
+
+    else:
+      if SUPPORT_NUMBA:
+        rng = np.random
+        numba_seed(self.rng.randint(0, int(1e8)))
+      else:
+        rng = self.rng
+
+      @numba_jit  # (parallel=True, nogil=True)
+      def single_conn():
+        posts = np.zeros((post_num_total, pre_num_to_select), dtype=IDX_DTYPE)
+        for i in numba_range(post_num_total):
+          posts[i] = rng.choice(pre_num_total, pre_num_to_select, replace=False)
+        return posts
+
+      selected_pre_ids = jnp.asarray(single_conn())
+
+    post_nums = jnp.ones((post_num_total,), dtype=get_idx_type()) * pre_num_to_select
+    if not self.include_self:
+      true_ids = selected_pre_ids == jnp.reshape(jnp.arange(pre_num_total), (-1, 1))
+      post_nums -= jnp.sum(true_ids, axis=1)
+      selected_pre_ids = selected_pre_ids.flatten()[jnp.logical_not(true_ids).flatten()]
+    else:
+      selected_pre_ids = selected_pre_ids.flatten()
+    selected_post_ids = jnp.repeat(jnp.arange(post_num_total), post_nums)
+    return selected_pre_ids.astype(get_idx_type()), selected_post_ids.astype(get_idx_type())
+
+
+class FixedPostNum(FixedNum):
+  """Connect the fixed number of post-synaptic neurons for each pre-synaptic neuron.
+
+  Parameters
+  ----------
+  num : float, int
+      The conn probability (if "num" is float) or the fixed number of
+      connectivity (if "num" is int).
+  include_self : bool
+      Whether create (i, i) conn ?
+  seed : None, int
+      Seed the random generator.
+  allow_multi_conn: bool
+    Allow one pre-synaptic neuron connects to multiple post-synaptic neurons?
+
+    .. versionadded:: 2.2.3.2
+
+  """
+
+  def _ii(self):
+    if isinstance(self.num, int) and self.num > self.post_num:
+      raise ConnectorError(f'"num" must be smaller than "post_num", '
+                           f'but got {self.num} > {self.post_num}')
+    if (not self.include_self) and (self.pre_num != self.post_num):
+      raise ConnectorError(f'We found pre_num != post_num ({self.pre_num} != {self.post_num}). '
+                           f'But `include_self` is set to True.')
+    post_num_to_select = int(self.post_num * self.num) if isinstance(self.num, float) else self.num
+    pre_num_to_select = self.pre_num
+    pre_ids = jnp.arange(self.pre_num)
+    post_num_total = self.post_num
+
+    if self.allow_multi_conn:
+      selected_post_ids = self.rng.randint(0, post_num_total, (pre_num_to_select, post_num_to_select,))
+
+    else:
+      if SUPPORT_NUMBA:
+        rng = np.random
+        numba_seed(self.rng.randint(0, int(1e8)))
+      else:
+        rng = self.rng
+
+      @numba_jit  # (parallel=True, nogil=True)
+      def single_conn():
+        posts = np.zeros((pre_num_to_select, post_num_to_select), dtype=IDX_DTYPE)
+        for i in numba_range(pre_num_to_select):
+          posts[i] = rng.choice(post_num_total, post_num_to_select, replace=False)
+        return posts
+
+      selected_post_ids = jnp.asarray(single_conn())
+    return pre_num_to_select, post_num_to_select, bm.as_jax(selected_post_ids), bm.as_jax(pre_ids)
+
+  def build_coo(self):
+    _, post_num_to_select, selected_post_ids, pre_ids = self._ii()
+    selected_post_ids = selected_post_ids.flatten()
+    selected_pre_ids = jnp.repeat(pre_ids, post_num_to_select)
+    if not self.include_self:
+      true_ids = selected_pre_ids != selected_post_ids
+      selected_pre_ids = selected_pre_ids[true_ids]
+      selected_post_ids = selected_post_ids[true_ids]
+    return selected_pre_ids.astype(get_idx_type()), selected_post_ids.astype(get_idx_type())
+
+  def build_csr(self):
+    pre_num_to_select, post_num_to_select, selected_post_ids, pre_ids = self._ii()
+    pre_nums = jnp.ones(pre_num_to_select) * post_num_to_select
+    if not self.include_self:
+      true_ids = selected_post_ids == jnp.reshape(pre_ids, (-1, 1))
+      pre_nums -= jnp.sum(true_ids, axis=1)
+      selected_post_ids = selected_post_ids.flatten()[jnp.logical_not(true_ids).flatten()]
+    else:
+      selected_post_ids = selected_post_ids.flatten()
+    selected_pre_inptr = jnp.cumsum(jnp.concatenate([jnp.zeros(1), pre_nums]))
+    return selected_post_ids.astype(get_idx_type()), selected_pre_inptr.astype(get_idx_type())
+
+
+@jit
+@partial(vmap, in_axes=(0, None, None))
+def gaussian_prob_dist_cal1(i_value, post_values, sigma):
+  dists = jnp.abs(i_value - post_values)
+  exp_dists = jnp.exp(-(jnp.sqrt(jnp.sum(dists ** 2, axis=0)) / sigma) ** 2 / 2)
+  return bm.asarray(exp_dists)
+
+
+@jit
+@partial(vmap, in_axes=(0, None, None, None))
+def gaussian_prob_dist_cal2(i_value, post_values, value_sizes, sigma):
+  dists = jnp.abs(i_value - post_values)
+  dists = jnp.where(dists > (value_sizes / 2), value_sizes - dists, dists)
+  exp_dists = jnp.exp(-(jnp.sqrt(jnp.sum(dists ** 2, axis=0)) / sigma) ** 2 / 2)
+  return bm.asarray(exp_dists)
+
+
+class GaussianProb(OneEndConnector):
+  r"""Builds a Gaussian connectivity pattern within a population of neurons,
+  where the connection probability decay according to the gaussian function.
+
+  Specifically, for any pair of neurons :math:`(i, j)`,
+
+  .. math::
+
+      p(i, j)=\exp(-\frac{\sum_{k=1}^n |v_k^i - v_k^j|^2 }{2\sigma^2})
+
+  where :math:`v_k^i` is the :math:`i`-th neuron's encoded value at dimension :math:`k`.
+
+  Parameters
+  ----------
+  sigma : float
+      Width of the Gaussian function.
+  encoding_values : optional, list, tuple, int, float
+    The value ranges to encode for neurons at each axis.
+
+    - If `values` is not provided, the neuron only encodes each positional
+      information, i.e., :math:`(i, j, k, ...)`, where :math:`i, j, k` is
+      the index in the high-dimensional space.
+    - If `values` is a single tuple/list of int/float, neurons at each dimension
+      will encode the same range of values. For example, ``values=(0, np.pi)``,
+      neurons at each dimension will encode a continuous value space ``[0, np.pi]``.
+    - If `values` is a tuple/list of list/tuple, it means the value space will be
+      different for each dimension. For example, ``values=((-np.pi, np.pi), (10, 20), (0, 2 * np.pi))``.
+
+  periodic_boundary : bool
+    Whether the neuron encode the value space with the periodic boundary.
+  normalize : bool
+      Whether normalize the connection probability .
+  include_self : bool
+      Whether create the connection at the same position.
+  seed : int
+      The random seed.
+  """
+
+  def __init__(
+      self,
+      sigma: float,
+      encoding_values: Optional[np.ndarray] = None,
+      normalize: bool = True,
+      include_self: bool = True,
+      periodic_boundary: bool = False,
+      seed: int = None,
+      **kwargs
+  ):
+    super(GaussianProb, self).__init__(**kwargs)
+    self.sigma = sigma
+    self.encoding_values = encoding_values
+    self.normalize = normalize
+    self.include_self = include_self
+    self.periodic_boundary = periodic_boundary
+    self.seed = format_seed(seed)
+    self.rng = np.random.RandomState(self.seed)
+
+  def __repr__(self):
+    return (f'{self.__class__.__name__}(sigma={self.sigma}, '
+            f'normalize={self.normalize}, '
+            f'periodic_boundary={self.periodic_boundary}, '
+            f'include_self={self.include_self}, '
+            f'seed={self.seed})')
+
+  def build_mat(self, isOptimized=True):
+    self.rng = np.random.RandomState(self.seed)
+    # value range to encode
+    if self.encoding_values is None:
+      value_ranges = tuple([(0, s) for s in self.pre_size])
+    elif isinstance(self.encoding_values, (tuple, list)):
+      if len(self.encoding_values) == 0:
+        raise ConnectorError(f'encoding_values has a length of 0.')
+      elif isinstance(self.encoding_values[0], (int, float)):
+        assert len(self.encoding_values) == 2
+        assert self.encoding_values[0] < self.encoding_values[1]
+        value_ranges = tuple([self.encoding_values for _ in self.pre_size])
+      elif isinstance(self.encoding_values[0], (tuple, list)):
+        if len(self.encoding_values) != len(self.pre_size):
+          raise ConnectorError(f'The network size has {len(self.pre_size)} dimensions, while '
+                               f'the encoded values provided only has {len(self.encoding_values)}-D. '
+                               f'Error in {str(self)}.')
+        for v in self.encoding_values:
+          assert isinstance(v[0], (int, float))
+          assert len(v) == 2
+        value_ranges = tuple(self.encoding_values)
+      else:
+        raise ConnectorError(f'Unsupported encoding values: {self.encoding_values}')
+    else:
+      raise ConnectorError(f'Unsupported encoding values: {self.encoding_values}')
+
+    # values
+    values = [np.linspace(vs[0], vs[1], n + 1)[:n] for vs, n in zip(value_ranges, self.pre_size)]
+    # post_values = np.stack([v.flatten() for v in np.meshgrid(*values, indexing='ij')])
+    post_values = np.stack([v.flatten() for v in np.meshgrid(*values)])
+    value_sizes = np.array([v[1] - v[0] for v in value_ranges])
+    if value_sizes.ndim < post_values.ndim:
+      value_sizes = np.expand_dims(value_sizes, axis=tuple([i + 1 for i in range(post_values.ndim - 1)]))
+
+    # probability of connections
+    if isOptimized:
+      i_value_list = np.zeros(shape=(self.pre_num, len(self.pre_size), 1))
+      for i in range(self.pre_num):
+        list_index = i
+        # values for node i
+        i_coordinate = tuple()
+        for s in self.pre_size[:-1]:
+          i, pos = divmod(i, s)
+          i_coordinate += (pos,)
+        i_coordinate += (i,)
+        i_value = np.array([values[i][c] for i, c in enumerate(i_coordinate)])
+        if i_value.ndim < post_values.ndim:
+          i_value = np.expand_dims(i_value, axis=tuple([i + 1 for i in range(post_values.ndim - 1)]))
+        i_value_list[list_index] = i_value
+
+      if self.periodic_boundary:
+        prob_mat = gaussian_prob_dist_cal2(i_value_list, post_values, value_sizes, self.sigma)
+      else:
+        prob_mat = gaussian_prob_dist_cal1(i_value_list, post_values, self.sigma)
+    else:
+      prob_mat = []
+      for i in range(self.pre_num):
+        # values for node i
+        i_coordinate = tuple()
+        for s in self.pre_size[:-1]:
+          i, pos = divmod(i, s)
+          i_coordinate += (pos,)
+        i_coordinate += (i,)
+        i_value = np.array([values[i][c] for i, c in enumerate(i_coordinate)])
+        if i_value.ndim < post_values.ndim:
+          i_value = np.expand_dims(i_value, axis=tuple([i + 1 for i in range(post_values.ndim - 1)]))
+        # distances
+        dists = np.abs(i_value - post_values)
+        if self.periodic_boundary:
+          dists = np.where(dists > value_sizes / 2, value_sizes - dists, dists)
+        exp_dists = np.exp(-(np.linalg.norm(dists, axis=0) / self.sigma) ** 2 / 2)
+        prob_mat.append(exp_dists)
+      prob_mat = np.stack(prob_mat)
+
+    if self.normalize:
+      prob_mat /= prob_mat.max()
+
+    # connectivity
+    conn_mat = np.asarray(prob_mat) >= self.rng.random(prob_mat.shape)
+    if not self.include_self:
+      np.fill_diagonal(conn_mat, False)
+    return conn_mat
+
+
+class SmallWorld(TwoEndConnector):
+  """Build a Watts–Strogatz small-world graph.
+
+  Parameters
+  ----------
+  num_neighbor : int
+      Each node is joined with its `k` nearest neighbors in a ring
+      topology.
+  prob : float
+      The probability of rewiring each edge
+  directed : bool
+      Whether the graph is a directed graph.
+  include_self : bool
+      Whether include the node self.
+
+  Notes
+  -----
+  First create a ring over :math:`num\_node` nodes [1]_.  Then each node in the ring is
+  joined to its :math:`num\_neighbor` nearest neighbors (or :math:`num\_neighbor - 1` neighbors
+  if :math:`num\_neighbor` is odd). Then shortcuts are created by replacing some edges as
+  follows: for each edge :math:`(u, v)` in the underlying ":math:`num\_node`-ring with
+  :math:`num\_neighbor` nearest neighbors" with probability :math:`prob` replace it with a new
+  edge :math:`(u, w)` with uniformly random choice of existing node :math:`w`.
+
+  References
+  ----------
+  .. [1] Duncan J. Watts and Steven H. Strogatz,
+         Collective dynamics of small-world networks,
+         Nature, 393, pp. 440--442, 1998.
+  """
+
+  def __init__(
+      self,
+      num_neighbor,
+      prob,
+      directed=False,
+      include_self=False,
+      seed=None,
+      **kwargs
+  ):
+    super(SmallWorld, self).__init__(**kwargs)
+    self.prob = prob
+    self.directed = directed
+    self.num_neighbor = num_neighbor
+    self.include_self = include_self
+
+    self.seed = format_seed(seed)
+    self.rng = np.random.RandomState(seed=self.seed)
+    rng = np.random if SUPPORT_NUMBA else self.rng
+
+    def _smallworld_rewire(i, all_j):
+      if rng.random(1) < prob:
+        non_connected = np.where(np.logical_not(all_j))[0]
+        if len(non_connected) <= 1:
+          return -1
+        # Enforce no self-loops or multiple edges
+        w = rng.choice(non_connected)
+        while (not include_self) and w == i:
+          # non_connected.remove(w)
+          w = rng.choice(non_connected)
+        return w
+      else:
+        return -1
+
+    self._connect = numba_jit(_smallworld_rewire)
+
+  def __repr__(self):
+    return (f'{self.__class__.__name__}(prob={self.prob}, '
+            f'directed={self.directed}, '
+            f'num_neighbor={self.num_neighbor}, '
+            f'include_self={self.include_self}, '
+            f'seed={self.seed})')
+
+  def build_conn(self):
+    assert self.pre_size == self.post_size
+
+    # seed
+    self.seed = self.rng.randint(1, int(1e7))
+    numba_seed(self.seed)
+
+    if isinstance(self.pre_size, int) or (isinstance(self.pre_size, (tuple, list)) and len(self.pre_size) == 1):
+      num_node = self.pre_num
+
+      if self.num_neighbor > num_node:
+        raise ConnectorError("num_neighbor > num_node, choose smaller num_neighbor or larger num_node")
+      # If k == n, the graph is complete not Watts-Strogatz
+      if self.num_neighbor == num_node:
+        conn = np.ones((num_node, num_node), dtype=MAT_DTYPE)
+      else:
+        conn = np.zeros((num_node, num_node), dtype=MAT_DTYPE)
+        nodes = np.array(list(range(num_node)))  # nodes are labeled 0 to n-1
+        # connect each node to k/2 neighbors
+        for j in range(1, self.num_neighbor // 2 + 1):
+          targets = np.concatenate([nodes[j:], nodes[0:j]])  # first j nodes are now last in list
+          conn[nodes, targets] = True
+          conn[targets, nodes] = True
+
+        # rewire edges from each node
+        # loop over all nodes in order (label) and neighbors in order (distance)
+        # no self loops or multiple edges allowed
+        for j in range(1, self.num_neighbor // 2 + 1):  # outer loop is neighbors
+          targets = np.concatenate([nodes[j:], nodes[0:j]])  # first j nodes are now last in list
+          if self.directed:
+            # inner loop in node order
+            for u, v in zip(nodes, targets):
+              w = self._connect(prob=self.prob, i=u, all_j=conn[u])
+              if w != -1:
+                conn[u, v] = False
+                conn[u, w] = True
+              w = self._connect(prob=self.prob, i=u, all_j=conn[:, u])
+              if w != -1:
+                conn[v, u] = False
+                conn[w, u] = True
+          else:
+            # inner loop in node order
+            for u, v in zip(nodes, targets):
+              w = self._connect(i=u, all_j=conn[u])
+              if w != -1:
+                conn[u, v] = False
+                conn[v, u] = False
+                conn[u, w] = True
+                conn[w, u] = True
+        # conn = np.asarray(conn, dtype=MAT_DTYPE)
+    else:
+      raise ConnectorError('Currently only support 1D ring connection.')
+
+    return 'mat', conn
+
+
+# def _random_subset(seq, m, rng):
+#   """Return m unique elements from seq.
+#
+#   This differs from random.sample which can return repeated
+#   elements if seq holds repeated elements.
+#
+#   Note: rng is a random.Random or numpy.random.RandomState instance.
+#   """
+#   targets = set()
+#   while len(targets) < m:
+#     x = rng.choice(seq)
+#     targets.add(x)
+#   return targets
+
+
+class ScaleFreeBA(TwoEndConnector):
+  """Build a random graph according to the Barabási–Albert preferential
+  attachment model.
+
+  A graph of :math:`num\_node` nodes is grown by attaching new nodes each with
+  :math:`m` edges that are preferentially attached to existing nodes
+  with high degree.
+
+  Parameters
+  ----------
+  m : int
+      Number of edges to attach from a new node to existing nodes
+  seed : integer, random_state, or None (default)
+      Indicator of random number generation state.
+
+  Raises
+  ------
+  ConnectorError
+      If `m` does not satisfy ``1 <= m < n``.
+
+  References
+  ----------
+  .. [1] A. L. Barabási and R. Albert "Emergence of scaling in
+         random networks", Science 286, pp 509-512, 1999.
+  """
+
+  def __init__(self, m, directed=False, seed=None, **kwargs):
+    super(ScaleFreeBA, self).__init__(**kwargs)
+    self.m = m
+    self.directed = directed
+    self.seed = format_seed(seed)
+    self.rng = np.random.RandomState(self.seed)
+    rng = np.random if SUPPORT_NUMBA else self.rng
+
+    def _random_subset(seq, m):
+      targets = set()
+      while len(targets) < m:
+        x = rng.choice(seq)
+        targets.add(x)
+      return targets
+
+    self._connect = numba_jit(_random_subset)
+
+  def __repr__(self):
+    return (f'{self.__class__.__name__}(m={self.m}, '
+            f'directed={self.directed}, '
+            f'seed={self.seed})')
+
+  def build_mat(self, isOptimized=True):
+    assert self.pre_num == self.post_num
+
+    # seed
+    self.rng = np.random.RandomState(self.seed)
+    numba_seed(self.seed)
+
+    num_node = self.pre_num
+    if self.m < 1 or self.m >= num_node:
+      raise ConnectorError(f"Barabási–Albert network must have m >= 1 and "
+                           f"m < n, while m = {self.m} and n = {num_node}")
+
+    # Add m initial nodes (m0 in barabasi-speak)
+    conn = np.zeros((num_node, num_node), dtype=MAT_DTYPE)
+    # Target nodes for new edges
+    targets = list(range(self.m))
+    # List of existing nodes, with nodes repeated once for each adjacent edge
+
+    if not isOptimized:
+      repeated_nodes = []
+      # Start adding the other n-m nodes. The first node is m.
+      source = self.m
+      while source < num_node:
+        # Add edges to m nodes from the source.
+        origins = [source] * self.m
+        conn[origins, targets] = True
+        if not self.directed:
+          conn[targets, origins] = True
+        # Add one node to the list for each new edge just created.
+        repeated_nodes.extend(targets)
+        # And the new node "source" has m edges to add to the list.
+        repeated_nodes.extend([source] * self.m)
+        # Now choose m unique nodes from the existing nodes
+        # Pick uniformly from repeated_nodes (preferential attachment)
+        targets = list(self._connect(np.asarray(repeated_nodes), self.m))
+        source += 1
+      return conn
+
+    # List of existing nodes, with nodes repeated once for each adjacent edge
+    # Preallocate repeated_nodes as a numpy array
+    repeated_nodes = np.empty(2 * num_node * self.m, dtype=int)
+    size_repeated_nodes = 0
+    # Start adding the other n-m nodes. The first node is m.
+    source = self.m
+    while source < num_node:
+      # Add edges to m nodes from the source.
+      origins = [source] * self.m
+      conn[origins, targets] = True
+      if not self.directed:
+        conn[targets, origins] = True
+      # Add one node to the list for each new edge just created.
+      repeated_nodes[size_repeated_nodes:size_repeated_nodes + self.m] = targets
+      size_repeated_nodes += self.m
+      # And the new node "source" has m edges to add to the list.
+      repeated_nodes[size_repeated_nodes:size_repeated_nodes + self.m] = source
+      size_repeated_nodes += self.m
+      # Now choose m unique nodes from the existing nodes
+      # Pick uniformly from repeated_nodes (preferential attachment)
+      targets = list(self._connect(repeated_nodes[:size_repeated_nodes], self.m))
+      source += 1
+
+    return conn
+
+
+class ScaleFreeBADual(TwoEndConnector):
+  r"""Build a random graph according to the dual Barabási–Albert preferential
+  attachment model.
+
+  A graph of :math::`num\_node` nodes is grown by attaching new nodes each with either $m_1$
+  edges (with probability :math:`p`) or :math:`m_2` edges (with probability :math:`1-p`) that
+  are preferentially attached to existing nodes with high degree.
+
+  Parameters
+  ----------
+  m1 : int
+      Number of edges to attach from a new node to existing nodes with probability :math:`p`
+  m2 : int
+      Number of edges to attach from a new node to existing nodes with probability :math:`1-p`
+  p : float
+      The probability of attaching :math:`m\_1` edges (as opposed to :math:`m\_2` edges)
+  seed : integer, random_state, or None (default)
+      Indicator of random number generation state.
+
+  Raises
+  ------
+  ConnectorError
+      If `m1` and `m2` do not satisfy ``1 <= m1,m2 < n`` or `p` does not satisfy ``0 <= p <= 1``.
+
+  References
+  ----------
+  .. [1] N. Moshiri "The dual-Barabasi-Albert model", arXiv:1810.10538.
+  """
+
+  def __init__(self, m1, m2, p, directed=False, seed=None, **kwargs):
+    super(ScaleFreeBADual, self).__init__(**kwargs)
+    self.m1 = m1
+    self.m2 = m2
+    self.p = p
+    self.directed = directed
+    self.seed = format_seed(seed)
+    self.rng = np.random.RandomState(self.seed)
+    rng = np.random if SUPPORT_NUMBA else self.rng
+
+    def _random_subset(seq, m):
+      targets = set()
+      while len(targets) < m:
+        x = rng.choice(seq)
+        targets.add(x)
+      return targets
+
+    self._connect = numba_jit(_random_subset)
+
+  def __repr__(self):
+    return (f'{self.__class__.__name__}(m1={self.m1}, m2={self.m2}, '
+            f'p={self.p}, directed={self.directed}, seed={self.seed})')
+
+  def build_mat(self, isOptimized=True):
+    assert self.pre_num == self.post_num
+    # seed
+    self.rng = np.random.RandomState(self.seed)
+    numba_seed(self.seed)
+
+    num_node = self.pre_num
+    if self.m1 < 1 or self.m1 >= num_node:
+      raise ConnectorError(f"Dual Barabási–Albert network must have m1 >= 1 and m1 < num_node, "
+                           f"while m1 = {self.m1} and num_node = {num_node}.")
+    if self.m2 < 1 or self.m2 >= num_node:
+      raise ConnectorError(f"Dual Barabási–Albert network must have m2 >= 1 and m2 < num_node, "
+                           f"while m2 = {self.m2} and num_node = {num_node}.")
+    if self.p < 0 or self.p > 1:
+      raise ConnectorError(f"Dual Barabási–Albert network must have 0 <= p <= 1, while p = {self.p}")
+
+    # Add max(m1,m2) initial nodes (m0 in barabasi-speak)
+    conn = np.zeros((num_node, num_node), dtype=MAT_DTYPE)
+
+    if not isOptimized:
+      # List of existing nodes, with nodes repeated once for each adjacent edge
+      repeated_nodes = []
+      # Start adding the remaining nodes.
+      source = max(self.m1, self.m2)
+      # Pick which m to use first time (m1 or m2)
+      m = self.m1 if self.rng.random() < self.p else self.m2
+      # Target nodes for new edges
+      targets = list(range(m))
+      while source < num_node:
+        # Add edges to m nodes from the source.
+        origins = [source] * m
+        conn[origins, targets] = True
+        if not self.directed:
+          conn[targets, origins] = True
+        # Add one node to the list for each new edge just created.
+        repeated_nodes.extend(targets)
+        # And the new node "source" has m edges to add to the list.
+        repeated_nodes.extend([source] * m)
+        # Pick which m to use next time (m1 or m2)
+        m = self.m1 if self.rng.random() < self.p else self.m2
+        # Now choose m unique nodes from the existing nodes
+        # Pick uniformly from repeated_nodes (preferential attachment)
+        targets = list(self._connect(np.asarray(repeated_nodes), m))
+        source += 1
+      return conn
+
+    # List of existing nodes, with nodes repeated once for each adjacent edge
+    # Preallocate repeated_nodes as a numpy array
+    repeated_nodes = np.empty(2 * num_node * max(self.m1, self.m2), dtype=int)
+    size_repeated_nodes = 0
+    # Start adding the remaining nodes.
+    source = max(self.m1, self.m2)
+    # Pick which m to use first time (m1 or m2)
+    m = self.m1 if self.rng.random() < self.p else self.m2
+    # Target nodes for new edges
+    targets = list(range(m))
+    while source < num_node:
+      # Add edges to m nodes from the source.
+      origins = [source] * m
+      conn[origins, targets] = True
+      if not self.directed:
+        conn[targets, origins] = True
+      # Add one node to the list for each new edge just created.
+      repeated_nodes[size_repeated_nodes:size_repeated_nodes + m] = targets
+      size_repeated_nodes += m
+      # And the new node "source" has m edges to add to the list.
+      repeated_nodes[size_repeated_nodes:size_repeated_nodes + m] = source
+      size_repeated_nodes += m
+      # Pick which m to use next time (m1 or m2)
+      m = self.m1 if self.rng.random() < self.p else self.m2
+      # Now choose m unique nodes from the existing nodes
+      # Pick uniformly from repeated_nodes (preferential attachment)
+      targets = list(self._connect(repeated_nodes[:size_repeated_nodes], m))
+      source += 1
+
+    return conn
+
+
+class PowerLaw(TwoEndConnector):
+  """Holme and Kim algorithm for growing graphs with powerlaw
+  degree distribution and approximate average clustering.
+
+  Parameters
+  ----------
+  m : int
+      the number of random edges to add for each new node
+  p : float,
+      Probability of adding a triangle after adding a random edge
+  seed : integer, random_state, or None (default)
+      Indicator of random number generation state.
+
+  Notes
+  -----
+  The average clustering has a hard time getting above a certain
+  cutoff that depends on :math:`m`.  This cutoff is often quite low.  The
+  transitivity (fraction of triangles to possible triangles) seems to
+  decrease with network size.
+
+  It is essentially the Barabási–Albert (BA) growth model with an
+  extra step that each random edge is followed by a chance of
+  making an edge to one of its neighbors too (and thus a triangle).
+
+  This algorithm improves on BA in the sense that it enables a
+  higher average clustering to be attained if desired.
+
+  It seems possible to have a disconnected graph with this algorithm
+  since the initial :math:`m` nodes may not be all linked to a new node
+  on the first iteration like the BA model.
+
+  Raises
+  ------
+  ConnectorError
+      If :math:`m` does not satisfy :math:`1 <= m <= n` or :math:`p` does not
+      satisfy :math:`0 <= p <= 1`.
+
+  References
+  ----------
+  .. [1] P. Holme and B. J. Kim,
+         "Growing scale-free networks with tunable clustering",
+         Phys. Rev. E, 65, 026107, 2002.
+  """
+
+  def __init__(self, m: int, p: float, directed=False, seed=None, **kwargs):
+    super(PowerLaw, self).__init__(**kwargs)
+    self.m = m
+    self.p = p
+    if self.p > 1 or self.p < 0:
+      raise ConnectorError(f"p must be in [0,1], while p={self.p}")
+    self.directed = directed
+    self.seed = format_seed(seed)
+    self.rng = np.random.RandomState(self.seed)
+    rng = np.random if SUPPORT_NUMBA else self.rng
+
+    def _random_subset(seq, m):
+      targets = set()
+      while len(targets) < m:
+        x = rng.choice(seq)
+        targets.add(x)
+      return targets
+
+    self._connect = numba_jit(_random_subset)
+
+  def __repr__(self):
+    return (f'{self.__class__.__name__}(m={self.m}, p={self.p}, directed={self.directed}, seed={self.seed})')
+
+  def build_mat(self, isOptimized=True):
+    assert self.pre_num == self.post_num
+    # seed
+    self.rng = np.random.RandomState(self.seed)
+    numba_seed(self.seed)
+    num_node = self.pre_num
+    if self.m < 1 or num_node < self.m:
+      raise ConnectorError(f"Must have m>1 and m<n, while m={self.m} and n={num_node}")
+    # add m initial nodes (m0 in barabasi-speak)
+    conn = np.zeros((num_node, num_node), dtype=MAT_DTYPE)
+
+    if not isOptimized:
+      repeated_nodes = list(range(self.m))  # list of existing nodes to sample from
+      # with nodes repeated once for each adjacent edge
+      source = self.m  # next node is m
+      while source < num_node:  # Now add the other n-1 nodes
+        possible_targets = self._connect(np.asarray(repeated_nodes), self.m)
+        # do one preferential attachment for new node
+        target = possible_targets.pop()
+        conn[source, target] = True
+        if not self.directed:
+          conn[target, source] = True
+        repeated_nodes.append(target)  # add one node to list for each new link
+        count = 1
+        while count < self.m:  # add m-1 more new links
+          if self.rng.random() < self.p:  # clustering step: add triangle
+            neighbors = np.where(conn[target])[0]
+            neighborhood = [nbr for nbr in neighbors if not conn[source, nbr] and not nbr == source]
+            if neighborhood:  # if there is a neighbor without a link
+              nbr = self.rng.choice(neighborhood)
+              conn[source, nbr] = True  # add triangle
+              if not self.directed:
+                conn[nbr, source] = True
+              repeated_nodes.append(nbr)
+              count = count + 1
+              continue  # go to top of while loop
+          # else do preferential attachment step if above fails
+          target = possible_targets.pop()
+          conn[source, target] = True
+          if not self.directed:
+            conn[target, source] = True
+          repeated_nodes.append(target)
+          count = count + 1
+        repeated_nodes.extend([source] * self.m)  # add source node to list m times
+        source += 1
+      return conn
+
+    # Preallocate repeated_nodes as a numpy array
+    repeated_nodes = np.empty(2 * num_node * self.m, dtype=int)
+    repeated_nodes[:self.m] = np.arange(self.m)
+    size_repeated_nodes = self.m
+
+    source = self.m  # next node is m
+    while source < num_node:  # Now add the other n-1 nodes
+      possible_targets = list(self._connect(repeated_nodes[:size_repeated_nodes], self.m))
+      possible_targets.reverse()
+
+      # do one preferential attachment for new node
+      target = possible_targets.pop()
+      conn[source, target] = True
+      if not self.directed:
+        conn[target, source] = True
+      repeated_nodes[size_repeated_nodes] = target
+      size_repeated_nodes += 1
+
+      count = 1
+      while count < self.m:  # add m-1 more new links
+        if self.rng.random() < self.p:  # clustering step: add triangle
+          neighbors = np.where(conn[target])[0]
+          neighborhood = [nbr for nbr in neighbors if not conn[source, nbr] and nbr != source]
+          if neighborhood:  # if there is a neighbor without a link
+            nbr = self.rng.choice(neighborhood)
+            conn[source, nbr] = True  # add triangle
+            if not self.directed:
+              conn[nbr, source] = True
+            repeated_nodes[size_repeated_nodes] = nbr
+            size_repeated_nodes += 1
+            count += 1
+            continue  # go to top of while loop
+
+        # else do preferential attachment step if above fails
+        target = possible_targets.pop()
+        conn[source, target] = True
+        if not self.directed:
+          conn[target, source] = True
+        repeated_nodes[size_repeated_nodes] = target
+        size_repeated_nodes += 1
+        count += 1
+
+      repeated_nodes[size_repeated_nodes:size_repeated_nodes + self.m] = source
+      size_repeated_nodes += self.m
+      source += 1
+    return conn
+
+
+@numba_jit
+def pos2ind(pos, size):
+  idx = 0
+  for i, p in enumerate(pos):
+    idx += p * np.prod(size[i + 1:])
+  return idx
+
+
+class ProbDist(TwoEndConnector):
+  """Connection with a maximum distance under a probability `p`.
+
+  .. versionadded:: 2.1.13
+
+  Parameters
+  ----------
+  dist: float, int
+    The maximum distance between two points.
+  prob: float
+    The connection probability, within 0. and 1.
+  pre_ratio: float
+    The ratio of pre-synaptic neurons to connect.
+  seed: optional, int
+    The random seed.
+  include_self: bool
+    Whether include the point at the same position.
+
+  """
+
+  def __init__(self, dist=1, prob=1., pre_ratio=1., seed=None, include_self=True, **kwargs):
+    super(ProbDist, self).__init__(**kwargs)
+
+    self.prob = prob
+    self.pre_ratio = pre_ratio
+    self.dist = dist
+    self.seed = format_seed(seed)
+    self.rng = np.random.RandomState(self.seed)
+    self.include_self = include_self
+
+    rng = np.random if SUPPORT_NUMBA else self.rng
+
+    @numba_jit
+    def _connect_1d_jit(pre_pos, pre_size, post_size, n_dim):
+      all_post_ids = np.zeros(post_size[0], dtype=IDX_DTYPE)
+      all_pre_ids = np.zeros(post_size[0], dtype=IDX_DTYPE)
+      size = 0
+
+      if rng.random() < pre_ratio:
+        normalized_pos = np.zeros(n_dim)
+        for i in range(n_dim):
+          pre_len = pre_size[i]
+          post_len = post_size[i]
+          normalized_pos[i] = pre_pos[i] * post_len / pre_len
+        for i in range(post_size[0]):
+          post_pos = np.asarray((i,))
+          d = np.abs(pre_pos[0] - post_pos[0])
+          if d <= dist:
+            if d == 0. and not include_self:
+              continue
+            if rng.random() <= prob:
+              all_post_ids[size] = pos2ind(post_pos, post_size)
+              all_pre_ids[size] = pos2ind(pre_pos, pre_size)
+              size += 1
+      return all_pre_ids[:size], all_post_ids[:size]
+
+    @numba_jit
+    def _connect_2d_jit(pre_pos, pre_size, post_size, n_dim):
+      max_size = post_size[0] * post_size[1]
+      all_post_ids = np.zeros(max_size, dtype=IDX_DTYPE)
+      all_pre_ids = np.zeros(max_size, dtype=IDX_DTYPE)
+      size = 0
+
+      if rng.random() < pre_ratio:
+        normalized_pos = np.zeros(n_dim)
+        for i in range(n_dim):
+          pre_len = pre_size[i]
+          post_len = post_size[i]
+          normalized_pos[i] = pre_pos[i] * post_len / pre_len
+        for i in range(post_size[0]):
+          for j in range(post_size[1]):
+            post_pos = np.asarray((i, j))
+            d = np.sqrt(np.sum(np.square(pre_pos - post_pos)))
+            if d <= dist:
+              if d == 0. and not include_self:
+                continue
+              if rng.random() <= prob:
+                all_post_ids[size] = pos2ind(post_pos, post_size)
+                all_pre_ids[size] = pos2ind(pre_pos, pre_size)
+                size += 1
+      return all_pre_ids[:size], all_post_ids[:size]  # Return filled part of the arrays
+
+    @numba_jit
+    def _connect_3d_jit(pre_pos, pre_size, post_size, n_dim):
+      max_size = post_size[0] * post_size[1] * post_size[2]
+      all_post_ids = np.zeros(max_size, dtype=IDX_DTYPE)
+      all_pre_ids = np.zeros(max_size, dtype=IDX_DTYPE)
+      size = 0
+
+      if rng.random() < pre_ratio:
+        normalized_pos = np.zeros(n_dim)
+        for i in range(n_dim):
+          pre_len = pre_size[i]
+          post_len = post_size[i]
+          normalized_pos[i] = pre_pos[i] * post_len / pre_len
+        for i in range(post_size[0]):
+          for j in range(post_size[1]):
+            for k in range(post_size[2]):
+              post_pos = np.asarray((i, j, k))
+              d = np.sqrt(np.sum(np.square(pre_pos - post_pos)))
+              if d <= dist:
+                if d == 0. and not include_self:
+                  continue
+                if rng.random() <= prob:
+                  all_post_ids[size] = pos2ind(post_pos, post_size)
+                  all_pre_ids[size] = pos2ind(pre_pos, pre_size)
+                  size += 1
+      return all_pre_ids[:size], all_post_ids[:size]
+
+    @numba_jit
+    def _connect_4d_jit(pre_pos, pre_size, post_size, n_dim):
+      max_size = post_size[0] * post_size[1] * post_size[2] * post_size[3]
+      all_post_ids = np.zeros(max_size, dtype=IDX_DTYPE)
+      all_pre_ids = np.zeros(max_size, dtype=IDX_DTYPE)
+      size = 0
+
+      if rng.random() < pre_ratio:
+        normalized_pos = np.zeros(n_dim)
+        for i in range(n_dim):
+          pre_len = pre_size[i]
+          post_len = post_size[i]
+          normalized_pos[i] = pre_pos[i] * post_len / pre_len
+        for i in range(post_size[0]):
+          for j in range(post_size[1]):
+            for k in range(post_size[2]):
+              for l in range(post_size[3]):
+                post_pos = np.asarray((i, j, k, l))
+                d = np.sqrt(np.sum(np.square(pre_pos - post_pos)))
+                if d <= dist:
+                  if d == 0. and not include_self:
+                    continue
+                  if rng.random() <= prob:
+                    all_post_ids[size] = pos2ind(post_pos, post_size)
+                    all_pre_ids[size] = pos2ind(pre_pos, pre_size)
+                    size += 1
+      return all_pre_ids[:size], all_post_ids[:size]
+
+    self._connect_1d_jit = _connect_1d_jit
+    self._connect_2d_jit = _connect_2d_jit
+    self._connect_3d_jit = _connect_3d_jit
+    self._connect_4d_jit = _connect_4d_jit
+
+  def build_coo(self, isOptimized=True):
+    if len(self.pre_size) != len(self.post_size):
+      raise ValueError('The dimensions of shapes of two objects to establish connections should '
+                       f'be the same. But we got dimension {len(self.pre_size)} != {len(self.post_size)}. '
+                       f'Specifically, pre size = {self.pre_size}, post size = {self.post_size}')
+    self.rng = np.random.RandomState(self.seed)
+    numba_seed(self.seed)
+
+    # connections
+    n_dim = len(self.pre_size)
+    if n_dim == 1:
+      f = self._connect_1d_jit
+    elif n_dim == 2:
+      f = self._connect_2d_jit
+    elif n_dim == 3:
+      f = self._connect_3d_jit
+    elif n_dim == 4:
+      f = self._connect_4d_jit
+    else:
+      raise NotImplementedError('Does not support the network dimension bigger than 4.')
+
+    pre_size = np.asarray(self.pre_size)
+    post_size = np.asarray(self.post_size)
+    connected_pres = []
+    connected_posts = []
+    pre_ids = np.meshgrid(*(np.arange(p) for p in self.pre_size), indexing='ij')
+    pre_ids = tuple([(np.moveaxis(p, 0, 1).flatten()) if p.ndim > 1 else p.flatten() for p in pre_ids])
+    size = np.prod(pre_size)
+
+    for i in range(size):
+      pre_pos = np.asarray([p[i] for p in pre_ids])
+      pres, posts = f(pre_pos, pre_size=pre_size, post_size=post_size, n_dim=n_dim)
+      connected_pres.extend(pres)
+      connected_posts.extend(posts)
+    return np.asarray(connected_pres), np.asarray(connected_posts)
diff --git a/brainpy/_src/dependency_check.py b/brainpy/_src/dependency_check.py
index 3bba20a79..b8bd6e99a 100644
--- a/brainpy/_src/dependency_check.py
+++ b/brainpy/_src/dependency_check.py
@@ -1,87 +1,135 @@
-import os
-import sys
-from jax.lib import xla_client
-
-__all__ = [
-  'import_taichi',
-  'import_brainpylib_cpu_ops',
-  'import_brainpylib_gpu_ops',
-]
-
-_minimal_brainpylib_version = '0.2.6'
-_minimal_taichi_version = (1, 7, 0)
-
-taichi = None
-brainpylib_cpu_ops = None
-brainpylib_gpu_ops = None
-
-taichi_install_info = (f'We need taichi=={_minimal_taichi_version}. '
-                       f'Currently you can install taichi=={_minimal_taichi_version} through:\n\n'
-                       '> pip install taichi==1.7.0')
-os.environ["TI_LOG_LEVEL"] = "error"
-
-
-def import_taichi():
-  global taichi
-  if taichi is None:
-    with open(os.devnull, 'w') as devnull:
-      old_stdout = sys.stdout
-      sys.stdout = devnull
-      try:
-        import taichi as taichi  # noqa
-      except ModuleNotFoundError:
-        raise ModuleNotFoundError(taichi_install_info)
-      finally:
-        sys.stdout = old_stdout
-
-  if taichi.__version__ != _minimal_taichi_version:
-    raise RuntimeError(taichi_install_info)
-  return taichi
-
-
-def is_brainpylib_gpu_installed():
-  return False if brainpylib_gpu_ops is None else True
-
-
-def import_brainpylib_cpu_ops():
-  global brainpylib_cpu_ops
-  if brainpylib_cpu_ops is None:
-    try:
-      from brainpylib import cpu_ops as brainpylib_cpu_ops
-
-      for _name, _value in brainpylib_cpu_ops.registrations().items():
-        xla_client.register_custom_call_target(_name, _value, platform="cpu")
-
-      import brainpylib
-      if brainpylib.__version__ < _minimal_brainpylib_version:
-        raise SystemError(f'This version of brainpy needs brainpylib >= {_minimal_brainpylib_version}.')
-      if hasattr(brainpylib, 'check_brainpy_version'):
-        brainpylib.check_brainpy_version()
-
-    except ImportError:
-      raise ImportError('Please install brainpylib. \n'
-                        'See https://brainpy.readthedocs.io for installation instructions.')
-
-  return brainpylib_cpu_ops
-
-
-def import_brainpylib_gpu_ops():
-  global brainpylib_gpu_ops
-  if brainpylib_gpu_ops is None:
-    try:
-      from brainpylib import gpu_ops as brainpylib_gpu_ops
-
-      for _name, _value in brainpylib_gpu_ops.registrations().items():
-        xla_client.register_custom_call_target(_name, _value, platform="gpu")
-
-      import brainpylib
-      if brainpylib.__version__ < _minimal_brainpylib_version:
-        raise SystemError(f'This version of brainpy needs brainpylib >= {_minimal_brainpylib_version}.')
-      if hasattr(brainpylib, 'check_brainpy_version'):
-        brainpylib.check_brainpy_version()
-
-    except ImportError:
-      raise ImportError('Please install GPU version of brainpylib. \n'
-                        'See https://brainpy.readthedocs.io for installation instructions.')
-
-  return brainpylib_gpu_ops
+import os
+import sys
+
+from jax.lib import xla_client
+
+__all__ = [
+  'import_taichi',
+  'raise_taichi_not_found',
+  'import_numba',
+  'raise_numba_not_found',
+  'import_brainpylib_cpu_ops',
+  'import_brainpylib_gpu_ops',
+]
+
+_minimal_brainpylib_version = '0.2.6'
+_minimal_taichi_version = (1, 7, 0)
+
+numba = None
+taichi = None
+brainpylib_cpu_ops = None
+brainpylib_gpu_ops = None
+
+taichi_install_info = (f'We need taichi=={_minimal_taichi_version}. '
+                       f'Currently you can install taichi=={_minimal_taichi_version} through:\n\n'
+                       '> pip install taichi==1.7.0')
+numba_install_info = ('We need numba. Please install numba by pip . \n'
+                      '> pip install numba')
+os.environ["TI_LOG_LEVEL"] = "error"
+
+
+def import_taichi(error_if_not_found=True):
+  """Internal API to import taichi.
+
+  If taichi is not found, it will raise a ModuleNotFoundError if error_if_not_found is True,
+  otherwise it will return None.
+  """
+  global taichi
+  if taichi is None:
+    with open(os.devnull, 'w') as devnull:
+      old_stdout = sys.stdout
+      sys.stdout = devnull
+      try:
+        import taichi as taichi  # noqa
+      except ModuleNotFoundError:
+        if error_if_not_found:
+          raise raise_taichi_not_found()
+      finally:
+        sys.stdout = old_stdout
+
+  if taichi is None:
+    return None
+  if taichi.__version__ != _minimal_taichi_version:
+    raise RuntimeError(taichi_install_info)
+  return taichi
+
+
+def raise_taichi_not_found(*args, **kwargs):
+  raise ModuleNotFoundError(taichi_install_info)
+
+
+def import_numba(error_if_not_found=True):
+  """
+  Internal API to import numba.
+
+  If numba is not found, it will raise a ModuleNotFoundError if error_if_not_found is True,
+  otherwise it will return None.
+  """
+  global numba
+  if numba is None:
+    try:
+      import numba as numba
+    except ModuleNotFoundError:
+      if error_if_not_found:
+        raise_numba_not_found()
+      else:
+        return None
+  return numba
+
+
+def raise_numba_not_found():
+  raise ModuleNotFoundError(numba_install_info)
+
+
+def is_brainpylib_gpu_installed():
+  return False if brainpylib_gpu_ops is None else True
+
+
+def import_brainpylib_cpu_ops():
+  """
+  Internal API to import brainpylib cpu_ops.
+  """
+  global brainpylib_cpu_ops
+  if brainpylib_cpu_ops is None:
+    try:
+      from brainpylib import cpu_ops as brainpylib_cpu_ops
+
+      for _name, _value in brainpylib_cpu_ops.registrations().items():
+        xla_client.register_custom_call_target(_name, _value, platform="cpu")
+
+      import brainpylib
+      if brainpylib.__version__ < _minimal_brainpylib_version:
+        raise SystemError(f'This version of brainpy needs brainpylib >= {_minimal_brainpylib_version}.')
+      if hasattr(brainpylib, 'check_brainpy_version'):
+        brainpylib.check_brainpy_version()
+
+    except ImportError:
+      raise ImportError('Please install brainpylib. \n'
+                        'See https://brainpy.readthedocs.io for installation instructions.')
+
+  return brainpylib_cpu_ops
+
+
+def import_brainpylib_gpu_ops():
+  """
+  Internal API to import brainpylib gpu_ops.
+  """
+  global brainpylib_gpu_ops
+  if brainpylib_gpu_ops is None:
+    try:
+      from brainpylib import gpu_ops as brainpylib_gpu_ops
+
+      for _name, _value in brainpylib_gpu_ops.registrations().items():
+        xla_client.register_custom_call_target(_name, _value, platform="gpu")
+
+      import brainpylib
+      if brainpylib.__version__ < _minimal_brainpylib_version:
+        raise SystemError(f'This version of brainpy needs brainpylib >= {_minimal_brainpylib_version}.')
+      if hasattr(brainpylib, 'check_brainpy_version'):
+        brainpylib.check_brainpy_version()
+
+    except ImportError:
+      raise ImportError('Please install GPU version of brainpylib. \n'
+                        'See https://brainpy.readthedocs.io for installation instructions.')
+
+  return brainpylib_gpu_ops
diff --git a/brainpy/_src/dnn/linear.py b/brainpy/_src/dnn/linear.py
index 539214d3b..c524fb0bf 100644
--- a/brainpy/_src/dnn/linear.py
+++ b/brainpy/_src/dnn/linear.py
@@ -6,22 +6,21 @@
 
 import jax
 import jax.numpy as jnp
-import numba
 import numpy as np
 
 from brainpy import math as bm
 from brainpy._src import connect, initialize as init
 from brainpy._src.context import share
+from brainpy._src.dependency_check import import_taichi
 from brainpy._src.dnn.base import Layer
 from brainpy._src.mixin import SupportOnline, SupportOffline, SupportSTDP
-from brainpy._src.dependency_check import import_taichi
 from brainpy.check import is_initializer
 from brainpy.connect import csr2csc
-from brainpy.errors import MathError
+from brainpy.errors import MathError, PackageMissingError
 from brainpy.initialize import XavierNormal, ZeroInit, Initializer, parameter
 from brainpy.types import ArrayType, Sharding
 
-ti = import_taichi()
+ti = import_taichi(error_if_not_found=False)
 
 __all__ = [
   'Dense', 'Linear',
@@ -239,140 +238,106 @@ def update(self, x):
     return x
 
 
-# @numba.njit(nogil=True, fastmath=True, parallel=False)
-# def _cpu_dense_on_pre(weight, spike, trace, w_min, w_max, out_w):
-#   out_w[:] = weight
-#   for i in numba.prange(spike.shape[0]):
-#     if spike[i]:
-#       out_w[i] = np.clip(out_w[i] + trace, w_min, w_max)
-
-@ti.kernel
-def _cpu_dense_on_pre(weight: ti.types.ndarray(ndim=2),
-                      spike: ti.types.ndarray(ndim=1),
-                      trace: ti.types.ndarray(ndim=1),
-                      w_min: ti.types.ndarray(ndim=1),
-                      w_max: ti.types.ndarray(ndim=1),
-                      out_w: ti.types.ndarray(ndim=2)):
-  trace0 = trace[0]
-  w_min0 = w_min[0]
-  w_max0 = w_max[0]
-  for i, j in ti.ndrange(out_w.shape[0], out_w.shape[1]):
-    out_w[i, j] = weight[i, j]
-  for i in range(spike.shape[0]):
-    if spike[i]:
-      for j in range(out_w.shape[1]):
-        new_value = out_w[i, j] + trace0
+if ti is not None:
+
+  # @numba.njit(nogil=True, fastmath=True, parallel=False)
+  # def _cpu_dense_on_post(weight, spike, trace, w_min, w_max, out_w):
+  #   out_w[:] = weight
+  #   for i in numba.prange(spike.shape[0]):
+  #     if spike[i]:
+  #       out_w[:, i] = np.clip(out_w[:, i] + trace, w_min, w_max)
+
+  @ti.kernel
+  def _dense_on_post(
+      old_w: ti.types.ndarray(ndim=2),
+      post_spike: ti.types.ndarray(ndim=1),
+      pre_trace: ti.types.ndarray(ndim=1),
+      w_min: ti.types.ndarray(ndim=1),
+      w_max: ti.types.ndarray(ndim=1),
+      out_w: ti.types.ndarray(ndim=2)
+  ):
+    w_min0 = w_min[0]
+    w_max0 = w_max[0]
+    num_pre, num_post = out_w.shape
+
+    for i, j in ti.ndrange(num_pre, num_post):
+      if post_spike[j]:
+        new_value = out_w[i, j] + pre_trace[i]
         if new_value < w_min0:
           out_w[i, j] = w_min0
         elif new_value > w_max0:
           out_w[i, j] = w_max0
         else:
-            out_w[i, j] = new_value
-
-
-@ti.kernel
-def _gpu_dense_on_pre(weight: ti.types.ndarray(ndim=1),
-                      spike: ti.types.ndarray(ndim=1),
-                      trace: ti.types.ndarray(ndim=1),
-                      w_min: ti.types.ndarray(ndim=1),
-                      w_max: ti.types.ndarray(ndim=1),
-                      out_w: ti.types.ndarray(ndim=1)):
-  trace0 = trace[0]
-  w_min0 = w_min[0]
-  w_max0 = w_max[0]
-  for i, j in ti.ndrange(out_w.shape[0], out_w.shape[1]):
-    out_w[i, j] = weight[i, j]
-  for i in range(spike.shape[0]):
-    if spike[i]:
-      for j in range(out_w.shape[1]):
-        new_value = out_w[i, j] + trace0
+          out_w[i, j] = new_value
+      else:
+        out_w[i, j] = old_w[i, j]
+
+
+  dense_on_post_prim = bm.XLACustomOp(cpu_kernel=_dense_on_post, gpu_kernel=_dense_on_post)
+
+
+  # @numba.njit(nogil=True, fastmath=True, parallel=False)
+  # def _cpu_dense_on_pre(weight, spike, trace, w_min, w_max, out_w):
+  #   out_w[:] = weight
+  #   for i in numba.prange(spike.shape[0]):
+  #     if spike[i]:
+  #       out_w[i] = np.clip(out_w[i] + trace, w_min, w_max)
+
+  @ti.kernel
+  def _dense_on_pre(
+      old_w: ti.types.ndarray(ndim=2),
+      pre_spike: ti.types.ndarray(ndim=1),
+      post_trace: ti.types.ndarray(ndim=1),
+      w_min: ti.types.ndarray(ndim=1),
+      w_max: ti.types.ndarray(ndim=1),
+      out_w: ti.types.ndarray(ndim=2)
+  ):
+    w_min0 = w_min[0]
+    w_max0 = w_max[0]
+    num_pre, num_post = out_w.shape
+
+    for i, j in ti.ndrange(num_pre, num_post):
+      if pre_spike[i]:
+        new_value = out_w[i, j] + post_trace[j]
         if new_value < w_min0:
           out_w[i, j] = w_min0
         elif new_value > w_max0:
           out_w[i, j] = w_max0
         else:
           out_w[i, j] = new_value
-  
+      else:
+        out_w[i, j] = old_w[i, j]
 
-dense_on_pre_prim = bm.XLACustomOp(cpu_kernel=_cpu_dense_on_pre,
-                                   gpu_kernel=_gpu_dense_on_pre)
+
+  dense_on_pre_prim = bm.XLACustomOp(cpu_kernel=_dense_on_pre, gpu_kernel=_dense_on_pre)
+
+else:
+  dense_on_pre_prim = None
+  dense_on_post_prim = None
 
 
 def dense_on_pre(weight, spike, trace, w_min, w_max):
+  if dense_on_pre_prim is None:
+    raise PackageMissingError.by_purpose('taichi', 'custom operators')
+
   if w_min is None:
     w_min = -np.inf
   if w_max is None:
     w_max = np.inf
-  trace = jnp.atleast_1d(trace)
   w_min = jnp.atleast_1d(w_min)
   w_max = jnp.atleast_1d(w_max)
   return dense_on_pre_prim(weight, spike, trace, w_min, w_max,
                            outs=[jax.ShapeDtypeStruct(weight.shape, weight.dtype)])[0]
 
 
-# @numba.njit(nogil=True, fastmath=True, parallel=False)
-# def _cpu_dense_on_post(weight, spike, trace, w_min, w_max, out_w):
-#   out_w[:] = weight
-#   for i in numba.prange(spike.shape[0]):
-#     if spike[i]:
-#       out_w[:, i] = np.clip(out_w[:, i] + trace, w_min, w_max)
-
-@ti.kernel
-def _cpu_dense_on_post(weight: ti.types.ndarray(ndim=2),
-                       spike: ti.types.ndarray(ndim=1),
-                       trace: ti.types.ndarray(ndim=1),
-                       w_min: ti.types.ndarray(ndim=1),
-                       w_max: ti.types.ndarray(ndim=1),
-                       out_w: ti.types.ndarray(ndim=2)):
-  trace0 = trace[0]
-  w_min0 = w_min[0]
-  w_max0 = w_max[0]
-  for i, j in ti.ndrange(out_w.shape[0], out_w.shape[1]):
-    out_w[i, j] = weight[i, j]
-  for i in range(spike.shape[0]):
-    if spike[i]:
-      for j in range(out_w.shape[0]):
-        new_value = out_w[j, i] + trace0
-        if new_value < w_min0:
-          out_w[j, i] = w_min0
-        elif new_value > w_max0:
-          out_w[j, i] = w_max0
-        else:
-          out_w[j, i] = new_value
-
-@ti.kernel
-def _gpu_dense_on_post(weight: ti.types.ndarray(ndim=2),
-                       spike: ti.types.ndarray(ndim=1),
-                       trace: ti.types.ndarray(ndim=1),
-                       w_min: ti.types.ndarray(ndim=1),
-                       w_max: ti.types.ndarray(ndim=1),
-                       out_w: ti.types.ndarray(ndim=2)):
-  trace0 = trace[0]
-  w_min0 = w_min[0]
-  w_max0 = w_max[0]
-  for i, j in ti.ndrange(out_w.shape[0], out_w.shape[1]):
-    out_w[i, j] = weight[i, j]
-  for i in range(spike.shape[0]):
-    if spike[i]:
-      for j in range(out_w.shape[0]):
-        new_value = out_w[j, i] + trace0
-        if new_value < w_min0:
-          out_w[j, i] = w_min0
-        elif new_value > w_max0:
-          out_w[j, i] = w_max0
-        else:
-          out_w[j, i] = new_value
-
-dense_on_post_prim = bm.XLACustomOp(cpu_kernel=_cpu_dense_on_post,
-                                    gpu_kernel=_gpu_dense_on_post)
-
-
 def dense_on_post(weight, spike, trace, w_min, w_max):
+  if dense_on_post_prim is None:
+    raise PackageMissingError.by_purpose('taichi', 'custom operators')
+
   if w_min is None:
     w_min = -np.inf
   if w_max is None:
     w_max = np.inf
-  trace = jnp.atleast_1d(trace)
   w_min = jnp.atleast_1d(w_min)
   w_max = jnp.atleast_1d(w_max)
   return dense_on_post_prim(weight, spike, trace, w_min, w_max,
@@ -630,7 +595,7 @@ def stdp_update(
       raise ValueError(f'The shape of weight should be the same as the shape of sparse weight {self.weight.shape}.')
     if not isinstance(self.weight, bm.Variable):
       self.tracing_variable('weight', self.weight, self.weight.shape)
-    if on_pre is not None:   # update on presynaptic spike
+    if on_pre is not None:  # update on presynaptic spike
       spike = on_pre['spike']
       trace = on_pre['trace']
       self.weight.value = csr_on_pre_update(self.weight.value, self.indices, self.indptr, spike, trace, w_min, w_max)
@@ -682,8 +647,7 @@ def __init__(
   def update(self, x):
     if x.ndim == 1:
       return bm.sparse.csrmv(self.weight, self.indices, self.indptr, x,
-                             shape=(self.conn.pre_num, self.conn.post_num),
-                             method=self.method, transpose=self.transpose)
+                             shape=(self.conn.pre_num, self.conn.post_num), transpose=self.transpose)
     elif x.ndim > 1:
       shapes = x.shape[:-1]
       x = bm.flatten(x, end_dim=-2)
@@ -694,8 +658,8 @@ def update(self, x):
 
   def _batch_csrmv(self, x):
     return bm.sparse.csrmv(self.weight, self.indices, self.indptr, x,
-                           shape=(self.conn.pre_num, self.conn.post_num),
-                           method=self.method, transpose=self.transpose)
+                           shape=(self.conn.pre_num, self.conn.post_num), transpose=self.transpose)
+
 
 class EventCSRLinear(_CSRLayer):
   r"""Synaptic matrix multiplication with event CSR sparse computation.
@@ -746,99 +710,170 @@ def _batch_csrmv(self, x):
                           shape=(self.conn.pre_num, self.conn.post_num),
                           transpose=self.transpose)
 
-# @numba.njit(nogil=True, fastmath=True, parallel=False)
-# def _cpu_csr_on_pre_update(w, indices, indptr, spike, trace, w_min, w_max, out_w):
-#   out_w[:] = w
-#   w_min = w_min[()]
-#   w_max = w_max[()]
-#   for i in numba.prange(spike.shape[0]):  # pre id
-#     if spike[i]:
-#       for k in range(indptr[i], indptr[i + 1]):  # synapse id
-#         j = indices[k]  # post id
-#         # out_w[k] = np.clip(out_w[k] + trace[j], w_min, w_max)
-#         out_w[k] = np.minimum(np.maximum(out_w[k] + trace[j], w_min), w_max)
-
-
-@ti.kernel
-def _cpu_csr_on_pre_update(w: ti.types.ndarray(ndim=1),
-                           indices: ti.types.ndarray(ndim=1),
-                           indptr: ti.types.ndarray(ndim=1),
-                           spike: ti.types.ndarray(ndim=1),
-                           trace: ti.types.ndarray(ndim=1),
-                           w_min: ti.types.ndarray(ndim=1),
-                           w_max: ti.types.ndarray(ndim=1),
-                           out_w: ti.types.ndarray(ndim=1)):
-  trace0 = trace[0]
-  w_min0 = w_min[0]
-  w_max0 = w_max[0]
-  for i in range(out_w.shape[0]):
-    out_w[i] = w[i]
-  for i in range(spike.shape[0]):
-    if spike[i]:
-      for k in range(indptr[i], indptr[i + 1]):
-        j = indices[k]
-        out_w[k] = min(max(out_w[k] + trace[j], w_min0), w_max0)
-@ti.kernel
-def _gpu_csr_on_pre_update(w: ti.types.ndarray(ndim=1),
-                           indices: ti.types.ndarray(ndim=1),
-                           indptr: ti.types.ndarray(ndim=1),
-                           spike: ti.types.ndarray(ndim=1),
-                           trace: ti.types.ndarray(ndim=1),
-                           w_min: ti.types.ndarray(ndim=1),
-                           w_max: ti.types.ndarray(ndim=1),
-                           out_w: ti.types.ndarray(ndim=1)):
-  trace0 = trace[0]
-  w_min0 = w_min[0]
-  w_max0 = w_max[0]
-  for i in range(out_w.shape[0]):
-    out_w[i] = w[i]
-  for i in range(spike.shape[0]):
-    if spike[i]:
-      for k in range(indptr[i], indptr[i + 1]):
-        j = indices[k]
-        out_w[k] = min(max(out_w[k] + trace[j], w_min0), w_max0)
-
-
-csr_on_pre_update_prim = bm.XLACustomOp(cpu_kernel=_cpu_csr_on_pre_update,
-                                        gpu_kernel=_gpu_csr_on_pre_update)
+
+if ti is not None:
+  @ti.kernel
+  def _csr_on_pre_update(
+      old_w: ti.types.ndarray(ndim=1),  # vector with shape of (num_syn)
+      indices: ti.types.ndarray(ndim=1),  # vector with shape of (num_syn)
+      indptr: ti.types.ndarray(ndim=1),  # vector with shape of (num_pre + 1)
+      spike: ti.types.ndarray(ndim=1),  # vector with shape of (num_pre,)
+      trace: ti.types.ndarray(ndim=1),  # vector with shape of (num_post,)
+      w_min: ti.types.ndarray(ndim=1),  # scalar
+      w_max: ti.types.ndarray(ndim=1),  # scalar
+      out_w: ti.types.ndarray(ndim=1)  # vector with shape of (num_syn)
+  ):
+    w_min0 = w_min[0]
+    w_max0 = w_max[0]
+    num_pre = spike.shape[0]
+    for i_pre in range(num_pre):
+      if spike[i_pre]:
+        for i_syn in range(indptr[i_pre], indptr[i_pre + 1]):
+          out_w[i_syn] = min(max(old_w[i_syn] + trace[indices[i_syn]], w_min0), w_max0)
+      else:
+        for i_syn in range(indptr[i_pre], indptr[i_pre + 1]):
+          out_w[i_syn] = old_w[i_syn]
+
+
+  csr_on_pre_update_prim = bm.XLACustomOp(cpu_kernel=_csr_on_pre_update, gpu_kernel=_csr_on_pre_update)
+
+
+  @ti.kernel
+  def _coo_on_pre_update(
+      old_w: ti.types.ndarray(ndim=1),  # vector with shape of (num_syn)
+      pre_ids: ti.types.ndarray(ndim=1),  # vector with shape of (num_syn)
+      post_ids: ti.types.ndarray(ndim=1),  # vector with shape of (num_syn)
+      pre_spike: ti.types.ndarray(ndim=1),  # vector with shape of (num_pre,)
+      post_trace: ti.types.ndarray(ndim=1),  # vector with shape of (num_post,)
+      w_min: ti.types.ndarray(ndim=1),  # scalar
+      w_max: ti.types.ndarray(ndim=1),  # scalar
+      out_w: ti.types.ndarray(ndim=1)  # vector with shape of (num_syn)
+  ):
+    w_min0 = w_min[0]
+    w_max0 = w_max[0]
+    num_syn = old_w.shape[0]
+    for i_syn in range(num_syn):
+      if pre_spike[pre_ids[i_syn]]:  # pre spike
+        out_w[i_syn] = min(max(old_w[i_syn] + post_trace[post_ids[i_syn]], w_min0), w_max0)
+      else:
+        out_w[i_syn] = old_w[i_syn]
+
+
+  coo_on_pre_update_prim = bm.XLACustomOp(cpu_kernel=_coo_on_pre_update, gpu_kernel=_coo_on_pre_update)
+
+
+  @ti.kernel
+  def _coo_on_post_update(
+      old_w: ti.types.ndarray(ndim=1),  # vector with shape of (num_syn)
+      pre_ids: ti.types.ndarray(ndim=1),  # vector with shape of (num_syn)
+      post_ids: ti.types.ndarray(ndim=1),  # vector with shape of (num_syn)
+      post_spike: ti.types.ndarray(ndim=1),  # vector with shape of (num_pre,)
+      pre_trace: ti.types.ndarray(ndim=1),  # vector with shape of (num_post,)
+      w_min: ti.types.ndarray(ndim=1),  # scalar
+      w_max: ti.types.ndarray(ndim=1),  # scalar
+      out_w: ti.types.ndarray(ndim=1)  # vector with shape of (num_syn)
+  ):
+    w_min0 = w_min[0]
+    w_max0 = w_max[0]
+    num_syn = old_w.shape[0]
+    for i_syn in range(num_syn):
+      if post_spike[post_ids[i_syn]]:  # pre spike
+        out_w[i_syn] = min(max(old_w[i_syn] + pre_trace[pre_ids[i_syn]], w_min0), w_max0)
+      else:
+        out_w[i_syn] = old_w[i_syn]
+
+
+  coo_on_post_update_prim = bm.XLACustomOp(cpu_kernel=_coo_on_post_update, gpu_kernel=_coo_on_post_update)
+
+
+  # @numba.njit(nogil=True, fastmath=True, parallel=False)
+  # def _cpu_csc_on_pre_update(w, post_ids, indptr, w_ids, spike, trace, w_min, w_max, out_w):
+  #   out_w[:] = w
+  #   w_min = w_min[()]
+  #   w_max = w_max[()]
+  #   for i in numba.prange(spike.shape[0]):  # post id
+  #     if spike[i]:
+  #       for k in range(indptr[i], indptr[i + 1]):
+  #         j = post_ids[k]  # pre id
+  #         l = w_ids[k]  # syn id
+  #         out_w[l] = np.minimum(np.maximum(out_w[l] + trace[j], w_min), w_max)
+
+  @ti.kernel
+  def _csc_on_post_update(
+      old_w: ti.types.ndarray(ndim=1),  # vector with shape of (num_syn)
+      indices: ti.types.ndarray(ndim=1),  # vector with shape of (num_syn)
+      indptr: ti.types.ndarray(ndim=1),  # vector with shape of (num_post + 1)
+      w_ids: ti.types.ndarray(ndim=1),  # vector with shape of (num_syn)
+      post_spike: ti.types.ndarray(ndim=1),  # vector with shape of (num_post,)
+      pre_trace: ti.types.ndarray(ndim=1),  # vector with shape of (num_pre,)
+      w_min: ti.types.ndarray(ndim=1),  # scalar
+      w_max: ti.types.ndarray(ndim=1),  # scalar
+      out_w: ti.types.ndarray(ndim=1),  # vector with shape of (num_syn)
+  ):
+    w_min0 = w_min[0]
+    w_max0 = w_max[0]
+    num_post = post_spike.shape[0]
+    for i_post in range(num_post):
+      if post_spike[i_post]:
+        for k in range(indptr[i_post], indptr[i_post + 1]):
+          i_syn = w_ids[k]  # syn id
+          out_w[i_syn] = min(max(old_w[i_syn] + pre_trace[indices[k]], w_min0), w_max0)
+      else:
+        for k in range(indptr[i_post], indptr[i_post + 1]):
+          i_syn = w_ids[k]  # syn id
+          out_w[i_syn] = old_w[i_syn]
+
+
+  csc_on_post_update_prim = bm.XLACustomOp(cpu_kernel=_csc_on_post_update, gpu_kernel=_csc_on_post_update)
+
+
+else:
+  csr_on_pre_update_prim = None
+  coo_on_pre_update_prim = None
+  csc_on_post_update_prim = None
 
 
 def csr_on_pre_update(w, indices, indptr, spike, trace, w_min=None, w_max=None):
+  if csr_on_pre_update_prim is None:
+    raise PackageMissingError.by_purpose('taichi', 'customized operators')
+
   if w_min is None:
     w_min = -np.inf
   if w_max is None:
     w_max = np.inf
-  trace = jnp.atleast_1d(trace)
   w_min = jnp.atleast_1d(w_min)
   w_max = jnp.atleast_1d(w_max)
   return csr_on_pre_update_prim(w, indices, indptr, spike, trace, w_min, w_max,
                                 outs=[jax.ShapeDtypeStruct(w.shape, w.dtype)])[0]
 
-@numba.njit(nogil=True, fastmath=True, parallel=False)
-def _cpu_csc_on_pre_update(w, post_ids, indptr, w_ids, spike, trace, w_min, w_max, out_w):
-  out_w[:] = w
-  w_min = w_min[()]
-  w_max = w_max[()]
-  for i in numba.prange(spike.shape[0]):  # post id
-    if spike[i]:
-      for k in range(indptr[i], indptr[i + 1]):
-        j = post_ids[k]  # pre id
-        l = w_ids[k]  # syn id
-        out_w[l] = np.minimum(np.maximum(out_w[l] + trace[j], w_min), w_max)
 
+def coo_on_pre_update(w, pre_ids, post_ids, spike, trace, w_min=None, w_max=None):
+  if coo_on_pre_update_prim is None:
+    raise PackageMissingError.by_purpose('taichi', 'customized operators')
 
-csc_on_pre_update_prim = bm.XLACustomOp(_cpu_csc_on_pre_update)
-
-
-def csc_on_post_update(w, post_ids, indptr, w_ids, spike, trace, w_min=None, w_max=None):
   if w_min is None:
     w_min = -np.inf
   if w_max is None:
     w_max = np.inf
-  return csc_on_pre_update_prim(w, post_ids, indptr, w_ids, spike, trace, w_min, w_max,
+  w_min = jnp.atleast_1d(w_min)
+  w_max = jnp.atleast_1d(w_max)
+  return coo_on_pre_update_prim(w, pre_ids, post_ids, spike, trace, w_min, w_max,
                                 outs=[jax.ShapeDtypeStruct(w.shape, w.dtype)])[0]
 
 
+def csc_on_post_update(w, post_ids, indptr, w_ids, post_spike, pre_trace, w_min=None, w_max=None):
+  if csc_on_post_update_prim is None:
+    raise PackageMissingError.by_purpose('taichi', 'customized operators')
+
+  if w_min is None:
+    w_min = -np.inf
+  if w_max is None:
+    w_max = np.inf
+  w_min = jnp.atleast_1d(w_min)
+  w_max = jnp.atleast_1d(w_max)
+  return csc_on_post_update_prim(w, post_ids, indptr, w_ids, post_spike, pre_trace, w_min, w_max,
+                                 outs=[jax.ShapeDtypeStruct(w.shape, w.dtype)])[0]
+
 
 class CSCLinear(Layer):
   r"""Synaptic matrix multiplication with CSC sparse computation.
diff --git a/brainpy/_src/dnn/tests/test_linear.py b/brainpy/_src/dnn/tests/test_linear.py
index 7fc89526c..422f161f1 100644
--- a/brainpy/_src/dnn/tests/test_linear.py
+++ b/brainpy/_src/dnn/tests/test_linear.py
@@ -1,217 +1,223 @@
-import brainpy as bp
-from absl.testing import parameterized
-from absl.testing import absltest
-import brainpy.math as bm
-
-
-class TestLinear(parameterized.TestCase):
-  def __init__(self, *args, **kwargs):
-    super().__init__(*args, **kwargs)
-    bm.random.seed()
-
-  @parameterized.product(
-    size=[(10,),
-          (20, 10),
-          (5, 8, 10)],
-    num_out=[20, 10, 5]
-  )
-  def test_Dense1(self, size, num_out):
-    bm.random.seed()
-    f = bp.dnn.Linear(10, num_out)
-    x = bm.random.random(size)
-    y = f(x)
-    self.assertTrue(y.shape == size[:-1] + (num_out,))
-    bm.clear_buffer_memory()
-
-  @parameterized.product(
-    size=[(10,),
-          (20, 10),
-          (5, 8, 10)],
-  )
-  def test_Identity(self, size):
-    bm.random.seed()
-    f = bp.dnn.Identity()
-    x = bm.random.random(size)
-    y = f(x)
-    self.assertTrue(y.shape == size)
-    bm.clear_buffer_memory()
-
-  def test_AllToAll1(self):
-    bm.random.seed()
-    with bm.environment(mode=bm.BatchingMode()):
-      f = bp.dnn.AllToAll(10, 20, weight=.1, include_self=True)
-      x = bm.random.random((8, 10))
-      y = f(x)
-      expected = bm.sum(x, axis=1, keepdims=True) * 0.1
-      self.assertTrue(bm.allclose(y, expected))
-
-    with bm.environment(mode=bm.NonBatchingMode()):
-      f = bp.dnn.AllToAll(10, 20, weight=.1, include_self=True)
-      x = bm.random.random((10,))
-      y = f(x)
-      expected = bm.sum(x, keepdims=True) * 0.1
-      self.assertTrue(bm.allclose(y, expected))
-    bm.clear_buffer_memory()
-
-  def test_OneToOne(self):
-    bm.random.seed()
-    with bm.environment(mode=bm.BatchingMode()):
-      f = bp.dnn.OneToOne(10, weight=.1)
-      x = bm.random.random((8, 10))
-      y = f(x)
-      expected = x * 0.1
-      self.assertTrue(bm.allclose(y, expected))
-
-    with bm.environment(mode=bm.NonBatchingMode()):
-      f = bp.dnn.OneToOne(10, weight=.1)
-      x = bm.random.random((10,))
-      y = f(x)
-      expected = x * 0.1
-      self.assertTrue(bm.allclose(y, expected))
-    bm.clear_buffer_memory()
-
-  @parameterized.product(
-    conn=[
-      # bp.conn.FixedProb(0.1, pre=100, post=100),
-      bp.conn.GridFour(pre=100, post=100),
-      bp.conn.GaussianProb(0.1, pre=100, post=100),
-    ]
-  )
-  def test_MaskedLinear(self, conn):
-    bm.random.seed()
-    bm.random.DEFAULT.seed(123)
-    f = bp.dnn.MaskedLinear(conn, weight=bp.init.XavierNormal(seed=123))
-    x = bm.random.random((16, 100))
-    y = f(x)
-    self.assertTrue(y.shape == (16, 100))
-    bm.clear_buffer_memory()
-
-  @parameterized.product(
-    conn=[
-      bp.conn.FixedProb(0.1, pre=100, post=100),
-      bp.conn.GridFour(pre=100, post=100),
-      bp.conn.GaussianProb(0.1, pre=100, post=100),
-    ]
-  )
-  def test_CSRLinear(self, conn):
-    bm.random.seed()
-    f = bp.dnn.CSRLinear(conn, weight=bp.init.Normal())
-    x = bm.random.random((16, 100))
-    y = f(x)
-    self.assertTrue(y.shape == (16, 100))
-
-    x = bm.random.random((100,))
-    y = f(x)
-    self.assertTrue(y.shape == (100,))
-    bm.clear_buffer_memory()
-
-
-  @parameterized.product(
-    conn=[
-      bp.conn.FixedProb(0.1, pre=100, post=100),
-      bp.conn.GridFour(pre=100, post=100),
-      bp.conn.GaussianProb(0.1, pre=100, post=100),
-    ]
-  )
-  def test_EventCSRLinear(self,conn):
-    bm.random.seed()
-    f=bp.layers.EventCSRLinear(conn,weight=bp.init.Normal())
-    x = bm.random.random((16, 100))
-    y = f(x)
-    self.assertTrue(y.shape == (16, 100))
-    x = bm.random.random((100,))
-    y = f(x)
-    self.assertTrue(y.shape == (100,))
-    bm.clear_buffer_memory()
-
-
-  @parameterized.product(
-    prob=[0.01, 0.05, 0.5],
-    weight=[0.01, 0.01],
-    shape=[(), (10,), (10, 20), (10, 20, 25)]
-  )
-  def test_JitFPHomoLinear(self, prob, weight, shape):
-    bm.random.seed()
-    f = bp.dnn.JitFPHomoLinear(100, 200, prob, weight, seed=123)
-    x = bm.random.random(shape + (100,))
-    y = f(x)
-    self.assertTrue(y.shape == shape + (200,))
-    bm.clear_buffer_memory()
-
-  @parameterized.product(
-    prob=[0.01, 0.05, 0.5],
-    w_low=[-0.01, -0.01],
-    w_high=[0.01, 0.01],
-    shape=[(), (10,), (10, 20), (10, 20, 25)]
-  )
-  def test_JitFPUniformLinear(self, prob, w_low, w_high, shape):
-    bm.random.seed()
-    f = bp.dnn.JitFPUniformLinear(100, 200, prob, w_low, w_high, seed=123)
-    x = bm.random.random(shape + (100,))
-    y = f(x)
-    self.assertTrue(y.shape == shape + (200,))
-    bm.clear_buffer_memory()
-
-  @parameterized.product(
-    prob=[0.01, 0.1, 0.5],
-    w_mu=[-0.01, -0.01],
-    w_sigma=[0.01, 0.01],
-    shape=[(), (10,), (10, 20), (10, 20, 25)]
-  )
-  def test_JitFPNormalLinear(self, prob, w_mu, w_sigma, shape):
-    bm.random.seed()
-    f = bp.dnn.JitFPNormalLinear(100, 200, prob, w_mu, w_sigma, seed=123)
-    x = bm.random.random(shape + (100,))
-    y = f(x)
-    self.assertTrue(y.shape == shape + (200,))
-    bm.clear_buffer_memory()
-
-  @parameterized.product(
-    prob=[0.01, 0.05, 0.5],
-    weight=[0.01, 0.01],
-    shape=[(), (10,), (10, 20), (10, 20, 25)]
-  )
-  def test_EventJitFPHomoLinear(self, prob, weight, shape):
-    bm.random.seed()
-    f = bp.dnn.EventJitFPHomoLinear(100, 200, prob, weight, seed=123)
-    y = f(bm.random.random(shape + (100,)) < 0.1)
-    self.assertTrue(y.shape == shape + (200,))
-
-    y2 = f(bm.as_jax(bm.random.random(shape + (100,)) < 0.1, dtype=float))
-    self.assertTrue(y2.shape == shape + (200,))
-    bm.clear_buffer_memory()
-
-  @parameterized.product(
-    prob=[0.01, 0.05, 0.5],
-    w_low=[-0.01, -0.01],
-    w_high=[0.01, 0.01],
-    shape=[(), (10,), (10, 20), (10, 20, 25)]
-  )
-  def test_EventJitFPUniformLinear(self, prob, w_low, w_high, shape):
-    bm.random.seed()
-    f = bp.dnn.EventJitFPUniformLinear(100, 200, prob, w_low, w_high, seed=123)
-    y = f(bm.random.random(shape + (100,)) < 0.1)
-    self.assertTrue(y.shape == shape + (200,))
-
-    y2 = f(bm.as_jax(bm.random.random(shape + (100,)) < 0.1, dtype=float))
-    self.assertTrue(y2.shape == shape + (200,))
-    bm.clear_buffer_memory()
-
-  @parameterized.product(
-    prob=[0.01, 0.1, 0.5],
-    w_mu=[-0.01, -0.01],
-    w_sigma=[0.01, 0.01],
-    shape=[(), (10,), (10, 20), (10, 20, 25)]
-  )
-  def test_EventJitFPNormalLinear(self, prob, w_mu, w_sigma, shape):
-    bm.random.seed()
-    f = bp.dnn.EventJitFPNormalLinear(100, 200, prob, w_mu, w_sigma, seed=123)
-    y = f(bm.random.random(shape + (100,)) < 0.1)
-    self.assertTrue(y.shape == shape + (200,))
-
-    y2 = f(bm.as_jax(bm.random.random(shape + (100,)) < 0.1, dtype=float))
-    self.assertTrue(y2.shape == shape + (200,))
-    bm.clear_buffer_memory()
-
-if __name__ == '__main__':
-  absltest.main()
+import pytest
+from absl.testing import absltest
+from absl.testing import parameterized
+
+import brainpy as bp
+import brainpy.math as bm
+
+from brainpy._src.dependency_check import import_taichi
+
+if import_taichi(error_if_not_found=False) is None:
+  pytest.skip('no taichi', allow_module_level=True)
+
+
+class TestLinear(parameterized.TestCase):
+  def __init__(self, *args, **kwargs):
+    super().__init__(*args, **kwargs)
+    bm.random.seed()
+
+  @parameterized.product(
+    size=[(10,),
+          (20, 10),
+          (5, 8, 10)],
+    num_out=[20, 10, 5]
+  )
+  def test_Dense1(self, size, num_out):
+    bm.random.seed()
+    f = bp.dnn.Linear(10, num_out)
+    x = bm.random.random(size)
+    y = f(x)
+    self.assertTrue(y.shape == size[:-1] + (num_out,))
+    bm.clear_buffer_memory()
+
+  @parameterized.product(
+    size=[(10,),
+          (20, 10),
+          (5, 8, 10)],
+  )
+  def test_Identity(self, size):
+    bm.random.seed()
+    f = bp.dnn.Identity()
+    x = bm.random.random(size)
+    y = f(x)
+    self.assertTrue(y.shape == size)
+    bm.clear_buffer_memory()
+
+  def test_AllToAll1(self):
+    bm.random.seed()
+    with bm.environment(mode=bm.BatchingMode()):
+      f = bp.dnn.AllToAll(10, 20, weight=.1, include_self=True)
+      x = bm.random.random((8, 10))
+      y = f(x)
+      expected = bm.sum(x, axis=1, keepdims=True) * 0.1
+      self.assertTrue(bm.allclose(y, expected))
+
+    with bm.environment(mode=bm.NonBatchingMode()):
+      f = bp.dnn.AllToAll(10, 20, weight=.1, include_self=True)
+      x = bm.random.random((10,))
+      y = f(x)
+      expected = bm.sum(x, keepdims=True) * 0.1
+      self.assertTrue(bm.allclose(y, expected))
+    bm.clear_buffer_memory()
+
+  def test_OneToOne(self):
+    bm.random.seed()
+    with bm.environment(mode=bm.BatchingMode()):
+      f = bp.dnn.OneToOne(10, weight=.1)
+      x = bm.random.random((8, 10))
+      y = f(x)
+      expected = x * 0.1
+      self.assertTrue(bm.allclose(y, expected))
+
+    with bm.environment(mode=bm.NonBatchingMode()):
+      f = bp.dnn.OneToOne(10, weight=.1)
+      x = bm.random.random((10,))
+      y = f(x)
+      expected = x * 0.1
+      self.assertTrue(bm.allclose(y, expected))
+    bm.clear_buffer_memory()
+
+  @parameterized.product(
+    conn=[
+      # bp.conn.FixedProb(0.1, pre=100, post=100),
+      bp.conn.GridFour(pre=100, post=100),
+      bp.conn.GaussianProb(0.1, pre=100, post=100),
+    ]
+  )
+  def test_MaskedLinear(self, conn):
+    bm.random.seed()
+    bm.random.DEFAULT.seed(123)
+    f = bp.dnn.MaskedLinear(conn, weight=bp.init.XavierNormal(seed=123))
+    x = bm.random.random((16, 100))
+    y = f(x)
+    self.assertTrue(y.shape == (16, 100))
+    bm.clear_buffer_memory()
+
+  @parameterized.product(
+    conn=[
+      bp.conn.FixedProb(0.1, pre=100, post=100),
+      bp.conn.GridFour(pre=100, post=100),
+      bp.conn.GaussianProb(0.1, pre=100, post=100),
+    ]
+  )
+  def test_CSRLinear(self, conn):
+    bm.random.seed()
+    f = bp.dnn.CSRLinear(conn, weight=bp.init.Normal())
+    x = bm.random.random((16, 100))
+    y = f(x)
+    self.assertTrue(y.shape == (16, 100))
+
+    x = bm.random.random((100,))
+    y = f(x)
+    self.assertTrue(y.shape == (100,))
+    bm.clear_buffer_memory()
+
+  @parameterized.product(
+    conn=[
+      bp.conn.FixedProb(0.1, pre=100, post=100),
+      bp.conn.GridFour(pre=100, post=100),
+      bp.conn.GaussianProb(0.1, pre=100, post=100),
+    ]
+  )
+  def test_EventCSRLinear(self, conn):
+    bm.random.seed()
+    f = bp.layers.EventCSRLinear(conn, weight=bp.init.Normal())
+    x = bm.random.random((16, 100))
+    y = f(x)
+    self.assertTrue(y.shape == (16, 100))
+    x = bm.random.random((100,))
+    y = f(x)
+    self.assertTrue(y.shape == (100,))
+    bm.clear_buffer_memory()
+
+  @parameterized.product(
+    prob=[0.01, 0.05, 0.5],
+    weight=[0.01, 0.01],
+    shape=[(), (10,), (10, 20), (10, 20, 25)]
+  )
+  def test_JitFPHomoLinear(self, prob, weight, shape):
+    bm.random.seed()
+    f = bp.dnn.JitFPHomoLinear(100, 200, prob, weight, seed=123)
+    x = bm.random.random(shape + (100,))
+    y = f(x)
+    self.assertTrue(y.shape == shape + (200,))
+    bm.clear_buffer_memory()
+
+  @parameterized.product(
+    prob=[0.01, 0.05, 0.5],
+    w_low=[-0.01, -0.01],
+    w_high=[0.01, 0.01],
+    shape=[(), (10,), (10, 20), (10, 20, 25)]
+  )
+  def test_JitFPUniformLinear(self, prob, w_low, w_high, shape):
+    bm.random.seed()
+    f = bp.dnn.JitFPUniformLinear(100, 200, prob, w_low, w_high, seed=123)
+    x = bm.random.random(shape + (100,))
+    y = f(x)
+    self.assertTrue(y.shape == shape + (200,))
+    bm.clear_buffer_memory()
+
+  @parameterized.product(
+    prob=[0.01, 0.1, 0.5],
+    w_mu=[-0.01, -0.01],
+    w_sigma=[0.01, 0.01],
+    shape=[(), (10,), (10, 20), (10, 20, 25)]
+  )
+  def test_JitFPNormalLinear(self, prob, w_mu, w_sigma, shape):
+    bm.random.seed()
+    f = bp.dnn.JitFPNormalLinear(100, 200, prob, w_mu, w_sigma, seed=123)
+    x = bm.random.random(shape + (100,))
+    y = f(x)
+    self.assertTrue(y.shape == shape + (200,))
+    bm.clear_buffer_memory()
+
+  @parameterized.product(
+    prob=[0.01, 0.05, 0.5],
+    weight=[0.01, 0.01],
+    shape=[(), (10,), (10, 20), (10, 20, 25)]
+  )
+  def test_EventJitFPHomoLinear(self, prob, weight, shape):
+    bm.random.seed()
+    f = bp.dnn.EventJitFPHomoLinear(100, 200, prob, weight, seed=123)
+    y = f(bm.random.random(shape + (100,)) < 0.1)
+    self.assertTrue(y.shape == shape + (200,))
+
+    y2 = f(bm.as_jax(bm.random.random(shape + (100,)) < 0.1, dtype=float))
+    self.assertTrue(y2.shape == shape + (200,))
+    bm.clear_buffer_memory()
+
+  @parameterized.product(
+    prob=[0.01, 0.05, 0.5],
+    w_low=[-0.01, -0.01],
+    w_high=[0.01, 0.01],
+    shape=[(), (10,), (10, 20), (10, 20, 25)]
+  )
+  def test_EventJitFPUniformLinear(self, prob, w_low, w_high, shape):
+    bm.random.seed()
+    f = bp.dnn.EventJitFPUniformLinear(100, 200, prob, w_low, w_high, seed=123)
+    y = f(bm.random.random(shape + (100,)) < 0.1)
+    self.assertTrue(y.shape == shape + (200,))
+
+    y2 = f(bm.as_jax(bm.random.random(shape + (100,)) < 0.1, dtype=float))
+    self.assertTrue(y2.shape == shape + (200,))
+    bm.clear_buffer_memory()
+
+  @parameterized.product(
+    prob=[0.01, 0.1, 0.5],
+    w_mu=[-0.01, -0.01],
+    w_sigma=[0.01, 0.01],
+    shape=[(), (10,), (10, 20), (10, 20, 25)]
+  )
+  def test_EventJitFPNormalLinear(self, prob, w_mu, w_sigma, shape):
+    bm.random.seed()
+    f = bp.dnn.EventJitFPNormalLinear(100, 200, prob, w_mu, w_sigma, seed=123)
+    y = f(bm.random.random(shape + (100,)) < 0.1)
+    self.assertTrue(y.shape == shape + (200,))
+
+    y2 = f(bm.as_jax(bm.random.random(shape + (100,)) < 0.1, dtype=float))
+    self.assertTrue(y2.shape == shape + (200,))
+    bm.clear_buffer_memory()
+
+
+if __name__ == '__main__':
+  absltest.main()
diff --git a/brainpy/_src/dnn/tests/test_mode.py b/brainpy/_src/dnn/tests/test_mode.py
index 0d754976f..f0c67da12 100644
--- a/brainpy/_src/dnn/tests/test_mode.py
+++ b/brainpy/_src/dnn/tests/test_mode.py
@@ -1,800 +1,807 @@
-import brainpy.math as bm
-from absl.testing import parameterized
-from absl.testing import absltest
-import brainpy as bp
-
-
-class Test_Conv(parameterized.TestCase):
-  @parameterized.product(
-    mode=[bm.TrainingMode(),
-          bm.TrainingMode(10),
-          bm.BatchingMode(),
-          bm.BatchingMode(10), ]
-  )
-  def test_Conv1d(self, mode):
-    bm.random.seed()
-    input = bm.random.randn(10, 50, 3)
-    layer = bp.dnn.Conv1d(in_channels=3,
-                          out_channels=4,
-                          kernel_size=5,
-                          mode=mode)
-    output = layer(input)
-    bm.clear_buffer_memory()
-
-  def test_Conv1d_NonBatching(self):
-    bm.random.seed()
-    input = bm.random.randn(50, 3)
-    layer = bp.dnn.Conv1d(in_channels=3,
-                          out_channels=4,
-                          kernel_size=5,
-                          mode=bm.NonBatchingMode())
-    output = layer(input)
-    bm.clear_buffer_memory()
-
-  @parameterized.product(
-    mode=[bm.TrainingMode(),
-          bm.TrainingMode(10),
-          bm.BatchingMode(),
-          bm.BatchingMode(10), ]
-  )
-  def test_Conv2d(self, mode):
-    bm.random.seed()
-    input = bm.random.randn(10, 50, 50, 3)
-    layer = bp.dnn.Conv2d(in_channels=3,
-                          out_channels=4,
-                          kernel_size=(5, 5),
-                          mode=mode)
-    output = layer(input)
-    bm.clear_buffer_memory()
-
-  def test_Conv2_NonBatching(self):
-    bm.random.seed()
-    input = bm.random.randn(10, 10, 3)
-    layer = bp.dnn.Conv2d(in_channels=3,
-                          out_channels=4,
-                          kernel_size=(5, 5),
-                          mode=bm.NonBatchingMode())
-    output = layer(input)
-    bm.clear_buffer_memory()
-    bm.clear_buffer_memory()
-
-  @parameterized.product(
-    mode=[bm.TrainingMode(),
-          bm.TrainingMode(10),
-          bm.BatchingMode(),
-          bm.BatchingMode(10), ]
-  )
-  def test_Conv3d(self, mode):
-    bm.random.seed()
-    input = bm.random.randn(10, 50, 50, 50, 3)
-    layer = bp.dnn.Conv3d(in_channels=3,
-                          out_channels=4,
-                          kernel_size=(5, 5, 5),
-                          mode=mode)
-    output = layer(input)
-    bm.clear_buffer_memory()
-
-  def test_Conv3_NonBatching(self):
-    bm.random.seed()
-    input = bm.random.randn(10, 10, 10, 3)
-    layer = bp.dnn.Conv3d(in_channels=3,
-                          out_channels=4,
-                          kernel_size=(5, 5, 5),
-                          mode=bm.NonBatchingMode())
-    output = layer(input)
-    bm.clear_buffer_memory()
-
-  @parameterized.product(
-    mode=[bm.TrainingMode(),
-          bm.TrainingMode(10),
-          bm.BatchingMode(),
-          bm.BatchingMode(10), ]
-  )
-  def test_ConvTranspose1d(self, mode):
-    bm.random.seed()
-    input = bm.random.randn(10, 50, 3)
-    layer = bp.dnn.ConvTranspose1d(in_channels=3,
-                                   out_channels=4,
-                                   kernel_size=5,
-                                   mode=mode
-                                   )
-    output = layer(input)
-    bm.clear_buffer_memory()
-
-  def test_ConvTranspose1d_NonBatching(self):
-    bm.random.seed()
-    input = bm.random.randn(10, 3)
-    layer = bp.dnn.ConvTranspose1d(in_channels=3,
-                                   out_channels=4,
-                                   kernel_size=5,
-                                   mode=bm.NonBatchingMode())
-    output = layer(input)
-    bm.clear_buffer_memory()
-
-  @parameterized.product(
-    mode=[bm.TrainingMode(),
-          bm.TrainingMode(10),
-          bm.BatchingMode(),
-          bm.BatchingMode(10), ]
-  )
-  def test_ConvTranspose2d(self, mode):
-    bm.random.seed()
-    input = bm.random.randn(10, 50, 50, 3)
-    layer = bp.dnn.ConvTranspose2d(in_channels=3,
-                                   out_channels=4,
-                                   kernel_size=(5, 5),
-                                   mode=mode
-                                   )
-    output = layer(input)
-    bm.clear_buffer_memory()
-
-  def test_ConvTranspose2d_NonBatching(self):
-    bm.random.seed()
-    input = bm.random.randn(10, 10, 3)
-    layer = bp.dnn.ConvTranspose2d(in_channels=3,
-                                   out_channels=4,
-                                   kernel_size=(5, 5),
-                                   mode=bm.NonBatchingMode())
-    output = layer(input)
-    bm.clear_buffer_memory()
-
-  @parameterized.product(
-    mode=[bm.TrainingMode(),
-          bm.TrainingMode(10),
-          bm.BatchingMode(),
-          bm.BatchingMode(10), ]
-  )
-  def test_ConvTranspose3d(self, mode):
-    bm.random.seed()
-    input = bm.random.randn(10, 50, 50, 50, 3)
-    layer = bp.dnn.ConvTranspose3d(in_channels=3,
-                                   out_channels=4,
-                                   kernel_size=(5, 5, 5),
-                                   mode=mode
-                                   )
-    output = layer(input)
-    bm.clear_buffer_memory()
-
-  def test_ConvTranspose3d_NonBatching(self):
-    bm.random.seed()
-    input = bm.random.randn(10, 10, 10, 3)
-    layer = bp.dnn.ConvTranspose3d(in_channels=3,
-                                   out_channels=4,
-                                   kernel_size=(5, 5, 5),
-                                   mode=bm.NonBatchingMode())
-    output = layer(input)
-    bm.clear_buffer_memory()
-
-
-class TestPool(parameterized.TestCase):
-
-  @parameterized.product(
-    mode=[bm.TrainingMode(),
-          bm.TrainingMode(10),
-          bm.BatchingMode(),
-          bm.BatchingMode(10),
-          bm.NonBatchingMode()]
-  )
-  def test_MaxPool(self, mode):
-    bm.random.seed()
-    input = bm.random.randn(10, 5, 5, 4)
-    layer = bp.dnn.MaxPool(kernel_size=(3, 3),
-                           channel_axis=-1,
-                           mode=mode)
-    if mode in [bm.NonBatchingMode()]:
-      for i in input:
-        output = layer(i)
-    else:
-      output = layer(input)
-
-  @parameterized.product(
-    mode=[bm.TrainingMode(),
-          bm.TrainingMode(10),
-          bm.BatchingMode(),
-          bm.BatchingMode(10),
-          bm.NonBatchingMode()]
-  )
-  def test_MinPool(self, mode):
-    bm.random.seed()
-    input = bm.random.randn(10, 5, 5, 4)
-    layer = bp.dnn.MaxPool(kernel_size=(3, 3),
-                           channel_axis=-1,
-                           mode=mode)
-    if mode in [bm.NonBatchingMode()]:
-      for i in input:
-        output = layer(i)
-    else:
-      output = layer(input)
-
-  @parameterized.product(
-    mode=[bm.TrainingMode(),
-          bm.TrainingMode(10),
-          bm.BatchingMode(),
-          bm.BatchingMode(10),
-          bm.NonBatchingMode()]
-  )
-  def test_AvgPool(self, mode):
-    bm.random.seed()
-    input = bm.random.randn(10, 5, 5, 4)
-    layer = bp.dnn.AvgPool(kernel_size=(3, 3),
-                           channel_axis=-1,
-                           mode=mode)
-    if mode in [bm.NonBatchingMode()]:
-      for i in input:
-        output = layer(i)
-    else:
-      output = layer(input)
-
-  @parameterized.product(
-    mode=[bm.TrainingMode(),
-          bm.TrainingMode(10),
-          bm.BatchingMode(),
-          bm.BatchingMode(10),
-          bm.NonBatchingMode()]
-  )
-  def test_AvgPool1d(self, mode):
-    bm.random.seed()
-    input = bm.random.randn(10, 5, 4)
-    layer = bp.dnn.AvgPool1d(kernel_size=3,
-                             channel_axis=-1,
-                             mode=mode)
-    if mode in [bm.NonBatchingMode()]:
-      for i in input:
-        output = layer(i)
-    else:
-      output = layer(input)
-
-  @parameterized.product(
-    mode=[bm.TrainingMode(),
-          bm.TrainingMode(10),
-          bm.BatchingMode(),
-          bm.BatchingMode(10),
-          bm.NonBatchingMode()]
-  )
-  def test_AvgPool2d(self, mode):
-    bm.random.seed()
-    input = bm.random.randn(10, 5, 5, 4)
-    layer = bp.dnn.AvgPool2d(kernel_size=(3, 3),
-                             channel_axis=-1,
-                             mode=mode)
-    if mode in [bm.NonBatchingMode()]:
-      for i in input:
-        output = layer(i)
-    else:
-      output = layer(input)
-
-  @parameterized.product(
-    mode=[bm.TrainingMode(),
-          bm.TrainingMode(10),
-          bm.BatchingMode(),
-          bm.BatchingMode(10),
-          bm.NonBatchingMode()]
-  )
-  def test_AvgPool3d(self, mode):
-    bm.random.seed()
-    input = bm.random.randn(10, 5, 5, 5, 4)
-    layer = bp.dnn.AvgPool3d(kernel_size=(3, 3, 3),
-                             channel_axis=-1,
-                             mode=mode)
-    if mode in [bm.NonBatchingMode()]:
-      for i in input:
-        output = layer(i)
-    else:
-      output = layer(input)
-
-  @parameterized.product(
-    mode=[bm.TrainingMode(),
-          bm.TrainingMode(10),
-          bm.BatchingMode(),
-          bm.BatchingMode(10),
-          bm.NonBatchingMode()]
-  )
-  def test_MaxPool1d(self, mode):
-    bm.random.seed()
-    input = bm.random.randn(10, 5, 4)
-    layer = bp.dnn.MaxPool1d(kernel_size=3,
-                             channel_axis=-1,
-                             mode=mode)
-    if mode in [bm.NonBatchingMode()]:
-      for i in input:
-        output = layer(i)
-    else:
-      output = layer(input)
-
-  @parameterized.product(
-    mode=[bm.TrainingMode(),
-          bm.TrainingMode(10),
-          bm.BatchingMode(),
-          bm.BatchingMode(10),
-          bm.NonBatchingMode()]
-  )
-  def test_MaxPool2d(self, mode):
-    bm.random.seed()
-    input = bm.random.randn(10, 5, 5, 4)
-    layer = bp.dnn.MaxPool2d(kernel_size=(3, 3),
-                             channel_axis=-1,
-                             mode=mode)
-    if mode in [bm.NonBatchingMode()]:
-      for i in input:
-        output = layer(i)
-    else:
-      output = layer(input)
-
-  @parameterized.product(
-    mode=[bm.TrainingMode(),
-          bm.TrainingMode(10),
-          bm.BatchingMode(),
-          bm.BatchingMode(10),
-          bm.NonBatchingMode()]
-  )
-  def test_MaxPool3d(self, mode):
-    bm.random.seed()
-    input = bm.random.randn(10, 5, 5, 5, 4)
-    layer = bp.dnn.MaxPool3d(kernel_size=(3, 3, 3),
-                             channel_axis=-1,
-                             mode=mode)
-    if mode in [bm.NonBatchingMode()]:
-      for i in input:
-        output = layer(i)
-    else:
-      output = layer(input)
-
-  @parameterized.product(
-    mode=[bm.TrainingMode(),
-          bm.TrainingMode(10),
-          bm.BatchingMode(),
-          bm.BatchingMode(10),
-          bm.NonBatchingMode()]
-  )
-  def test_AdaptiveAvgPool1d(self, mode):
-    bm.random.seed()
-    input = bm.random.randn(10, 5, 4)
-    layer = bp.dnn.AdaptiveAvgPool1d(target_shape=3,
-                                     channel_axis=-1,
-                                     mode=mode)
-    if mode in [bm.NonBatchingMode()]:
-      for i in input:
-        output = layer(i)
-    else:
-      output = layer(input)
-
-  @parameterized.product(
-    mode=[bm.TrainingMode(),
-          bm.TrainingMode(10),
-          bm.BatchingMode(),
-          bm.BatchingMode(10),
-          bm.NonBatchingMode()]
-  )
-  def test_AdaptiveAvgPool2d(self, mode):
-    bm.random.seed()
-    input = bm.random.randn(10, 5, 5, 4)
-    layer = bp.dnn.AdaptiveAvgPool2d(target_shape=(3, 3),
-                                     channel_axis=-1,
-                                     mode=mode)
-    if mode in [bm.NonBatchingMode()]:
-      for i in input:
-        output = layer(i)
-    else:
-      output = layer(input)
-
-  @parameterized.product(
-    mode=[bm.TrainingMode(),
-          bm.TrainingMode(10),
-          bm.BatchingMode(),
-          bm.BatchingMode(10),
-          bm.NonBatchingMode()]
-  )
-  def test_AdaptiveAvgPool3d(self, mode):
-    bm.random.seed()
-    input = bm.random.randn(10, 5, 5, 5, 4)
-    layer = bp.dnn.AdaptiveAvgPool3d(target_shape=(3, 3, 3),
-                                     channel_axis=-1,
-                                     mode=mode)
-    if mode in [bm.NonBatchingMode()]:
-      for i in input:
-        output = layer(i)
-    else:
-      output = layer(input)
-
-  @parameterized.product(
-    mode=[bm.TrainingMode(),
-          bm.TrainingMode(10),
-          bm.BatchingMode(),
-          bm.BatchingMode(10),
-          bm.NonBatchingMode()]
-  )
-  def test_AdaptiveMaxPool1d(self, mode):
-    bm.random.seed()
-    input = bm.random.randn(10, 5, 4)
-    layer = bp.dnn.AdaptiveMaxPool1d(target_shape=3,
-                                     channel_axis=-1,
-                                     mode=mode)
-    if mode in [bm.NonBatchingMode()]:
-      for i in input:
-        output = layer(i)
-    else:
-      output = layer(input)
-
-  @parameterized.product(
-    mode=[bm.TrainingMode(),
-          bm.TrainingMode(10),
-          bm.BatchingMode(),
-          bm.BatchingMode(10),
-          bm.NonBatchingMode()]
-  )
-  def test_AdaptiveMaxPool2d(self, mode):
-    bm.random.seed()
-    input = bm.random.randn(10, 5, 5, 4)
-    layer = bp.dnn.AdaptiveMaxPool2d(target_shape=(3, 3),
-                                     channel_axis=-1,
-                                     mode=mode)
-    if mode in [bm.NonBatchingMode()]:
-      for i in input:
-        output = layer(i)
-    else:
-      output = layer(input)
-
-  @parameterized.product(
-    mode=[bm.TrainingMode(),
-          bm.TrainingMode(10),
-          bm.BatchingMode(),
-          bm.BatchingMode(10),
-          bm.NonBatchingMode()]
-  )
-  def test_AdaptiveMaxPool3d(self, mode):
-    bm.random.seed()
-    input = bm.random.randn(10, 5, 5, 5, 4)
-    layer = bp.dnn.AdaptiveMaxPool3d(target_shape=(3, 3, 3),
-                                     channel_axis=-1,
-                                     mode=mode)
-    if mode in [bm.NonBatchingMode()]:
-      for i in input:
-        output = layer(i)
-    else:
-      output = layer(input)
-
-
-class Test_Dropout(parameterized.TestCase):
-  @parameterized.product(
-    mode=[bm.TrainingMode(),
-          bm.TrainingMode(10),
-          bm.BatchingMode(),
-          bm.BatchingMode(10),
-          bm.NonBatchingMode()]
-  )
-  def test_Dropout(self, mode):
-    bp.share.save(fit=False)
-    bm.random.seed()
-    input = bm.random.randn(10, 5, 5, 5, 4)
-    layer = bp.dnn.Dropout(prob=0.2,
-                           mode=mode)
-    output = layer(input)
-
-
-class Test_function(parameterized.TestCase):
-  @parameterized.product(
-    mode=[bm.TrainingMode(),
-          bm.TrainingMode(10),
-          bm.BatchingMode(),
-          bm.BatchingMode(10),
-          bm.NonBatchingMode()]
-  )
-  def test_Flatten(self, mode):
-    bm.random.seed()
-    layer = bp.dnn.Flatten(mode=mode)
-    input = bm.random.randn(10, 5, 5, 5, 4)
-    output = layer(input)
-
-
-class Test_linear(parameterized.TestCase):
-
-  @parameterized.product(
-    mode=[bm.TrainingMode(),
-          bm.TrainingMode(10),
-          bm.BatchingMode(),
-          bm.BatchingMode(10),
-          bm.NonBatchingMode()]
-  )
-  def test_linear(self, mode):
-    bm.random.seed()
-    input = bm.random.randn(10, 9, 8, 7)
-    layer = bp.dnn.Linear(num_in=7,
-                          num_out=6,
-                          mode=mode)
-    output = layer(input)
-
-  @parameterized.product(
-    mode=[bm.TrainingMode(),
-          bm.TrainingMode(10),
-          bm.BatchingMode(),
-          bm.BatchingMode(10),
-          bm.NonBatchingMode()]
-  )
-  def test_AllToAll(self, mode):
-    bm.random.seed()
-    input = bm.random.randn(10, 10)
-    layer = bp.dnn.AllToAll(num_pre=10,
-                            num_post=20,
-                            weight=0.1,
-                            mode=mode)
-    if mode in [bm.NonBatchingMode()]:
-      for i in input:
-        output = layer(i)
-    else:
-      output = layer(input)
-
-  @parameterized.product(
-    mode=[bm.TrainingMode(),
-          bm.TrainingMode(10),
-          bm.BatchingMode(),
-          bm.BatchingMode(10),
-          bm.NonBatchingMode()]
-  )
-  def test_OneToOne(self, mode):
-    bm.random.seed()
-    input = bm.random.randn(10, 10)
-    layer = bp.dnn.OneToOne(num=10,
-                            weight=0.1,
-                            mode=mode)
-    output = layer(input)
-
-  @parameterized.product(
-    mode=[bm.TrainingMode(),
-          bm.TrainingMode(10),
-          bm.BatchingMode(),
-          bm.BatchingMode(10),
-          bm.NonBatchingMode()]
-  )
-  def test_MaskedLinear(self, mode):
-    bm.random.seed()
-    input = bm.random.randn(100, 100)
-    layer = bp.dnn.MaskedLinear(conn=bp.conn.FixedProb(0.1, pre=100, post=100),
-                                weight=0.1,
-                                mode=mode)
-    output = layer(input)
-
-  @parameterized.product(
-    mode=[bm.TrainingMode(),
-          bm.TrainingMode(10),
-          bm.BatchingMode(),
-          bm.BatchingMode(10),
-          bm.NonBatchingMode()]
-  )
-  def test_CSRLinear(self, mode):
-    bm.random.seed()
-    input = bm.random.randn(100, 100)
-    layer = bp.dnn.CSRLinear(conn=bp.conn.FixedProb(0.1, pre=100, post=100),
-                             weight=0.1,
-                             mode=mode)
-    output = layer(input)
-
-  @parameterized.product(
-    mode=[bm.TrainingMode(),
-          bm.TrainingMode(10),
-          bm.BatchingMode(),
-          bm.BatchingMode(10),
-          bm.NonBatchingMode()]
-  )
-  def test_EventCSRLinear(self, mode):
-    bm.random.seed()
-    input = bm.random.randn(100, 100)
-    layer = bp.dnn.EventCSRLinear(conn=bp.conn.FixedProb(0.1, pre=100, post=100),
-                                  weight=0.1,
-                                  mode=mode)
-    output = layer(input)
-
-  @parameterized.product(
-    mode=[bm.TrainingMode(),
-          bm.TrainingMode(10),
-          bm.BatchingMode(),
-          bm.BatchingMode(10),
-          bm.NonBatchingMode()]
-  )
-  def test_JitFPHomoLinear(self, mode):
-    bm.random.seed()
-    layer = bp.dnn.JitFPHomoLinear(num_in=100,
-                                   num_out=200,
-                                   prob=0.1,
-                                   weight=0.01,
-                                   seed=100,
-                                   mode=mode)
-    input = bm.random.randn(10, 100)
-    output = layer(input)
-
-  @parameterized.product(
-    mode=[bm.TrainingMode(),
-          bm.TrainingMode(10),
-          bm.BatchingMode(),
-          bm.BatchingMode(10),
-          bm.NonBatchingMode()]
-  )
-  def test_JitFPUniformLinear(self, mode):
-    bm.random.seed()
-    layer = bp.dnn.JitFPUniformLinear(num_in=100,
-                                      num_out=200,
-                                      prob=0.1,
-                                      w_low=-0.01,
-                                      w_high=0.01,
-                                      seed=100,
-                                      mode=mode)
-    input = bm.random.randn(10, 100)
-    output = layer(input)
-
-  @parameterized.product(
-    mode=[bm.TrainingMode(),
-          bm.TrainingMode(10),
-          bm.BatchingMode(),
-          bm.BatchingMode(10),
-          bm.NonBatchingMode()]
-  )
-  def test_JitFPNormalLinear(self, mode):
-    bm.random.seed()
-    layer = bp.dnn.JitFPNormalLinear(num_in=100,
-                                     num_out=200,
-                                     prob=0.1,
-                                     w_mu=-0.01,
-                                     w_sigma=0.01,
-                                     seed=100,
-                                     mode=mode)
-    input = bm.random.randn(10, 100)
-    output = layer(input)
-
-  @parameterized.product(
-    mode=[bm.TrainingMode(),
-          bm.TrainingMode(10),
-          bm.BatchingMode(),
-          bm.BatchingMode(10),
-          bm.NonBatchingMode()]
-  )
-  def test_EventJitFPHomoLinear(self, mode):
-    bm.random.seed()
-    layer = bp.dnn.EventJitFPHomoLinear(num_in=100,
-                                        num_out=200,
-                                        prob=0.1,
-                                        weight=0.01,
-                                        seed=100,
-                                        mode=mode)
-    input = bm.random.randn(10, 100)
-    output = layer(input)
-
-  @parameterized.product(
-    mode=[bm.TrainingMode(),
-          bm.TrainingMode(10),
-          bm.BatchingMode(),
-          bm.BatchingMode(10),
-          bm.NonBatchingMode()]
-  )
-  def test_EventJitFPNormalLinear(self, mode):
-    bm.random.seed()
-    layer = bp.dnn.EventJitFPNormalLinear(num_in=100,
-                                          num_out=200,
-                                          prob=0.1,
-                                          w_mu=-0.01,
-                                          w_sigma=0.01,
-                                          seed=100,
-                                          mode=mode)
-    input = bm.random.randn(10, 100)
-    output = layer(input)
-
-  @parameterized.product(
-    mode=[bm.TrainingMode(),
-          bm.TrainingMode(10),
-          bm.BatchingMode(),
-          bm.BatchingMode(10),
-          bm.NonBatchingMode()]
-  )
-  def test_EventJitFPUniformLinear(self, mode):
-    bm.random.seed()
-    layer = bp.dnn.EventJitFPUniformLinear(num_in=100,
-                                           num_out=200,
-                                           prob=0.1,
-                                           w_low=-0.01,
-                                           w_high=0.01,
-                                           seed=100,
-                                           mode=mode)
-    input = bm.random.randn(10, 100)
-    output = layer(input)
-
-
-class Test_Normalization(parameterized.TestCase):
-
-  @parameterized.product(
-    mode=[bm.TrainingMode(),
-          bm.TrainingMode(10),
-          bm.BatchingMode(),
-          bm.BatchingMode(10)],
-    fit=[True, False]
-  )
-  def test_BatchNorm1d(self, fit, mode):
-    bm.random.seed()
-    bp.share.save(fit=fit)
-    layer = bp.dnn.BatchNorm1d(num_features=100,
-                               mode=mode,
-                               affine=False)
-    input = bm.random.randn(10, 5, 100)
-    output = layer(input)
-
-  @parameterized.product(
-    mode=[bm.TrainingMode(),
-          bm.TrainingMode(10),
-          bm.BatchingMode(),
-          bm.BatchingMode(10)],
-    fit=[True, False]
-  )
-  def test_BatchNorm2d(self, fit, mode):
-    bm.random.seed()
-    bp.share.save(fit=fit)
-    layer = bp.dnn.BatchNorm2d(num_features=100,
-                               mode=mode,
-                               affine=False)
-    input = bm.random.randn(10, 5, 6, 100)
-    output = layer(input)
-
-  @parameterized.product(
-    mode=[bm.TrainingMode(),
-          bm.TrainingMode(10),
-          bm.BatchingMode(),
-          bm.BatchingMode(10)],
-    fit=[True, False]
-  )
-  def test_BatchNorm3d(self, fit, mode):
-    bm.random.seed()
-    bp.share.save(fit=fit)
-    layer = bp.dnn.BatchNorm3d(num_features=100,
-                               mode=mode,
-                               affine=False)
-    input = bm.random.randn(10, 5, 6, 7, 100)
-    output = layer(input)
-
-  @parameterized.product(
-    mode=[bm.TrainingMode(),
-          bm.TrainingMode(10),
-          bm.BatchingMode(),
-          bm.BatchingMode(10),
-          bm.NonBatchingMode()],
-  )
-  def test_LayerNorm(self, mode):
-    bm.random.seed()
-    layer = bp.dnn.LayerNorm(normalized_shape=3,
-                             mode=mode,
-                             elementwise_affine=False
-                             )
-    input = bm.random.randn(10, 5, 3)
-    outout = layer(input)
-
-  @parameterized.product(
-    mode=[bm.TrainingMode(),
-          bm.TrainingMode(10),
-          bm.BatchingMode(),
-          bm.BatchingMode(10),
-          bm.NonBatchingMode()],
-  )
-  def test_GroupNorm(self, mode):
-    bm.random.seed()
-    layer = bp.dnn.GroupNorm(num_groups=2,
-                             num_channels=6,
-                             affine=False,
-                             mode=mode
-                             )
-    input = bm.random.randn(20, 10, 10, 6)
-    output = layer(input)
-
-  @parameterized.product(
-    mode=[bm.TrainingMode(),
-          bm.TrainingMode(10),
-          bm.BatchingMode(),
-          bm.BatchingMode(10),
-          bm.NonBatchingMode()],
-  )
-  def test_InstanceNorm(self, mode):
-    bm.random.seed()
-    layer = bp.dnn.InstanceNorm(num_channels=6,
-                                affine=False,
-                                mode=mode
-                                )
-    input = bm.random.randn(20, 10, 10, 6)
-    output = layer(input)
-
-
-if __name__ == '__main__':
-  absltest.main()
+import pytest
+from absl.testing import absltest
+from absl.testing import parameterized
+
+import brainpy as bp
+import brainpy.math as bm
+
+from brainpy._src.dependency_check import import_taichi
+
+if import_taichi(error_if_not_found=False) is None:
+  pytest.skip('no taichi', allow_module_level=True)
+
+
+class Test_Conv(parameterized.TestCase):
+  @parameterized.product(
+    mode=[bm.TrainingMode(),
+          bm.TrainingMode(10),
+          bm.BatchingMode(),
+          bm.BatchingMode(10), ]
+  )
+  def test_Conv1d(self, mode):
+    bm.random.seed()
+    input = bm.random.randn(10, 50, 3)
+    layer = bp.dnn.Conv1d(in_channels=3,
+                          out_channels=4,
+                          kernel_size=5,
+                          mode=mode)
+    output = layer(input)
+    bm.clear_buffer_memory()
+
+  def test_Conv1d_NonBatching(self):
+    bm.random.seed()
+    input = bm.random.randn(50, 3)
+    layer = bp.dnn.Conv1d(in_channels=3,
+                          out_channels=4,
+                          kernel_size=5,
+                          mode=bm.NonBatchingMode())
+    output = layer(input)
+    bm.clear_buffer_memory()
+
+  @parameterized.product(
+    mode=[bm.TrainingMode(),
+          bm.TrainingMode(10),
+          bm.BatchingMode(),
+          bm.BatchingMode(10), ]
+  )
+  def test_Conv2d(self, mode):
+    bm.random.seed()
+    input = bm.random.randn(10, 50, 50, 3)
+    layer = bp.dnn.Conv2d(in_channels=3,
+                          out_channels=4,
+                          kernel_size=(5, 5),
+                          mode=mode)
+    output = layer(input)
+    bm.clear_buffer_memory()
+
+  def test_Conv2_NonBatching(self):
+    bm.random.seed()
+    input = bm.random.randn(10, 10, 3)
+    layer = bp.dnn.Conv2d(in_channels=3,
+                          out_channels=4,
+                          kernel_size=(5, 5),
+                          mode=bm.NonBatchingMode())
+    output = layer(input)
+    bm.clear_buffer_memory()
+    bm.clear_buffer_memory()
+
+  @parameterized.product(
+    mode=[bm.TrainingMode(),
+          bm.TrainingMode(10),
+          bm.BatchingMode(),
+          bm.BatchingMode(10), ]
+  )
+  def test_Conv3d(self, mode):
+    bm.random.seed()
+    input = bm.random.randn(10, 50, 50, 50, 3)
+    layer = bp.dnn.Conv3d(in_channels=3,
+                          out_channels=4,
+                          kernel_size=(5, 5, 5),
+                          mode=mode)
+    output = layer(input)
+    bm.clear_buffer_memory()
+
+  def test_Conv3_NonBatching(self):
+    bm.random.seed()
+    input = bm.random.randn(10, 10, 10, 3)
+    layer = bp.dnn.Conv3d(in_channels=3,
+                          out_channels=4,
+                          kernel_size=(5, 5, 5),
+                          mode=bm.NonBatchingMode())
+    output = layer(input)
+    bm.clear_buffer_memory()
+
+  @parameterized.product(
+    mode=[bm.TrainingMode(),
+          bm.TrainingMode(10),
+          bm.BatchingMode(),
+          bm.BatchingMode(10), ]
+  )
+  def test_ConvTranspose1d(self, mode):
+    bm.random.seed()
+    input = bm.random.randn(10, 50, 3)
+    layer = bp.dnn.ConvTranspose1d(in_channels=3,
+                                   out_channels=4,
+                                   kernel_size=5,
+                                   mode=mode
+                                   )
+    output = layer(input)
+    bm.clear_buffer_memory()
+
+  def test_ConvTranspose1d_NonBatching(self):
+    bm.random.seed()
+    input = bm.random.randn(10, 3)
+    layer = bp.dnn.ConvTranspose1d(in_channels=3,
+                                   out_channels=4,
+                                   kernel_size=5,
+                                   mode=bm.NonBatchingMode())
+    output = layer(input)
+    bm.clear_buffer_memory()
+
+  @parameterized.product(
+    mode=[bm.TrainingMode(),
+          bm.TrainingMode(10),
+          bm.BatchingMode(),
+          bm.BatchingMode(10), ]
+  )
+  def test_ConvTranspose2d(self, mode):
+    bm.random.seed()
+    input = bm.random.randn(10, 50, 50, 3)
+    layer = bp.dnn.ConvTranspose2d(in_channels=3,
+                                   out_channels=4,
+                                   kernel_size=(5, 5),
+                                   mode=mode
+                                   )
+    output = layer(input)
+    bm.clear_buffer_memory()
+
+  def test_ConvTranspose2d_NonBatching(self):
+    bm.random.seed()
+    input = bm.random.randn(10, 10, 3)
+    layer = bp.dnn.ConvTranspose2d(in_channels=3,
+                                   out_channels=4,
+                                   kernel_size=(5, 5),
+                                   mode=bm.NonBatchingMode())
+    output = layer(input)
+    bm.clear_buffer_memory()
+
+  @parameterized.product(
+    mode=[bm.TrainingMode(),
+          bm.TrainingMode(10),
+          bm.BatchingMode(),
+          bm.BatchingMode(10), ]
+  )
+  def test_ConvTranspose3d(self, mode):
+    bm.random.seed()
+    input = bm.random.randn(10, 50, 50, 50, 3)
+    layer = bp.dnn.ConvTranspose3d(in_channels=3,
+                                   out_channels=4,
+                                   kernel_size=(5, 5, 5),
+                                   mode=mode
+                                   )
+    output = layer(input)
+    bm.clear_buffer_memory()
+
+  def test_ConvTranspose3d_NonBatching(self):
+    bm.random.seed()
+    input = bm.random.randn(10, 10, 10, 3)
+    layer = bp.dnn.ConvTranspose3d(in_channels=3,
+                                   out_channels=4,
+                                   kernel_size=(5, 5, 5),
+                                   mode=bm.NonBatchingMode())
+    output = layer(input)
+    bm.clear_buffer_memory()
+
+
+class TestPool(parameterized.TestCase):
+
+  @parameterized.product(
+    mode=[bm.TrainingMode(),
+          bm.TrainingMode(10),
+          bm.BatchingMode(),
+          bm.BatchingMode(10),
+          bm.NonBatchingMode()]
+  )
+  def test_MaxPool(self, mode):
+    bm.random.seed()
+    input = bm.random.randn(10, 5, 5, 4)
+    layer = bp.dnn.MaxPool(kernel_size=(3, 3),
+                           channel_axis=-1,
+                           mode=mode)
+    if mode in [bm.NonBatchingMode()]:
+      for i in input:
+        output = layer(i)
+    else:
+      output = layer(input)
+
+  @parameterized.product(
+    mode=[bm.TrainingMode(),
+          bm.TrainingMode(10),
+          bm.BatchingMode(),
+          bm.BatchingMode(10),
+          bm.NonBatchingMode()]
+  )
+  def test_MinPool(self, mode):
+    bm.random.seed()
+    input = bm.random.randn(10, 5, 5, 4)
+    layer = bp.dnn.MaxPool(kernel_size=(3, 3),
+                           channel_axis=-1,
+                           mode=mode)
+    if mode in [bm.NonBatchingMode()]:
+      for i in input:
+        output = layer(i)
+    else:
+      output = layer(input)
+
+  @parameterized.product(
+    mode=[bm.TrainingMode(),
+          bm.TrainingMode(10),
+          bm.BatchingMode(),
+          bm.BatchingMode(10),
+          bm.NonBatchingMode()]
+  )
+  def test_AvgPool(self, mode):
+    bm.random.seed()
+    input = bm.random.randn(10, 5, 5, 4)
+    layer = bp.dnn.AvgPool(kernel_size=(3, 3),
+                           channel_axis=-1,
+                           mode=mode)
+    if mode in [bm.NonBatchingMode()]:
+      for i in input:
+        output = layer(i)
+    else:
+      output = layer(input)
+
+  @parameterized.product(
+    mode=[bm.TrainingMode(),
+          bm.TrainingMode(10),
+          bm.BatchingMode(),
+          bm.BatchingMode(10),
+          bm.NonBatchingMode()]
+  )
+  def test_AvgPool1d(self, mode):
+    bm.random.seed()
+    input = bm.random.randn(10, 5, 4)
+    layer = bp.dnn.AvgPool1d(kernel_size=3,
+                             channel_axis=-1,
+                             mode=mode)
+    if mode in [bm.NonBatchingMode()]:
+      for i in input:
+        output = layer(i)
+    else:
+      output = layer(input)
+
+  @parameterized.product(
+    mode=[bm.TrainingMode(),
+          bm.TrainingMode(10),
+          bm.BatchingMode(),
+          bm.BatchingMode(10),
+          bm.NonBatchingMode()]
+  )
+  def test_AvgPool2d(self, mode):
+    bm.random.seed()
+    input = bm.random.randn(10, 5, 5, 4)
+    layer = bp.dnn.AvgPool2d(kernel_size=(3, 3),
+                             channel_axis=-1,
+                             mode=mode)
+    if mode in [bm.NonBatchingMode()]:
+      for i in input:
+        output = layer(i)
+    else:
+      output = layer(input)
+
+  @parameterized.product(
+    mode=[bm.TrainingMode(),
+          bm.TrainingMode(10),
+          bm.BatchingMode(),
+          bm.BatchingMode(10),
+          bm.NonBatchingMode()]
+  )
+  def test_AvgPool3d(self, mode):
+    bm.random.seed()
+    input = bm.random.randn(10, 5, 5, 5, 4)
+    layer = bp.dnn.AvgPool3d(kernel_size=(3, 3, 3),
+                             channel_axis=-1,
+                             mode=mode)
+    if mode in [bm.NonBatchingMode()]:
+      for i in input:
+        output = layer(i)
+    else:
+      output = layer(input)
+
+  @parameterized.product(
+    mode=[bm.TrainingMode(),
+          bm.TrainingMode(10),
+          bm.BatchingMode(),
+          bm.BatchingMode(10),
+          bm.NonBatchingMode()]
+  )
+  def test_MaxPool1d(self, mode):
+    bm.random.seed()
+    input = bm.random.randn(10, 5, 4)
+    layer = bp.dnn.MaxPool1d(kernel_size=3,
+                             channel_axis=-1,
+                             mode=mode)
+    if mode in [bm.NonBatchingMode()]:
+      for i in input:
+        output = layer(i)
+    else:
+      output = layer(input)
+
+  @parameterized.product(
+    mode=[bm.TrainingMode(),
+          bm.TrainingMode(10),
+          bm.BatchingMode(),
+          bm.BatchingMode(10),
+          bm.NonBatchingMode()]
+  )
+  def test_MaxPool2d(self, mode):
+    bm.random.seed()
+    input = bm.random.randn(10, 5, 5, 4)
+    layer = bp.dnn.MaxPool2d(kernel_size=(3, 3),
+                             channel_axis=-1,
+                             mode=mode)
+    if mode in [bm.NonBatchingMode()]:
+      for i in input:
+        output = layer(i)
+    else:
+      output = layer(input)
+
+  @parameterized.product(
+    mode=[bm.TrainingMode(),
+          bm.TrainingMode(10),
+          bm.BatchingMode(),
+          bm.BatchingMode(10),
+          bm.NonBatchingMode()]
+  )
+  def test_MaxPool3d(self, mode):
+    bm.random.seed()
+    input = bm.random.randn(10, 5, 5, 5, 4)
+    layer = bp.dnn.MaxPool3d(kernel_size=(3, 3, 3),
+                             channel_axis=-1,
+                             mode=mode)
+    if mode in [bm.NonBatchingMode()]:
+      for i in input:
+        output = layer(i)
+    else:
+      output = layer(input)
+
+  @parameterized.product(
+    mode=[bm.TrainingMode(),
+          bm.TrainingMode(10),
+          bm.BatchingMode(),
+          bm.BatchingMode(10),
+          bm.NonBatchingMode()]
+  )
+  def test_AdaptiveAvgPool1d(self, mode):
+    bm.random.seed()
+    input = bm.random.randn(10, 5, 4)
+    layer = bp.dnn.AdaptiveAvgPool1d(target_shape=3,
+                                     channel_axis=-1,
+                                     mode=mode)
+    if mode in [bm.NonBatchingMode()]:
+      for i in input:
+        output = layer(i)
+    else:
+      output = layer(input)
+
+  @parameterized.product(
+    mode=[bm.TrainingMode(),
+          bm.TrainingMode(10),
+          bm.BatchingMode(),
+          bm.BatchingMode(10),
+          bm.NonBatchingMode()]
+  )
+  def test_AdaptiveAvgPool2d(self, mode):
+    bm.random.seed()
+    input = bm.random.randn(10, 5, 5, 4)
+    layer = bp.dnn.AdaptiveAvgPool2d(target_shape=(3, 3),
+                                     channel_axis=-1,
+                                     mode=mode)
+    if mode in [bm.NonBatchingMode()]:
+      for i in input:
+        output = layer(i)
+    else:
+      output = layer(input)
+
+  @parameterized.product(
+    mode=[bm.TrainingMode(),
+          bm.TrainingMode(10),
+          bm.BatchingMode(),
+          bm.BatchingMode(10),
+          bm.NonBatchingMode()]
+  )
+  def test_AdaptiveAvgPool3d(self, mode):
+    bm.random.seed()
+    input = bm.random.randn(10, 5, 5, 5, 4)
+    layer = bp.dnn.AdaptiveAvgPool3d(target_shape=(3, 3, 3),
+                                     channel_axis=-1,
+                                     mode=mode)
+    if mode in [bm.NonBatchingMode()]:
+      for i in input:
+        output = layer(i)
+    else:
+      output = layer(input)
+
+  @parameterized.product(
+    mode=[bm.TrainingMode(),
+          bm.TrainingMode(10),
+          bm.BatchingMode(),
+          bm.BatchingMode(10),
+          bm.NonBatchingMode()]
+  )
+  def test_AdaptiveMaxPool1d(self, mode):
+    bm.random.seed()
+    input = bm.random.randn(10, 5, 4)
+    layer = bp.dnn.AdaptiveMaxPool1d(target_shape=3,
+                                     channel_axis=-1,
+                                     mode=mode)
+    if mode in [bm.NonBatchingMode()]:
+      for i in input:
+        output = layer(i)
+    else:
+      output = layer(input)
+
+  @parameterized.product(
+    mode=[bm.TrainingMode(),
+          bm.TrainingMode(10),
+          bm.BatchingMode(),
+          bm.BatchingMode(10),
+          bm.NonBatchingMode()]
+  )
+  def test_AdaptiveMaxPool2d(self, mode):
+    bm.random.seed()
+    input = bm.random.randn(10, 5, 5, 4)
+    layer = bp.dnn.AdaptiveMaxPool2d(target_shape=(3, 3),
+                                     channel_axis=-1,
+                                     mode=mode)
+    if mode in [bm.NonBatchingMode()]:
+      for i in input:
+        output = layer(i)
+    else:
+      output = layer(input)
+
+  @parameterized.product(
+    mode=[bm.TrainingMode(),
+          bm.TrainingMode(10),
+          bm.BatchingMode(),
+          bm.BatchingMode(10),
+          bm.NonBatchingMode()]
+  )
+  def test_AdaptiveMaxPool3d(self, mode):
+    bm.random.seed()
+    input = bm.random.randn(10, 5, 5, 5, 4)
+    layer = bp.dnn.AdaptiveMaxPool3d(target_shape=(3, 3, 3),
+                                     channel_axis=-1,
+                                     mode=mode)
+    if mode in [bm.NonBatchingMode()]:
+      for i in input:
+        output = layer(i)
+    else:
+      output = layer(input)
+
+
+class Test_Dropout(parameterized.TestCase):
+  @parameterized.product(
+    mode=[bm.TrainingMode(),
+          bm.TrainingMode(10),
+          bm.BatchingMode(),
+          bm.BatchingMode(10),
+          bm.NonBatchingMode()]
+  )
+  def test_Dropout(self, mode):
+    bp.share.save(fit=False)
+    bm.random.seed()
+    input = bm.random.randn(10, 5, 5, 5, 4)
+    layer = bp.dnn.Dropout(prob=0.2,
+                           mode=mode)
+    output = layer(input)
+
+
+class Test_function(parameterized.TestCase):
+  @parameterized.product(
+    mode=[bm.TrainingMode(),
+          bm.TrainingMode(10),
+          bm.BatchingMode(),
+          bm.BatchingMode(10),
+          bm.NonBatchingMode()]
+  )
+  def test_Flatten(self, mode):
+    bm.random.seed()
+    layer = bp.dnn.Flatten(mode=mode)
+    input = bm.random.randn(10, 5, 5, 5, 4)
+    output = layer(input)
+
+
+class Test_linear(parameterized.TestCase):
+
+  @parameterized.product(
+    mode=[bm.TrainingMode(),
+          bm.TrainingMode(10),
+          bm.BatchingMode(),
+          bm.BatchingMode(10),
+          bm.NonBatchingMode()]
+  )
+  def test_linear(self, mode):
+    bm.random.seed()
+    input = bm.random.randn(10, 9, 8, 7)
+    layer = bp.dnn.Linear(num_in=7,
+                          num_out=6,
+                          mode=mode)
+    output = layer(input)
+
+  @parameterized.product(
+    mode=[bm.TrainingMode(),
+          bm.TrainingMode(10),
+          bm.BatchingMode(),
+          bm.BatchingMode(10),
+          bm.NonBatchingMode()]
+  )
+  def test_AllToAll(self, mode):
+    bm.random.seed()
+    input = bm.random.randn(10, 10)
+    layer = bp.dnn.AllToAll(num_pre=10,
+                            num_post=20,
+                            weight=0.1,
+                            mode=mode)
+    if mode in [bm.NonBatchingMode()]:
+      for i in input:
+        output = layer(i)
+    else:
+      output = layer(input)
+
+  @parameterized.product(
+    mode=[bm.TrainingMode(),
+          bm.TrainingMode(10),
+          bm.BatchingMode(),
+          bm.BatchingMode(10),
+          bm.NonBatchingMode()]
+  )
+  def test_OneToOne(self, mode):
+    bm.random.seed()
+    input = bm.random.randn(10, 10)
+    layer = bp.dnn.OneToOne(num=10,
+                            weight=0.1,
+                            mode=mode)
+    output = layer(input)
+
+  @parameterized.product(
+    mode=[bm.TrainingMode(),
+          bm.TrainingMode(10),
+          bm.BatchingMode(),
+          bm.BatchingMode(10),
+          bm.NonBatchingMode()]
+  )
+  def test_MaskedLinear(self, mode):
+    bm.random.seed()
+    input = bm.random.randn(100, 100)
+    layer = bp.dnn.MaskedLinear(conn=bp.conn.FixedProb(0.1, pre=100, post=100),
+                                weight=0.1,
+                                mode=mode)
+    output = layer(input)
+
+  @parameterized.product(
+    mode=[bm.TrainingMode(),
+          bm.TrainingMode(10),
+          bm.BatchingMode(),
+          bm.BatchingMode(10),
+          bm.NonBatchingMode()]
+  )
+  def test_CSRLinear(self, mode):
+    bm.random.seed()
+    input = bm.random.randn(100, 100)
+    layer = bp.dnn.CSRLinear(conn=bp.conn.FixedProb(0.1, pre=100, post=100),
+                             weight=0.1,
+                             mode=mode)
+    output = layer(input)
+
+  @parameterized.product(
+    mode=[bm.TrainingMode(),
+          bm.TrainingMode(10),
+          bm.BatchingMode(),
+          bm.BatchingMode(10),
+          bm.NonBatchingMode()]
+  )
+  def test_EventCSRLinear(self, mode):
+    bm.random.seed()
+    input = bm.random.randn(100, 100)
+    layer = bp.dnn.EventCSRLinear(conn=bp.conn.FixedProb(0.1, pre=100, post=100),
+                                  weight=0.1,
+                                  mode=mode)
+    output = layer(input)
+
+  @parameterized.product(
+    mode=[bm.TrainingMode(),
+          bm.TrainingMode(10),
+          bm.BatchingMode(),
+          bm.BatchingMode(10),
+          bm.NonBatchingMode()]
+  )
+  def test_JitFPHomoLinear(self, mode):
+    bm.random.seed()
+    layer = bp.dnn.JitFPHomoLinear(num_in=100,
+                                   num_out=200,
+                                   prob=0.1,
+                                   weight=0.01,
+                                   seed=100,
+                                   mode=mode)
+    input = bm.random.randn(10, 100)
+    output = layer(input)
+
+  @parameterized.product(
+    mode=[bm.TrainingMode(),
+          bm.TrainingMode(10),
+          bm.BatchingMode(),
+          bm.BatchingMode(10),
+          bm.NonBatchingMode()]
+  )
+  def test_JitFPUniformLinear(self, mode):
+    bm.random.seed()
+    layer = bp.dnn.JitFPUniformLinear(num_in=100,
+                                      num_out=200,
+                                      prob=0.1,
+                                      w_low=-0.01,
+                                      w_high=0.01,
+                                      seed=100,
+                                      mode=mode)
+    input = bm.random.randn(10, 100)
+    output = layer(input)
+
+  @parameterized.product(
+    mode=[bm.TrainingMode(),
+          bm.TrainingMode(10),
+          bm.BatchingMode(),
+          bm.BatchingMode(10),
+          bm.NonBatchingMode()]
+  )
+  def test_JitFPNormalLinear(self, mode):
+    bm.random.seed()
+    layer = bp.dnn.JitFPNormalLinear(num_in=100,
+                                     num_out=200,
+                                     prob=0.1,
+                                     w_mu=-0.01,
+                                     w_sigma=0.01,
+                                     seed=100,
+                                     mode=mode)
+    input = bm.random.randn(10, 100)
+    output = layer(input)
+
+  @parameterized.product(
+    mode=[bm.TrainingMode(),
+          bm.TrainingMode(10),
+          bm.BatchingMode(),
+          bm.BatchingMode(10),
+          bm.NonBatchingMode()]
+  )
+  def test_EventJitFPHomoLinear(self, mode):
+    bm.random.seed()
+    layer = bp.dnn.EventJitFPHomoLinear(num_in=100,
+                                        num_out=200,
+                                        prob=0.1,
+                                        weight=0.01,
+                                        seed=100,
+                                        mode=mode)
+    input = bm.random.randn(10, 100)
+    output = layer(input)
+
+  @parameterized.product(
+    mode=[bm.TrainingMode(),
+          bm.TrainingMode(10),
+          bm.BatchingMode(),
+          bm.BatchingMode(10),
+          bm.NonBatchingMode()]
+  )
+  def test_EventJitFPNormalLinear(self, mode):
+    bm.random.seed()
+    layer = bp.dnn.EventJitFPNormalLinear(num_in=100,
+                                          num_out=200,
+                                          prob=0.1,
+                                          w_mu=-0.01,
+                                          w_sigma=0.01,
+                                          seed=100,
+                                          mode=mode)
+    input = bm.random.randn(10, 100)
+    output = layer(input)
+
+  @parameterized.product(
+    mode=[bm.TrainingMode(),
+          bm.TrainingMode(10),
+          bm.BatchingMode(),
+          bm.BatchingMode(10),
+          bm.NonBatchingMode()]
+  )
+  def test_EventJitFPUniformLinear(self, mode):
+    bm.random.seed()
+    layer = bp.dnn.EventJitFPUniformLinear(num_in=100,
+                                           num_out=200,
+                                           prob=0.1,
+                                           w_low=-0.01,
+                                           w_high=0.01,
+                                           seed=100,
+                                           mode=mode)
+    input = bm.random.randn(10, 100)
+    output = layer(input)
+
+
+class Test_Normalization(parameterized.TestCase):
+
+  @parameterized.product(
+    mode=[bm.TrainingMode(),
+          bm.TrainingMode(10),
+          bm.BatchingMode(),
+          bm.BatchingMode(10)],
+    fit=[True, False]
+  )
+  def test_BatchNorm1d(self, fit, mode):
+    bm.random.seed()
+    bp.share.save(fit=fit)
+    layer = bp.dnn.BatchNorm1d(num_features=100,
+                               mode=mode,
+                               affine=False)
+    input = bm.random.randn(10, 5, 100)
+    output = layer(input)
+
+  @parameterized.product(
+    mode=[bm.TrainingMode(),
+          bm.TrainingMode(10),
+          bm.BatchingMode(),
+          bm.BatchingMode(10)],
+    fit=[True, False]
+  )
+  def test_BatchNorm2d(self, fit, mode):
+    bm.random.seed()
+    bp.share.save(fit=fit)
+    layer = bp.dnn.BatchNorm2d(num_features=100,
+                               mode=mode,
+                               affine=False)
+    input = bm.random.randn(10, 5, 6, 100)
+    output = layer(input)
+
+  @parameterized.product(
+    mode=[bm.TrainingMode(),
+          bm.TrainingMode(10),
+          bm.BatchingMode(),
+          bm.BatchingMode(10)],
+    fit=[True, False]
+  )
+  def test_BatchNorm3d(self, fit, mode):
+    bm.random.seed()
+    bp.share.save(fit=fit)
+    layer = bp.dnn.BatchNorm3d(num_features=100,
+                               mode=mode,
+                               affine=False)
+    input = bm.random.randn(10, 5, 6, 7, 100)
+    output = layer(input)
+
+  @parameterized.product(
+    mode=[bm.TrainingMode(),
+          bm.TrainingMode(10),
+          bm.BatchingMode(),
+          bm.BatchingMode(10),
+          bm.NonBatchingMode()],
+  )
+  def test_LayerNorm(self, mode):
+    bm.random.seed()
+    layer = bp.dnn.LayerNorm(normalized_shape=3,
+                             mode=mode,
+                             elementwise_affine=False
+                             )
+    input = bm.random.randn(10, 5, 3)
+    outout = layer(input)
+
+  @parameterized.product(
+    mode=[bm.TrainingMode(),
+          bm.TrainingMode(10),
+          bm.BatchingMode(),
+          bm.BatchingMode(10),
+          bm.NonBatchingMode()],
+  )
+  def test_GroupNorm(self, mode):
+    bm.random.seed()
+    layer = bp.dnn.GroupNorm(num_groups=2,
+                             num_channels=6,
+                             affine=False,
+                             mode=mode
+                             )
+    input = bm.random.randn(20, 10, 10, 6)
+    output = layer(input)
+
+  @parameterized.product(
+    mode=[bm.TrainingMode(),
+          bm.TrainingMode(10),
+          bm.BatchingMode(),
+          bm.BatchingMode(10),
+          bm.NonBatchingMode()],
+  )
+  def test_InstanceNorm(self, mode):
+    bm.random.seed()
+    layer = bp.dnn.InstanceNorm(num_channels=6,
+                                affine=False,
+                                mode=mode
+                                )
+    input = bm.random.randn(20, 10, 10, 6)
+    output = layer(input)
+
+
+if __name__ == '__main__':
+  absltest.main()
diff --git a/brainpy/_src/dyn/projections/tests/test_STDP.py b/brainpy/_src/dyn/projections/tests/test_STDP.py
index b8884f327..18d9d9dc9 100644
--- a/brainpy/_src/dyn/projections/tests/test_STDP.py
+++ b/brainpy/_src/dyn/projections/tests/test_STDP.py
@@ -1,120 +1,127 @@
-# -*- coding: utf-8 -*-
-
-
-import numpy as np
-from absl.testing import parameterized
-
-import brainpy as bp
-import brainpy.math as bm
-
-
-class Test_STDP(parameterized.TestCase):
-
-  @parameterized.product(
-    comm_method=['dense', 'csr', 'masked_linear', 'all2all', 'one2one'],
-    delay=[None, 0., 2.],
-    syn_model=['exp', 'dual_exp', 'ampa'],
-    out_model=['cuba', 'coba', 'mg']
-  )
-  def test_STDP(self, comm_method, delay, syn_model, out_model):
-    bm.random.seed()
-
-    class STDPNet(bp.DynamicalSystem):
-      def __init__(self, num_pre, num_post):
-        super().__init__()
-        self.pre = bp.dyn.LifRef(num_pre)
-        self.post = bp.dyn.LifRef(num_post)
-
-        if comm_method == 'all2all':
-          comm = bp.dnn.AllToAll(self.pre.num, self.post.num, weight=bp.init.Uniform(.1, 0.1))
-        elif comm_method == 'csr':
-          if syn_model == 'exp':
-            comm = bp.dnn.EventCSRLinear(bp.conn.FixedProb(1, pre=self.pre.num, post=self.post.num),
-                                         weight=bp.init.Uniform(0., 0.1))
-          else:
-            comm = bp.dnn.CSRLinear(bp.conn.FixedProb(1, pre=self.pre.num, post=self.post.num),
-                                    weight=bp.init.Uniform(0., 0.1))
-        elif comm_method == 'masked_linear':
-          comm = bp.dnn.MaskedLinear(bp.conn.FixedProb(1, pre=self.pre.num, post=self.post.num),
-                                     weight=bp.init.Uniform(0., 0.1))
-        elif comm_method == 'dense':
-          comm = bp.dnn.Dense(self.pre.num, self.post.num, W_initializer=bp.init.Uniform(.1, 0.1))
-        elif comm_method == 'one2one':
-          comm = bp.dnn.OneToOne(self.pre.num, weight=bp.init.Uniform(.1, 0.1))
-        else:
-          raise ValueError
-
-        if syn_model == 'exp':
-          syn = bp.dyn.Expon.desc(self.post.varshape, tau=5.)
-        elif syn_model == 'dual_exp':
-          syn = bp.dyn.DualExpon.desc(self.post.varshape)
-        elif syn_model == 'dual_exp_v2':
-          syn = bp.dyn.DualExponV2.desc(self.post.varshape)
-        elif syn_model == 'ampa':
-          syn = bp.dyn.AMPA.desc(self.post.varshape)
-        else:
-          raise ValueError
-
-        if out_model == 'cuba':
-          out = bp.dyn.CUBA.desc()
-        elif out_model == 'coba':
-          out = bp.dyn.COBA.desc(E=0.)
-        elif out_model == 'mg':
-          out = bp.dyn.MgBlock.desc(E=0.)
-        else:
-          raise ValueError
-
-        self.syn = bp.dyn.STDP_Song2000(
-          pre=self.pre,
-          delay=delay,
-          comm=comm,
-          syn=syn,
-          out=out,
-          post=self.post,
-          tau_s=16.8,
-          tau_t=33.7,
-          A1=0.96,
-          A2=0.53,
-          W_min=0.,
-          W_max=1.
-        )
-
-      def update(self, I_pre, I_post):
-        self.syn()
-        self.pre(I_pre)
-        self.post(I_post)
-        conductance = self.syn.refs['syn'].g
-        Apre = self.syn.refs['pre_trace'].g
-        Apost = self.syn.refs['post_trace'].g
-        current = self.post.sum_current_inputs(self.post.V)
-        if comm_method == 'dense':
-          w = self.syn.comm.W.flatten()
-        else:
-          w = self.syn.comm.weight.flatten()
-        return self.pre.spike, self.post.spike, conductance, Apre, Apost, current, w
-
-    duration = 300.
-    I_pre = bp.inputs.section_input([0, 30, 0, 30, 0, 30, 0, 30, 0, 30, 0, 30, 0],
-                                    [5, 15, 15, 15, 15, 15, 100, 15, 15, 15, 15, 15, duration - 255])
-    I_post = bp.inputs.section_input([0, 30, 0, 30, 0, 30, 0, 30, 0, 30, 0, 30, 0],
-                                     [10, 15, 15, 15, 15, 15, 90, 15, 15, 15, 15, 15, duration - 250])
-
-    net = STDPNet(1, 1)
-
-    def run(i, I_pre, I_post):
-      pre_spike, post_spike, g, Apre, Apost, current, W = net.step_run(i, I_pre, I_post)
-      return pre_spike, post_spike, g, Apre, Apost, current, W
-
-    indices = np.arange(int(duration / bm.dt))
-    pre_spike, post_spike, g, Apre, Apost, current, W = bm.for_loop(run, [indices, I_pre, I_post])
-
-    # import matplotlib.pyplot as plt
-    # fig, gs = bp.visualize.get_figure(4, 1, 3, 10)
-    # bp.visualize.line_plot(indices, g, ax=fig.add_subplot(gs[0, 0]))
-    # bp.visualize.line_plot(indices, Apre, ax=fig.add_subplot(gs[1, 0]))
-    # bp.visualize.line_plot(indices, Apost, ax=fig.add_subplot(gs[2, 0]))
-    # bp.visualize.line_plot(indices, W, ax=fig.add_subplot(gs[3, 0]))
-    # plt.show()
-
-    bm.clear_buffer_memory()
-
+# -*- coding: utf-8 -*-
+
+import numpy as np
+import pytest
+from absl.testing import parameterized
+
+import brainpy as bp
+import brainpy.math as bm
+from brainpy._src.dependency_check import import_taichi
+
+if import_taichi(error_if_not_found=False) is None:
+  pytest.skip('no taichi', allow_module_level=True)
+
+bm.set_platform('cpu')
+
+
+class Test_STDP(parameterized.TestCase):
+
+  @parameterized.product(
+    comm_method=['csr', 'dense', 'masked_linear', 'all2all', 'one2one'],
+    delay=[None, 0., 2.],
+    syn_model=['exp', 'dual_exp', 'ampa'],
+    out_model=['cuba', 'coba', 'mg']
+  )
+  def test_STDP(self, comm_method, delay, syn_model, out_model):
+    bm.random.seed()
+
+    class STDPNet(bp.DynamicalSystem):
+      def __init__(self, num_pre, num_post):
+        super().__init__()
+        self.pre = bp.dyn.LifRef(num_pre)
+        self.post = bp.dyn.LifRef(num_post)
+
+        if comm_method == 'all2all':
+          comm = bp.dnn.AllToAll(self.pre.num, self.post.num, weight=bp.init.Uniform(.1, 0.1))
+        elif comm_method == 'csr':
+          if syn_model == 'exp':
+            comm = bp.dnn.EventCSRLinear(bp.conn.FixedProb(1, pre=self.pre.num, post=self.post.num),
+                                         weight=bp.init.Uniform(0., 0.1))
+          else:
+            comm = bp.dnn.CSRLinear(bp.conn.FixedProb(1, pre=self.pre.num, post=self.post.num),
+                                    weight=bp.init.Uniform(0., 0.1))
+        elif comm_method == 'masked_linear':
+          comm = bp.dnn.MaskedLinear(bp.conn.FixedProb(1, pre=self.pre.num, post=self.post.num),
+                                     weight=bp.init.Uniform(0., 0.1))
+        elif comm_method == 'dense':
+          comm = bp.dnn.Dense(self.pre.num, self.post.num, W_initializer=bp.init.Uniform(.1, 0.1))
+        elif comm_method == 'one2one':
+          comm = bp.dnn.OneToOne(self.pre.num, weight=bp.init.Uniform(.1, 0.1))
+        else:
+          raise ValueError
+
+        if syn_model == 'exp':
+          syn = bp.dyn.Expon.desc(self.post.varshape, tau=5.)
+        elif syn_model == 'dual_exp':
+          syn = bp.dyn.DualExpon.desc(self.post.varshape)
+        elif syn_model == 'dual_exp_v2':
+          syn = bp.dyn.DualExponV2.desc(self.post.varshape)
+        elif syn_model == 'ampa':
+          syn = bp.dyn.AMPA.desc(self.post.varshape)
+        else:
+          raise ValueError
+
+        if out_model == 'cuba':
+          out = bp.dyn.CUBA.desc()
+        elif out_model == 'coba':
+          out = bp.dyn.COBA.desc(E=0.)
+        elif out_model == 'mg':
+          out = bp.dyn.MgBlock.desc(E=0.)
+        else:
+          raise ValueError
+
+        self.syn = bp.dyn.STDP_Song2000(
+          pre=self.pre,
+          delay=delay,
+          comm=comm,
+          syn=syn,
+          out=out,
+          post=self.post,
+          tau_s=16.8,
+          tau_t=33.7,
+          A1=0.96,
+          A2=0.53,
+          W_min=0.,
+          W_max=1.
+        )
+
+      def update(self, I_pre, I_post):
+        self.syn()
+        self.pre(I_pre)
+        self.post(I_post)
+        conductance = self.syn.refs['syn'].g
+        Apre = self.syn.refs['pre_trace'].g
+        Apost = self.syn.refs['post_trace'].g
+        current = self.post.sum_current_inputs(self.post.V)
+        if comm_method == 'dense':
+          w = self.syn.comm.W.flatten()
+        else:
+          w = self.syn.comm.weight.flatten()
+        return self.pre.spike, self.post.spike, conductance, Apre, Apost, current, w
+
+    duration = 300.
+    I_pre = bp.inputs.section_input([0, 30, 0, 30, 0, 30, 0, 30, 0, 30, 0, 30, 0],
+                                    [5, 15, 15, 15, 15, 15, 100, 15, 15, 15, 15, 15,
+                                     duration - 255])
+    I_post = bp.inputs.section_input([0, 30, 0, 30, 0, 30, 0, 30, 0, 30, 0, 30, 0],
+                                     [10, 15, 15, 15, 15, 15, 90, 15, 15, 15, 15, 15,
+                                      duration - 250])
+
+    net = STDPNet(1, 1)
+
+    def run(i, I_pre, I_post):
+      pre_spike, post_spike, g, Apre, Apost, current, W = net.step_run(i, I_pre, I_post)
+      return pre_spike, post_spike, g, Apre, Apost, current, W
+
+    indices = np.arange(int(duration / bm.dt))
+    pre_spike, post_spike, g, Apre, Apost, current, W = bm.for_loop(run, [indices, I_pre, I_post])
+
+    # import matplotlib.pyplot as plt
+    # fig, gs = bp.visualize.get_figure(4, 1, 3, 10)
+    # bp.visualize.line_plot(indices, g, ax=fig.add_subplot(gs[0, 0]))
+    # bp.visualize.line_plot(indices, Apre, ax=fig.add_subplot(gs[1, 0]))
+    # bp.visualize.line_plot(indices, Apost, ax=fig.add_subplot(gs[2, 0]))
+    # bp.visualize.line_plot(indices, W, ax=fig.add_subplot(gs[3, 0]))
+    # plt.show()
+
+    bm.clear_buffer_memory()
diff --git a/brainpy/_src/dyn/projections/tests/test_aligns.py b/brainpy/_src/dyn/projections/tests/test_aligns.py
index 90500a26f..eec2c9459 100644
--- a/brainpy/_src/dyn/projections/tests/test_aligns.py
+++ b/brainpy/_src/dyn/projections/tests/test_aligns.py
@@ -1,439 +1,444 @@
-import matplotlib.pyplot as plt
-import numpy as np
-
-import brainpy as bp
-import brainpy.math as bm
-
-neu_pars = dict(V_rest=-60., V_th=-50., V_reset=-60., tau=20., tau_ref=5.,
-                V_initializer=bp.init.Normal(-55., 2.))
-
-
-def test_ProjAlignPreMg1():
-  class EICOBA_PreAlign(bp.DynamicalSystem):
-    def __init__(self, scale=1., inp=20., delay=None):
-      super().__init__()
-
-      self.inp = inp
-      self.E = bp.dyn.LifRefLTC(int(3200 * scale), **neu_pars)
-      self.I = bp.dyn.LifRefLTC(int(800 * scale), **neu_pars)
-
-      prob = 80 / (4000 * scale)
-
-      self.E2I = bp.dyn.FullProjAlignPreSDMg(
-        pre=self.E,
-        syn=bp.dyn.Expon.desc(self.E.varshape, tau=5.),
-        delay=delay,
-        comm=bp.dnn.CSRLinear(bp.conn.FixedProb(prob, pre=self.E.num, post=self.I.num), 0.6),
-        out=bp.dyn.COBA(E=0.),
-        post=self.I,
-      )
-      self.E2E = bp.dyn.FullProjAlignPreSDMg(
-        pre=self.E,
-        syn=bp.dyn.Expon.desc(self.E.varshape, tau=5.),
-        delay=delay,
-        comm=bp.dnn.CSRLinear(bp.conn.FixedProb(prob, pre=self.E.num, post=self.E.num), 0.6),
-        out=bp.dyn.COBA(E=0.),
-        post=self.E,
-      )
-      self.I2E = bp.dyn.FullProjAlignPreSDMg(
-        pre=self.I,
-        syn=bp.dyn.Expon.desc(self.I.varshape, tau=10.),
-        delay=delay,
-        comm=bp.dnn.CSRLinear(bp.conn.FixedProb(prob, pre=self.I.num, post=self.E.num), 6.7),
-        out=bp.dyn.COBA(E=-80.),
-        post=self.E,
-      )
-      self.I2I = bp.dyn.FullProjAlignPreSDMg(
-        pre=self.I,
-        syn=bp.dyn.Expon.desc(self.I.varshape, tau=10.),
-        delay=delay,
-        comm=bp.dnn.CSRLinear(bp.conn.FixedProb(prob, pre=self.I.num, post=self.I.num), 6.7),
-        out=bp.dyn.COBA(E=-80.),
-        post=self.I,
-      )
-
-    def update(self):
-      self.E2I()
-      self.I2I()
-      self.I2E()
-      self.E2E()
-      self.E(self.inp)
-      self.I(self.inp)
-      return self.E.spike.value
-
-  net = EICOBA_PreAlign(0.5)
-  indices = np.arange(400)
-  spks = bm.for_loop(net.step_run, indices)
-  bp.visualize.raster_plot(indices * bm.dt, spks, show=True)
-
-  net = EICOBA_PreAlign(0.5, delay=1.)
-  indices = np.arange(400)
-  spks = bm.for_loop(net.step_run, indices)
-  bp.visualize.raster_plot(indices * bm.dt, spks, show=True)
-
-  plt.close()
-  bm.clear_buffer_memory()
-
-
-def test_ProjAlignPostMg2():
-  class EICOBA_PostAlign(bp.DynamicalSystem):
-    def __init__(self, scale, inp=20., ltc=True, delay=None):
-      super().__init__()
-      self.inp = inp
-
-      if ltc:
-        self.E = bp.dyn.LifRefLTC(int(3200 * scale), **neu_pars)
-        self.I = bp.dyn.LifRefLTC(int(800 * scale), **neu_pars)
-      else:
-        self.E = bp.dyn.LifRef(int(3200 * scale), **neu_pars)
-        self.I = bp.dyn.LifRef(int(800 * scale), **neu_pars)
-
-      prob = 80 / (4000 * scale)
-
-      self.E2E = bp.dyn.FullProjAlignPostMg(
-        pre=self.E,
-        delay=delay,
-        comm=bp.dnn.EventCSRLinear(bp.conn.FixedProb(prob, pre=self.E.num, post=self.E.num), 0.6),
-        syn=bp.dyn.Expon.desc(self.E.varshape, tau=5.),
-        out=bp.dyn.COBA.desc(E=0.),
-        post=self.E,
-      )
-      self.E2I = bp.dyn.FullProjAlignPostMg(
-        pre=self.E,
-        delay=delay,
-        comm=bp.dnn.EventCSRLinear(bp.conn.FixedProb(prob, pre=self.E.num, post=self.I.num), 0.6),
-        syn=bp.dyn.Expon.desc(self.I.varshape, tau=5.),
-        out=bp.dyn.COBA.desc(E=0.),
-        post=self.I,
-      )
-      self.I2E = bp.dyn.FullProjAlignPostMg(
-        pre=self.I,
-        delay=delay,
-        comm=bp.dnn.EventCSRLinear(bp.conn.FixedProb(prob, pre=self.I.num, post=self.E.num), 6.7),
-        syn=bp.dyn.Expon.desc(self.E.varshape, tau=10.),
-        out=bp.dyn.COBA.desc(E=-80.),
-        post=self.E,
-      )
-      self.I2I = bp.dyn.FullProjAlignPostMg(
-        pre=self.I,
-        delay=delay,
-        comm=bp.dnn.EventCSRLinear(bp.conn.FixedProb(prob, pre=self.I.num, post=self.I.num), 6.7),
-        syn=bp.dyn.Expon.desc(self.I.varshape, tau=10.),
-        out=bp.dyn.COBA.desc(E=-80.),
-        post=self.I,
-      )
-
-    def update(self):
-      self.E2I()
-      self.I2I()
-      self.I2E()
-      self.E2E()
-      self.E(self.inp)
-      self.I(self.inp)
-      return self.E.spike.value
-
-  net = EICOBA_PostAlign(0.5)
-  indices = np.arange(400)
-  spks = bm.for_loop(net.step_run, indices)
-  bp.visualize.raster_plot(indices * bm.dt, spks, show=True)
-
-  net = EICOBA_PostAlign(0.5, delay=1.)
-  indices = np.arange(400)
-  spks = bm.for_loop(net.step_run, indices)
-  bp.visualize.raster_plot(indices * bm.dt, spks, show=True)
-
-  net = EICOBA_PostAlign(0.5, ltc=False)
-  indices = np.arange(400)
-  spks = bm.for_loop(net.step_run, indices)
-  bp.visualize.raster_plot(indices * bm.dt, spks, show=True)
-
-  plt.close()
-  bm.clear_buffer_memory()
-
-
-def test_ProjAlignPost1():
-  class EINet(bp.DynSysGroup):
-    def __init__(self, scale=1.):
-      super().__init__()
-      num = int(4000 * scale)
-      self.num_exc = int(3200 * scale)
-      self.num_inh = num - self.num_exc
-      prob = 80 / num
-
-      self.N = bp.dyn.LifRefLTC(num, V_rest=-60., V_th=-50., V_reset=-60., tau=20., tau_ref=5.,
-                                V_initializer=bp.init.Normal(-55., 2.))
-      self.delay = bp.VarDelay(self.N.spike, entries={'I': None})
-      self.E = bp.dyn.HalfProjAlignPost(comm=bp.dnn.EventJitFPHomoLinear(self.num_exc, num, prob=prob, weight=0.6),
-                                        syn=bp.dyn.Expon(size=num, tau=5.),
-                                        out=bp.dyn.COBA(E=0.),
-                                        post=self.N)
-      self.I = bp.dyn.HalfProjAlignPost(comm=bp.dnn.EventJitFPHomoLinear(self.num_inh, num, prob=prob, weight=6.7),
-                                        syn=bp.dyn.Expon(size=num, tau=10.),
-                                        out=bp.dyn.COBA(E=-80.),
-                                        post=self.N)
-
-    def update(self, input):
-      spk = self.delay.at('I')
-      self.E(spk[:self.num_exc])
-      self.I(spk[self.num_exc:])
-      self.delay(self.N(input))
-      return self.N.spike.value
-
-  model = EINet(0.5)
-  indices = bm.arange(400)
-  spks = bm.for_loop(lambda i: model.step_run(i, 20.), indices)
-  bp.visualize.raster_plot(indices, spks, show=True)
-  bm.clear_buffer_memory()
-  plt.close()
-
-
-def test_ProjAlignPost2():
-  class EINet(bp.DynSysGroup):
-    def __init__(self, scale, delay=None):
-      super().__init__()
-      ne, ni = int(3200 * scale), int(800 * scale)
-      p = 80 / (ne + ni)
-
-      self.E = bp.dyn.LifRefLTC(ne, V_rest=-60., V_th=-50., V_reset=-60., tau=20., tau_ref=5.,
-                                V_initializer=bp.init.Normal(-55., 2.))
-      self.I = bp.dyn.LifRefLTC(ni, V_rest=-60., V_th=-50., V_reset=-60., tau=20., tau_ref=5.,
-                                V_initializer=bp.init.Normal(-55., 2.))
-      self.E2E = bp.dyn.FullProjAlignPost(pre=self.E,
-                                          delay=delay,
-                                          comm=bp.dnn.EventJitFPHomoLinear(ne, ne, prob=p, weight=0.6),
-                                          syn=bp.dyn.Expon(size=ne, tau=5.),
-                                          out=bp.dyn.COBA(E=0.),
-                                          post=self.E)
-      self.E2I = bp.dyn.FullProjAlignPost(pre=self.E,
-                                          delay=delay,
-                                          comm=bp.dnn.EventJitFPHomoLinear(ne, ni, prob=p, weight=0.6),
-                                          syn=bp.dyn.Expon(size=ni, tau=5.),
-                                          out=bp.dyn.COBA(E=0.),
-                                          post=self.I)
-      self.I2E = bp.dyn.FullProjAlignPost(pre=self.I,
-                                          delay=delay,
-                                          comm=bp.dnn.EventJitFPHomoLinear(ni, ne, prob=p, weight=6.7),
-                                          syn=bp.dyn.Expon(size=ne, tau=10.),
-                                          out=bp.dyn.COBA(E=-80.),
-                                          post=self.E)
-      self.I2I = bp.dyn.FullProjAlignPost(pre=self.I,
-                                          delay=delay,
-                                          comm=bp.dnn.EventJitFPHomoLinear(ni, ni, prob=p, weight=6.7),
-                                          syn=bp.dyn.Expon(size=ni, tau=10.),
-                                          out=bp.dyn.COBA(E=-80.),
-                                          post=self.I)
-
-    def update(self, inp):
-      self.E2E()
-      self.E2I()
-      self.I2E()
-      self.I2I()
-      self.E(inp)
-      self.I(inp)
-      return self.E.spike
-
-  model = EINet(0.5, delay=1.)
-  indices = bm.arange(400)
-  spks = bm.for_loop(lambda i: model.step_run(i, 20.), indices)
-  bp.visualize.raster_plot(indices, spks, show=True)
-
-  model = EINet(0.5, delay=None)
-  indices = bm.arange(400)
-  spks = bm.for_loop(lambda i: model.step_run(i, 20.), indices)
-  bp.visualize.raster_plot(indices, spks, show=True)
-
-  bm.clear_buffer_memory()
-  plt.close()
-
-
-def test_VanillaProj():
-  class EINet(bp.DynSysGroup):
-    def __init__(self, scale=0.5):
-      super().__init__()
-      num = int(4000 * scale)
-      self.ne = int(3200 * scale)
-      self.ni = num - self.ne
-      p = 80 / num
-
-      self.N = bp.dyn.LifRefLTC(num, V_rest=-60., V_th=-50., V_reset=-60., tau=20., tau_ref=5.,
-                                V_initializer=bp.init.Normal(-55., 2.))
-      self.delay = bp.VarDelay(self.N.spike, entries={'I': None})
-      self.syn1 = bp.dyn.Expon(size=self.ne, tau=5.)
-      self.syn2 = bp.dyn.Expon(size=self.ni, tau=10.)
-      self.E = bp.dyn.VanillaProj(comm=bp.dnn.JitFPHomoLinear(self.ne, num, prob=p, weight=0.6),
-                                  out=bp.dyn.COBA(E=0.),
-                                  post=self.N)
-      self.I = bp.dyn.VanillaProj(comm=bp.dnn.JitFPHomoLinear(self.ni, num, prob=p, weight=6.7),
-                                  out=bp.dyn.COBA(E=-80.),
-                                  post=self.N)
-
-    def update(self, input):
-      spk = self.delay.at('I')
-      self.E(self.syn1(spk[:self.ne]))
-      self.I(self.syn2(spk[self.ne:]))
-      self.delay(self.N(input))
-      return self.N.spike.value
-
-  model = EINet()
-  indices = bm.arange(400)
-  spks = bm.for_loop(lambda i: model.step_run(i, 20.), indices)
-  bp.visualize.raster_plot(indices, spks, show=True)
-  bm.clear_buffer_memory()
-  plt.close()
-
-
-def test_ProjAlignPreMg1_v2():
-  class EINet(bp.DynSysGroup):
-    def __init__(self, scale=1., delay=None):
-      super().__init__()
-      ne, ni = int(3200 * scale), int(800 * scale)
-      p = 80 / (4000 * scale)
-      self.E = bp.dyn.LifRefLTC(ne, V_rest=-60., V_th=-50., V_reset=-60., tau=20., tau_ref=5.,
-                                V_initializer=bp.init.Normal(-55., 2.))
-      self.I = bp.dyn.LifRefLTC(ni, V_rest=-60., V_th=-50., V_reset=-60., tau=20., tau_ref=5.,
-                                V_initializer=bp.init.Normal(-55., 2.))
-      self.E2E = bp.dyn.FullProjAlignPreSDMg(pre=self.E,
-                                             syn=bp.dyn.Expon.desc(size=ne, tau=5.),
-                                             delay=delay,
-                                             comm=bp.dnn.JitFPHomoLinear(ne, ne, prob=p, weight=0.6),
-                                             out=bp.dyn.COBA(E=0.),
-                                             post=self.E)
-      self.E2I = bp.dyn.FullProjAlignPreSDMg(pre=self.E,
-                                             syn=bp.dyn.Expon.desc(size=ne, tau=5.),
-                                             delay=delay,
-                                             comm=bp.dnn.JitFPHomoLinear(ne, ni, prob=p, weight=0.6),
-                                             out=bp.dyn.COBA(E=0.),
-                                             post=self.I)
-      self.I2E = bp.dyn.FullProjAlignPreSDMg(pre=self.I,
-                                             syn=bp.dyn.Expon.desc(size=ni, tau=10.),
-                                             delay=delay,
-                                             comm=bp.dnn.JitFPHomoLinear(ni, ne, prob=p, weight=6.7),
-                                             out=bp.dyn.COBA(E=-80.),
-                                             post=self.E)
-      self.I2I = bp.dyn.FullProjAlignPreSDMg(pre=self.I,
-                                             syn=bp.dyn.Expon.desc(size=ni, tau=10.),
-                                             delay=delay,
-                                             comm=bp.dnn.JitFPHomoLinear(ni, ni, prob=p, weight=6.7),
-                                             out=bp.dyn.COBA(E=-80.),
-                                             post=self.I)
-
-    def update(self, inp):
-      self.E2E()
-      self.E2I()
-      self.I2E()
-      self.I2I()
-      self.E(inp)
-      self.I(inp)
-      return self.E.spike
-
-  model = EINet()
-  indices = bm.arange(400)
-  spks = bm.for_loop(lambda i: model.step_run(i, 20.), indices)
-  bp.visualize.raster_plot(indices, spks, show=True)
-
-  model = EINet(delay=1.)
-  indices = bm.arange(400)
-  spks = bm.for_loop(lambda i: model.step_run(i, 20.), indices)
-  bp.visualize.raster_plot(indices, spks, show=True)
-
-  bm.clear_buffer_memory()
-  plt.close()
-
-
-def test_ProjAlignPreMg2():
-  class EINet(bp.DynSysGroup):
-    def __init__(self, scale=1., delay=None):
-      super().__init__()
-      ne, ni = int(3200 * scale), int(800 * scale)
-      p = 80 / (4000 * scale)
-      self.E = bp.dyn.LifRefLTC(ne, V_rest=-60., V_th=-50., V_reset=-60., tau=20., tau_ref=5.,
-                                V_initializer=bp.init.Normal(-55., 2.))
-      self.I = bp.dyn.LifRefLTC(ni, V_rest=-60., V_th=-50., V_reset=-60., tau=20., tau_ref=5.,
-                                V_initializer=bp.init.Normal(-55., 2.))
-      self.E2E = bp.dyn.FullProjAlignPreDSMg(pre=self.E,
-                                             delay=delay,
-                                             syn=bp.dyn.Expon.desc(size=ne, tau=5.),
-                                             comm=bp.dnn.JitFPHomoLinear(ne, ne, prob=p, weight=0.6),
-                                             out=bp.dyn.COBA(E=0.),
-                                             post=self.E)
-      self.E2I = bp.dyn.FullProjAlignPreDSMg(pre=self.E,
-                                             delay=delay,
-                                             syn=bp.dyn.Expon.desc(size=ne, tau=5.),
-                                             comm=bp.dnn.JitFPHomoLinear(ne, ni, prob=p, weight=0.6),
-                                             out=bp.dyn.COBA(E=0.),
-                                             post=self.I)
-      self.I2E = bp.dyn.FullProjAlignPreDSMg(pre=self.I,
-                                             delay=delay,
-                                             syn=bp.dyn.Expon.desc(size=ni, tau=10.),
-                                             comm=bp.dnn.JitFPHomoLinear(ni, ne, prob=p, weight=6.7),
-                                             out=bp.dyn.COBA(E=-80.),
-                                             post=self.E)
-      self.I2I = bp.dyn.FullProjAlignPreDSMg(pre=self.I,
-                                             delay=delay,
-                                             syn=bp.dyn.Expon.desc(size=ni, tau=10.),
-                                             comm=bp.dnn.JitFPHomoLinear(ni, ni, prob=p, weight=6.7),
-                                             out=bp.dyn.COBA(E=-80.),
-                                             post=self.I)
-
-    def update(self, inp):
-      self.E2E()
-      self.E2I()
-      self.I2E()
-      self.I2I()
-      self.E(inp)
-      self.I(inp)
-      return self.E.spike
-
-  model = EINet(scale=0.2, delay=None)
-  indices = bm.arange(400)
-  spks = bm.for_loop(lambda i: model.step_run(i, 20.), indices)
-  bp.visualize.raster_plot(indices, spks, show=True)
-
-  model = EINet(scale=0.2, delay=1.)
-  indices = bm.arange(400)
-  spks = bm.for_loop(lambda i: model.step_run(i, 20.), indices)
-  bp.visualize.raster_plot(indices, spks, show=True)
-
-  bm.clear_buffer_memory()
-  plt.close()
-
-
-def test_vanalla_proj_v2():
-  class EINet(bp.DynSysGroup):
-    def __init__(self, scale=1.):
-      super().__init__()
-      num = int(4000 * scale)
-      self.ne = int(3200 * scale)
-      self.ni = num - self.ne
-      p = 80 / num
-
-      self.N = bp.dyn.LifRefLTC(num, V_rest=-60., V_th=-50., V_reset=-60., tau=20., tau_ref=5.,
-                                V_initializer=bp.init.Normal(-55., 1.))
-      self.delay = bp.VarDelay(self.N.spike, entries={'delay': 2})
-      self.syn1 = bp.dyn.Expon(size=self.ne, tau=5.)
-      self.syn2 = bp.dyn.Expon(size=self.ni, tau=10.)
-      self.E = bp.dyn.VanillaProj(
-        comm=bp.dnn.CSRLinear(bp.conn.FixedProb(p, pre=self.ne, post=num), weight=0.6),
-        out=bp.dyn.COBA(E=0.),
-        post=self.N
-      )
-      self.I = bp.dyn.VanillaProj(
-        comm=bp.dnn.CSRLinear(bp.conn.FixedProb(p, pre=self.ni, post=num), weight=6.7),
-        out=bp.dyn.COBA(E=-80.),
-        post=self.N
-      )
-
-    def update(self, input):
-      spk = self.delay.at('delay')
-      self.E(self.syn1(spk[:self.ne]))
-      self.I(self.syn2(spk[self.ne:]))
-      self.delay(self.N(input))
-      return self.N.spike.value
-
-  model = EINet()
-  indices = bm.arange(400)
-  spks = bm.for_loop(lambda i: model.step_run(i, 20.), indices, progress_bar=True)
-  bp.visualize.raster_plot(indices, spks, show=True)
-  plt.close()
-  bm.clear_buffer_memory()
-
+import pytest
+import matplotlib.pyplot as plt
+import numpy as np
+
+import brainpy as bp
+import brainpy.math as bm
+
+from brainpy._src.dependency_check import import_taichi
+
+if import_taichi(error_if_not_found=False) is None:
+  pytest.skip('no taichi', allow_module_level=True)
+
+neu_pars = dict(V_rest=-60., V_th=-50., V_reset=-60., tau=20., tau_ref=5.,
+                V_initializer=bp.init.Normal(-55., 2.))
+
+
+def test_ProjAlignPreMg1():
+  class EICOBA_PreAlign(bp.DynamicalSystem):
+    def __init__(self, scale=1., inp=20., delay=None):
+      super().__init__()
+
+      self.inp = inp
+      self.E = bp.dyn.LifRefLTC(int(3200 * scale), **neu_pars)
+      self.I = bp.dyn.LifRefLTC(int(800 * scale), **neu_pars)
+
+      prob = 80 / (4000 * scale)
+
+      self.E2I = bp.dyn.FullProjAlignPreSDMg(
+        pre=self.E,
+        syn=bp.dyn.Expon.desc(self.E.varshape, tau=5.),
+        delay=delay,
+        comm=bp.dnn.CSRLinear(bp.conn.FixedProb(prob, pre=self.E.num, post=self.I.num), 0.6),
+        out=bp.dyn.COBA(E=0.),
+        post=self.I,
+      )
+      self.E2E = bp.dyn.FullProjAlignPreSDMg(
+        pre=self.E,
+        syn=bp.dyn.Expon.desc(self.E.varshape, tau=5.),
+        delay=delay,
+        comm=bp.dnn.CSRLinear(bp.conn.FixedProb(prob, pre=self.E.num, post=self.E.num), 0.6),
+        out=bp.dyn.COBA(E=0.),
+        post=self.E,
+      )
+      self.I2E = bp.dyn.FullProjAlignPreSDMg(
+        pre=self.I,
+        syn=bp.dyn.Expon.desc(self.I.varshape, tau=10.),
+        delay=delay,
+        comm=bp.dnn.CSRLinear(bp.conn.FixedProb(prob, pre=self.I.num, post=self.E.num), 6.7),
+        out=bp.dyn.COBA(E=-80.),
+        post=self.E,
+      )
+      self.I2I = bp.dyn.FullProjAlignPreSDMg(
+        pre=self.I,
+        syn=bp.dyn.Expon.desc(self.I.varshape, tau=10.),
+        delay=delay,
+        comm=bp.dnn.CSRLinear(bp.conn.FixedProb(prob, pre=self.I.num, post=self.I.num), 6.7),
+        out=bp.dyn.COBA(E=-80.),
+        post=self.I,
+      )
+
+    def update(self):
+      self.E2I()
+      self.I2I()
+      self.I2E()
+      self.E2E()
+      self.E(self.inp)
+      self.I(self.inp)
+      return self.E.spike.value
+
+  net = EICOBA_PreAlign(0.5)
+  indices = np.arange(400)
+  spks = bm.for_loop(net.step_run, indices)
+  bp.visualize.raster_plot(indices * bm.dt, spks, show=True)
+
+  net = EICOBA_PreAlign(0.5, delay=1.)
+  indices = np.arange(400)
+  spks = bm.for_loop(net.step_run, indices)
+  bp.visualize.raster_plot(indices * bm.dt, spks, show=True)
+
+  plt.close()
+  bm.clear_buffer_memory()
+
+
+def test_ProjAlignPostMg2():
+  class EICOBA_PostAlign(bp.DynamicalSystem):
+    def __init__(self, scale, inp=20., ltc=True, delay=None):
+      super().__init__()
+      self.inp = inp
+
+      if ltc:
+        self.E = bp.dyn.LifRefLTC(int(3200 * scale), **neu_pars)
+        self.I = bp.dyn.LifRefLTC(int(800 * scale), **neu_pars)
+      else:
+        self.E = bp.dyn.LifRef(int(3200 * scale), **neu_pars)
+        self.I = bp.dyn.LifRef(int(800 * scale), **neu_pars)
+
+      prob = 80 / (4000 * scale)
+
+      self.E2E = bp.dyn.FullProjAlignPostMg(
+        pre=self.E,
+        delay=delay,
+        comm=bp.dnn.EventCSRLinear(bp.conn.FixedProb(prob, pre=self.E.num, post=self.E.num), 0.6),
+        syn=bp.dyn.Expon.desc(self.E.varshape, tau=5.),
+        out=bp.dyn.COBA.desc(E=0.),
+        post=self.E,
+      )
+      self.E2I = bp.dyn.FullProjAlignPostMg(
+        pre=self.E,
+        delay=delay,
+        comm=bp.dnn.EventCSRLinear(bp.conn.FixedProb(prob, pre=self.E.num, post=self.I.num), 0.6),
+        syn=bp.dyn.Expon.desc(self.I.varshape, tau=5.),
+        out=bp.dyn.COBA.desc(E=0.),
+        post=self.I,
+      )
+      self.I2E = bp.dyn.FullProjAlignPostMg(
+        pre=self.I,
+        delay=delay,
+        comm=bp.dnn.EventCSRLinear(bp.conn.FixedProb(prob, pre=self.I.num, post=self.E.num), 6.7),
+        syn=bp.dyn.Expon.desc(self.E.varshape, tau=10.),
+        out=bp.dyn.COBA.desc(E=-80.),
+        post=self.E,
+      )
+      self.I2I = bp.dyn.FullProjAlignPostMg(
+        pre=self.I,
+        delay=delay,
+        comm=bp.dnn.EventCSRLinear(bp.conn.FixedProb(prob, pre=self.I.num, post=self.I.num), 6.7),
+        syn=bp.dyn.Expon.desc(self.I.varshape, tau=10.),
+        out=bp.dyn.COBA.desc(E=-80.),
+        post=self.I,
+      )
+
+    def update(self):
+      self.E2I()
+      self.I2I()
+      self.I2E()
+      self.E2E()
+      self.E(self.inp)
+      self.I(self.inp)
+      return self.E.spike.value
+
+  net = EICOBA_PostAlign(0.5)
+  indices = np.arange(400)
+  spks = bm.for_loop(net.step_run, indices)
+  bp.visualize.raster_plot(indices * bm.dt, spks, show=True)
+
+  net = EICOBA_PostAlign(0.5, delay=1.)
+  indices = np.arange(400)
+  spks = bm.for_loop(net.step_run, indices)
+  bp.visualize.raster_plot(indices * bm.dt, spks, show=True)
+
+  net = EICOBA_PostAlign(0.5, ltc=False)
+  indices = np.arange(400)
+  spks = bm.for_loop(net.step_run, indices)
+  bp.visualize.raster_plot(indices * bm.dt, spks, show=True)
+
+  plt.close()
+  bm.clear_buffer_memory()
+
+
+def test_ProjAlignPost1():
+  class EINet(bp.DynSysGroup):
+    def __init__(self, scale=1.):
+      super().__init__()
+      num = int(4000 * scale)
+      self.num_exc = int(3200 * scale)
+      self.num_inh = num - self.num_exc
+      prob = 80 / num
+
+      self.N = bp.dyn.LifRefLTC(num, V_rest=-60., V_th=-50., V_reset=-60., tau=20., tau_ref=5.,
+                                V_initializer=bp.init.Normal(-55., 2.))
+      self.delay = bp.VarDelay(self.N.spike, entries={'I': None})
+      self.E = bp.dyn.HalfProjAlignPost(comm=bp.dnn.EventJitFPHomoLinear(self.num_exc, num, prob=prob, weight=0.6),
+                                        syn=bp.dyn.Expon(size=num, tau=5.),
+                                        out=bp.dyn.COBA(E=0.),
+                                        post=self.N)
+      self.I = bp.dyn.HalfProjAlignPost(comm=bp.dnn.EventJitFPHomoLinear(self.num_inh, num, prob=prob, weight=6.7),
+                                        syn=bp.dyn.Expon(size=num, tau=10.),
+                                        out=bp.dyn.COBA(E=-80.),
+                                        post=self.N)
+
+    def update(self, input):
+      spk = self.delay.at('I')
+      self.E(spk[:self.num_exc])
+      self.I(spk[self.num_exc:])
+      self.delay(self.N(input))
+      return self.N.spike.value
+
+  model = EINet(0.5)
+  indices = bm.arange(400)
+  spks = bm.for_loop(lambda i: model.step_run(i, 20.), indices)
+  bp.visualize.raster_plot(indices, spks, show=True)
+  bm.clear_buffer_memory()
+  plt.close()
+
+
+def test_ProjAlignPost2():
+  class EINet(bp.DynSysGroup):
+    def __init__(self, scale, delay=None):
+      super().__init__()
+      ne, ni = int(3200 * scale), int(800 * scale)
+      p = 80 / (ne + ni)
+
+      self.E = bp.dyn.LifRefLTC(ne, V_rest=-60., V_th=-50., V_reset=-60., tau=20., tau_ref=5.,
+                                V_initializer=bp.init.Normal(-55., 2.))
+      self.I = bp.dyn.LifRefLTC(ni, V_rest=-60., V_th=-50., V_reset=-60., tau=20., tau_ref=5.,
+                                V_initializer=bp.init.Normal(-55., 2.))
+      self.E2E = bp.dyn.FullProjAlignPost(pre=self.E,
+                                          delay=delay,
+                                          comm=bp.dnn.EventJitFPHomoLinear(ne, ne, prob=p, weight=0.6),
+                                          syn=bp.dyn.Expon(size=ne, tau=5.),
+                                          out=bp.dyn.COBA(E=0.),
+                                          post=self.E)
+      self.E2I = bp.dyn.FullProjAlignPost(pre=self.E,
+                                          delay=delay,
+                                          comm=bp.dnn.EventJitFPHomoLinear(ne, ni, prob=p, weight=0.6),
+                                          syn=bp.dyn.Expon(size=ni, tau=5.),
+                                          out=bp.dyn.COBA(E=0.),
+                                          post=self.I)
+      self.I2E = bp.dyn.FullProjAlignPost(pre=self.I,
+                                          delay=delay,
+                                          comm=bp.dnn.EventJitFPHomoLinear(ni, ne, prob=p, weight=6.7),
+                                          syn=bp.dyn.Expon(size=ne, tau=10.),
+                                          out=bp.dyn.COBA(E=-80.),
+                                          post=self.E)
+      self.I2I = bp.dyn.FullProjAlignPost(pre=self.I,
+                                          delay=delay,
+                                          comm=bp.dnn.EventJitFPHomoLinear(ni, ni, prob=p, weight=6.7),
+                                          syn=bp.dyn.Expon(size=ni, tau=10.),
+                                          out=bp.dyn.COBA(E=-80.),
+                                          post=self.I)
+
+    def update(self, inp):
+      self.E2E()
+      self.E2I()
+      self.I2E()
+      self.I2I()
+      self.E(inp)
+      self.I(inp)
+      return self.E.spike
+
+  model = EINet(0.5, delay=1.)
+  indices = bm.arange(400)
+  spks = bm.for_loop(lambda i: model.step_run(i, 20.), indices)
+  bp.visualize.raster_plot(indices, spks, show=True)
+
+  model = EINet(0.5, delay=None)
+  indices = bm.arange(400)
+  spks = bm.for_loop(lambda i: model.step_run(i, 20.), indices)
+  bp.visualize.raster_plot(indices, spks, show=True)
+
+  bm.clear_buffer_memory()
+  plt.close()
+
+
+def test_VanillaProj():
+  class EINet(bp.DynSysGroup):
+    def __init__(self, scale=0.5):
+      super().__init__()
+      num = int(4000 * scale)
+      self.ne = int(3200 * scale)
+      self.ni = num - self.ne
+      p = 80 / num
+
+      self.N = bp.dyn.LifRefLTC(num, V_rest=-60., V_th=-50., V_reset=-60., tau=20., tau_ref=5.,
+                                V_initializer=bp.init.Normal(-55., 2.))
+      self.delay = bp.VarDelay(self.N.spike, entries={'I': None})
+      self.syn1 = bp.dyn.Expon(size=self.ne, tau=5.)
+      self.syn2 = bp.dyn.Expon(size=self.ni, tau=10.)
+      self.E = bp.dyn.VanillaProj(comm=bp.dnn.JitFPHomoLinear(self.ne, num, prob=p, weight=0.6),
+                                  out=bp.dyn.COBA(E=0.),
+                                  post=self.N)
+      self.I = bp.dyn.VanillaProj(comm=bp.dnn.JitFPHomoLinear(self.ni, num, prob=p, weight=6.7),
+                                  out=bp.dyn.COBA(E=-80.),
+                                  post=self.N)
+
+    def update(self, input):
+      spk = self.delay.at('I')
+      self.E(self.syn1(spk[:self.ne]))
+      self.I(self.syn2(spk[self.ne:]))
+      self.delay(self.N(input))
+      return self.N.spike.value
+
+  model = EINet()
+  indices = bm.arange(400)
+  spks = bm.for_loop(lambda i: model.step_run(i, 20.), indices)
+  bp.visualize.raster_plot(indices, spks, show=True)
+  bm.clear_buffer_memory()
+  plt.close()
+
+
+def test_ProjAlignPreMg1_v2():
+  class EINet(bp.DynSysGroup):
+    def __init__(self, scale=1., delay=None):
+      super().__init__()
+      ne, ni = int(3200 * scale), int(800 * scale)
+      p = 80 / (4000 * scale)
+      self.E = bp.dyn.LifRefLTC(ne, V_rest=-60., V_th=-50., V_reset=-60., tau=20., tau_ref=5.,
+                                V_initializer=bp.init.Normal(-55., 2.))
+      self.I = bp.dyn.LifRefLTC(ni, V_rest=-60., V_th=-50., V_reset=-60., tau=20., tau_ref=5.,
+                                V_initializer=bp.init.Normal(-55., 2.))
+      self.E2E = bp.dyn.FullProjAlignPreSDMg(pre=self.E,
+                                             syn=bp.dyn.Expon.desc(size=ne, tau=5.),
+                                             delay=delay,
+                                             comm=bp.dnn.JitFPHomoLinear(ne, ne, prob=p, weight=0.6),
+                                             out=bp.dyn.COBA(E=0.),
+                                             post=self.E)
+      self.E2I = bp.dyn.FullProjAlignPreSDMg(pre=self.E,
+                                             syn=bp.dyn.Expon.desc(size=ne, tau=5.),
+                                             delay=delay,
+                                             comm=bp.dnn.JitFPHomoLinear(ne, ni, prob=p, weight=0.6),
+                                             out=bp.dyn.COBA(E=0.),
+                                             post=self.I)
+      self.I2E = bp.dyn.FullProjAlignPreSDMg(pre=self.I,
+                                             syn=bp.dyn.Expon.desc(size=ni, tau=10.),
+                                             delay=delay,
+                                             comm=bp.dnn.JitFPHomoLinear(ni, ne, prob=p, weight=6.7),
+                                             out=bp.dyn.COBA(E=-80.),
+                                             post=self.E)
+      self.I2I = bp.dyn.FullProjAlignPreSDMg(pre=self.I,
+                                             syn=bp.dyn.Expon.desc(size=ni, tau=10.),
+                                             delay=delay,
+                                             comm=bp.dnn.JitFPHomoLinear(ni, ni, prob=p, weight=6.7),
+                                             out=bp.dyn.COBA(E=-80.),
+                                             post=self.I)
+
+    def update(self, inp):
+      self.E2E()
+      self.E2I()
+      self.I2E()
+      self.I2I()
+      self.E(inp)
+      self.I(inp)
+      return self.E.spike
+
+  model = EINet()
+  indices = bm.arange(400)
+  spks = bm.for_loop(lambda i: model.step_run(i, 20.), indices)
+  bp.visualize.raster_plot(indices, spks, show=True)
+
+  model = EINet(delay=1.)
+  indices = bm.arange(400)
+  spks = bm.for_loop(lambda i: model.step_run(i, 20.), indices)
+  bp.visualize.raster_plot(indices, spks, show=True)
+
+  bm.clear_buffer_memory()
+  plt.close()
+
+
+def test_ProjAlignPreMg2():
+  class EINet(bp.DynSysGroup):
+    def __init__(self, scale=1., delay=None):
+      super().__init__()
+      ne, ni = int(3200 * scale), int(800 * scale)
+      p = 80 / (4000 * scale)
+      self.E = bp.dyn.LifRefLTC(ne, V_rest=-60., V_th=-50., V_reset=-60., tau=20., tau_ref=5.,
+                                V_initializer=bp.init.Normal(-55., 2.))
+      self.I = bp.dyn.LifRefLTC(ni, V_rest=-60., V_th=-50., V_reset=-60., tau=20., tau_ref=5.,
+                                V_initializer=bp.init.Normal(-55., 2.))
+      self.E2E = bp.dyn.FullProjAlignPreDSMg(pre=self.E,
+                                             delay=delay,
+                                             syn=bp.dyn.Expon.desc(size=ne, tau=5.),
+                                             comm=bp.dnn.JitFPHomoLinear(ne, ne, prob=p, weight=0.6),
+                                             out=bp.dyn.COBA(E=0.),
+                                             post=self.E)
+      self.E2I = bp.dyn.FullProjAlignPreDSMg(pre=self.E,
+                                             delay=delay,
+                                             syn=bp.dyn.Expon.desc(size=ne, tau=5.),
+                                             comm=bp.dnn.JitFPHomoLinear(ne, ni, prob=p, weight=0.6),
+                                             out=bp.dyn.COBA(E=0.),
+                                             post=self.I)
+      self.I2E = bp.dyn.FullProjAlignPreDSMg(pre=self.I,
+                                             delay=delay,
+                                             syn=bp.dyn.Expon.desc(size=ni, tau=10.),
+                                             comm=bp.dnn.JitFPHomoLinear(ni, ne, prob=p, weight=6.7),
+                                             out=bp.dyn.COBA(E=-80.),
+                                             post=self.E)
+      self.I2I = bp.dyn.FullProjAlignPreDSMg(pre=self.I,
+                                             delay=delay,
+                                             syn=bp.dyn.Expon.desc(size=ni, tau=10.),
+                                             comm=bp.dnn.JitFPHomoLinear(ni, ni, prob=p, weight=6.7),
+                                             out=bp.dyn.COBA(E=-80.),
+                                             post=self.I)
+
+    def update(self, inp):
+      self.E2E()
+      self.E2I()
+      self.I2E()
+      self.I2I()
+      self.E(inp)
+      self.I(inp)
+      return self.E.spike
+
+  model = EINet(scale=0.2, delay=None)
+  indices = bm.arange(400)
+  spks = bm.for_loop(lambda i: model.step_run(i, 20.), indices)
+  bp.visualize.raster_plot(indices, spks, show=True)
+
+  model = EINet(scale=0.2, delay=1.)
+  indices = bm.arange(400)
+  spks = bm.for_loop(lambda i: model.step_run(i, 20.), indices)
+  bp.visualize.raster_plot(indices, spks, show=True)
+
+  bm.clear_buffer_memory()
+  plt.close()
+
+
+def test_vanalla_proj_v2():
+  class EINet(bp.DynSysGroup):
+    def __init__(self, scale=1.):
+      super().__init__()
+      num = int(4000 * scale)
+      self.ne = int(3200 * scale)
+      self.ni = num - self.ne
+      p = 80 / num
+
+      self.N = bp.dyn.LifRefLTC(num, V_rest=-60., V_th=-50., V_reset=-60., tau=20., tau_ref=5.,
+                                V_initializer=bp.init.Normal(-55., 1.))
+      self.delay = bp.VarDelay(self.N.spike, entries={'delay': 2})
+      self.syn1 = bp.dyn.Expon(size=self.ne, tau=5.)
+      self.syn2 = bp.dyn.Expon(size=self.ni, tau=10.)
+      self.E = bp.dyn.VanillaProj(
+        comm=bp.dnn.CSRLinear(bp.conn.FixedProb(p, pre=self.ne, post=num), weight=0.6),
+        out=bp.dyn.COBA(E=0.),
+        post=self.N
+      )
+      self.I = bp.dyn.VanillaProj(
+        comm=bp.dnn.CSRLinear(bp.conn.FixedProb(p, pre=self.ni, post=num), weight=6.7),
+        out=bp.dyn.COBA(E=-80.),
+        post=self.N
+      )
+
+    def update(self, input):
+      spk = self.delay.at('delay')
+      self.E(self.syn1(spk[:self.ne]))
+      self.I(self.syn2(spk[self.ne:]))
+      self.delay(self.N(input))
+      return self.N.spike.value
+
+  model = EINet()
+  indices = bm.arange(400)
+  spks = bm.for_loop(lambda i: model.step_run(i, 20.), indices, progress_bar=True)
+  bp.visualize.raster_plot(indices, spks, show=True)
+  plt.close()
+  bm.clear_buffer_memory()
diff --git a/brainpy/_src/dynold/synapses/tests/test_abstract_synapses.py b/brainpy/_src/dynold/synapses/tests/test_abstract_synapses.py
index badb60832..6db945ff2 100644
--- a/brainpy/_src/dynold/synapses/tests/test_abstract_synapses.py
+++ b/brainpy/_src/dynold/synapses/tests/test_abstract_synapses.py
@@ -24,9 +24,7 @@ def test_all2all_synapse(self, name, stp, mode):
       net = bp.Network(pre=pre_neu, syn=syn, post=post_neu)
 
     # 运行模拟
-    runner = bp.DSRunner(net,
-                         monitors=['pre.V', 'syn.g', 'post.V'],
-                         inputs=('pre.input', 35.))
+    runner = bp.DSRunner(net, monitors=['pre.V', 'syn.g', 'post.V'], inputs=('pre.input', 35.))
     runner(10.)
 
     expected_shape = (100, 5)
@@ -123,4 +121,4 @@ def test_delta_synapse(self, post_ref_key, stp, mode):
       post_expected_shape = (mode.batch_size,) + post_expected_shape
     self.assertTupleEqual(runner.mon['pre.V'].shape, pre_expected_shape)
     self.assertTupleEqual(runner.mon['post.V'].shape, post_expected_shape)
-    bm.clear_buffer_memory()
+    bm.clear_buffer_memory()
\ No newline at end of file
diff --git a/brainpy/_src/dynold/synapses/tests/test_biological_synapses.py b/brainpy/_src/dynold/synapses/tests/test_biological_synapses.py
index 395868092..01a315261 100644
--- a/brainpy/_src/dynold/synapses/tests/test_biological_synapses.py
+++ b/brainpy/_src/dynold/synapses/tests/test_biological_synapses.py
@@ -1,103 +1,108 @@
-# -*- coding: utf-8 -*-
-
-
-from absl.testing import parameterized
-
-import brainpy as bp
-import brainpy.math as bm
-
-biological_models = [
-  bp.synapses.AMPA,
-  bp.synapses.GABAa,
-  bp.synapses.BioNMDA,
-]
-
-
-class Test_Biological_Synapse(parameterized.TestCase):
-  @parameterized.product(
-    synapse=biological_models,
-    delay_step=[None, 5, 1],
-    mode=[bm.NonBatchingMode(), bm.BatchingMode(5)],
-    stp=[None, bp.synplast.STP(), bp.synplast.STD()]
-  )
-  def test_all2all_synapse(self, synapse, delay_step, mode, stp):
-    bm.random.seed()
-    with bm.environment(mode=mode):
-      pre_neu = bp.neurons.LIF(5)
-      post_neu = bp.neurons.LIF(5)
-      syn = synapse(pre_neu, post_neu, conn=bp.conn.All2All(), delay_step=delay_step, stp=stp)
-      net = bp.Network(pre=pre_neu, syn=syn, post=post_neu)
-
-    # 运行模拟
-    runner = bp.DSRunner(net,
-                         monitors=['pre.V', 'syn.g', 'post.V'],
-                         inputs=('pre.input', 35.))
-    runner(10.)
-
-    expected_shape = (100, 5)
-    if isinstance(mode, bm.BatchingMode):
-      expected_shape = (mode.batch_size,) + expected_shape
-
-    self.assertTupleEqual(runner.mon['pre.V'].shape, expected_shape)
-    self.assertTupleEqual(runner.mon['syn.g'].shape, expected_shape)
-    self.assertTupleEqual(runner.mon['post.V'].shape, expected_shape)
-    bm.clear_buffer_memory()
-
-  @parameterized.product(
-    synapse=biological_models,
-    delay_step=[None, 10, 1],
-    mode=[bm.NonBatchingMode(), bm.BatchingMode(5), ],
-    stp=[None, bp.synplast.STP(), bp.synplast.STD()]
-  )
-  def test_one2one_synapse(self, synapse, delay_step, mode, stp):
-    bm.random.seed()
-    with bm.environment(mode=mode):
-      pre_neu = bp.neurons.LIF(5)
-      post_neu = bp.neurons.LIF(5)
-      syn = synapse(pre_neu, post_neu, conn=bp.conn.One2One(), delay_step=delay_step, stp=stp)
-      net = bp.Network(pre=pre_neu, syn=syn, post=post_neu)
-
-    # 运行模拟
-    runner = bp.DSRunner(net,
-                         monitors=['pre.V', 'syn.g', 'post.V'],
-                         inputs=('pre.input', 35.))
-    runner(10.)
-
-    expected_shape = (100, 5)
-    if isinstance(mode, bm.BatchingMode):
-      expected_shape = (mode.batch_size,) + expected_shape
-    self.assertTupleEqual(runner.mon['pre.V'].shape, expected_shape)
-    self.assertTupleEqual(runner.mon['syn.g'].shape, expected_shape)
-    self.assertTupleEqual(runner.mon['post.V'].shape, expected_shape)
-    bm.clear_buffer_memory()
-
-  @parameterized.product(
-    synapse=biological_models,
-    comp_method=['sparse', 'dense'],
-    delay_step=[None, 10, 1],
-    mode=[bm.NonBatchingMode(), bm.BatchingMode(5)],
-    stp=[None, bp.synplast.STP(), bp.synplast.STD()]
-  )
-  def test_sparse_synapse(self, synapse, comp_method, delay_step, mode, stp):
-    bm.random.seed()
-    with bm.environment(mode=mode):
-      pre_neu = bp.neurons.LIF(10)
-      post_neu = bp.neurons.LIF(10)
-      syn = synapse(pre_neu, post_neu, conn=bp.conn.FixedProb(0.5),
-                    comp_method=comp_method, delay_step=delay_step,
-                    stp=stp)
-      net = bp.Network(pre=pre_neu, syn=syn, post=post_neu)
-
-    # 运行模拟
-    runner = bp.DSRunner(net,
-                         monitors=['pre.V', 'syn.g', 'post.V'],
-                         inputs=('pre.input', 35.))
-    runner(10.)
-
-    expected_shape = (100, 10)
-    if isinstance(mode, bm.BatchingMode):
-      expected_shape = (mode.batch_size,) + expected_shape
-    self.assertTupleEqual(runner.mon['pre.V'].shape, expected_shape)
-    self.assertTupleEqual(runner.mon['syn.g'].shape, expected_shape)
-    self.assertTupleEqual(runner.mon['post.V'].shape, expected_shape)
-    bm.clear_buffer_memory()
+# -*- coding: utf-8 -*-
+
+import pytest
+from absl.testing import parameterized
+
+import brainpy as bp
+import brainpy.math as bm
+
+from brainpy._src.dependency_check import import_taichi
+
+if import_taichi(error_if_not_found=False) is None:
+  pytest.skip('no taichi', allow_module_level=True)
+
+biological_models = [
+  bp.synapses.AMPA,
+  bp.synapses.GABAa,
+  bp.synapses.BioNMDA,
+]
+
+
+class Test_Biological_Synapse(parameterized.TestCase):
+  @parameterized.product(
+    synapse=biological_models,
+    delay_step=[None, 5, 1],
+    mode=[bm.NonBatchingMode(), bm.BatchingMode(5)],
+    stp=[None, bp.synplast.STP(), bp.synplast.STD()]
+  )
+  def test_all2all_synapse(self, synapse, delay_step, mode, stp):
+    bm.random.seed()
+    with bm.environment(mode=mode):
+      pre_neu = bp.neurons.LIF(5)
+      post_neu = bp.neurons.LIF(5)
+      syn = synapse(pre_neu, post_neu, conn=bp.conn.All2All(), delay_step=delay_step, stp=stp)
+      net = bp.Network(pre=pre_neu, syn=syn, post=post_neu)
+
+    # 运行模拟
+    runner = bp.DSRunner(net,
+                         monitors=['pre.V', 'syn.g', 'post.V'],
+                         inputs=('pre.input', 35.))
+    runner(10.)
+
+    expected_shape = (100, 5)
+    if isinstance(mode, bm.BatchingMode):
+      expected_shape = (mode.batch_size,) + expected_shape
+
+    self.assertTupleEqual(runner.mon['pre.V'].shape, expected_shape)
+    self.assertTupleEqual(runner.mon['syn.g'].shape, expected_shape)
+    self.assertTupleEqual(runner.mon['post.V'].shape, expected_shape)
+    bm.clear_buffer_memory()
+
+  @parameterized.product(
+    synapse=biological_models,
+    delay_step=[None, 10, 1],
+    mode=[bm.NonBatchingMode(), bm.BatchingMode(5), ],
+    stp=[None, bp.synplast.STP(), bp.synplast.STD()]
+  )
+  def test_one2one_synapse(self, synapse, delay_step, mode, stp):
+    bm.random.seed()
+    with bm.environment(mode=mode):
+      pre_neu = bp.neurons.LIF(5)
+      post_neu = bp.neurons.LIF(5)
+      syn = synapse(pre_neu, post_neu, conn=bp.conn.One2One(), delay_step=delay_step, stp=stp)
+      net = bp.Network(pre=pre_neu, syn=syn, post=post_neu)
+
+    # 运行模拟
+    runner = bp.DSRunner(net,
+                         monitors=['pre.V', 'syn.g', 'post.V'],
+                         inputs=('pre.input', 35.))
+    runner(10.)
+
+    expected_shape = (100, 5)
+    if isinstance(mode, bm.BatchingMode):
+      expected_shape = (mode.batch_size,) + expected_shape
+    self.assertTupleEqual(runner.mon['pre.V'].shape, expected_shape)
+    self.assertTupleEqual(runner.mon['syn.g'].shape, expected_shape)
+    self.assertTupleEqual(runner.mon['post.V'].shape, expected_shape)
+    bm.clear_buffer_memory()
+
+  @parameterized.product(
+    synapse=biological_models,
+    comp_method=['sparse', 'dense'],
+    delay_step=[None, 10, 1],
+    mode=[bm.NonBatchingMode(), bm.BatchingMode(5)],
+    stp=[None, bp.synplast.STP(), bp.synplast.STD()]
+  )
+  def test_sparse_synapse(self, synapse, comp_method, delay_step, mode, stp):
+    bm.random.seed()
+    with bm.environment(mode=mode):
+      pre_neu = bp.neurons.LIF(10)
+      post_neu = bp.neurons.LIF(10)
+      syn = synapse(pre_neu, post_neu, conn=bp.conn.FixedProb(0.5),
+                    comp_method=comp_method, delay_step=delay_step,
+                    stp=stp)
+      net = bp.Network(pre=pre_neu, syn=syn, post=post_neu)
+
+    # 运行模拟
+    runner = bp.DSRunner(net,
+                         monitors=['pre.V', 'syn.g', 'post.V'],
+                         inputs=('pre.input', 35.))
+    runner(10.)
+
+    expected_shape = (100, 10)
+    if isinstance(mode, bm.BatchingMode):
+      expected_shape = (mode.batch_size,) + expected_shape
+    self.assertTupleEqual(runner.mon['pre.V'].shape, expected_shape)
+    self.assertTupleEqual(runner.mon['syn.g'].shape, expected_shape)
+    self.assertTupleEqual(runner.mon['post.V'].shape, expected_shape)
+    bm.clear_buffer_memory()
diff --git a/brainpy/_src/math/defaults.py b/brainpy/_src/math/defaults.py
index 20cae197e..9f3c50454 100644
--- a/brainpy/_src/math/defaults.py
+++ b/brainpy/_src/math/defaults.py
@@ -7,7 +7,7 @@
 
 __all__ = ['mode', 'membrane_scaling', 'dt', 'bool_', 'int_', 'ti_int', 'float_', 'ti_float', 'complex_']
 
-ti = import_taichi()
+ti = import_taichi(error_if_not_found=False)
 
 # Default computation mode.
 mode = NonBatchingMode()
@@ -24,18 +24,23 @@
 # '''Default integer data type.'''
 int_ = jnp.int64 if config.read('jax_enable_x64') else jnp.int32
 
-# '''Default integer data type in Taichi.'''
-ti_int = ti.int64 if config.read('jax_enable_x64') else ti.int32
-
 # '''Default float data type.'''
 float_ = jnp.float64 if config.read('jax_enable_x64') else jnp.float32
 
-# '''Default float data type in Taichi.'''
-ti_float = ti.float64 if config.read('jax_enable_x64') else ti.float32
-
 # '''Default complex data type.'''
 complex_ = jnp.complex128 if config.read('jax_enable_x64') else jnp.complex64
 
 # register brainpy object as pytree
 bp_object_as_pytree = False
 
+if ti is not None:
+  # '''Default integer data type in Taichi.'''
+  ti_int = ti.int64 if config.read('jax_enable_x64') else ti.int32
+
+  # '''Default float data type in Taichi.'''
+  ti_float = ti.float64 if config.read('jax_enable_x64') else ti.float32
+
+else:
+  ti_int = None
+  ti_float = None
+
diff --git a/brainpy/_src/math/environment.py b/brainpy/_src/math/environment.py
index e91eca0b8..d49e70f51 100644
--- a/brainpy/_src/math/environment.py
+++ b/brainpy/_src/math/environment.py
@@ -2,6 +2,7 @@
 
 
 import functools
+import gc
 import inspect
 import os
 import re
@@ -16,9 +17,10 @@
 from . import modes
 from . import scales
 from . import defaults
+from .object_transform import naming
 from brainpy._src.dependency_check import import_taichi
 
-ti = import_taichi()
+ti = import_taichi(error_if_not_found=False)
 
 __all__ = [
   # context manage for environment setting
@@ -435,13 +437,16 @@ def set_float(dtype: type):
   """
   if dtype in [jnp.float16, 'float16', 'f16']:
     defaults.__dict__['float_'] = jnp.float16
-    defaults.__dict__['ti_float'] = ti.float16
+    if ti is not None:
+      defaults.__dict__['ti_float'] = ti.float16
   elif dtype in [jnp.float32, 'float32', 'f32']:
     defaults.__dict__['float_'] = jnp.float32
-    defaults.__dict__['ti_float'] = ti.float32
+    if ti is not None:
+      defaults.__dict__['ti_float'] = ti.float32
   elif dtype in [jnp.float64, 'float64', 'f64']:
     defaults.__dict__['float_'] = jnp.float64
-    defaults.__dict__['ti_float'] = ti.float64
+    if ti is not None:
+      defaults.__dict__['ti_float'] = ti.float64
   else:
     raise NotImplementedError
 
@@ -467,16 +472,20 @@ def set_int(dtype: type):
   """
   if dtype in [jnp.int8, 'int8', 'i8']:
     defaults.__dict__['int_'] = jnp.int8
-    defaults.__dict__['ti_int'] = ti.int8
+    if ti is not None:
+      defaults.__dict__['ti_int'] = ti.int8
   elif dtype in [jnp.int16, 'int16', 'i16']:
     defaults.__dict__['int_'] = jnp.int16
-    defaults.__dict__['ti_int'] = ti.int16
+    if ti is not None:
+      defaults.__dict__['ti_int'] = ti.int16
   elif dtype in [jnp.int32, 'int32', 'i32']:
     defaults.__dict__['int_'] = jnp.int32
-    defaults.__dict__['ti_int'] = ti.int32
+    if ti is not None:
+      defaults.__dict__['ti_int'] = ti.int32
   elif dtype in [jnp.int64, 'int64', 'i64']:
     defaults.__dict__['int_'] = jnp.int64
-    defaults.__dict__['ti_int'] = ti.int64
+    if ti is not None:
+      defaults.__dict__['ti_int'] = ti.int64
   else:
     raise NotImplementedError
 
@@ -693,7 +702,9 @@ def set_host_device_count(n):
 def clear_buffer_memory(
     platform: str = None,
     array: bool = True,
-    compilation: bool = False
+    transform: bool = True,
+    compilation: bool = False,
+    object_name: bool = False,
 ):
   """Clear all on-device buffers.
 
@@ -710,9 +721,13 @@ def clear_buffer_memory(
   platform: str
     The device to clear its memory.
   array: bool
-    Clear all buffer array.
+    Clear all buffer array. Default is True.
   compilation: bool
-    Clear compilation cache.
+    Clear compilation cache. Default is False.
+  transform: bool
+    Clear transform cache. Default is True.
+  object_name: bool
+    Clear name cache. Default is True.
 
   """
   if array:
@@ -720,6 +735,11 @@ def clear_buffer_memory(
       buf.delete()
   if compilation:
     jax.clear_caches()
+  if transform:
+    naming.clear_stack_cache()
+  if object_name:
+    naming.clear_name_cache()
+  gc.collect()
 
 
 def disable_gpu_memory_preallocation(release_memory: bool = True):
diff --git a/brainpy/_src/math/event/__init__.py b/brainpy/_src/math/event/__init__.py
index 631129558..bdd3102a3 100644
--- a/brainpy/_src/math/event/__init__.py
+++ b/brainpy/_src/math/event/__init__.py
@@ -1,4 +1,2 @@
-
-from ._info_collection import *
 from ._csr_matvec import *
 
diff --git a/brainpy/_src/math/event/_csr_matvec.py b/brainpy/_src/math/event/_csr_matvec.py
index 6e03be463..6b7f7da02 100644
--- a/brainpy/_src/math/event/_csr_matvec.py
+++ b/brainpy/_src/math/event/_csr_matvec.py
@@ -10,34 +10,25 @@
 
 """
 
-from functools import partial
 from typing import Union, Tuple
 
 import jax
 import jax.numpy as jnp
-import numba
 import numpy as np
-from jax.core import ShapedArray, Primitive
-from jax.interpreters import ad, xla
-from jax.lib import xla_client
+from jax.interpreters import ad
 
-from brainpy._src.dependency_check import (import_brainpylib_gpu_ops)
 from brainpy._src.dependency_check import import_taichi
 from brainpy._src.math.interoperability import as_jax
-from brainpy._src.math.op_register import (compile_cpu_signature_with_numba,
-                                           register_general_batching,
-                                           XLACustomOp)
-from brainpy._src.math.sparse._csr_mv import csrmv_brainpylib as normal_csrmv
+from brainpy._src.math.op_register import XLACustomOp
 from brainpy._src.math.sparse._csr_mv import raw_csrmv_taichi as normal_csrmv_taichi
 from brainpy._src.math.sparse._utils import csr_to_coo
-from brainpy.errors import GPUOperatorNotFound
+from brainpy.errors import PackageMissingError
 
 __all__ = [
   'csrmv'
 ]
 
-ti = import_taichi()
-
+ti = import_taichi(error_if_not_found=False)
 
 def csrmv(
     data: Union[float, jax.Array],
@@ -53,577 +44,6 @@ def csrmv(
   This function supports JAX transformations, including `jit()`, `grad()`,
   `vmap()` and `pmap()`.
 
-  Parameters
-  ----------
-  data: ndarray, float
-    An array of shape ``(nse,)``.
-  indices: ndarray
-    An array of shape ``(nse,)``.
-  indptr: ndarray
-    An array of shape ``(shape[0] + 1,)`` and dtype ``indices.dtype``.
-  events: ndarray
-    An array of shape ``(shape[0] if transpose else shape[1],)``
-    and dtype ``data.dtype``.
-  shape: tuple
-    A length-2 tuple representing the matrix shape.
-  transpose: bool
-    A boolean specifying whether to transpose the sparse matrix
-    before computing.
-    If ``transpose=True``, the operator will compute based on the
-    event-driven property of the ``events`` vector.
-
-  Returns
-  -------
-  y : Array
-    The array of shape ``(shape[1] if transpose else shape[0],)`` representing
-    the matrix vector product.
-  """
-  return csrmv_taichi(data, indices, indptr, events, shape=shape, transpose=transpose)
-
-
-### BRAINPYLIB ###
-
-def csrmv_brainpylib(
-    data: Union[float, jax.Array],
-    indices: jax.Array,
-    indptr: jax.Array,
-    events: jax.Array,
-    *,
-    shape: Tuple[int, int],
-    transpose: bool = False
-) -> jax.Array:
-  """Product of a sparse CSR matrix and a dense event vector.
-
-  This function supports JAX transformations, including `jit()`, `grad()`,
-  `vmap()` and `pmap()`.
-
-  Parameters
-  ----------
-  data: ndarray, float
-    An array of shape ``(nse,)``.
-  indices: ndarray
-    An array of shape ``(nse,)``.
-  indptr: ndarray
-    An array of shape ``(shape[0] + 1,)`` and dtype ``indices.dtype``.
-  events: ndarray
-    An array of shape ``(shape[0] if transpose else shape[1],)``
-    and dtype ``data.dtype``.
-  shape: tuple
-    A length-2 tuple representing the matrix shape.
-  transpose: bool
-    A boolean specifying whether to transpose the sparse matrix
-    before computing.
-    If ``transpose=True``, the operator will compute based on the
-    event-driven property of the ``events`` vector.
-
-  Returns
-  -------
-  y : Array
-    The array of shape ``(shape[1] if transpose else shape[0],)`` representing
-    the matrix vector product.
-  """
-  data = as_jax(data)
-  indices = as_jax(indices)
-  indptr = as_jax(indptr)
-  events = as_jax(events)
-  # checking
-  data = jnp.atleast_1d(data)
-  if np.ndim(data) == 1:
-    if data.shape[0] not in [1, indices.shape[0]]:
-      raise ValueError('The size of data should be 1 or be consistent with indices.'
-                       f'But we got {data.shape} != {indices.shape}, {data.shape} != 1.')
-  else:
-    raise ValueError('data should be a scalar or 1D vector. '
-                     f'But we got {np.ndim(data)}-D array.')
-  if np.ndim(indices) != 1:
-    raise ValueError('indices should be a 1D vector with integer type.')
-  if np.ndim(indptr) != 1:
-    raise ValueError('indptr should be a 1D vector with integer type.')
-  if indices.dtype not in [jnp.int32, jnp.int64, jnp.uint32, jnp.uint64]:
-    raise ValueError('indices should be a 1D vector with int32 or int64 type.')
-  if indptr.dtype not in [jnp.int32, jnp.int64, jnp.uint32, jnp.uint64]:
-    raise ValueError('indptr should be a 1D vector with int32 or int64 type.')
-  if np.ndim(events) != 1:
-    raise ValueError('events should be a 1D vector.')
-  if len(shape) != 2:
-    raise ValueError('shape should be a length-2 tuple.')
-  if transpose:
-    if events.shape[0] != shape[0]:
-      raise ValueError(f'Shape mismatch, vec ({events.shape[0]},) @ mat {shape}.')
-  else:
-    if events.shape[0] != shape[1]:
-      raise ValueError(f'Shape mismatch, mat {shape} @ vec ({events.shape[0]},).')
-
-  # computing
-  return event_csr_matvec_p.bind(data, indices, indptr, events, shape=shape, transpose=transpose)
-
-
-# ----------------------------------------------------------
-# event csr matvec
-# ----------------------------------------------------------
-
-# operator for `event_csr_matvec` batching rule
-# --------
-
-def _batch_event_csr_matvec_abstract(
-    values, indices, indptr, events, *, batch_size, shape, transpose=False
-):
-  return ShapedArray(dtype=values.dtype, shape=(batch_size, shape[1] if transpose else shape[0]))
-
-
-@numba.njit(fastmath=True, parallel=True, nogil=True)
-def _batch_event_csr_matvec_transpose_numba_imp(outs, ins):
-  res_val = outs
-  res_val.fill(0)
-  values, indices, indptr, events, batch_size, shape, _ = ins
-  batch_size = batch_size[()]
-  event_batch_dim = events.shape[0]
-  indices_batch_dim = indices.shape[0]
-  indptr_batch_dim = indptr.shape[0]
-  values_batch_dim = values.shape[0]
-
-  if values.shape[1] == 1:  # homogeneous value
-    for bi in numba.prange(batch_size):
-      event_bi = bi % event_batch_dim
-      indptr_bi = bi % indptr_batch_dim
-      indices_bi = bi % indices_batch_dim
-      values_bi = bi % values_batch_dim
-      for row_i in range(shape[0]):
-        if events[event_bi, row_i]:
-          value = values[values_bi, 0]
-          for j in range(indptr[indptr_bi, row_i], indptr[indptr_bi, row_i + 1]):
-            col_i = indices[indices_bi, j]
-            res_val[bi, col_i] += value
-
-  else:  # heterogeneous values
-    for bi in numba.prange(batch_size):
-      event_bi = bi % event_batch_dim
-      indptr_bi = bi % indptr_batch_dim
-      indices_bi = bi % indices_batch_dim
-      value_bi = bi % values_batch_dim
-      for row_i in range(shape[0]):
-        if events[event_bi, row_i]:
-          for j in range(indptr[indptr_bi, row_i], indptr[indptr_bi, row_i + 1]):
-            col_i = indices[indices_bi, j]
-            res_val[bi, col_i] += values[value_bi, j]
-
-
-@numba.njit(fastmath=True, parallel=True, nogil=True)
-def _batch_event_csr_matvec_numba_imp(outs, ins):
-  res_val = outs
-  res_val.fill(0)
-  values, indices, indptr, events, batch_size, shape, transpose = ins
-  batch_size = batch_size[()]
-  event_batch_dim = events.shape[0]
-  indices_batch_dim = indices.shape[0]
-  indptr_batch_dim = indptr.shape[0]
-  values_batch_dim = values.shape[0]
-
-  if values.shape[1] == 1:  # homogeneous value
-    for bi in numba.prange(batch_size):
-      event_bi = bi % event_batch_dim
-      indptr_bi = bi % indptr_batch_dim
-      indices_bi = bi % indices_batch_dim
-      value_bi = bi % values_batch_dim
-      value = values[value_bi, 0]
-      for row_i in numba.prange(shape[0]):
-        r = 0.
-        for j in range(indptr[indptr_bi, row_i], indptr[indptr_bi, row_i + 1]):
-          col_i = indices[indices_bi, j]
-          if events[event_bi, col_i]:
-            r += value
-        res_val[bi, row_i] = r
-
-  else:  # heterogeneous values
-    for bi in numba.prange(batch_size):
-      event_bi = bi % event_batch_dim
-      indptr_bi = bi % indptr_batch_dim
-      indices_bi = bi % indices_batch_dim
-      value_bi = bi % values_batch_dim
-      for row_i in numba.prange(shape[0]):
-        r = 0.
-        for j in range(indptr[indptr_bi, row_i], indptr[indptr_bi, row_i + 1]):
-          col_i = indices[indices_bi, j]
-          if events[event_bi, col_i]:
-            r += values[value_bi, j]
-        res_val[bi, row_i] = r
-
-
-def _batch_event_csr_matvec_cpu_translation(c, values, indices, indptr, events, *,
-                                            batch_size, shape, transpose):
-  inputs = (values, indices, indptr, events)
-  description = dict(batch_size=batch_size, shape=shape, transpose=transpose)
-  if transpose:
-    name, inputs, in_layouts, out_layouts = compile_cpu_signature_with_numba(
-      c,
-      _batch_event_csr_matvec_transpose_numba_imp,
-      _batch_event_csr_matvec_abstract,
-      False,
-      inputs=inputs,
-      description=description
-    )
-  else:
-    name, inputs, in_layouts, out_layouts = compile_cpu_signature_with_numba(
-      c,
-      _batch_event_csr_matvec_numba_imp,
-      _batch_event_csr_matvec_abstract,
-      False,
-      inputs=inputs,
-      description=description
-    )
-  return xla_client.ops.CustomCallWithLayout(
-    c,
-    name,
-    operands=inputs,
-    operand_shapes_with_layout=in_layouts,
-    shape_with_layout=out_layouts,
-  )
-
-
-def _batch_event_csr_matvec_gpu_translation(c, values, indices, indptr, events, *,
-                                            batch_size, shape, transpose):
-  pass
-
-
-def _batch_event_csr_matvec_jvp_values(values_dot, values, indices, indptr, events, *,
-                                       batch_size, shape, transpose):
-  return event_csr_matvec_batching_p.bind(values_dot, indices, indptr, events,
-                                          batch_size=batch_size, shape=shape, transpose=transpose)
-
-
-def _batch_csr_matvec(values, indices, indptr, vectors, *, shape, transpose):
-  f = jax.vmap(partial(normal_csrmv, shape=shape, transpose=transpose),
-               in_axes=(0 if values.shape[0] > 1 else None,
-                        0 if indices.shape[0] > 1 else None,
-                        0 if indptr.shape[0] > 1 else None,
-                        0 if vectors.shape[0] > 1 else None))
-  return f(values if values.shape[0] > 1 else values[0],
-           indices if indices.shape[0] > 1 else indices[0],
-           indptr if indptr.shape[0] > 1 else indptr[0],
-           vectors if vectors.shape[0] > 1 else vectors[0])
-
-
-def _batch_event_csr_matvec_jvp_events(events_dot, values, indices, indptr, events, *,
-                                       batch_size, shape, transpose):
-  return _batch_csr_matvec(values, indices, indptr, events_dot,
-                           shape=shape, transpose=transpose)
-
-
-def _batch_event_csr_matvec_transpose(ct, values, indices, indptr, events, *,
-                                      batch_size, shape, transpose):
-  if ad.is_undefined_primal(indices) or ad.is_undefined_primal(indptr):
-    raise ValueError("Cannot transpose with respect to sparse indices.")
-
-  if ad.is_undefined_primal(events):
-    ct_events = (
-      ad.Zero(events.aval) if type(ct) is ad.Zero else
-      _batch_csr_matvec(ct, indices, indptr, values,
-                        shape=shape, transpose=not transpose)
-    )
-    return values, indices, indptr, ct_events
-  else:
-    if values.aval.shape[1] == 1:  # scalar
-      temp = event_csr_matvec_batching_p.bind(jnp.ones((1, 1)), indices, indptr, events,
-                                              batch_size=batch_size, shape=shape,
-                                              transpose=transpose)
-      ct_values = jax.vmap(jnp.inner)(ct, temp)
-    else:  # heterogeneous values
-      if type(ct) is ad.Zero:
-        ct_values = ad.Zero(values.aval)
-      else:
-
-        def _f(ct, indices, indptr, events, *, transpose):
-          row, col = csr_to_coo(indices, indptr)
-          ct_values = events[row] * ct[col] if transpose else events[col] * ct[row]
-          return ct_values
-
-        f = jax.vmap(partial(_f, transpose=transpose),
-                     in_axes=(0,
-                              0 if indices.shape[0] > 1 else None,
-                              0 if indptr.shape[0] > 1 else None,
-                              0 if events.shape[0] > 1 else None))
-        ct_values = f(ct,
-                      indices if indices.shape[0] > 1 else indices[0],
-                      indptr if indptr.shape[0] > 1 else indptr[0],
-                      events if events.shape[0] > 1 else events[0])
-    return ct_values, indices, indptr, events
-
-
-event_csr_matvec_batching_p = Primitive('event_csr_matvec_batching')
-event_csr_matvec_batching_p.def_abstract_eval(_batch_event_csr_matvec_abstract)
-event_csr_matvec_batching_p.def_impl(partial(xla.apply_primitive, event_csr_matvec_batching_p))
-# xla.backend_specific_translations['cpu'][event_csr_matvec_batching_p] = _batch_event_csr_matvec_cpu_translation
-ad.defjvp(event_csr_matvec_batching_p, _batch_event_csr_matvec_jvp_values,
-          None, None, _batch_event_csr_matvec_jvp_events)
-ad.primitive_transposes[event_csr_matvec_batching_p] = _batch_event_csr_matvec_transpose
-
-
-# operator for `event_csr_matvec` #
-# ------------------------------- #
-
-
-def _event_csr_matvec_abstract(values, indices, indptr, events, *, shape, transpose=False):
-  return ShapedArray(dtype=values.dtype, shape=(shape[1] if transpose else shape[0],))
-
-
-@numba.njit(fastmath=True)
-def _event_csr_matvec_transpose_numba_imp1_bool(outs, ins):
-  res_val = outs
-  res_val.fill(0)
-  values, indices, indptr, events, shape, _ = ins
-  if values.shape[0] > 1:  # heter
-    for row_i, event in enumerate(events):
-      if event:
-        for j in range(indptr[row_i], indptr[row_i + 1]):
-          col_i = indices[j]
-          res_val[col_i] += values[j]
-
-  else:  # homo
-    values = values[0]
-    for row_i, event in enumerate(events):
-      if event:
-        for j in range(indptr[row_i], indptr[row_i + 1]):
-          col_i = indices[j]
-          res_val[col_i] += values
-
-
-@numba.njit(fastmath=True)
-def _event_csr_matvec_transpose_numba_imp2(outs, ins):
-  res_val = outs
-  res_val.fill(0)
-  values, indices, indptr, events, shape, _ = ins
-  if values.shape[0] > 1:  # heter
-    for row_i, event in enumerate(events):
-      if event > 0.:
-        for j in range(indptr[row_i], indptr[row_i + 1]):
-          col_i = indices[j]
-          res_val[col_i] += values[j]
-
-  else:  # homo
-    values = values[0]
-    for row_i, event in enumerate(events):
-      if event > 0.:
-        for j in range(indptr[row_i], indptr[row_i + 1]):
-          col_i = indices[j]
-          res_val[col_i] += values
-
-
-@numba.njit(fastmath=True, parallel=True, nogil=True)
-def _event_csr_matvec_numba_imp1_bool(outs, ins):
-  res_val = outs
-  res_val.fill(0)
-  values, indices, indptr, events, shape, _ = ins
-
-  if values.shape[0] > 1:  # heter
-    for row_i in range(shape[0]):
-      r = 0.
-      for j in range(indptr[row_i], indptr[row_i + 1]):
-        col_i = indices[j]
-        if events[col_i]:
-          r += values[j]
-      res_val[row_i] = r
-
-  else:  # homo
-    values = values[0]
-    for row_i in numba.prange(shape[0]):
-      r = 0.
-      for j in range(indptr[row_i], indptr[row_i + 1]):
-        col_i = indices[j]
-        if events[col_i]:
-          r += values
-      res_val[row_i] = r
-
-
-@numba.njit(fastmath=True, parallel=True, nogil=True)
-def _event_csr_matvec_numba_imp2(outs, ins):
-  res_val = outs
-  res_val.fill(0)
-  values, indices, indptr, events, shape, _ = ins
-
-  if values.shape[0] > 1:  # heter
-    for row_i in range(shape[0]):
-      r = 0.
-      for j in range(indptr[row_i], indptr[row_i + 1]):
-        col_i = indices[j]
-        if events[col_i] > 0.:
-          r += values[j]
-      res_val[row_i] = r
-
-  else:  # homo
-    values = values[0]
-    for row_i in numba.prange(shape[0]):
-      r = 0.
-      for j in range(indptr[row_i], indptr[row_i + 1]):
-        col_i = indices[j]
-        if events[col_i] > 0.:
-          r += values
-      res_val[row_i] = r
-
-
-def _event_csr_matvec_cpu_translation(c, values, indices, indptr, events, *, shape, transpose):
-  inputs = (values, indices, indptr, events)
-  event_type = c.get_shape(events)
-  description = dict(shape=shape, transpose=transpose)
-  if transpose:
-    if event_type.element_type() == jnp.bool_:
-      imp = _event_csr_matvec_transpose_numba_imp1_bool
-    else:
-      imp = _event_csr_matvec_transpose_numba_imp2
-    name, inputs, in_layouts, out_layouts = compile_cpu_signature_with_numba(
-      c,
-      imp,
-      abs_eval_fn=_event_csr_matvec_abstract,
-      multiple_results=False,
-      inputs=inputs,
-      description=description
-    )
-  else:
-    if event_type.element_type() == jnp.bool_:
-      imp = _event_csr_matvec_numba_imp1_bool
-    else:
-      imp = _event_csr_matvec_numba_imp2
-    name, inputs, in_layouts, out_layouts = compile_cpu_signature_with_numba(
-      c,
-      imp,
-      abs_eval_fn=_event_csr_matvec_abstract,
-      multiple_results=False,
-      inputs=inputs,
-      description=description
-    )
-  return xla_client.ops.CustomCallWithLayout(
-    c, name,
-    operands=inputs,
-    operand_shapes_with_layout=in_layouts,
-    shape_with_layout=out_layouts,
-  )
-
-
-def _event_csr_matvec_gpu_translation(c, data, indices, indptr, vector, *, shape, transpose):
-  gpu_ops = import_brainpylib_gpu_ops()
-  if gpu_ops is None:
-    raise GPUOperatorNotFound(event_csr_matvec_p.name)
-
-  # shape checking
-  data_shape = c.get_shape(data)
-  indices_shape = c.get_shape(indices)
-  indptr_shape = c.get_shape(indptr)
-  vec_shape = c.get_shape(vector)
-  if data_shape.element_type() == jnp.float32:
-    ftype = b'_float'
-  elif data_shape.element_type() == jnp.float64:
-    ftype = b'_double'
-  else:
-    raise ValueError
-  assert indices_shape.element_type() == indptr_shape.element_type()
-  if indices_shape.element_type() == jnp.int32:
-    itype = b'_int'
-  elif indices_shape.element_type() == jnp.int64:
-    itype = b'_long'
-  else:
-    raise ValueError
-  data_name = b'_homo' if data_shape.dimensions() == (1,) else b'_heter'
-  tran_type = b'_transpose' if transpose else b''
-  if vec_shape.element_type() == jnp.bool_:
-    vec_type = b'_bool'
-  else:
-    assert vec_shape.element_type() == data_shape.element_type()
-    vec_type = b''
-
-  # opaque
-  opaque = gpu_ops.build_double_size_descriptor(shape[0], shape[1])
-
-  # call
-  return xla_client.ops.CustomCallWithLayout(
-    c,
-    b'event_csrmv' + data_name + ftype + itype + vec_type + tran_type,
-    operands=(data, indices, indptr, vector),
-    operand_shapes_with_layout=(c.get_shape(data),
-                                c.get_shape(indices),
-                                c.get_shape(indptr),
-                                c.get_shape(vector)),
-    shape_with_layout=xla_client.Shape.array_shape(data_shape.element_type(),
-                                                   (shape[1] if transpose else shape[0],),
-                                                   (0,)),
-    opaque=opaque,
-  )
-
-
-def _event_csr_matvec_batching_rule(args, axes, *, shape, transpose):
-  batch_size = 0
-  args_processed = []
-  for arg, axis in zip(args, axes):
-    if axis is None:
-      arg = jnp.expand_dims(jnp.atleast_1d(arg), 0)
-    else:
-      batch_size = arg.shape[axis]
-      if axis > 0:
-        arg = jnp.moveaxis(arg, axis, 0)
-    args_processed.append(arg)
-
-  r = event_csr_matvec_batching_p.bind(*args_processed,
-                                       batch_size=batch_size,
-                                       shape=shape,
-                                       transpose=transpose)
-  return r, 0
-
-
-def _event_csr_matvec_jvp_values_brainpylib(values_dot, values, indices, indptr, events, *, shape, transpose):
-  return normal_csrmv(values_dot, indices, indptr, events, shape=shape, transpose=transpose)
-
-
-def _event_csr_matvec_jvp_events_brainpylib(events_dot, values, indices, indptr, events, *, shape, transpose):
-  return normal_csrmv(values, indices, indptr, events_dot, shape=shape, transpose=transpose)
-
-
-def _event_csr_matvec_transpose_brainpylib(ct, values, indices, indptr, events, *, shape, transpose):
-  if ad.is_undefined_primal(indices) or ad.is_undefined_primal(indptr):
-    raise ValueError("Cannot transpose with respect to sparse indices.")
-  if ad.is_undefined_primal(events):
-    ct_events = normal_csrmv(values, indices, indptr, ct, shape=shape, transpose=not transpose)
-    return values, indices, indptr, (ad.Zero(events) if type(ct) is ad.Zero else ct_events)
-  else:
-    if type(ct) is ad.Zero:
-      ct_values = ad.Zero(values)
-    else:
-      if values.aval.shape[0] == 1:  # scalar
-        ct_values = csrmv_brainpylib(jnp.ones(1), indices, indptr, events, shape=shape, transpose=transpose)
-        ct_values = jnp.inner(ct, ct_values)
-      else:  # heterogeneous values
-        row, col = csr_to_coo(indices, indptr)
-        ct_values = events[row] * ct[col] if transpose else events[col] * ct[row]
-    return ct_values, indices, indptr, events
-
-
-event_csr_matvec_p = Primitive('event_csr_matvec')
-event_csr_matvec_p.def_abstract_eval(_event_csr_matvec_abstract)
-event_csr_matvec_p.def_impl(partial(xla.apply_primitive, event_csr_matvec_p))
-# xla.backend_specific_translations['cpu'][event_csr_matvec_p] = _event_csr_matvec_cpu_translation
-# xla.backend_specific_translations['gpu'][event_csr_matvec_p] = _event_csr_matvec_gpu_translation
-ad.defjvp(event_csr_matvec_p, _event_csr_matvec_jvp_values_brainpylib, None, None,
-          _event_csr_matvec_jvp_events_brainpylib)
-ad.primitive_transposes[event_csr_matvec_p] = _event_csr_matvec_transpose_brainpylib
-register_general_batching(event_csr_matvec_p)
-
-
-# batching.primitive_batchers[event_csr_matvec_p] = _event_csr_matvec_batching_rule
-
-
-### TAICHI ###
-
-def csrmv_taichi(
-    data: Union[float, jax.Array],
-    indices: jax.Array,
-    indptr: jax.Array,
-    events: jax.Array,
-    *,
-    shape: Tuple[int, int],
-    transpose: bool = False
-) -> jax.Array:
-  """Product of a sparse CSR matrix and a dense event vector.
-
-  This function supports JAX transformations, including `jit()`, `grad()`,
-  `vmap()` and `pmap()`.
-
   Parameters
   ----------
   data: ndarray, float
@@ -691,298 +111,6 @@ def csrmv_taichi(
   return raw_csrmv_taichi(data, indices, indptr, events, shape=shape, transpose=transpose)[0]
 
 
-# -------------
-# CPU operators
-# -------------
-
-# 1. The benchmarking shows that the performance of the following transpose
-#    kernels is maximized when using serialized mode
-# 2. Since our Taichi-JAX kernel does not support the non-differentiable/non-jittable
-#    arguments, we have to define each kernel separately when the
-#    non-differentiable/non-jittable arguments are different.
-
-
-@ti.kernel
-def _event_csr_matvec_transpose_bool_homo_cpu(values: ti.types.ndarray(ndim=1),
-                                              indices: ti.types.ndarray(ndim=1),
-                                              indptr: ti.types.ndarray(ndim=1),
-                                              events: ti.types.ndarray(ndim=1),
-                                              out: ti.types.ndarray(ndim=1)):
-  value = values[0]
-  ti.loop_config(serialize=True)
-  for row_i in range(indptr.shape[0] - 1):
-    if events[row_i]:
-      for j in range(indptr[row_i], indptr[row_i + 1]):
-        out[indices[j]] += value
-
-
-@ti.kernel
-def _event_csr_matvec_transpose_bool_heter_cpu(values: ti.types.ndarray(ndim=1),
-                                               indices: ti.types.ndarray(ndim=1),
-                                               indptr: ti.types.ndarray(ndim=1),
-                                               events: ti.types.ndarray(ndim=1),
-                                               out: ti.types.ndarray(ndim=1)):
-  ti.loop_config(serialize=True)
-  for row_i in range(indptr.shape[0] - 1):
-    if events[row_i]:
-      for j in range(indptr[row_i], indptr[row_i + 1]):
-        out[indices[j]] += values[j]
-
-
-@ti.kernel
-def _event_csr_matvec_transpose_homo_cpu(values: ti.types.ndarray(ndim=1),
-                                         indices: ti.types.ndarray(ndim=1),
-                                         indptr: ti.types.ndarray(ndim=1),
-                                         events: ti.types.ndarray(ndim=1),
-                                         out: ti.types.ndarray(ndim=1)):
-  value = values[0]
-  ti.loop_config(serialize=True)
-  for row_i in range(indptr.shape[0] - 1):
-    if events[row_i] != 0.:
-      for j in range(indptr[row_i], indptr[row_i + 1]):
-        out[indices[j]] += value
-
-
-@ti.kernel
-def _event_csr_matvec_transpose_heter_cpu(values: ti.types.ndarray(ndim=1),
-                                          indices: ti.types.ndarray(ndim=1),
-                                          indptr: ti.types.ndarray(ndim=1),
-                                          events: ti.types.ndarray(ndim=1),
-                                          out: ti.types.ndarray(ndim=1)):
-  ti.loop_config(serialize=True)
-  for row_i in range(indptr.shape[0] - 1):
-    if events[row_i] != 0.:
-      for j in range(indptr[row_i], indptr[row_i + 1]):
-        out[indices[j]] += values[j]
-
-
-@ti.kernel
-def _event_csr_matvec_bool_homo_cpu(values: ti.types.ndarray(ndim=1),
-                                    indices: ti.types.ndarray(ndim=1),
-                                    indptr: ti.types.ndarray(ndim=1),
-                                    events: ti.types.ndarray(ndim=1),
-                                    out: ti.types.ndarray(ndim=1)):
-  value = values[0]
-  # ti.loop_config(serialize=True)
-  for row_i in range(indptr.shape[0] - 1):
-    r = 0.
-    for j in range(indptr[row_i], indptr[row_i + 1]):
-      if events[indices[j]]:
-        r += value
-    out[row_i] = r
-
-
-@ti.kernel
-def _event_csr_matvec_bool_heter_cpu(values: ti.types.ndarray(ndim=1),
-                                     indices: ti.types.ndarray(ndim=1),
-                                     indptr: ti.types.ndarray(ndim=1),
-                                     events: ti.types.ndarray(ndim=1),
-                                     out: ti.types.ndarray(ndim=1)):
-  # ti.loop_config(serialize=True)
-  for row_i in range(indptr.shape[0] - 1):
-    r = 0.
-    for j in range(indptr[row_i], indptr[row_i + 1]):
-      if events[indices[j]]:
-        r += values[j]
-    out[row_i] = r
-
-
-@ti.kernel
-def _event_csr_matvec_homo_cpu(values: ti.types.ndarray(ndim=1),
-                               indices: ti.types.ndarray(ndim=1),
-                               indptr: ti.types.ndarray(ndim=1),
-                               events: ti.types.ndarray(ndim=1),
-                               out: ti.types.ndarray(ndim=1)):
-  value = values[0]
-  # ti.loop_config(serialize=True)
-  for row_i in range(indptr.shape[0] - 1):
-    r = 0.
-    for j in range(indptr[row_i], indptr[row_i + 1]):
-      if events[indices[j]] != 0.:
-        r += value
-    out[row_i] = r
-
-
-@ti.kernel
-def _event_csr_matvec_heter_cpu(values: ti.types.ndarray(ndim=1),
-                                indices: ti.types.ndarray(ndim=1),
-                                indptr: ti.types.ndarray(ndim=1),
-                                events: ti.types.ndarray(ndim=1),
-                                out: ti.types.ndarray(ndim=1)):
-  # ti.loop_config(serialize=True)
-  for row_i in range(indptr.shape[0] - 1):
-    r = 0.
-    for j in range(indptr[row_i], indptr[row_i + 1]):
-      if events[indices[j]] != 0.:
-        r += values[j]
-    out[row_i] = r
-
-
-# -------------
-# GPU operators
-# -------------
-
-# 1. GPU kernels are different from the CPU ones, since the GPU kernels need
-#    to use warp-level parallelism to achieve the best performance.
-
-
-@ti.kernel
-def _event_csr_matvec_transpose_bool_homo_gpu(values: ti.types.ndarray(ndim=1),
-                                              indices: ti.types.ndarray(ndim=1),
-                                              indptr: ti.types.ndarray(ndim=1),
-                                              events: ti.types.ndarray(ndim=1),
-                                              out: ti.types.ndarray(ndim=1)):
-  value = values[0]
-  for i in range((indptr.shape[0] - 1) * 32):
-    row_i = i >> 5
-    index = i & 31
-    if events[row_i]:
-      j = indptr[row_i] + index
-      end_index = indptr[row_i + 1]
-      while j < end_index:
-        out[indices[j]] += value
-        j += 32
-
-
-@ti.kernel
-def _event_csr_matvec_transpose_homo_gpu(values: ti.types.ndarray(ndim=1),
-                                         indices: ti.types.ndarray(ndim=1),
-                                         indptr: ti.types.ndarray(ndim=1),
-                                         events: ti.types.ndarray(ndim=1),
-                                         out: ti.types.ndarray(ndim=1)):
-  value = values[0]
-  for i in range((indptr.shape[0] - 1) * 32):
-    row_i = i >> 5
-    index = i & 31
-    if events[row_i] != 0.:
-      j = indptr[row_i] + index
-      end_index = indptr[row_i + 1]
-      while j < end_index:
-        out[indices[j]] += value
-        j += 32
-
-
-# TODO
-# It is important to note that the following warp-based kernels
-# should be improved, since the atomic_add for each thread is not
-# very efficient. Instead, the warp-level reduction primitive
-# should be used.
-# see ``warp_reduce_sum()`` function in tifunc.py.
-# However, currently Taichi does not support general warp-level primitives.
-
-
-@ti.kernel
-def _event_csr_matvec_bool_homo_gpu(values: ti.types.ndarray(ndim=1),
-                                    indices: ti.types.ndarray(ndim=1),
-                                    indptr: ti.types.ndarray(ndim=1),
-                                    events: ti.types.ndarray(ndim=1),
-                                    out: ti.types.ndarray(ndim=1)):
-  value = values[0]
-  for i in range((indptr.shape[0] - 1) * 32):
-    row_i = i >> 5
-    index = i & 31
-    r = 0.
-    j = indptr[row_i] + index
-    end_index = indptr[row_i + 1]
-    while j < end_index:
-      if events[indices[j]]:
-        r += value
-      j += 32
-    out[row_i] += r  # TODO: warp-level primitive
-
-
-@ti.kernel
-def _event_csr_matvec_homo_gpu(values: ti.types.ndarray(ndim=1),
-                               indices: ti.types.ndarray(ndim=1),
-                               indptr: ti.types.ndarray(ndim=1),
-                               events: ti.types.ndarray(ndim=1),
-                               out: ti.types.ndarray(ndim=1)):
-  value = values[0]
-  for i in range((indptr.shape[0] - 1) * 32):
-    row_i = i >> 5
-    index = i & 31
-    r = 0.
-    j = indptr[row_i] + index
-    end_index = indptr[row_i + 1]
-    while j < end_index:
-      if events[indices[j]] != 0.:
-        r += value
-      j += 32
-    out[row_i] += r  # TODO: warp-level primitive
-
-
-@ti.kernel
-def _event_csr_matvec_transpose_bool_heter_gpu(values: ti.types.ndarray(ndim=1),
-                                               indices: ti.types.ndarray(ndim=1),
-                                               indptr: ti.types.ndarray(ndim=1),
-                                               events: ti.types.ndarray(ndim=1),
-                                               out: ti.types.ndarray(ndim=1)):
-  for i in range((indptr.shape[0] - 1) * 32):
-    row_i = i >> 5
-    index = i & 31
-    if events[row_i]:
-      j = indptr[row_i] + index
-      end_index = indptr[row_i + 1]
-      while j < end_index:
-        out[indices[j]] += values[j]
-        j += 32
-
-
-@ti.kernel
-def _event_csr_matvec_transpose_heter_gpu(values: ti.types.ndarray(ndim=1),
-                                          indices: ti.types.ndarray(ndim=1),
-                                          indptr: ti.types.ndarray(ndim=1),
-                                          events: ti.types.ndarray(ndim=1),
-                                          out: ti.types.ndarray(ndim=1)):
-  for i in range((indptr.shape[0] - 1) * 32):
-    row_i = i >> 5
-    index = i & 31
-    if events[row_i] != 0.:
-      j = indptr[row_i] + index
-      end_index = indptr[row_i + 1]
-      while j < end_index:
-        out[indices[j]] += values[j]
-        j += 32
-
-
-@ti.kernel
-def _event_csr_matvec_bool_heter_gpu(values: ti.types.ndarray(ndim=1),
-                                     indices: ti.types.ndarray(ndim=1),
-                                     indptr: ti.types.ndarray(ndim=1),
-                                     events: ti.types.ndarray(ndim=1),
-                                     out: ti.types.ndarray(ndim=1)):
-  for i in range((indptr.shape[0] - 1) * 32):
-    row_i = i >> 5
-    index = i & 31
-    r = 0.
-    j = indptr[row_i] + index
-    end_index = indptr[row_i + 1]
-    while j < end_index:
-      if events[indices[j]]:
-        r += values[j]
-      j += 32
-    out[row_i] += r  # TODO: warp-level primitive
-
-
-@ti.kernel
-def _event_csr_matvec_heter_gpu(values: ti.types.ndarray(ndim=1),
-                                indices: ti.types.ndarray(ndim=1),
-                                indptr: ti.types.ndarray(ndim=1),
-                                events: ti.types.ndarray(ndim=1),
-                                out: ti.types.ndarray(ndim=1)):
-  for i in range((indptr.shape[0] - 1) * 32):
-    row_i = i >> 5
-    index = i & 31
-    r = 0.
-    j = indptr[row_i] + index
-    end_index = indptr[row_i + 1]
-    while j < end_index:
-      if events[indices[j]] != 0.:
-        r += values[j]
-      j += 32
-    out[row_i] += r  # TODO: warp-level primitive
-
-
 def raw_csrmv_taichi(
     data: Union[float, jax.Array],
     indices: jax.Array,
@@ -992,6 +120,9 @@ def raw_csrmv_taichi(
     shape: Tuple[int, int],
     transpose: bool = False
 ):
+  if ti is None:
+    raise PackageMissingError.by_purpose(name='taichi==1.7.0', purpose='customized operators')
+
   if transpose:
     if events.dtype == jnp.bool_:
       if data.shape[0] == 1:
@@ -1025,65 +156,361 @@ def raw_csrmv_taichi(
               shape=shape)
 
 
-def _event_csr_matvec_jvp_values_taichi(val_dot, values, indices, indptr, events, *, outs, transpose, shape):
-  return normal_csrmv_taichi(val_dot, indices, indptr, events, shape=shape, transpose=transpose)
+if ti is not None:
+
+  # -------------
+  # CPU operators
+  # -------------
+
+  # 1. The benchmarking shows that the performance of the following transpose
+  #    kernels is maximized when using serialized mode
+  # 2. Since our Taichi-JAX kernel does not support the non-differentiable/non-jittable
+  #    arguments, we have to define each kernel separately when the
+  #    non-differentiable/non-jittable arguments are different.
+
+  @ti.kernel
+  def _event_csr_matvec_transpose_bool_homo_cpu(values: ti.types.ndarray(ndim=1),
+                                                indices: ti.types.ndarray(ndim=1),
+                                                indptr: ti.types.ndarray(ndim=1),
+                                                events: ti.types.ndarray(ndim=1),
+                                                out: ti.types.ndarray(ndim=1)):
+    value = values[0]
+    ti.loop_config(serialize=True)
+    for row_i in range(indptr.shape[0] - 1):
+      if events[row_i]:
+        for j in range(indptr[row_i], indptr[row_i + 1]):
+          out[indices[j]] += value
+
+
+  @ti.kernel
+  def _event_csr_matvec_transpose_bool_heter_cpu(values: ti.types.ndarray(ndim=1),
+                                                 indices: ti.types.ndarray(ndim=1),
+                                                 indptr: ti.types.ndarray(ndim=1),
+                                                 events: ti.types.ndarray(ndim=1),
+                                                 out: ti.types.ndarray(ndim=1)):
+    ti.loop_config(serialize=True)
+    for row_i in range(indptr.shape[0] - 1):
+      if events[row_i]:
+        for j in range(indptr[row_i], indptr[row_i + 1]):
+          out[indices[j]] += values[j]
+
+
+  @ti.kernel
+  def _event_csr_matvec_transpose_homo_cpu(values: ti.types.ndarray(ndim=1),
+                                           indices: ti.types.ndarray(ndim=1),
+                                           indptr: ti.types.ndarray(ndim=1),
+                                           events: ti.types.ndarray(ndim=1),
+                                           out: ti.types.ndarray(ndim=1)):
+    value = values[0]
+    ti.loop_config(serialize=True)
+    for row_i in range(indptr.shape[0] - 1):
+      if events[row_i] != 0.:
+        for j in range(indptr[row_i], indptr[row_i + 1]):
+          out[indices[j]] += value
+
+
+  @ti.kernel
+  def _event_csr_matvec_transpose_heter_cpu(values: ti.types.ndarray(ndim=1),
+                                            indices: ti.types.ndarray(ndim=1),
+                                            indptr: ti.types.ndarray(ndim=1),
+                                            events: ti.types.ndarray(ndim=1),
+                                            out: ti.types.ndarray(ndim=1)):
+    ti.loop_config(serialize=True)
+    for row_i in range(indptr.shape[0] - 1):
+      if events[row_i] != 0.:
+        for j in range(indptr[row_i], indptr[row_i + 1]):
+          out[indices[j]] += values[j]
+
+
+  @ti.kernel
+  def _event_csr_matvec_bool_homo_cpu(values: ti.types.ndarray(ndim=1),
+                                      indices: ti.types.ndarray(ndim=1),
+                                      indptr: ti.types.ndarray(ndim=1),
+                                      events: ti.types.ndarray(ndim=1),
+                                      out: ti.types.ndarray(ndim=1)):
+    value = values[0]
+    # ti.loop_config(serialize=True)
+    for row_i in range(indptr.shape[0] - 1):
+      r = 0.
+      for j in range(indptr[row_i], indptr[row_i + 1]):
+        if events[indices[j]]:
+          r += value
+      out[row_i] = r
+
+
+  @ti.kernel
+  def _event_csr_matvec_bool_heter_cpu(values: ti.types.ndarray(ndim=1),
+                                       indices: ti.types.ndarray(ndim=1),
+                                       indptr: ti.types.ndarray(ndim=1),
+                                       events: ti.types.ndarray(ndim=1),
+                                       out: ti.types.ndarray(ndim=1)):
+    # ti.loop_config(serialize=True)
+    for row_i in range(indptr.shape[0] - 1):
+      r = 0.
+      for j in range(indptr[row_i], indptr[row_i + 1]):
+        if events[indices[j]]:
+          r += values[j]
+      out[row_i] = r
+
+
+  @ti.kernel
+  def _event_csr_matvec_homo_cpu(values: ti.types.ndarray(ndim=1),
+                                 indices: ti.types.ndarray(ndim=1),
+                                 indptr: ti.types.ndarray(ndim=1),
+                                 events: ti.types.ndarray(ndim=1),
+                                 out: ti.types.ndarray(ndim=1)):
+    value = values[0]
+    # ti.loop_config(serialize=True)
+    for row_i in range(indptr.shape[0] - 1):
+      r = 0.
+      for j in range(indptr[row_i], indptr[row_i + 1]):
+        if events[indices[j]] != 0.:
+          r += value
+      out[row_i] = r
+
+
+  @ti.kernel
+  def _event_csr_matvec_heter_cpu(values: ti.types.ndarray(ndim=1),
+                                  indices: ti.types.ndarray(ndim=1),
+                                  indptr: ti.types.ndarray(ndim=1),
+                                  events: ti.types.ndarray(ndim=1),
+                                  out: ti.types.ndarray(ndim=1)):
+    # ti.loop_config(serialize=True)
+    for row_i in range(indptr.shape[0] - 1):
+      r = 0.
+      for j in range(indptr[row_i], indptr[row_i + 1]):
+        if events[indices[j]] != 0.:
+          r += values[j]
+      out[row_i] = r
+
+
+  # -------------
+  # GPU operators
+  # -------------
+
+  # 1. GPU kernels are different from the CPU ones, since the GPU kernels need
+  #    to use warp-level parallelism to achieve the best performance.
+
+  @ti.kernel
+  def _event_csr_matvec_transpose_bool_homo_gpu(values: ti.types.ndarray(ndim=1),
+                                                indices: ti.types.ndarray(ndim=1),
+                                                indptr: ti.types.ndarray(ndim=1),
+                                                events: ti.types.ndarray(ndim=1),
+                                                out: ti.types.ndarray(ndim=1)):
+    value = values[0]
+    for i in range((indptr.shape[0] - 1) * 32):
+      row_i = i >> 5
+      index = i & 31
+      if events[row_i]:
+        j = indptr[row_i] + index
+        end_index = indptr[row_i + 1]
+        while j < end_index:
+          out[indices[j]] += value
+          j += 32
+
+
+  @ti.kernel
+  def _event_csr_matvec_transpose_homo_gpu(values: ti.types.ndarray(ndim=1),
+                                           indices: ti.types.ndarray(ndim=1),
+                                           indptr: ti.types.ndarray(ndim=1),
+                                           events: ti.types.ndarray(ndim=1),
+                                           out: ti.types.ndarray(ndim=1)):
+    value = values[0]
+    for i in range((indptr.shape[0] - 1) * 32):
+      row_i = i >> 5
+      index = i & 31
+      if events[row_i] != 0.:
+        j = indptr[row_i] + index
+        end_index = indptr[row_i + 1]
+        while j < end_index:
+          out[indices[j]] += value
+          j += 32
+
+
+  # TODO
+  # It is important to note that the following warp-based kernels
+  # should be improved, since the atomic_add for each thread is not
+  # very efficient. Instead, the warp-level reduction primitive
+  # should be used.
+  # see ``warp_reduce_sum()`` function in tifunc.py.
+  # However, currently Taichi does not support general warp-level primitives.
+
+  @ti.kernel
+  def _event_csr_matvec_bool_homo_gpu(values: ti.types.ndarray(ndim=1),
+                                      indices: ti.types.ndarray(ndim=1),
+                                      indptr: ti.types.ndarray(ndim=1),
+                                      events: ti.types.ndarray(ndim=1),
+                                      out: ti.types.ndarray(ndim=1)):
+    value = values[0]
+    for i in range((indptr.shape[0] - 1) * 32):
+      row_i = i >> 5
+      index = i & 31
+      r = 0.
+      j = indptr[row_i] + index
+      end_index = indptr[row_i + 1]
+      while j < end_index:
+        if events[indices[j]]:
+          r += value
+        j += 32
+      out[row_i] += r  # TODO: warp-level primitive
+
+
+  @ti.kernel
+  def _event_csr_matvec_homo_gpu(values: ti.types.ndarray(ndim=1),
+                                 indices: ti.types.ndarray(ndim=1),
+                                 indptr: ti.types.ndarray(ndim=1),
+                                 events: ti.types.ndarray(ndim=1),
+                                 out: ti.types.ndarray(ndim=1)):
+    value = values[0]
+    for i in range((indptr.shape[0] - 1) * 32):
+      row_i = i >> 5
+      index = i & 31
+      r = 0.
+      j = indptr[row_i] + index
+      end_index = indptr[row_i + 1]
+      while j < end_index:
+        if events[indices[j]] != 0.:
+          r += value
+        j += 32
+      out[row_i] += r  # TODO: warp-level primitive
+
+
+  @ti.kernel
+  def _event_csr_matvec_transpose_bool_heter_gpu(values: ti.types.ndarray(ndim=1),
+                                                 indices: ti.types.ndarray(ndim=1),
+                                                 indptr: ti.types.ndarray(ndim=1),
+                                                 events: ti.types.ndarray(ndim=1),
+                                                 out: ti.types.ndarray(ndim=1)):
+    for i in range((indptr.shape[0] - 1) * 32):
+      row_i = i >> 5
+      index = i & 31
+      if events[row_i]:
+        j = indptr[row_i] + index
+        end_index = indptr[row_i + 1]
+        while j < end_index:
+          out[indices[j]] += values[j]
+          j += 32
+
+
+  @ti.kernel
+  def _event_csr_matvec_transpose_heter_gpu(values: ti.types.ndarray(ndim=1),
+                                            indices: ti.types.ndarray(ndim=1),
+                                            indptr: ti.types.ndarray(ndim=1),
+                                            events: ti.types.ndarray(ndim=1),
+                                            out: ti.types.ndarray(ndim=1)):
+    for i in range((indptr.shape[0] - 1) * 32):
+      row_i = i >> 5
+      index = i & 31
+      if events[row_i] != 0.:
+        j = indptr[row_i] + index
+        end_index = indptr[row_i + 1]
+        while j < end_index:
+          out[indices[j]] += values[j]
+          j += 32
+
+
+  @ti.kernel
+  def _event_csr_matvec_bool_heter_gpu(values: ti.types.ndarray(ndim=1),
+                                       indices: ti.types.ndarray(ndim=1),
+                                       indptr: ti.types.ndarray(ndim=1),
+                                       events: ti.types.ndarray(ndim=1),
+                                       out: ti.types.ndarray(ndim=1)):
+    for i in range((indptr.shape[0] - 1) * 32):
+      row_i = i >> 5
+      index = i & 31
+      r = 0.
+      j = indptr[row_i] + index
+      end_index = indptr[row_i + 1]
+      while j < end_index:
+        if events[indices[j]]:
+          r += values[j]
+        j += 32
+      out[row_i] += r  # TODO: warp-level primitive
+
+
+  @ti.kernel
+  def _event_csr_matvec_heter_gpu(values: ti.types.ndarray(ndim=1),
+                                  indices: ti.types.ndarray(ndim=1),
+                                  indptr: ti.types.ndarray(ndim=1),
+                                  events: ti.types.ndarray(ndim=1),
+                                  out: ti.types.ndarray(ndim=1)):
+    for i in range((indptr.shape[0] - 1) * 32):
+      row_i = i >> 5
+      index = i & 31
+      r = 0.
+      j = indptr[row_i] + index
+      end_index = indptr[row_i + 1]
+      while j < end_index:
+        if events[indices[j]] != 0.:
+          r += values[j]
+        j += 32
+      out[row_i] += r  # TODO: warp-level primitive
 
 
-def _event_csr_matvec_jvp_events_taichi(evt_dot, values, indices, indptr, events, *, outs, transpose, shape):
-  return normal_csrmv_taichi(values, indices, indptr, evt_dot, shape=shape, transpose=transpose)
+  def _event_csr_matvec_jvp_values_taichi(val_dot, values, indices, indptr, events, *, outs, transpose, shape):
+    return normal_csrmv_taichi(val_dot, indices, indptr, events, shape=shape, transpose=transpose)
 
 
-def _event_csr_matvec_transpose_taichi(
-    ct, values, indices, indptr, events, *, outs, transpose, shape
-):
-  if ad.is_undefined_primal(indices) or ad.is_undefined_primal(indptr):
-    raise ValueError("Cannot transpose with respect to sparse indices.")
-  if ad.is_undefined_primal(events):
-    ct_events = normal_csrmv_taichi(values, indices, indptr, ct[0], shape=shape, transpose=transpose)[0]
-    return values, indices, indptr, (ad.Zero(events) if type(ct[0]) is ad.Zero else ct_events)
-  else:
-    if type(ct[0]) is ad.Zero:
-      ct_values = ad.Zero(values)
+  def _event_csr_matvec_jvp_events_taichi(evt_dot, values, indices, indptr, events, *, outs, transpose, shape):
+    return normal_csrmv_taichi(values, indices, indptr, evt_dot, shape=shape, transpose=transpose)
+
+
+  def _event_csr_matvec_transpose_taichi(
+      ct, values, indices, indptr, events, *, outs, transpose, shape
+  ):
+    if ad.is_undefined_primal(indices) or ad.is_undefined_primal(indptr):
+      raise ValueError("Cannot transpose with respect to sparse indices.")
+    if ad.is_undefined_primal(events):
+      ct_events = normal_csrmv_taichi(values, indices, indptr, ct[0], shape=shape, transpose=transpose)[0]
+      return values, indices, indptr, (ad.Zero(events) if type(ct[0]) is ad.Zero else ct_events)
     else:
-      if values.aval.shape[0] == 1:  # scalar
-        ct_values = raw_csrmv_taichi(jnp.ones(1), indices, indptr, events, shape=shape, transpose=transpose)[0]
-        ct_values = jnp.inner(ct[0], ct_values)
-      else:  # heterogeneous values
-        row, col = csr_to_coo(indices, indptr)
-        ct_values = events[row] * ct[0][col] if transpose else events[col] * ct[0][row]
-    return ct_values, indices, indptr, events
+      if type(ct[0]) is ad.Zero:
+        ct_values = ad.Zero(values)
+      else:
+        if values.aval.shape[0] == 1:  # scalar
+          ct_values = raw_csrmv_taichi(jnp.ones(1), indices, indptr, events, shape=shape, transpose=transpose)[0]
+          ct_values = jnp.inner(ct[0], ct_values)
+        else:  # heterogeneous values
+          row, col = csr_to_coo(indices, indptr)
+          ct_values = events[row] * ct[0][col] if transpose else events[col] * ct[0][row]
+      return ct_values, indices, indptr, events
 
 
-def _define_op(cpu_kernel, gpu_kernel):
-  prim = XLACustomOp(cpu_kernel=cpu_kernel, gpu_kernel=gpu_kernel)
-  prim.defjvp(_event_csr_matvec_jvp_values_taichi, None, None, _event_csr_matvec_jvp_events_taichi)
-  prim.def_transpose_rule(_event_csr_matvec_transpose_taichi)
-  return prim
+  def _define_op(cpu_kernel, gpu_kernel):
+    prim = XLACustomOp(cpu_kernel=cpu_kernel, gpu_kernel=gpu_kernel)
+    prim.defjvp(_event_csr_matvec_jvp_values_taichi, None, None, _event_csr_matvec_jvp_events_taichi)
+    prim.def_transpose_rule(_event_csr_matvec_transpose_taichi)
+    return prim
 
 
-# transpose bool homo
-_event_csrmv_transpose_bool_homo_p = _define_op(_event_csr_matvec_transpose_bool_homo_cpu,
-                                                _event_csr_matvec_transpose_bool_homo_gpu)
+  # transpose bool homo
+  _event_csrmv_transpose_bool_homo_p = _define_op(_event_csr_matvec_transpose_bool_homo_cpu,
+                                                  _event_csr_matvec_transpose_bool_homo_gpu)
 
-# transpose homo
-_event_csrmv_transpose_homo_p = _define_op(_event_csr_matvec_transpose_homo_cpu, _event_csr_matvec_transpose_homo_gpu)
+  # transpose homo
+  _event_csrmv_transpose_homo_p = _define_op(_event_csr_matvec_transpose_homo_cpu,
+                                             _event_csr_matvec_transpose_homo_gpu)
 
-# not transpose bool homo
-_event_csrmv_bool_homo_p = _define_op(_event_csr_matvec_bool_homo_cpu, _event_csr_matvec_bool_homo_gpu)
+  # not transpose bool homo
+  _event_csrmv_bool_homo_p = _define_op(_event_csr_matvec_bool_homo_cpu,
+                                        _event_csr_matvec_bool_homo_gpu)
 
-# not transpose homo
-_event_csrmv_homo_p = _define_op(_event_csr_matvec_homo_cpu, _event_csr_matvec_homo_gpu)
+  # not transpose homo
+  _event_csrmv_homo_p = _define_op(_event_csr_matvec_homo_cpu,
+                                   _event_csr_matvec_homo_gpu)
 
-# transpose bool heter
-_event_csrmv_transpose_bool_heter_p = _define_op(_event_csr_matvec_transpose_bool_heter_cpu,
-                                                 _event_csr_matvec_transpose_bool_heter_gpu)
+  # transpose bool heter
+  _event_csrmv_transpose_bool_heter_p = _define_op(_event_csr_matvec_transpose_bool_heter_cpu,
+                                                   _event_csr_matvec_transpose_bool_heter_gpu)
 
-# transpose heter
-_event_csrmv_transpose_heter_p = _define_op(_event_csr_matvec_transpose_heter_cpu,
-                                            _event_csr_matvec_transpose_heter_gpu)
+  # transpose heter
+  _event_csrmv_transpose_heter_p = _define_op(_event_csr_matvec_transpose_heter_cpu,
+                                              _event_csr_matvec_transpose_heter_gpu)
 
-# not transpose bool heter
-_event_csrmv_bool_heter_p = _define_op(_event_csr_matvec_bool_heter_cpu, _event_csr_matvec_bool_heter_gpu)
+  # not transpose bool heter
+  _event_csrmv_bool_heter_p = _define_op(_event_csr_matvec_bool_heter_cpu,
+                                         _event_csr_matvec_bool_heter_gpu)
 
-# not transpose heter
-_event_csrmv_heter_p = _define_op(_event_csr_matvec_heter_cpu, _event_csr_matvec_heter_gpu)
+  # not transpose heter
+  _event_csrmv_heter_p = _define_op(_event_csr_matvec_heter_cpu,
+                                    _event_csr_matvec_heter_gpu)
diff --git a/brainpy/_src/math/event/_info_collection.py b/brainpy/_src/math/event/_info_collection.py
deleted file mode 100644
index 7bb043e3e..000000000
--- a/brainpy/_src/math/event/_info_collection.py
+++ /dev/null
@@ -1,198 +0,0 @@
-# -*- coding: utf-8 -*-
-
-from typing import Tuple, Union
-
-import jax
-import numba
-from jax import dtypes, numpy as jnp
-from jax.core import ShapedArray
-from jax.lib import xla_client
-
-from brainpy._src.dependency_check import import_brainpylib_gpu_ops
-from brainpy._src.dependency_check import import_taichi
-from brainpy._src.math.interoperability import as_jax
-from brainpy._src.math.ndarray import Array
-from brainpy._src.math.op_register.base import XLACustomOp
-from brainpy.errors import GPUOperatorNotFound
-
-ti = import_taichi()
-
-__all__ = [
-  'info'
-]
-
-
-def info(events: Union[Array, jax.Array]) -> Tuple[jax.Array, jax.Array]:
-  """Collect event information, including event indices, and event number.
-
-  This function supports JAX transformations, including `jit()`,
-  `vmap()` and `pmap()`.
-
-  Parameters
-  ----------
-  events: jax.Array
-    The events.
-
-  Returns
-  -------
-  res: tuple
-    A tuple with two elements, denoting the event indices and the event number.
-  """
-  events = as_jax(events)
-  if events.ndim != 1:
-    raise TypeError('Only support 1D boolean vector.')
-  return event_info_p(events)
-
-
-def _batch_event_info_abstract(events):
-  assert events.ndim == 2
-  # assert events.dtype == jnp.bool_
-  event_ids = ShapedArray(dtype=dtypes.canonicalize_dtype(int), shape=events.shape)
-  event_num = ShapedArray(dtype=dtypes.canonicalize_dtype(int), shape=(events.shape[0],))
-  return event_ids, event_num
-
-
-@numba.njit(fastmath=True, parallel=True, nogil=True)
-def _batch_event_info(outs, ins):
-  event_ids, event_num = outs
-  event_num.fill(0)
-  event_ids.fill(-1)
-  events = ins
-  for batch_idx in range(event_ids.shape[0]):
-    num = 0
-    for i in range(event_ids.shape[1]):
-      if events[batch_idx, i]:
-        event_ids[batch_idx, num] = i
-        num += 1
-    event_num[batch_idx] = num
-
-
-@ti.kernel
-def _batch_event_info_taichi(events: ti.types.ndarray(ndim=2),
-                             event_ids: ti.types.ndarray(ndim=2),
-                             event_num: ti.types.ndarray(ndim=1)):
-  for i, j in ti.grouped(ti.ndrange(event_ids.shape)):
-    event_ids[i, j] = -1
-  for batch_idx in range(event_ids.shape[0]):
-    num = 0
-    for i in range(event_ids.shape[1]):
-      if events[batch_idx, i]:
-        event_ids[batch_idx, num] = i
-        num += 1
-    event_num[batch_idx] = num
-
-
-def _batch_event_info_batching_rule(args, axes):
-  arg = jnp.moveaxis(args[0], axes[0], 0)
-  shape = arg.shape
-  arg = jnp.reshape(arg, (shape[0] * shape[1], shape[2]))
-  event_ids, event_num = batch_event_info_p(arg)
-  return ((jnp.reshape(event_ids, shape), jnp.reshape(event_num, shape[:2])),
-          (0, 0))
-
-
-def _event_info_gpu_translation(c, events):
-  gpu_ops = import_brainpylib_gpu_ops()
-  if gpu_ops is None:
-    raise GPUOperatorNotFound(event_info_p.name)
-
-  e_shape = c.get_shape(events).dimensions()
-  e_type = c.get_shape(events).element_type()
-  if len(e_shape) == 1:
-    event_size = e_shape[0]
-    batch_size = 1
-    event_ids_shape = xla_client.Shape.array_shape(dtypes.canonicalize_dtype(int),
-                                                   (event_size,),
-                                                   (0,))
-  else:
-    batch_size, event_size = e_shape
-    event_ids_shape = xla_client.Shape.array_shape(dtypes.canonicalize_dtype(int),
-                                                   (batch_size, event_size),
-                                                   (1, 0))
-  event_num_shape = xla_client.Shape.array_shape(dtypes.canonicalize_dtype(int),
-                                                 (batch_size,),
-                                                 (0,))
-  opaque = gpu_ops.build_nonzero_descriptor(event_size, batch_size)
-
-  if e_type == jnp.bool_:
-    type_name = b'_bool'
-  elif e_type == jnp.int32:
-    type_name = b'_int'
-  elif e_type == jnp.int64:
-    type_name = b'_long'
-  elif e_type == jnp.float32:
-    type_name = b'_float'
-  elif e_type == jnp.float64:
-    type_name = b'_double'
-  else:
-    raise ValueError
-
-  return xla_client.ops.CustomCallWithLayout(
-    c,
-    b'nonzero' + type_name,
-    operands=(events,),
-    operand_shapes_with_layout=(c.get_shape(events),),
-    shape_with_layout=xla_client.Shape.tuple_shape((event_ids_shape, event_num_shape)),
-    opaque=opaque,
-  )
-
-
-batch_event_info_p = XLACustomOp(
-  name='batched_event_info',
-  cpu_kernel=_batch_event_info_taichi,
-  gpu_kernel=_batch_event_info_taichi,
-  outs=_batch_event_info_abstract,
-)
-batch_event_info_p.def_batching_rule(_batch_event_info_batching_rule)
-
-
-def _event_info_abstract(events, **kwargs):
-  assert events.ndim == 1
-  # assert events.dtype == jnp.bool_
-  event_ids = ShapedArray(dtype=dtypes.canonicalize_dtype(int), shape=events.shape)
-  event_num = ShapedArray(dtype=dtypes.canonicalize_dtype(int), shape=(1,))
-  return event_ids, event_num
-
-
-# TODO: first parallel evaluate the sub-sections, then serially event the sub-results.
-@numba.njit(fastmath=True)
-def _event_info(outs, ins):
-  event_ids, event_num = outs
-  event_num.fill(0)
-  event_ids.fill(-1)
-  events = ins
-  num = 0
-  for i in range(event_ids.shape[0]):
-    if events[i]:
-      event_ids[num] = i
-      num += 1
-  event_num[0] = num
-
-
-@ti.kernel
-def _event_info_taichi(events: ti.types.ndarray(ndim=1),
-                       event_ids: ti.types.ndarray(ndim=1),
-                       event_num: ti.types.ndarray(ndim=1)):
-  for i in range(event_ids.shape[0]):
-    event_ids[i] = -1
-  num = 0
-  for i in range(event_ids.shape[0]):
-    if events[i]:
-      event_ids[num] = i
-      num += 1
-  event_num[0] = num
-
-
-def _event_info_batching_rule(args, axes):
-  arg = jnp.moveaxis(args[0], axes[0], 0)
-  return (batch_event_info_p(arg), (0, 0))
-
-
-event_info_p = XLACustomOp(
-  name='event_info',
-  cpu_kernel=_event_info_taichi,
-  gpu_kernel=_event_info_taichi,
-  outs=_event_info_abstract,
-  # gpu_func_translation=_event_info_gpu_translation,
-)
-event_info_p.def_batching_rule(_event_info_batching_rule)
diff --git a/brainpy/_src/math/event/tests/event_info_VS_jax_operators.py b/brainpy/_src/math/event/tests/event_info_VS_jax_operators.py
deleted file mode 100644
index 74cc6b7f9..000000000
--- a/brainpy/_src/math/event/tests/event_info_VS_jax_operators.py
+++ /dev/null
@@ -1,275 +0,0 @@
-from time import time
-
-from jax import jit, vmap, numpy as jnp
-
-import brainpy.math as bm
-
-
-def compare_argsort_and_sum(platform='cpu'):
-  """
-  CPU
-  ---
-
-  shape = (100, 10000)
-  brainpylib        0.1872694492340088 s
-  JAX argsort + sum 5.297466516494751 s
-
-  shape = (100, 100000)
-  brainpylib        2.333505153656006 s
-  JAX argsort + sum 65.20281910896301 s
-
-  shape = (1000, 10000)
-  brainpylib        2.0739688873291016 s
-  JAX argsort + sum 53.70602822303772 s
-
-  shape = (10000, 1000)
-  brainpylib        1.7262670993804932 s
-  JAX argsort + sum 43.92174816131592 s
-
-  GPU
-  ---
-  shape = (100, 100000)
-  brainpylib        0.14670848846435547 s
-  JAX argsort + sum 1.001936435699463 s
-
-  shape = (100, 1000000)
-  brainpylib        0.27660632133483887 s
-  JAX argsort + sum 16.390073776245117 s
-
-  shape = (1000, 100000)
-  brainpylib        0.2619345188140869 s
-  JAX argsort + sum 9.715844869613647 s
-
-  shape = (1000, 500000)
-  brainpylib        1.201209306716919 s
-  JAX argsort + sum 71.19761657714844 s
-
-  """
-
-  bm.set_platform(platform)
-
-  rng = bm.random.RandomState(123)
-  bp_event_info = jit(vmap(bm.event.info))
-  jax_event_info = jit(vmap(lambda events: (jnp.argsort(events), jnp.sum(events))))
-
-  if platform == 'cpu':
-    all_shapes = [
-      (100, 10000),
-      (100, 100000),
-      (1000, 10000),
-      (10000, 1000),
-    ]
-  else:
-    all_shapes = [
-      (100, 100000),
-      (100, 1000000),
-      (1000, 100000),
-      (1000, 500000),
-    ]
-
-  for shape in all_shapes:
-    print(f'shape = {shape}')
-
-    events = rng.random(shape).value < 0.1
-    event_ids1, event_num1 = bp_event_info(events)
-    event_ids2, event_num2 = jax_event_info(events)
-    assert jnp.allclose(event_num1, event_num2)
-    event_ids1.block_until_ready()
-    event_ids2.block_until_ready()
-
-    t0 = time()
-    for _ in range(100):
-      a, b = bp_event_info(events)
-      r = a.block_until_ready()
-    print(f'brainpylib        {time() - t0} s')
-
-    t0 = time()
-    for _ in range(100):
-      a, b = jax_event_info(events)
-      r = a.block_until_ready()
-    print(f'JAX argsort + sum {time() - t0} s')
-
-    print()
-
-
-def compare_argsort(platform='cpu'):
-  """
-
-  CPU
-  ---
-
-  shape = (100, 10000)
-  brainpylib  0.19738531112670898 s
-  JAX argsort 5.301469087600708 s
-
-  shape = (100, 100000)
-  brainpylib  2.3321938514709473 s
-  JAX argsort 65.13460850715637 s
-
-  shape = (1000, 10000)
-  brainpylib  2.0956876277923584 s
-  JAX argsort 53.863110065460205 s
-
-  shape = (10000, 1000)
-  brainpylib  1.7127799987792969 s
-  JAX argsort 44.05547475814819 s
-
-  GPU
-  ---
-  shape = (100, 100000)
-  brainpylib  0.1415419578552246 s
-  JAX argsort 0.9982438087463379 s
-
-  shape = (100, 1000000)
-  brainpylib  0.3224947452545166 s
-  JAX argsort 16.504750967025757 s
-
-  shape = (1000, 100000)
-  brainpylib  0.2781648635864258 s
-  JAX argsort 9.691488981246948 s
-
-  shape = (1000, 500000)
-  brainpylib  1.2167487144470215 s
-  JAX argsort 71.68716263771057 s
-
-  """
-
-  bm.set_platform(platform)
-
-  rng = bm.random.RandomState(123)
-  bp_event_info = jit(vmap(bm.event.info))
-  jax_event_info = jit(vmap(lambda events: jnp.argsort(events)))
-
-  if platform == 'cpu':
-    all_shapes = [
-      (100, 10000),
-      (100, 100000),
-      (1000, 10000),
-      (10000, 1000),
-    ]
-  else:
-    all_shapes = [
-      (100, 100000),
-      (100, 1000000),
-      (1000, 100000),
-      (1000, 500000),
-    ]
-
-  for shape in all_shapes:
-    print(f'shape = {shape}')
-
-    events = rng.random(shape).value < 0.1
-    event_ids1, event_num1 = bp_event_info(events)
-    event_ids1.block_until_ready()
-    event_ids2 = jax_event_info(events)
-    event_ids2.block_until_ready()
-
-    t0 = time()
-    for _ in range(100):
-      a, b = bp_event_info(events)
-      r = a.block_until_ready()
-    print(f'brainpylib  {time() - t0} s')
-
-    t0 = time()
-    for _ in range(100):
-      a = jax_event_info(events)
-      r = a.block_until_ready()
-    print(f'JAX argsort {time() - t0} s')
-
-    print()
-
-
-def compare_where(platform='cpu'):
-  """
-
-  CPU
-  ---
-
-  shape = (100, 10000)
-  brainpylib 0.20480966567993164 s
-  JAX where  0.7068588733673096 s
-
-  shape = (100, 100000)
-  brainpylib 2.3373026847839355 s
-  JAX where  5.862265348434448 s
-
-  shape = (1000, 10000)
-  brainpylib 2.105764865875244 s
-  JAX where  5.914586067199707 s
-
-  shape = (10000, 1000)
-  brainpylib 1.724682331085205 s
-  JAX where  5.718563795089722 s
-
-  GPU
-  ---
-  shape = (100, 100000)
-  brainpylib 0.15492558479309082 s
-  JAX where  0.3146538734436035 s
-
-  shape = (100, 1000000)
-  brainpylib 0.3290700912475586 s
-  JAX where  1.7064015865325928 s
-
-  shape = (1000, 100000)
-  brainpylib 0.2895216941833496 s
-  JAX where  1.6910102367401123 s
-
-  shape = (1000, 500000)
-  brainpylib 1.173649787902832 s
-  JAX where  7.868000268936157 s
-
-  """
-
-  bm.set_platform(platform)
-
-  rng = bm.random.RandomState(123)
-  bp_event_info = jit(vmap(bm.event.info))
-  jax_event_info = jit(vmap(lambda events: jnp.where(events, size=events.shape[0])))
-
-  if platform == 'cpu':
-    all_shapes = [
-      (100, 10000),
-      (100, 100000),
-      (1000, 10000),
-      (10000, 1000),
-    ]
-  else:
-    all_shapes = [
-      (100, 100000),
-      (100, 1000000),
-      (1000, 100000),
-      (1000, 500000),
-    ]
-
-  for shape in all_shapes:
-    print(f'shape = {shape}')
-
-    events = rng.random(shape).value < 0.1
-    event_ids1, event_num1 = bp_event_info(events)
-    event_ids1.block_until_ready()
-    event_ids2, = jax_event_info(events)
-    event_ids2.block_until_ready()
-
-    t0 = time()
-    for _ in range(100):
-      a, b = bp_event_info(events)
-      r = a.block_until_ready()
-    print(f'brainpylib {time() - t0} s')
-
-    t0 = time()
-    for _ in range(100):
-      a,  = jax_event_info(events)
-      r = a.block_until_ready()
-    print(f'JAX where  {time() - t0} s')
-
-    print()
-
-
-if __name__ == '__main__':
-  # compare_argsort_and_sum('cpu')
-  # compare_argsort_and_sum('gpu')
-  # compare_argsort('cpu')
-  compare_argsort('gpu')
-  # compare_where('cpu')
-  # compare_where('gpu')
diff --git a/brainpy/_src/math/event/tests/test_event_csrmv.py b/brainpy/_src/math/event/tests/test_event_csrmv.py
index e0f38490f..67e09d0a4 100644
--- a/brainpy/_src/math/event/tests/test_event_csrmv.py
+++ b/brainpy/_src/math/event/tests/test_event_csrmv.py
@@ -4,11 +4,18 @@
 from functools import partial
 
 import jax
+import pytest
 from absl.testing import parameterized
 
 import brainpy as bp
 import brainpy.math as bm
 
+from brainpy._src.dependency_check import import_taichi
+
+if import_taichi(error_if_not_found=False) is None:
+  pytest.skip('no taichi', allow_module_level=True)
+
+
 seed = 1234
 
 
diff --git a/brainpy/_src/math/event/tests/test_event_csrmv_old.py b/brainpy/_src/math/event/tests/test_event_csrmv_old.py
deleted file mode 100644
index 31a6527a2..000000000
--- a/brainpy/_src/math/event/tests/test_event_csrmv_old.py
+++ /dev/null
@@ -1,324 +0,0 @@
-# -*- coding: utf-8 -*-
-
-
-from functools import partial
-
-import jax
-from absl.testing import parameterized
-
-import brainpy as bp
-import brainpy.math as bm
-import platform
-
-import pytest
-pytest.skip('Old implementation.', allow_module_level=True)
-
-is_manual_test = False
-# if platform.system() == 'Windows' and not is_manual_test:
-#   pytest.skip('brainpy.math package may need manual tests.', allow_module_level=True)
-
-brainpylib_csr_matvec = partial(bm.event.csrmv, method='brainpylib')
-taichi_csr_matvec = partial(bm.event.csrmv, method='taichi')
-
-def sum_op(op):
-  def func(*args, **kwargs):
-    r = op(*args, **kwargs)
-    return r.sum()
-
-  return func
-
-
-class Test_event_csr_matvec(parameterized.TestCase):
-  def __init__(self, *args, platform='cpu', **kwargs):
-    super(Test_event_csr_matvec, self).__init__(*args, **kwargs)
-    bm.set_platform(platform)
-    print()
-
-  @parameterized.named_parameters(
-    dict(
-      testcase_name=f'transpose={transpose}, shape={shape}, homo_data={homo_data}',
-      transpose=transpose,
-      shape=shape,
-      homo_data=homo_data,
-    )
-    for transpose in [True, False]
-    for shape in [(100, 200),
-                  (200, 200),
-                  (200, 100),
-                  (10, 1000),
-                  (2, 10000),
-                  (1000, 10),
-                  (10000, 2)]
-    for homo_data in [-1., 0., 1.]
-  )
-  def test_homo(self, shape, transpose, homo_data):
-    print(f'test_homo: shape = {shape}, transpose = {transpose}, homo_data = {homo_data}')
-
-    rng = bm.random.RandomState()
-    indices, indptr = bp.conn.FixedProb(0.4)(*shape).require('pre2post')
-    events = rng.random(shape[0] if transpose else shape[1]) < 0.1
-    heter_data = bm.ones(indices.shape) * homo_data
-
-    r1 = brainpylib_csr_matvec(homo_data, indices, indptr, events, shape=shape, transpose=transpose)
-    r2 = brainpylib_csr_matvec(heter_data, indices, indptr, events, shape=shape, transpose=transpose)
-    self.assertTrue(bm.allclose(r1, r2))
-
-    r3 = brainpylib_csr_matvec(homo_data, indices, indptr, events.astype(float), shape=shape, transpose=transpose)
-    self.assertTrue(bm.allclose(r1, r3))
-
-    dense = bm.sparse.csr_to_dense(heter_data, indices, indptr, shape=shape)
-    r4 = (events @ dense) if transpose else (dense @ events)
-    self.assertTrue(bm.allclose(r1, r4))
-
-    r5 = brainpylib_csr_matvec(heter_data, indices, indptr, events.astype(float), shape=shape, transpose=transpose)
-    self.assertTrue(bm.allclose(r1, r5))
-
-    bm.clear_buffer_memory()
-
-  @parameterized.named_parameters(
-    dict(
-      testcase_name=f'transpose={transpose}, shape={shape}, homo_data={homo_data}',
-      transpose=transpose,
-      shape=shape,
-      homo_data=homo_data,
-    )
-    for transpose in [True, False]
-    for shape in [(100, 200),
-                  (200, 200),
-                  (200, 100),
-                  (10, 1000),
-                  (2, 10000),
-                  (1000, 10),
-                  (100000, 2)]
-    for homo_data in [-1., 0., 1.]
-  )
-  def test_homo_vmap(self, shape, transpose, homo_data):
-    print(f'test_homo_vamp: shape = {shape}, transpose = {transpose}, homo_data = {homo_data}')
-
-    rng = bm.random.RandomState()
-    indices, indptr = bp.conn.FixedProb(0.4)(*shape).require('pre2post')
-
-    # vmap 'data'
-    events = bm.as_jax(rng.random(shape[0] if transpose else shape[1])) < 0.1
-    f1 = jax.vmap(partial(brainpylib_csr_matvec, indices=indices, indptr=indptr, events=events,
-                          shape=shape, transpose=transpose))
-    f2 = jax.vmap(
-      partial(partial(bm.sparse.csrmv, method='cusparse'), indices=indices, indptr=indptr, vector=events.astype(float),
-              shape=shape, transpose=transpose))
-    vmap_data = bm.as_jax([homo_data] * 10)
-    self.assertTrue(bm.allclose(f1(vmap_data), f2(vmap_data)))
-
-    # vmap 'events'
-    f3 = jax.vmap(partial(brainpylib_csr_matvec, homo_data, indices, indptr,
-                          shape=shape, transpose=transpose))
-    f4 = jax.vmap(partial(partial(bm.sparse.csrmv, method='cusparse'), homo_data, indices, indptr,
-                          shape=shape, transpose=transpose))
-    vmap_data = bm.as_jax(rng.random((10, shape[0] if transpose else shape[1]))) < 0.1
-    self.assertTrue(bm.allclose(f3(vmap_data), f4(vmap_data.astype(float))))
-
-    # vmap 'data' and 'events'
-    f5 = jax.vmap(lambda dd, ee: brainpylib_csr_matvec(dd, indices, indptr, ee, shape=shape, transpose=transpose))
-    f6 = jax.vmap(lambda dd, ee: bm.sparse.csrmv(dd, indices, indptr, ee, shape=shape, transpose=transpose,
-                                                 method='cusparse'))
-    vmap_data1 = bm.as_jax([homo_data] * 10)
-    vmap_data2 = bm.as_jax(rng.random((10, shape[0] if transpose else shape[1]))) < 0.2
-    self.assertTrue(bm.allclose(f5(vmap_data1, vmap_data2),
-                                 f6(vmap_data1, vmap_data2.astype(float))))
-
-    bm.clear_buffer_memory()
-
-  @parameterized.named_parameters(
-    dict(
-      testcase_name=f'transpose={transpose},shape={shape},homo_data={homo_data}',
-      homo_data=homo_data,
-      shape=shape,
-      transpose=transpose,
-    )
-    for transpose in [True, False]
-    for shape in [(100, 200),
-                  (200, 200),
-                  (200, 100),
-                  (10, 1000),
-                  (2, 10000),
-                  (1000, 10),
-                  (100000, 2)]
-    for homo_data in [-1., 0., 1.]
-  )
-  def test_homo_grad(self, shape, transpose, homo_data):
-    print(f'test_homo_grad: shape = {shape}, transpose = {transpose}, homo_data = {homo_data}')
-
-    rng = bm.random.RandomState()
-    indices, indptr = bp.conn.FixedProb(0.4)(*shape).require('pre2post')
-    indices = bm.as_jax(indices)
-    indptr = bm.as_jax(indptr)
-    events = bm.as_jax(rng.random(shape[0] if transpose else shape[1])) < 0.1
-    dense_conn = bm.sparse.csr_to_dense(bm.ones(indices.shape).value, indices, indptr, shape=shape)
-
-    # grad 'data'
-    r1 = jax.grad(sum_op(brainpylib_csr_matvec))(
-      homo_data, indices, indptr, events, shape=shape, transpose=transpose)
-    r2 = jax.grad(sum_op(partial(bm.sparse.csrmv, method='cusparse')))(
-      homo_data, indices, indptr, events.astype(float), shape=shape, transpose=transpose)
-    self.assertTrue(bm.allclose(r1, r2))
-    r3 = jax.grad(sum_op(lambda a: (events @ (dense_conn * a) if transpose else
-                                    ((dense_conn * a) @ events))))(homo_data)
-    self.assertTrue(bm.allclose(r1, r3))
-
-    # grad 'events'
-    r4 = jax.grad(sum_op(brainpylib_csr_matvec), argnums=3)(
-      homo_data, indices, indptr, events.astype(float), shape=shape, transpose=transpose)
-    r5 = jax.grad(sum_op(partial(bm.sparse.csrmv, method='cusparse')), argnums=3)(
-      homo_data, indices, indptr, events.astype(float), shape=shape, transpose=transpose)
-    r6 = jax.grad(sum_op(lambda e: (e @ (dense_conn * homo_data) if transpose else
-                                    ((dense_conn * homo_data) @ e))))(events.astype(float))
-    self.assertTrue(bm.allclose(r4, r5))
-    self.assertTrue(bm.allclose(r4, r6))
-
-    bm.clear_buffer_memory()
-
-  @parameterized.named_parameters(
-    dict(
-      testcase_name=f'transpose={transpose}, shape={shape}',
-      shape=shape,
-      transpose=transpose,
-    )
-    for transpose in [True, False]
-    for shape in [(100, 200),
-                  (200, 200),
-                  (200, 100),
-                  (10, 1000),
-                  (2, 10000),
-                  (1000, 10),
-                  (10000, 2)]
-  )
-  def test_heter(self, shape, transpose):
-    print(f'test_heter: shape = {shape}, transpose = {transpose}')
-
-    rng = bm.random.RandomState()
-    indices, indptr = bp.conn.FixedProb(0.4)(*shape).require('pre2post')
-    indices = bm.as_jax(indices)
-    indptr = bm.as_jax(indptr)
-    events = bm.as_jax(rng.random(shape[0] if transpose else shape[1])) < 0.1
-    heter_data = bm.as_jax(rng.random(indices.shape))
-
-    r1 = brainpylib_csr_matvec(heter_data, indices, indptr, events,
-                        shape=shape, transpose=transpose)
-    r2 = partial(bm.sparse.csrmv, method='cusparse')(heter_data, indices, indptr, events.astype(float),
-                                                     shape=shape, transpose=transpose)
-    self.assertTrue(bm.allclose(r1, r2))
-
-    dense = bm.sparse.csr_to_dense(heter_data, indices, indptr, shape=shape)
-    r3 = (events @ dense) if transpose else (dense @ events)
-    self.assertTrue(bm.allclose(r1, r3))
-
-    r4 = brainpylib_csr_matvec(heter_data, indices, indptr, events.astype(float),
-                        shape=shape, transpose=transpose)
-    self.assertTrue(bm.allclose(r1, r4))
-
-    bm.clear_buffer_memory()
-
-  @parameterized.named_parameters(
-    dict(
-      testcase_name=f"transpose={transpose}, shape={shape}",
-      shape=shape,
-      transpose=transpose,
-    )
-    for transpose in [True, False]
-    for shape in [(100, 200),
-                  (200, 200),
-                  (200, 100),
-                  (10, 1000),
-                  (2, 10000),
-                  (1000, 10),
-                  (100000, 2)]
-  )
-  def test_heter_vmap(self, shape, transpose):
-    print(f'test_heter_vamp: shape = {shape}, transpose = {transpose}')
-
-    rng = bm.random.RandomState()
-    indices, indptr = bp.conn.FixedProb(0.4)(*shape).require('pre2post')
-    indices = bm.as_jax(indices)
-    indptr = bm.as_jax(indptr)
-
-    # vmap 'data'
-    events = bm.as_jax(rng.random(shape[0] if transpose else shape[1])) < 0.1
-    f1 = jax.vmap(partial(brainpylib_csr_matvec, indices=indices, indptr=indptr, events=events,
-                          shape=shape, transpose=transpose))
-    f2 = jax.vmap(
-      partial(partial(bm.sparse.csrmv, method='cusparse'), indices=indices, indptr=indptr, vector=events.astype(float),
-              shape=shape, transpose=transpose))
-    vmap_data = bm.as_jax(rng.random((10, indices.shape[0])))
-    self.assertTrue(bm.allclose(f1(vmap_data), f2(vmap_data)))
-
-    # vmap 'events'
-    data = bm.as_jax(rng.random(indices.shape))
-    f3 = jax.vmap(partial(brainpylib_csr_matvec, data, indices, indptr,
-                          shape=shape, transpose=transpose))
-    f4 = jax.vmap(partial(partial(bm.sparse.csrmv, method='cusparse'), data, indices, indptr,
-                          shape=shape, transpose=transpose))
-    vmap_data = bm.as_jax(rng.random((10, shape[0] if transpose else shape[1]))) < 0.1
-    self.assertTrue(bm.allclose(f3(vmap_data), f4(vmap_data.astype(float))))
-
-    # vmap 'data' and 'events'
-    f5 = jax.vmap(lambda dd, ee: brainpylib_csr_matvec(dd, indices, indptr, ee,
-                                                shape=shape, transpose=transpose))
-    f6 = jax.vmap(lambda dd, ee: partial(bm.sparse.csrmv, method='cusparse')(dd, indices, indptr, ee,
-                                                                             shape=shape, transpose=transpose))
-    vmap_data1 = bm.as_jax(rng.random((10, indices.shape[0])))
-    vmap_data2 = bm.as_jax(rng.random((10, shape[0] if transpose else shape[1]))) < 0.2
-    self.assertTrue(bm.allclose(f5(vmap_data1, vmap_data2),
-                                 f6(vmap_data1, vmap_data2.astype(float))))
-
-    bm.clear_buffer_memory()
-
-  @parameterized.named_parameters(
-    dict(testcase_name=f'transpose={transpose},shape={shape}',
-         shape=shape,
-         transpose=transpose,
-         )
-    for transpose in [True, False]
-    for shape in [(100, 200),
-                  (200, 200),
-                  (200, 100),
-                  (10, 1000),
-                  (2, 10000),
-                  (1000, 10),
-                  (100000, 2)]
-  )
-  def test_heter_grad(self, shape, transpose):
-    print(f'test_heter_grad: shape = {shape}, transpose = {transpose}')
-
-    rng = bm.random.RandomState()
-    indices, indptr = bp.conn.FixedProb(0.4)(*shape).require('pre2post')
-    indices = bm.as_jax(indices)
-    indptr = bm.as_jax(indptr)
-    events = rng.random(shape[0] if transpose else shape[1]) < 0.1
-    events = bm.as_jax(events)
-    dense_conn = bm.sparse.csr_to_dense(bm.ones(indices.shape).value, indices, indptr, shape=shape)
-
-    # grad 'data'
-    data = bm.as_jax(rng.random(indices.shape))
-    r1 = jax.grad(sum_op(brainpylib_csr_matvec))(
-      data, indices, indptr, events, shape=shape, transpose=transpose)
-    r2 = jax.grad(sum_op(partial(bm.sparse.csrmv, method='cusparse')))(
-      data, indices, indptr, events.astype(float), shape=shape, transpose=transpose)
-    self.assertTrue(bm.allclose(r1, r2))
-
-    dense_data = bm.sparse.csr_to_dense(data, indices, indptr, shape=shape)
-    r3 = jax.grad(sum_op(lambda a: ((events @ a) if transpose else
-                                    (a @ events))))(dense_data)
-    rows, cols = bm.sparse.csr_to_coo(indices, indptr)
-    r3 = r3[rows, cols]
-    self.assertTrue(bm.allclose(r1, r3))
-
-    # grad 'events'
-    r4 = jax.grad(sum_op(brainpylib_csr_matvec), argnums=3)(
-      data, indices, indptr, events.astype(float), shape=shape, transpose=transpose)
-    r5 = jax.grad(sum_op(partial(bm.sparse.csrmv, method='cusparse')), argnums=3)(
-      data, indices, indptr, events.astype(float), shape=shape, transpose=transpose)
-    r6 = jax.grad(sum_op(lambda e: ((e @ dense_data) if transpose else
-                                    (dense_data @ e))))(events.astype(float))
-    self.assertTrue(bm.allclose(r4, r5))
-    self.assertTrue(bm.allclose(r4, r6))
-
-    bm.clear_buffer_memory()
diff --git a/brainpy/_src/math/event/tests/test_info.py b/brainpy/_src/math/event/tests/test_info.py
deleted file mode 100644
index c326b0f76..000000000
--- a/brainpy/_src/math/event/tests/test_info.py
+++ /dev/null
@@ -1,62 +0,0 @@
-# -*- coding: utf-8 -*-
-
-import jax.numpy as jnp
-import unittest
-
-import brainpy.math as bm
-from jax import vmap
-
-import pytest
-
-
-class Test_event_info(unittest.TestCase):
-  def __init__(self, *args, platform='cpu', **kwargs):
-    super(Test_event_info, self).__init__(*args, **kwargs)
-
-    print()
-    bm.set_platform(platform)
-
-  def _base_test(self, length):
-    print(f'{self._base_test.__name__}: length = {length}')
-
-    rng = bm.random.RandomState()
-    events = bm.as_jax(rng.random(length)) < 0.1
-    event_ids, event_num = bm.event.info(events)
-    self.assertTrue(jnp.allclose(jnp.sum(events, keepdims=True), event_num))
-
-    bm.clear_buffer_memory()
-
-  def _base_vmap(self, length):
-    print(f'{self._base_vmap.__name__}: length = {length}')
-
-    rng = bm.random.RandomState()
-    events = bm.as_jax(rng.random((10, length))) < 0.1
-    event_ids, event_num = vmap(bm.event.info)(events)
-    self.assertTrue(jnp.allclose(jnp.sum(events, axis=-1), event_num))
-
-    bm.clear_buffer_memory()
-
-  def _base_vmap_vmap(self, length):
-    print(f'{self._base_vmap_vmap.__name__}: length = {length}')
-
-    rng = bm.random.RandomState()
-    events = bm.as_jax(rng.random((10, length))) < 0.1
-    event_ids, event_num = vmap(vmap(bm.event.info))(events)
-    self.assertTrue(jnp.allclose(jnp.sum(events, axis=-1), event_num))
-
-    bm.clear_buffer_memory()
-
-  def test(self):
-    for length in [1, 3, 8, 10, 100, 200, 500, 1000, 10000, 100000]:
-      self._base_test(length)
-
-  def test_vmap(self):
-    for length in [1, 3, 8, 10, 100, 200, 500, 1000, 10000, 100000]:
-      self._base_test(length)
-
-  def test_vmap_vmap(self):
-    for length in [1, 3, 8, 10, 100, 200, 500, 1000, 10000, 100000]:
-      self._base_test(length)
-
-
-
diff --git a/brainpy/_src/math/event/tests/test_info_gpu.py b/brainpy/_src/math/event/tests/test_info_gpu.py
deleted file mode 100644
index 55bdd15cd..000000000
--- a/brainpy/_src/math/event/tests/test_info_gpu.py
+++ /dev/null
@@ -1,14 +0,0 @@
-# -*- coding: utf-8 -*-
-
-import jax
-import pytest
-
-import test_info
-
-if jax.default_backend() != 'gpu':
-  pytest.skip("No gpu available.", allow_module_level=True)
-
-
-class Test_event_info_GPU(test_info.Test_event_info):
-  def __init__(self, *args, **kwargs):
-    super(Test_event_info_GPU, self).__init__(*args, **kwargs, platform='gpu')
diff --git a/brainpy/_src/math/index_tricks.py b/brainpy/_src/math/index_tricks.py
deleted file mode 100644
index 6c71b4b06..000000000
--- a/brainpy/_src/math/index_tricks.py
+++ /dev/null
@@ -1,305 +0,0 @@
-# -*- coding: utf-8 -*-
-
-import abc
-
-from jax import core
-from .compat_numpy import arange, array, concatenate, expand_dims, linspace, meshgrid, stack, transpose
-import numpy as np
-
-__all__ = ["c_", "index_exp", "mgrid", "ogrid", "r_", "s_"]
-
-
-def _make_1d_grid_from_slice(s: slice, op_name: str):
-  start = core.concrete_or_error(None, s.start,
-                                 f"slice start of jnp.{op_name}") or 0
-  stop = core.concrete_or_error(None, s.stop,
-                                f"slice stop of jnp.{op_name}")
-  step = core.concrete_or_error(None, s.step,
-                                f"slice step of jnp.{op_name}") or 1
-  if np.iscomplex(step):
-    newobj = linspace(start, stop, int(abs(step)))
-  else:
-    newobj = arange(start, stop, step)
-
-  return newobj
-
-
-class _IndexGrid(abc.ABC):
-  """Creates multi-dimensional grids of indices."""
-  sparse: bool
-  op_name: str
-
-  def __getitem__(self, key):
-    if isinstance(key, slice):
-      return _make_1d_grid_from_slice(key, op_name=self.op_name)
-    output = (_make_1d_grid_from_slice(k, op_name=self.op_name) for k in key)
-    output = meshgrid(*output, indexing='ij', sparse=self.sparse)
-    return output if self.sparse else stack(output, 0)
-
-
-class _Mgrid(_IndexGrid):
-  """Return dense multi-dimensional "meshgrid".
-
-  LAX-backend implementation of :obj:`numpy.mgrid`. This is a convenience wrapper for
-  functionality provided by :func:`jax.numpy.meshgrid` with ``sparse=False``.
-
-  See Also:
-    jnp.ogrid: open/sparse version of jnp.mgrid
-
-  Examples:
-    Pass ``[start:stop:step]`` to generate values similar to :func:`jax.numpy.arange`:
-
-    >>> import brainpy.math as bm
-    >>> bm.mgrid[0:4:1]
-    DeviceArray([0, 1, 2, 3], dtype=int32)
-
-    Passing an imaginary step generates values similar to :func:`jax.numpy.linspace`:
-
-    >>> bm.mgrid[0:1:4j]
-    DeviceArray([0.        , 0.33333334, 0.6666667 , 1.        ], dtype=float32)
-
-    Multiple slices can be used to create broadcasted grids of indices:
-
-    >>> bm.mgrid[:2, :3]
-    DeviceArray([[[0, 0, 0],
-                  [1, 1, 1]],
-                 [[0, 1, 2],
-                  [0, 1, 2]]], dtype=int32)
-  """
-  sparse = False
-  op_name = "mgrid"
-
-
-mgrid = _Mgrid()
-
-
-class _Ogrid(_IndexGrid):
-  """Return open multi-dimensional "meshgrid".
-
-  LAX-backend implementation of :obj:`numpy.ogrid`. This is a convenience wrapper for
-  functionality provided by :func:`jax.numpy.meshgrid` with ``sparse=True``.
-
-  See Also:
-    jnp.mgrid: dense version of jnp.ogrid
-
-  Examples:
-    Pass ``[start:stop:step]`` to generate values similar to :func:`jax.numpy.arange`:
-
-    >>> bm.ogrid[0:4:1]
-    DeviceArray([0, 1, 2, 3], dtype=int32)
-
-    Passing an imaginary step generates values similar to :func:`jax.numpy.linspace`:
-
-    >>> bm.ogrid[0:1:4j]
-    DeviceArray([0.        , 0.33333334, 0.6666667 , 1.        ], dtype=float32)
-
-    Multiple slices can be used to create sparse grids of indices:
-
-    >>> bm.ogrid[:2, :3]
-    [DeviceArray([[0],
-                  [1]], dtype=int32),
-     DeviceArray([[0, 1, 2]], dtype=int32)]
-  """
-  sparse = True
-  op_name = "ogrid"
-
-
-ogrid = _Ogrid()
-
-
-class _AxisConcat(abc.ABC):
-  """Concatenates slices, scalars and array-like objects along a given axis."""
-  axis: int
-  ndmin: int
-  trans1d: int
-  op_name: str
-
-  def __getitem__(self, key):
-    if not isinstance(key, tuple):
-      key = (key,)
-
-    params = [self.axis, self.ndmin, self.trans1d, -1]
-
-    if isinstance(key[0], str):
-      # split off the directive
-      directive, *key = key  # pytype: disable=bad-unpacking
-      # check two special cases: matrix directives
-      if directive == "r":
-        params[-1] = 0
-      elif directive == "c":
-        params[-1] = 1
-      else:
-        vec = directive.split(",")
-        k = len(vec)
-        if k < 4:
-          vec += params[k:]
-        else:
-          # ignore everything after the first three comma-separated ints
-          vec = vec[:3] + params[-1]
-        try:
-          params = list(map(int, vec))
-        except ValueError as err:
-          raise ValueError(
-            "could not understand directive {!r}".format(directive)
-          ) from err
-
-    axis, ndmin, trans1d, matrix = params
-
-    output = []
-    for item in key:
-      if isinstance(item, slice):
-        newobj = _make_1d_grid_from_slice(item, op_name=self.op_name)
-      elif isinstance(item, str):
-        raise ValueError("string directive must be placed at the beginning")
-      else:
-        newobj = item
-
-      newobj = array(newobj, copy=False, ndmin=ndmin)
-
-      if trans1d != -1 and ndmin - np.ndim(item) > 0:
-        shape_obj = list(range(ndmin))
-        # Calculate number of left shifts, with overflow protection by mod
-        num_lshifts = ndmin - abs(ndmin + trans1d + 1) % ndmin
-        shape_obj = tuple(shape_obj[num_lshifts:] + shape_obj[:num_lshifts])
-
-        newobj = transpose(newobj, shape_obj)
-
-      output.append(newobj)
-
-    res = concatenate(tuple(output), axis=axis)
-
-    if matrix != -1 and res.ndim == 1:
-      # insert 2nd dim at axis 0 or 1
-      res = expand_dims(res, matrix)
-
-    return res
-
-  def __len__(self):
-    return 0
-
-
-class RClass(_AxisConcat):
-  """Concatenate slices, scalars and array-like objects along the first axis.
-
-  LAX-backend implementation of :obj:`numpy.r_`.
-
-  See Also:
-    ``jnp.c_``: Concatenates slices, scalars and array-like objects along the last axis.
-
-  Examples:
-    Passing slices in the form ``[start:stop:step]`` generates ``jnp.arange`` objects:
-
-    >>> bm.r_[-1:5:1, 0, 0, bm.array([1,2,3])]
-    DeviceArray([-1,  0,  1,  2,  3,  4,  0,  0,  1,  2,  3], dtype=int32)
-
-    An imaginary value for ``step`` will create a ``jnp.linspace`` object instead,
-    which includes the right endpoint:
-
-    >>> bm.r_[-1:1:6j, 0, bm.array([1,2,3])]
-    DeviceArray([-1.        , -0.6       , -0.20000002,  0.20000005,
-                  0.6       ,  1.        ,  0.        ,  1.        ,
-                  2.        ,  3.        ], dtype=float32)
-
-    Use a string directive of the form ``"axis,dims,trans1d"`` as the first argument to
-    specify concatenation axis, minimum number of dimensions, and the position of the
-    upgraded array's original dimensions in the resulting array's shape tuple:
-
-    >>> bm.r_['0,2', [1,2,3], [4,5,6]] # concatenate along first axis, 2D output
-    DeviceArray([[1, 2, 3],
-                 [4, 5, 6]], dtype=int32)
-
-    >>> bm.r_['0,2,0', [1,2,3], [4,5,6]] # push last input axis to the front
-    DeviceArray([[1],
-                 [2],
-                 [3],
-                 [4],
-                 [5],
-                 [6]], dtype=int32)
-
-    Negative values for ``trans1d`` offset the last axis towards the start
-    of the shape tuple:
-
-    >>> bm.r_['0,2,-2', [1,2,3], [4,5,6]]
-    DeviceArray([[1],
-                 [2],
-                 [3],
-                 [4],
-                 [5],
-                 [6]], dtype=int32)
-
-    Use the special directives ``"r"`` or ``"c"`` as the first argument on flat inputs
-    to create an array with an extra row or column axis, respectively:
-
-    >>> bm.r_['r',[1,2,3], [4,5,6]]
-    DeviceArray([[1, 2, 3, 4, 5, 6]], dtype=int32)
-
-    >>> bm.r_['c',[1,2,3], [4,5,6]]
-    DeviceArray([[1],
-                 [2],
-                 [3],
-                 [4],
-                 [5],
-                 [6]], dtype=int32)
-
-    For higher-dimensional inputs (``dim >= 2``), both directives ``"r"`` and ``"c"``
-    give the same result.
-  """
-  axis = 0
-  ndmin = 1
-  trans1d = -1
-  op_name = "r_"
-
-
-r_ = RClass()
-
-
-class CClass(_AxisConcat):
-  """Concatenate slices, scalars and array-like objects along the last axis.
-
-  LAX-backend implementation of :obj:`numpy.c_`.
-
-  See Also:
-    ``jnp.r_``: Concatenates slices, scalars and array-like objects along the first axis.
-
-  Examples:
-
-    >>> a = bm.arange(6).reshape((2,3))
-    >>> bm.c_[a,a]
-    DeviceArray([[0, 1, 2, 0, 1, 2],
-                 [3, 4, 5, 3, 4, 5]], dtype=int32)
-
-    Use a string directive of the form ``"axis:dims:trans1d"`` as the first argument to specify
-    concatenation axis, minimum number of dimensions, and the position of the upgraded array's
-    original dimensions in the resulting array's shape tuple:
-
-    >>> bm.c_['0,2', [1,2,3], [4,5,6]]
-    DeviceArray([[1],
-                 [2],
-                 [3],
-                 [4],
-                 [5],
-                 [6]], dtype=int32)
-
-    >>> bm.c_['0,2,-1', [1,2,3], [4,5,6]]
-    DeviceArray([[1, 2, 3],
-                 [4, 5, 6]], dtype=int32)
-
-    Use the special directives ``"r"`` or ``"c"`` as the first argument on flat inputs
-    to create an array with inputs stacked along the last axis:
-
-    >>> jnp.c_['r',[1,2,3], [4,5,6]]
-    DeviceArray([[1, 4],
-                 [2, 5],
-                 [3, 6]], dtype=int32)
-  """
-  axis = -1
-  ndmin = 2
-  trans1d = 0
-  op_name = "c_"
-
-
-c_ = CClass()
-
-s_ = np.s_
-
-index_exp = np.index_exp
diff --git a/brainpy/_src/math/jitconn/__init__.py b/brainpy/_src/math/jitconn/__init__.py
index a79cdc982..6f7cddf6a 100644
--- a/brainpy/_src/math/jitconn/__init__.py
+++ b/brainpy/_src/math/jitconn/__init__.py
@@ -1,3 +1,2 @@
-
-from ._matvec import *
-from ._event_matvec import *
\ No newline at end of file
+from ._matvec import *
+from ._event_matvec import *
diff --git a/brainpy/_src/math/jitconn/_event_matvec.py b/brainpy/_src/math/jitconn/_event_matvec.py
index 3671755a9..ac62bbfaf 100644
--- a/brainpy/_src/math/jitconn/_event_matvec.py
+++ b/brainpy/_src/math/jitconn/_event_matvec.py
@@ -1,21 +1,14 @@
 # -*- coding: utf-8 -*-
 
-from functools import partial
 from typing import Tuple, Optional
 
 import jax
 import numpy as np
-from jax import numpy as jnp, dtypes
-from jax.core import ShapedArray, Primitive
-from jax.interpreters import xla, ad
-from jax.lib import xla_client
+from jax import numpy as jnp
 
-from brainpy._src.dependency_check import import_brainpylib_gpu_ops, import_brainpylib_cpu_ops, import_taichi
+from brainpy._src.dependency_check import import_taichi
 from brainpy._src.math.interoperability import as_jax
-from brainpy._src.math.jitconn._matvec import (mv_prob_homo_p,
-                                               mv_prob_uniform_p,
-                                               mv_prob_normal_p,
-                                               mv_prob_homo,
+from brainpy._src.math.jitconn._matvec import (mv_prob_homo,
                                                mv_prob_uniform,
                                                mv_prob_normal,
                                                _general_checking,
@@ -27,11 +20,10 @@
                                                _mv_prob_normal_transpose,
                                                _reverse)
 from brainpy._src.math.ndarray import _get_dtype
-from brainpy._src.math.op_register import register_general_batching, XLACustomOp
-from brainpy._src.math.tifunc import (lfsr88_key, lfsr88_random_integers, lfsr88_uniform, lfsr88_normal)
-from brainpy.errors import GPUOperatorNotFound
+from brainpy._src.math.op_register import XLACustomOp
+from brainpy.errors import PackageMissingError
 
-ti = import_taichi()
+ti = import_taichi(error_if_not_found=False)
 
 __all__ = [
   'event_mv_prob_homo',
@@ -50,8 +42,8 @@ def event_mv_prob_homo(
     transpose: bool = False,
     outdim_parallel: bool = True,
 ) -> jax.Array:
-  return event_mv_prob_homo_taichi(events, weight, conn_prob, seed, shape=shape, transpose=transpose,
-                                   outdim_parallel=outdim_parallel)
+  if ti is None:
+    raise PackageMissingError.by_purpose('taichi', purpose='customized operators')
 
 
 event_mv_prob_homo.__doc__ = mv_prob_homo.__doc__
@@ -103,8 +95,8 @@ def event_mv_prob_homo_brainpylib(
     outdim_parallel: bool = True,
 ) -> jax.Array:
   events = as_jax(events)
-  weight = jnp.atleast_1d(as_jax(weight))
-  conn_prob = jnp.atleast_1d(as_jax(conn_prob))
+  weight = jnp.atleast_1d(jnp.asarray(weight))
+  conn_prob = jnp.atleast_1d(jnp.asarray(conn_prob))
   clen = jnp.asarray(jnp.ceil(1 / conn_prob) * 2 - 1, dtype=jnp.int32)
   with jax.ensure_compile_time_eval():
     if seed is None:
@@ -791,19 +783,25 @@ def event_mv_prob_homo_taichi(
       The output of :math:`y = M @ v`.
   """
   events = as_jax(events)
-  if isinstance(weight, float): weight = as_jax(weight)
-  weight = jnp.atleast_1d(as_jax(weight))
+  weight = as_jax(weight)
+  if jnp.ndim(weight) < 1:
+    weight = jnp.expand_dims(weight, axis=0)
   conn_len = jnp.ceil(1 / conn_prob) * 2 - 1
   conn_len = jnp.asarray(jnp.atleast_1d(conn_len), dtype=jnp.int32)
   if seed is None:
     with jax.ensure_compile_time_eval():
       seed = np.random.randint(0, int(1e8), 1)
   seed = jnp.atleast_1d(jnp.asarray(seed, dtype=jnp.uint32))
-  return raw_event_mv_prob_homo(events, weight, conn_len, seed, shape=shape,
-                                transpose=transpose, outdim_parallel=outdim_parallel)[0]
+  return raw_event_mv_prob_homo(events, weight, conn_len, seed,
+                                shape=shape,
+                                transpose=transpose,
+                                outdim_parallel=outdim_parallel)[0]
+
+
+event_mv_prob_homo.__doc__ = mv_prob_homo.__doc__
 
 
-def event_mv_prob_uniform_taichi(
+def event_mv_prob_uniform(
     events: jax.Array,
     w_low: float,
     w_high: float,
@@ -814,56 +812,9 @@ def event_mv_prob_uniform_taichi(
     transpose: bool = False,
     outdim_parallel: bool = True,
 ) -> jax.Array:
-  r"""Perform the :math:`y=M@v` operation,
-  where :math:`M` is just-in-time randomly generated with a uniform distribution for its value.
-
-  This operator support ``jit()``, ``vmap()``, ``grad()`` and ``pmap()`` etc. transformations
-  on CPU and GPU devices.
-
-  .. warning::
-
-      This API may change in the future.
-
-  In this operation, :math:`M` is the random matrix with a connection probability
-  `conn_prob`, and at each connection the value is the same scalar `weight`.
-
-  When ``transpose=True``, we perform an operation of :math:`y=M^T@v`.
-
-  .. note::
-
-      Note that the just-in-time generated :math:`M` (`transpose=False`) is
-      different from the generated :math:`M^T` (`transpose=True`).
-
-      If you pursue the same :math:`M` and :math:`M^T` when performing the just-in-time
-      matrix generation, you should set ``outdim_parallel=True``, with the sacrifice of
-      the speed compared with ``outdim_parallel=False``.
+  if ti is None:
+    raise PackageMissingError.by_purpose('taichi', purpose='customized operators')
 
-  Parameters
-  ----------
-  events: Array, ndarray
-      The events.
-  w_low: float
-      Lower boundary of the output interval.
-  w_high: float
-      Upper boundary of the output interval.
-  conn_prob: float
-      The connection probability.
-  shape: tuple of int
-      The matrix shape.
-  seed: int
-      The random number generation seed.
-  transpose: bool
-      Transpose the random matrix or not.
-  outdim_parallel: bool
-      Perform the parallel random generations along the out dimension or not.
-      It can be used to set the just-in-time generated :math:M^T: is the same
-      as the just-in-time generated :math:`M` when ``transpose=True``.
-
-  Returns
-  -------
-  out: Array, ndarray
-      The output of :math:`y = M @ v`.
-  """
   events = as_jax(events)
   if isinstance(w_low, float): w_low = as_jax(w_low)
   if isinstance(w_high, float): w_high = as_jax(w_high)
@@ -879,7 +830,10 @@ def event_mv_prob_uniform_taichi(
                                    transpose=transpose, outdim_parallel=outdim_parallel)[0]
 
 
-def event_mv_prob_normal_taichi(
+event_mv_prob_uniform.__doc__ = mv_prob_uniform.__doc__
+
+
+def event_mv_prob_normal(
     events: jax.Array,
     w_mu: float,
     w_sigma: float,
@@ -890,56 +844,9 @@ def event_mv_prob_normal_taichi(
     transpose: bool = False,
     outdim_parallel: bool = True,
 ) -> jax.Array:
-  r"""Perform the :math:`y=M@v` operation,
-  where :math:`M` is just-in-time randomly generated with a normal distribution for its value.
-
-  This operator support ``jit()``, ``vmap()``, ``grad()`` and ``pmap()`` etc. transformations
-  on CPU and GPU devices.
+  if ti is None:
+    raise PackageMissingError.by_purpose('taichi', purpose='customized operators')
 
-  .. warning::
-
-      This API may change in the future.
-
-  In this operation, :math:`M` is the random matrix with a connection probability
-  `conn_prob`, and at each connection the value is the same scalar `weight`.
-
-  When ``transpose=True``, we perform an operation of :math:`y=M^T@v`.
-
-  .. note::
-
-      Note that the just-in-time generated :math:`M` (`transpose=False`) is
-      different from the generated :math:`M^T` (`transpose=True`).
-
-      If you pursue the same :math:`M` and :math:`M^T` when performing the just-in-time
-      matrix generation, you should set ``outdim_parallel=True``, with the sacrifice of
-      the speed compared with ``outdim_parallel=False``.
-
-  Parameters
-  ----------
-  events: Array, ndarray
-      The events.
-  w_mu: float
-      Mean (centre) of the distribution.
-  w_sigma: float
-      Standard deviation (spread or “width”) of the distribution. Must be non-negative.
-  conn_prob: float
-      The connection probability.
-  shape: tuple of int
-      The matrix shape.
-  seed: int
-      The random number generation seed.
-  transpose: bool
-      Transpose the random matrix or not.
-  outdim_parallel: bool
-      Perform the parallel random generations along the out dimension or not.
-      It can be used to set the just-in-time generated :math:M^T: is the same
-      as the just-in-time generated :math:`M` when ``transpose=True``.
-
-  Returns
-  -------
-  out: Array, ndarray
-      The output of :math:`y = M @ v`.
-  """
   events = as_jax(events)
   if isinstance(w_mu, float): w_mu = as_jax(w_mu)
   if isinstance(w_sigma, float): w_sigma = as_jax(w_sigma)
@@ -955,1034 +862,1036 @@ def event_mv_prob_normal_taichi(
                                   transpose=transpose, outdim_parallel=outdim_parallel)[0]
 
 
-# -------------
-# CPU function
-# -------------
-# For each non-zero event value, it generates a random key using a
-# function lfsr88_key and then uses this key to compute random integers
-# and update the out array based on the computed indices and weight.
-#
-# The function is likely designed to be parallelized.
-
-
-@ti.kernel
-def _event_mv_prob_homo_bool_cpu(
-    events: ti.types.ndarray(ndim=1),
-    weight: ti.types.ndarray(ndim=1),
-    clen: ti.types.ndarray(ndim=1),
-    seed: ti.types.ndarray(ndim=1),
-    out: ti.types.ndarray(ndim=1)
-):
-  num_row = out.shape[0]
-  num_col = events.shape[0]
-  weight0 = weight[0]
-  clen0 = clen[0]
-  seed0 = seed[0]
-
-  for i_col in range(num_col):
-    if events[i_col]:
-      key = lfsr88_key(seed0 + i_col)
-      key, i_row = lfsr88_random_integers(key, 0, clen0 - 1)
-      while i_row < num_row:
-        out[i_row] += weight0
+event_mv_prob_normal.__doc__ = mv_prob_normal.__doc__
+
+if ti is not None:
+  from brainpy._src.math.tifunc import (lfsr88_key, lfsr88_random_integers, lfsr88_uniform, lfsr88_normal)
+
+
+  # -------------
+  # CPU function
+  # -------------
+  # For each non-zero event value, it generates a random key using a
+  # function lfsr88_key and then uses this key to compute random integers
+  # and update the out array based on the computed indices and weight.
+  #
+  # The function is likely designed to be parallelized.
+
+  @ti.kernel
+  def _event_mv_prob_homo_bool_cpu(
+      events: ti.types.ndarray(ndim=1),
+      weight: ti.types.ndarray(ndim=1),
+      clen: ti.types.ndarray(ndim=1),
+      seed: ti.types.ndarray(ndim=1),
+      out: ti.types.ndarray(ndim=1)
+  ):
+    num_row = out.shape[0]
+    num_col = events.shape[0]
+    weight0 = weight[0]
+    clen0 = clen[0]
+    seed0 = seed[0]
+
+    for i_col in range(num_col):
+      if events[i_col]:
+        key = lfsr88_key(seed0 + i_col)
+        key, i_row = lfsr88_random_integers(key, 0, clen0 - 1)
+        while i_row < num_row:
+          out[i_row] += weight0
+          key, inc = lfsr88_random_integers(key, 1, clen0)
+          i_row += inc
+
+
+  @ti.kernel
+  def _event_mv_prob_homo_outdim_parallel_bool_cpu(
+      events: ti.types.ndarray(ndim=1),
+      weight: ti.types.ndarray(ndim=1),
+      clen: ti.types.ndarray(ndim=1),
+      seed: ti.types.ndarray(ndim=1),
+      out: ti.types.ndarray(ndim=1)
+  ):
+    num_row = out.shape[0]
+    num_col = events.shape[0]
+    weight0 = weight[0]
+    clen0 = clen[0]
+    seed0 = seed[0]
+
+    for i_row in range(num_row):
+      r = 0.
+      key = lfsr88_key(seed0 + i_row)
+      key, i_col = lfsr88_random_integers(key, 0, clen0 - 1)
+      while i_col < num_col:
+        if events[i_col]:
+          r += weight0
         key, inc = lfsr88_random_integers(key, 1, clen0)
-        i_row += inc
-
-
-@ti.kernel
-def _event_mv_prob_homo_outdim_parallel_bool_cpu(
-    events: ti.types.ndarray(ndim=1),
-    weight: ti.types.ndarray(ndim=1),
-    clen: ti.types.ndarray(ndim=1),
-    seed: ti.types.ndarray(ndim=1),
-    out: ti.types.ndarray(ndim=1)
-):
-  num_row = out.shape[0]
-  num_col = events.shape[0]
-  weight0 = weight[0]
-  clen0 = clen[0]
-  seed0 = seed[0]
-
-  for i_row in range(num_row):
-    r = 0.
-    key = lfsr88_key(seed0 + i_row)
-    key, i_col = lfsr88_random_integers(key, 0, clen0 - 1)
-    while i_col < num_col:
+        i_col += inc
+      out[i_row] = r
+
+
+  # -------------
+  # GPU function
+  # -------------
+  # Contrary to the CPU functions, for each column,
+  # this function will 32 threads (one warp) to make
+  # the just-in-time random generation parallelized.
+
+  @ti.kernel
+  def _event_mv_prob_homo_bool_gpu(
+      events: ti.types.ndarray(ndim=1),
+      weight: ti.types.ndarray(ndim=1),
+      clen: ti.types.ndarray(ndim=1),
+      seed: ti.types.ndarray(ndim=1),
+      out: ti.types.ndarray(ndim=1)
+  ):
+    num_row = out.shape[0]
+    num_col = events.shape[0]
+    weight0 = weight[0]
+    clen0 = clen[0]
+    seed0 = seed[0]
+    step = ti.uint32(ti.max((num_row + 1) >> 5, 1))
+
+    for i in range(num_col * 32):
+      i_col = i >> 5
       if events[i_col]:
-        r += weight0
-      key, inc = lfsr88_random_integers(key, 1, clen0)
-      i_col += inc
-    out[i_row] = r
-
-
-# -------------
-# GPU function
-# -------------
-# Contrary to the CPU functions, for each column,
-# this function will 32 threads (one warp) to make
-# the just-in-time random generation parallelized.
-
-
-@ti.kernel
-def _event_mv_prob_homo_bool_gpu(
-    events: ti.types.ndarray(ndim=1),
-    weight: ti.types.ndarray(ndim=1),
-    clen: ti.types.ndarray(ndim=1),
-    seed: ti.types.ndarray(ndim=1),
-    out: ti.types.ndarray(ndim=1)
-):
-  num_row = out.shape[0]
-  num_col = events.shape[0]
-  weight0 = weight[0]
-  clen0 = clen[0]
-  seed0 = seed[0]
-  step = ti.uint32(ti.max((num_row + 1) >> 5, 1))
-
-  for i in range(num_col * 32):
-    i_col = i >> 5
-    if events[i_col]:
-      index = i & 31
-      i_row = step * index - 1
-      end = ti.min(i_row + step, num_row)
-      key = lfsr88_key(seed0 + i)
-      key, inc = lfsr88_random_integers(key, 1, clen0)
-      i_row += inc
-      while i_row < end:
-        out[i_row] += weight0
+        index = i & 31
+        i_row = step * index - 1
+        end = ti.min(i_row + step, num_row)
+        key = lfsr88_key(seed0 + i)
         key, inc = lfsr88_random_integers(key, 1, clen0)
         i_row += inc
-
-
-@ti.kernel
-def _event_mv_prob_homo_outdim_parallel_bool_gpu(
-    events: ti.types.ndarray(ndim=1),
-    weight: ti.types.ndarray(ndim=1),
-    clen: ti.types.ndarray(ndim=1),
-    seed: ti.types.ndarray(ndim=1),
-    out: ti.types.ndarray(ndim=1)
-):
-  num_row = out.shape[0]
-  num_col = events.shape[0]
-  weight0 = weight[0]
-  clen0 = clen[0]
-  seed0 = seed[0]
-  step = ti.u32(ti.max((num_row + 1) >> 5, 1))
-
-  for i in range(num_row * 32):
-    i_row = i >> 5
-    index = i & 31
-    i_col = step * index - 1
-    end_col = ti.min(i_col + step, num_col)
-    r = 0.
-    key = lfsr88_key(seed0 + i)
-    key, inc = lfsr88_random_integers(key, 1, clen0)
-    i_col += inc
-    while i_col < end_col:
-      r += weight0 * events[i_col]  # TODO: speed comparison without if else
+        while i_row < end:
+          out[i_row] += weight0
+          key, inc = lfsr88_random_integers(key, 1, clen0)
+          i_row += inc
+
+
+  @ti.kernel
+  def _event_mv_prob_homo_outdim_parallel_bool_gpu(
+      events: ti.types.ndarray(ndim=1),
+      weight: ti.types.ndarray(ndim=1),
+      clen: ti.types.ndarray(ndim=1),
+      seed: ti.types.ndarray(ndim=1),
+      out: ti.types.ndarray(ndim=1)
+  ):
+    num_row = out.shape[0]
+    num_col = events.shape[0]
+    weight0 = weight[0]
+    clen0 = clen[0]
+    seed0 = seed[0]
+    step = ti.u32(ti.max((num_row + 1) >> 5, 1))
+
+    for i in range(num_row * 32):
+      i_row = i >> 5
+      index = i & 31
+      i_col = step * index - 1
+      end_col = ti.min(i_col + step, num_col)
+      r = 0.
+      key = lfsr88_key(seed0 + i)
       key, inc = lfsr88_random_integers(key, 1, clen0)
       i_col += inc
-    out[i_row] += r  # TODO: warp-level reduction
-
-
-def _reverse(shape):
-  return shape[::-1]
-
-
-# -------------
-# CPU function
-# -------------
-# For each non-zero event value, it generates a random key using a
-# function lfsr88_key and then uses this key to compute random integers
-# and update the out array based on the computed indices and weight.
-#
-# The function is likely designed to be parallelized.
-
-
-@ti.kernel
-def _event_mv_prob_homo_cpu(
-    events: ti.types.ndarray(ndim=1),
-    weight: ti.types.ndarray(ndim=1),
-    clen: ti.types.ndarray(ndim=1),
-    seed: ti.types.ndarray(ndim=1),
-    out: ti.types.ndarray(ndim=1)
-):
-  num_row = out.shape[0]
-  num_col = events.shape[0]
-  weight0 = weight[0]
-  clen0 = clen[0]
-  seed0 = seed[0]
-
-  for i_col in range(num_col):
-    if events[i_col] != 0.:
-      key = lfsr88_key(seed0 + i_col)
-      key, i_row = lfsr88_random_integers(key, 0, clen0 - 1)
-      while i_row < num_row:
-        out[i_row] += weight0
+      while i_col < end_col:
+        r += weight0 * events[i_col]  # TODO: speed comparison without if else
         key, inc = lfsr88_random_integers(key, 1, clen0)
-        i_row += inc
-
-
-@ti.kernel
-def _event_mv_prob_homo_outdim_parallel_cpu(
-    events: ti.types.ndarray(ndim=1),
-    weight: ti.types.ndarray(ndim=1),
-    clen: ti.types.ndarray(ndim=1),
-    seed: ti.types.ndarray(ndim=1),
-    out: ti.types.ndarray(ndim=1)
-):
-  num_row = out.shape[0]
-  num_col = events.shape[0]
-  weight0 = weight[0]
-  clen0 = clen[0]
-  seed0 = seed[0]
-
-  for i_row in range(num_row):
-    r = 0.
-    key = lfsr88_key(seed0 + i_row)
-    key, i_col = lfsr88_random_integers(key, 0, clen0 - 1)
-    while i_col < num_col:
+        i_col += inc
+      out[i_row] += r  # TODO: warp-level reduction
+
+
+  def _reverse(shape):
+    return shape[::-1]
+
+
+  # -------------
+  # CPU function
+  # -------------
+  # For each non-zero event value, it generates a random key using a
+  # function lfsr88_key and then uses this key to compute random integers
+  # and update the out array based on the computed indices and weight.
+  #
+  # The function is likely designed to be parallelized.
+
+  @ti.kernel
+  def _event_mv_prob_homo_cpu(
+      events: ti.types.ndarray(ndim=1),
+      weight: ti.types.ndarray(ndim=1),
+      clen: ti.types.ndarray(ndim=1),
+      seed: ti.types.ndarray(ndim=1),
+      out: ti.types.ndarray(ndim=1)
+  ):
+    num_row = out.shape[0]
+    num_col = events.shape[0]
+    weight0 = weight[0]
+    clen0 = clen[0]
+    seed0 = seed[0]
+
+    for i_col in range(num_col):
       if events[i_col] != 0.:
-        r += weight0
-      key, inc = lfsr88_random_integers(key, 1, clen0)
-      i_col += inc
-    out[i_row] = r  # TODO: warp-level reduction
-
-
-# -------------
-# GPU function
-# -------------
-# Contrary to the CPU functions, for each column,
-# this function will 32 threads (one warp) to make
-# the just-in-time random generation parallelized.
-
-
-@ti.kernel
-def _event_mv_prob_homo_gpu(
-    events: ti.types.ndarray(ndim=1),
-    weight: ti.types.ndarray(ndim=1),
-    clen: ti.types.ndarray(ndim=1),
-    seed: ti.types.ndarray(ndim=1),
-    out: ti.types.ndarray(ndim=1)
-):
-  num_row = out.shape[0]
-  num_col = events.shape[0]
-  weight0 = weight[0]
-  clen0 = clen[0]
-  seed0 = seed[0]
-  step = ti.uint32(ti.max((num_row + 1) >> 5, 1))
-
-  for i in range(num_col * 32):
-    i_col = i >> 5
-    if events[i_col] != 0.:
+        key = lfsr88_key(seed0 + i_col)
+        key, i_row = lfsr88_random_integers(key, 0, clen0 - 1)
+        while i_row < num_row:
+          out[i_row] += weight0
+          key, inc = lfsr88_random_integers(key, 1, clen0)
+          i_row += inc
+
+
+  @ti.kernel
+  def _event_mv_prob_homo_outdim_parallel_cpu(
+      events: ti.types.ndarray(ndim=1),
+      weight: ti.types.ndarray(ndim=1),
+      clen: ti.types.ndarray(ndim=1),
+      seed: ti.types.ndarray(ndim=1),
+      out: ti.types.ndarray(ndim=1)
+  ):
+    num_row = out.shape[0]
+    num_col = events.shape[0]
+    weight0 = weight[0]
+    clen0 = clen[0]
+    seed0 = seed[0]
+
+    for i_row in range(num_row):
+      r = 0.
+      key = lfsr88_key(seed0 + i_row)
+      key, i_col = lfsr88_random_integers(key, 0, clen0 - 1)
+      while i_col < num_col:
+        if events[i_col] != 0.:
+          r += weight0
+        key, inc = lfsr88_random_integers(key, 1, clen0)
+        i_col += inc
+      out[i_row] = r  # TODO: warp-level reduction
+
+
+  # -------------
+  # GPU function
+  # -------------
+  # Contrary to the CPU functions, for each column,
+  # this function will 32 threads (one warp) to make
+  # the just-in-time random generation parallelized.
+
+  @ti.kernel
+  def _event_mv_prob_homo_gpu(
+      events: ti.types.ndarray(ndim=1),
+      weight: ti.types.ndarray(ndim=1),
+      clen: ti.types.ndarray(ndim=1),
+      seed: ti.types.ndarray(ndim=1),
+      out: ti.types.ndarray(ndim=1)
+  ):
+    num_row = out.shape[0]
+    num_col = events.shape[0]
+    weight0 = weight[0]
+    clen0 = clen[0]
+    seed0 = seed[0]
+    step = ti.uint32(ti.max((num_row + 1) >> 5, 1))
+
+    for i in range(num_col * 32):
+      i_col = i >> 5
+      if events[i_col] != 0.:
+        index = i & 31
+        i_row = step * index - 1
+        end = ti.min(i_row + step, num_row)
+        key = lfsr88_key(seed0 + i)
+        key, inc = lfsr88_random_integers(key, 1, clen0)
+        i_row += inc
+        while i_row < end:
+          out[i_row] += weight0
+          key, inc = lfsr88_random_integers(key, 1, clen0)
+          i_row += inc
+
+
+  @ti.kernel
+  def _event_mv_prob_homo_outdim_parallel_gpu(
+      events: ti.types.ndarray(ndim=1),
+      weight: ti.types.ndarray(ndim=1),
+      clen: ti.types.ndarray(ndim=1),
+      seed: ti.types.ndarray(ndim=1),
+      out: ti.types.ndarray(ndim=1)
+  ):
+    num_row = out.shape[0]
+    num_col = events.shape[0]
+    weight0 = weight[0]
+    clen0 = clen[0]
+    seed0 = seed[0]
+    step = ti.uint32(ti.max((num_row + 1) >> 5, 1))
+
+    for i in range(num_row * 32):
+      i_row = i >> 5
       index = i & 31
-      i_row = step * index - 1
-      end = ti.min(i_row + step, num_row)
+      i_col = step * index - 1
+      end_col = ti.min(i_col + step, num_col)
+      r = 0.
       key = lfsr88_key(seed0 + i)
       key, inc = lfsr88_random_integers(key, 1, clen0)
-      i_row += inc
-      while i_row < end:
-        out[i_row] += weight0
+      i_col += inc
+      while i_col < end_col:
+        r += weight0 * events[i_col]  # TODO: speed comparison with if else
         key, inc = lfsr88_random_integers(key, 1, clen0)
-        i_row += inc
+        i_col += inc
+      out[i_row] += r  # TODO: warp-level reduction
 
 
-@ti.kernel
-def _event_mv_prob_homo_outdim_parallel_gpu(
-    events: ti.types.ndarray(ndim=1),
-    weight: ti.types.ndarray(ndim=1),
-    clen: ti.types.ndarray(ndim=1),
-    seed: ti.types.ndarray(ndim=1),
-    out: ti.types.ndarray(ndim=1)
-):
-  num_row = out.shape[0]
-  num_col = events.shape[0]
-  weight0 = weight[0]
-  clen0 = clen[0]
-  seed0 = seed[0]
-  step = ti.uint32(ti.max((num_row + 1) >> 5, 1))
-
-  for i in range(num_row * 32):
-    i_row = i >> 5
-    index = i & 31
-    i_col = step * index - 1
-    end_col = ti.min(i_col + step, num_col)
-    r = 0.
-    key = lfsr88_key(seed0 + i)
-    key, inc = lfsr88_random_integers(key, 1, clen0)
-    i_col += inc
-    while i_col < end_col:
-      r += weight0 * events[i_col]  # TODO: speed comparison with if else
-      key, inc = lfsr88_random_integers(key, 1, clen0)
-      i_col += inc
-    out[i_row] += r  # TODO: warp-level reduction
-
+  def _event_mv_prob_homo_jvp_events(
+      evt_dot, events, weight, clen, seed, *, outs, shape, transpose, outdim_parallel
+  ):
+    shape = _reverse(shape) if transpose else shape
+    return raw_mv_prob_homo(evt_dot, weight, clen, seed,
+                            shape=shape, transpose=transpose, outdim_parallel=outdim_parallel)
 
-def _event_mv_prob_homo_jvp_events(
-    evt_dot, events, weight, clen, seed, *, outs, shape, transpose, outdim_parallel
-):
-  shape = _reverse(shape) if transpose else shape
-  return raw_mv_prob_homo(evt_dot, weight, clen, seed,
-                          shape=shape, transpose=transpose, outdim_parallel=outdim_parallel)
 
+  def _event_mv_prob_homo_jvp_weight(
+      w_dot, events, weight, clen, seed, *, outs, shape, transpose, outdim_parallel
+  ):
+    shape = _reverse(shape) if transpose else shape
+    return raw_mv_prob_homo(events, w_dot, clen, seed,
+                            shape=shape, transpose=transpose, outdim_parallel=outdim_parallel)
 
-def _event_mv_prob_homo_jvp_weight(
-    w_dot, events, weight, clen, seed, *, outs, shape, transpose, outdim_parallel
-):
-  shape = _reverse(shape) if transpose else shape
-  return raw_mv_prob_homo(events, w_dot, clen, seed,
-                          shape=shape, transpose=transpose, outdim_parallel=outdim_parallel)
 
+  def _event_checking(vector, clen, seed, shape, outdim_parallel, transpose, *weights):
+    assert _get_dtype(vector) in [jnp.bool_, jnp.float16, jnp.float32, jnp.float64]
+    return _general_checking(vector, clen, seed, shape, outdim_parallel, transpose, *weights)
+
+
+  def raw_event_mv_prob_homo(
+      events: jax.Array,
+      weight: jax.Array,  # vector with size 1
+      conn_len: jax.Array,  # vector with size 1
+      seed: jax.Array,  # vector with size 1
+      *,
+      shape: Tuple[int, int],
+      transpose: bool = False,
+      outdim_parallel: bool = True,
+  ) -> jax.Array:
+    mat_shape, out_shape = _event_checking(events, conn_len, seed, shape, outdim_parallel, transpose, weight)
+
+    if outdim_parallel:
+      if events.dtype == jnp.bool_:
+        prim = _event_mv_prob_homo_outdim_parallel_bool_p
+      else:
+        prim = _event_mv_prob_homo_outdim_parallel_p
+    else:
+      if events.dtype == jnp.bool_:
+        prim = _event_mv_prob_homo_bool_p
+      else:
+        prim = _event_mv_prob_homo_p
+
+    return prim(events,
+                weight,
+                conn_len,
+                seed,
+                outs=[jax.ShapeDtypeStruct(shape=out_shape, dtype=weight.dtype)],
+                shape=mat_shape,
+                transpose=transpose,
+                outdim_parallel=outdim_parallel)
+
+
+  def _define_event_mv_prob_homo_prim(cpu_kernel, gpu_kernel):
+    prim = XLACustomOp(cpu_kernel=cpu_kernel, gpu_kernel=gpu_kernel)
+    prim.defjvp(_event_mv_prob_homo_jvp_events,
+                _event_mv_prob_homo_jvp_weight,
+                None,
+                None)
+    prim.def_transpose_rule(_mv_prob_homo_transpose)
+    return prim
+
+
+  # outdim_parallel = True, events.dtype = jnp.bool_
+  _event_mv_prob_homo_outdim_parallel_bool_p = _define_event_mv_prob_homo_prim(
+    cpu_kernel=_event_mv_prob_homo_outdim_parallel_bool_cpu,
+    gpu_kernel=_event_mv_prob_homo_outdim_parallel_bool_gpu
+  )
 
-def _event_checking(vector, clen, seed, shape, outdim_parallel, transpose, *weights):
-  assert _get_dtype(vector) in [jnp.bool_, jnp.float16, jnp.float32, jnp.float64]
-  return _general_checking(vector, clen, seed, shape, outdim_parallel, transpose, *weights)
+  # outdim_parallel = False, events.dtype = jnp.bool_
+  _event_mv_prob_homo_bool_p = _define_event_mv_prob_homo_prim(
+    cpu_kernel=_event_mv_prob_homo_bool_cpu,
+    gpu_kernel=_event_mv_prob_homo_bool_gpu
+  )
 
+  # outdim_parallel = True, events.dtype != jnp.bool_
+  _event_mv_prob_homo_outdim_parallel_p = _define_event_mv_prob_homo_prim(
+    cpu_kernel=_event_mv_prob_homo_outdim_parallel_cpu,
+    gpu_kernel=_event_mv_prob_homo_outdim_parallel_gpu
+  )
 
-def raw_event_mv_prob_homo(
-    events: jax.Array,
-    weight: jax.Array,  # vector with size 1
-    conn_len: jax.Array,  # vector with size 1
-    seed: jax.Array,  # vector with size 1
-    *,
-    shape: Tuple[int, int],
-    transpose: bool = False,
-    outdim_parallel: bool = True,
-) -> jax.Array:
-  mat_shape, out_shape = _event_checking(events, conn_len, seed, shape, outdim_parallel, transpose, weight)
+  # outdim_parallel = False, events.dtype != jnp.bool_
+  _event_mv_prob_homo_p = _define_event_mv_prob_homo_prim(
+    cpu_kernel=_event_mv_prob_homo_cpu,
+    gpu_kernel=_event_mv_prob_homo_gpu
+  )
 
-  if outdim_parallel:
-    if events.dtype == jnp.bool_:
-      prim = _event_mv_prob_homo_outdim_parallel_bool_p
-    else:
-      prim = _event_mv_prob_homo_outdim_parallel_p
-  else:
-    if events.dtype == jnp.bool_:
-      prim = _event_mv_prob_homo_bool_p
-    else:
-      prim = _event_mv_prob_homo_p
 
-  return prim(events,
-              weight,
-              conn_len,
-              seed,
-              outs=[jax.ShapeDtypeStruct(shape=out_shape, dtype=weight.dtype)],
-              shape=mat_shape,
-              transpose=transpose,
-              outdim_parallel=outdim_parallel)
-
-
-def _define_event_mv_prob_homo_prim(cpu_kernel, gpu_kernel):
-  prim = XLACustomOp(cpu_kernel=cpu_kernel, gpu_kernel=gpu_kernel)
-  prim.defjvp(_event_mv_prob_homo_jvp_events,
-              _event_mv_prob_homo_jvp_weight,
-              None,
-              None)
-  prim.def_transpose_rule(_mv_prob_homo_transpose)
-  return prim
-
-
-# outdim_parallel = True, events.dtype = jnp.bool_
-_event_mv_prob_homo_outdim_parallel_bool_p = _define_event_mv_prob_homo_prim(
-  cpu_kernel=_event_mv_prob_homo_outdim_parallel_bool_cpu,
-  gpu_kernel=_event_mv_prob_homo_outdim_parallel_bool_gpu
-)
-
-# outdim_parallel = False, events.dtype = jnp.bool_
-_event_mv_prob_homo_bool_p = _define_event_mv_prob_homo_prim(
-  cpu_kernel=_event_mv_prob_homo_bool_cpu,
-  gpu_kernel=_event_mv_prob_homo_bool_gpu
-)
-
-# outdim_parallel = True, events.dtype != jnp.bool_
-_event_mv_prob_homo_outdim_parallel_p = _define_event_mv_prob_homo_prim(
-  cpu_kernel=_event_mv_prob_homo_outdim_parallel_cpu,
-  gpu_kernel=_event_mv_prob_homo_outdim_parallel_gpu
-)
-
-# outdim_parallel = False, events.dtype != jnp.bool_
-_event_mv_prob_homo_p = _define_event_mv_prob_homo_prim(
-  cpu_kernel=_event_mv_prob_homo_cpu,
-  gpu_kernel=_event_mv_prob_homo_gpu
-)
-
-
-@ti.kernel
-def _event_mv_prob_uniform_bool_cpu(
-    events: ti.types.ndarray(ndim=1),
-    w_min: ti.types.ndarray(ndim=1),
-    w_max: ti.types.ndarray(ndim=1),
-    clen: ti.types.ndarray(ndim=1),
-    seed: ti.types.ndarray(ndim=1),
-    out: ti.types.ndarray(ndim=1)
-):
-  num_row = out.shape[0]
-  num_col = events.shape[0]
-  w_min0 = w_min[0]
-  w_max0 = w_max[0]
-  clen0 = clen[0]
-  seed0 = seed[0]
-
-  for i_col in range(num_col):
-    if events[i_col]:
-      key = lfsr88_key(seed0 + i_col)
-      key, i_row = lfsr88_random_integers(key, 0, clen0 - 1)
-      while i_row < num_row:
+  @ti.kernel
+  def _event_mv_prob_uniform_bool_cpu(
+      events: ti.types.ndarray(ndim=1),
+      w_min: ti.types.ndarray(ndim=1),
+      w_max: ti.types.ndarray(ndim=1),
+      clen: ti.types.ndarray(ndim=1),
+      seed: ti.types.ndarray(ndim=1),
+      out: ti.types.ndarray(ndim=1)
+  ):
+    num_row = out.shape[0]
+    num_col = events.shape[0]
+    w_min0 = w_min[0]
+    w_max0 = w_max[0]
+    clen0 = clen[0]
+    seed0 = seed[0]
+
+    for i_col in range(num_col):
+      if events[i_col]:
+        key = lfsr88_key(seed0 + i_col)
+        key, i_row = lfsr88_random_integers(key, 0, clen0 - 1)
+        while i_row < num_row:
+          key, row_v = lfsr88_uniform(key, w_min0, w_max0)
+          out[i_row] += row_v
+          key, inc = lfsr88_random_integers(key, 1, clen0)
+          i_row += inc
+
+
+  @ti.kernel
+  def _event_mv_prob_uniform_outdim_parallel_bool_cpu(
+      events: ti.types.ndarray(ndim=1),
+      w_min: ti.types.ndarray(ndim=1),
+      w_max: ti.types.ndarray(ndim=1),
+      clen: ti.types.ndarray(ndim=1),
+      seed: ti.types.ndarray(ndim=1),
+      out: ti.types.ndarray(ndim=1)
+  ):
+    num_row = out.shape[0]
+    num_col = events.shape[0]
+    w_min0 = w_min[0]
+    w_max0 = w_max[0]
+    clen0 = clen[0]
+    seed0 = seed[0]
+
+    for i_row in range(num_row):
+      r = 0.
+      key = lfsr88_key(seed0 + i_row)
+      key, i_col = lfsr88_random_integers(key, 0, clen0 - 1)
+      while i_col < num_col:
         key, row_v = lfsr88_uniform(key, w_min0, w_max0)
-        out[i_row] += row_v
+        if events[i_col]:
+          r += row_v
         key, inc = lfsr88_random_integers(key, 1, clen0)
-        i_row += inc
-
-
-@ti.kernel
-def _event_mv_prob_uniform_outdim_parallel_bool_cpu(
-    events: ti.types.ndarray(ndim=1),
-    w_min: ti.types.ndarray(ndim=1),
-    w_max: ti.types.ndarray(ndim=1),
-    clen: ti.types.ndarray(ndim=1),
-    seed: ti.types.ndarray(ndim=1),
-    out: ti.types.ndarray(ndim=1)
-):
-  num_row = out.shape[0]
-  num_col = events.shape[0]
-  w_min0 = w_min[0]
-  w_max0 = w_max[0]
-  clen0 = clen[0]
-  seed0 = seed[0]
-
-  for i_row in range(num_row):
-    r = 0.
-    key = lfsr88_key(seed0 + i_row)
-    key, i_col = lfsr88_random_integers(key, 0, clen0 - 1)
-    while i_col < num_col:
-      key, row_v = lfsr88_uniform(key, w_min0, w_max0)
+        i_col += inc
+      out[i_row] = r
+
+
+  @ti.kernel
+  def _event_mv_prob_uniform_bool_gpu(
+      events: ti.types.ndarray(ndim=1),
+      w_min: ti.types.ndarray(ndim=1),
+      w_max: ti.types.ndarray(ndim=1),
+      clen: ti.types.ndarray(ndim=1),
+      seed: ti.types.ndarray(ndim=1),
+      out: ti.types.ndarray(ndim=1)
+  ):
+    num_row = out.shape[0]
+    num_col = events.shape[0]
+    w_min0 = w_min[0]
+    w_max0 = w_max[0]
+    clen0 = clen[0]
+    seed0 = seed[0]
+    step = ti.uint32(ti.max((num_row + 1) >> 5, 1))
+
+    for i in range(num_col * 32):
+      i_col = i >> 5
       if events[i_col]:
-        r += row_v
-      key, inc = lfsr88_random_integers(key, 1, clen0)
-      i_col += inc
-    out[i_row] = r
-
-
-@ti.kernel
-def _event_mv_prob_uniform_bool_gpu(
-    events: ti.types.ndarray(ndim=1),
-    w_min: ti.types.ndarray(ndim=1),
-    w_max: ti.types.ndarray(ndim=1),
-    clen: ti.types.ndarray(ndim=1),
-    seed: ti.types.ndarray(ndim=1),
-    out: ti.types.ndarray(ndim=1)
-):
-  num_row = out.shape[0]
-  num_col = events.shape[0]
-  w_min0 = w_min[0]
-  w_max0 = w_max[0]
-  clen0 = clen[0]
-  seed0 = seed[0]
-  step = ti.uint32(ti.max((num_row + 1) >> 5, 1))
-
-  for i in range(num_col * 32):
-    i_col = i >> 5
-    if events[i_col]:
+        index = i & 31
+        i_row = step * index - 1
+        end = ti.min(i_row + step, num_row)
+        key = lfsr88_key(seed0 + i)
+        key, inc = lfsr88_random_integers(key, 1, clen0)
+        i_row += inc
+        while i_row < end:
+          key, row_v = lfsr88_uniform(key, w_min0, w_max0)
+          out[i_row] += row_v
+          key, inc = lfsr88_random_integers(key, 1, clen0)
+          i_row += inc
+
+
+  @ti.kernel
+  def _event_mv_prob_uniform_outdim_parallel_bool_gpu(
+      events: ti.types.ndarray(ndim=1),
+      w_min: ti.types.ndarray(ndim=1),
+      w_max: ti.types.ndarray(ndim=1),
+      clen: ti.types.ndarray(ndim=1),
+      seed: ti.types.ndarray(ndim=1),
+      out: ti.types.ndarray(ndim=1)
+  ):
+    num_row = out.shape[0]
+    num_col = events.shape[0]
+    w_min0 = w_min[0]
+    w_max0 = w_max[0]
+    clen0 = clen[0]
+    seed0 = seed[0]
+    step = ti.u32(ti.max((num_row + 1) >> 5, 1))
+
+    for i in range(num_row * 32):
+      i_row = i >> 5
       index = i & 31
-      i_row = step * index - 1
-      end = ti.min(i_row + step, num_row)
+      i_col = step * index - 1
+      end_col = ti.min(i_col + step, num_col)
+      r = 0.
       key = lfsr88_key(seed0 + i)
       key, inc = lfsr88_random_integers(key, 1, clen0)
-      i_row += inc
-      while i_row < end:
+      i_col += inc
+      while i_col < end_col:
         key, row_v = lfsr88_uniform(key, w_min0, w_max0)
-        out[i_row] += row_v
+        r += row_v * events[i_col]  # TODO: speed comparison without if else
         key, inc = lfsr88_random_integers(key, 1, clen0)
-        i_row += inc
-
-
-@ti.kernel
-def _event_mv_prob_uniform_outdim_parallel_bool_gpu(
-    events: ti.types.ndarray(ndim=1),
-    w_min: ti.types.ndarray(ndim=1),
-    w_max: ti.types.ndarray(ndim=1),
-    clen: ti.types.ndarray(ndim=1),
-    seed: ti.types.ndarray(ndim=1),
-    out: ti.types.ndarray(ndim=1)
-):
-  num_row = out.shape[0]
-  num_col = events.shape[0]
-  w_min0 = w_min[0]
-  w_max0 = w_max[0]
-  clen0 = clen[0]
-  seed0 = seed[0]
-  step = ti.u32(ti.max((num_row + 1) >> 5, 1))
-
-  for i in range(num_row * 32):
-    i_row = i >> 5
-    index = i & 31
-    i_col = step * index - 1
-    end_col = ti.min(i_col + step, num_col)
-    r = 0.
-    key = lfsr88_key(seed0 + i)
-    key, inc = lfsr88_random_integers(key, 1, clen0)
-    i_col += inc
-    while i_col < end_col:
-      key, row_v = lfsr88_uniform(key, w_min0, w_max0)
-      r += row_v * events[i_col]  # TODO: speed comparison without if else
-      key, inc = lfsr88_random_integers(key, 1, clen0)
-      i_col += inc
-    out[i_row] += r  # TODO: warp-level reduction
-
-
-@ti.kernel
-def _event_mv_prob_uniform_cpu(
-    events: ti.types.ndarray(ndim=1),
-    w_min: ti.types.ndarray(ndim=1),
-    w_max: ti.types.ndarray(ndim=1),
-    clen: ti.types.ndarray(ndim=1),
-    seed: ti.types.ndarray(ndim=1),
-    out: ti.types.ndarray(ndim=1)
-):
-  num_row = out.shape[0]
-  num_col = events.shape[0]
-  w_min0 = w_min[0]
-  w_max0 = w_max[0]
-  clen0 = clen[0]
-  seed0 = seed[0]
-
-  for i_col in range(num_col):
-    if events[i_col] != 0.:
-      key = lfsr88_key(seed0 + i_col)
-      key, i_row = lfsr88_random_integers(key, 0, clen0 - 1)
-      while i_row < num_row:
+        i_col += inc
+      out[i_row] += r  # TODO: warp-level reduction
+
+
+  @ti.kernel
+  def _event_mv_prob_uniform_cpu(
+      events: ti.types.ndarray(ndim=1),
+      w_min: ti.types.ndarray(ndim=1),
+      w_max: ti.types.ndarray(ndim=1),
+      clen: ti.types.ndarray(ndim=1),
+      seed: ti.types.ndarray(ndim=1),
+      out: ti.types.ndarray(ndim=1)
+  ):
+    num_row = out.shape[0]
+    num_col = events.shape[0]
+    w_min0 = w_min[0]
+    w_max0 = w_max[0]
+    clen0 = clen[0]
+    seed0 = seed[0]
+
+    for i_col in range(num_col):
+      if events[i_col] != 0.:
+        key = lfsr88_key(seed0 + i_col)
+        key, i_row = lfsr88_random_integers(key, 0, clen0 - 1)
+        while i_row < num_row:
+          key, row_v = lfsr88_uniform(key, w_min0, w_max0)
+          out[i_row] += row_v
+          key, inc = lfsr88_random_integers(key, 1, clen0)
+          i_row += inc
+
+
+  @ti.kernel
+  def _event_mv_prob_uniform_outdim_parallel_cpu(
+      events: ti.types.ndarray(ndim=1),
+      w_min: ti.types.ndarray(ndim=1),
+      w_max: ti.types.ndarray(ndim=1),
+      clen: ti.types.ndarray(ndim=1),
+      seed: ti.types.ndarray(ndim=1),
+      out: ti.types.ndarray(ndim=1)
+  ):
+    num_row = out.shape[0]
+    num_col = events.shape[0]
+    w_min0 = w_min[0]
+    w_max0 = w_max[0]
+    clen0 = clen[0]
+    seed0 = seed[0]
+
+    for i_row in range(num_row):
+      r = 0.
+      key = lfsr88_key(seed0 + i_row)
+      key, i_col = lfsr88_random_integers(key, 0, clen0 - 1)
+      while i_col < num_col:
         key, row_v = lfsr88_uniform(key, w_min0, w_max0)
-        out[i_row] += row_v
+        if events[i_col] != 0.:
+          r += row_v
         key, inc = lfsr88_random_integers(key, 1, clen0)
-        i_row += inc
-
-
-@ti.kernel
-def _event_mv_prob_uniform_outdim_parallel_cpu(
-    events: ti.types.ndarray(ndim=1),
-    w_min: ti.types.ndarray(ndim=1),
-    w_max: ti.types.ndarray(ndim=1),
-    clen: ti.types.ndarray(ndim=1),
-    seed: ti.types.ndarray(ndim=1),
-    out: ti.types.ndarray(ndim=1)
-):
-  num_row = out.shape[0]
-  num_col = events.shape[0]
-  w_min0 = w_min[0]
-  w_max0 = w_max[0]
-  clen0 = clen[0]
-  seed0 = seed[0]
-
-  for i_row in range(num_row):
-    r = 0.
-    key = lfsr88_key(seed0 + i_row)
-    key, i_col = lfsr88_random_integers(key, 0, clen0 - 1)
-    while i_col < num_col:
-      key, row_v = lfsr88_uniform(key, w_min0, w_max0)
+        i_col += inc
+      out[i_row] = r  # TODO: warp-level reduction
+
+
+  @ti.kernel
+  def _event_mv_prob_uniform_gpu(
+      events: ti.types.ndarray(ndim=1),
+      w_min: ti.types.ndarray(ndim=1),
+      w_max: ti.types.ndarray(ndim=1),
+      clen: ti.types.ndarray(ndim=1),
+      seed: ti.types.ndarray(ndim=1),
+      out: ti.types.ndarray(ndim=1)
+  ):
+    num_row = out.shape[0]
+    num_col = events.shape[0]
+    w_min0 = w_min[0]
+    w_max0 = w_max[0]
+    clen0 = clen[0]
+    seed0 = seed[0]
+    step = ti.uint32(ti.max((num_row + 1) >> 5, 1))
+
+    for i in range(num_col * 32):
+      i_col = i >> 5
       if events[i_col] != 0.:
-        r += row_v
-      key, inc = lfsr88_random_integers(key, 1, clen0)
-      i_col += inc
-    out[i_row] = r  # TODO: warp-level reduction
-
-
-@ti.kernel
-def _event_mv_prob_uniform_gpu(
-    events: ti.types.ndarray(ndim=1),
-    w_min: ti.types.ndarray(ndim=1),
-    w_max: ti.types.ndarray(ndim=1),
-    clen: ti.types.ndarray(ndim=1),
-    seed: ti.types.ndarray(ndim=1),
-    out: ti.types.ndarray(ndim=1)
-):
-  num_row = out.shape[0]
-  num_col = events.shape[0]
-  w_min0 = w_min[0]
-  w_max0 = w_max[0]
-  clen0 = clen[0]
-  seed0 = seed[0]
-  step = ti.uint32(ti.max((num_row + 1) >> 5, 1))
-
-  for i in range(num_col * 32):
-    i_col = i >> 5
-    if events[i_col] != 0.:
+        index = i & 31
+        i_row = step * index - 1
+        end = ti.min(i_row + step, num_row)
+        key = lfsr88_key(seed0 + i)
+        key, inc = lfsr88_random_integers(key, 1, clen0)
+        i_row += inc
+        while i_row < end:
+          key, row_v = lfsr88_uniform(key, w_min0, w_max0)
+          out[i_row] += row_v
+          key, inc = lfsr88_random_integers(key, 1, clen0)
+          i_row += inc
+
+
+  @ti.kernel
+  def _event_mv_prob_uniform_outdim_parallel_gpu(
+      events: ti.types.ndarray(ndim=1),
+      w_min: ti.types.ndarray(ndim=1),
+      w_max: ti.types.ndarray(ndim=1),
+      clen: ti.types.ndarray(ndim=1),
+      seed: ti.types.ndarray(ndim=1),
+      out: ti.types.ndarray(ndim=1)
+  ):
+    num_row = out.shape[0]
+    num_col = events.shape[0]
+    w_min0 = w_min[0]
+    w_max0 = w_max[0]
+    clen0 = clen[0]
+    seed0 = seed[0]
+    step = ti.uint32(ti.max((num_row + 1) >> 5, 1))
+
+    for i in range(num_row * 32):
+      i_row = i >> 5
       index = i & 31
-      i_row = step * index - 1
-      end = ti.min(i_row + step, num_row)
+      i_col = step * index - 1
+      end_col = ti.min(i_col + step, num_col)
+      r = 0.
       key = lfsr88_key(seed0 + i)
       key, inc = lfsr88_random_integers(key, 1, clen0)
-      i_row += inc
-      while i_row < end:
+      i_col += inc
+      while i_col < end_col:
         key, row_v = lfsr88_uniform(key, w_min0, w_max0)
-        out[i_row] += row_v
+        r += row_v * events[i_col]  # TODO: speed comparison with if else
         key, inc = lfsr88_random_integers(key, 1, clen0)
-        i_row += inc
-
-
-@ti.kernel
-def _event_mv_prob_uniform_outdim_parallel_gpu(
-    events: ti.types.ndarray(ndim=1),
-    w_min: ti.types.ndarray(ndim=1),
-    w_max: ti.types.ndarray(ndim=1),
-    clen: ti.types.ndarray(ndim=1),
-    seed: ti.types.ndarray(ndim=1),
-    out: ti.types.ndarray(ndim=1)
-):
-  num_row = out.shape[0]
-  num_col = events.shape[0]
-  w_min0 = w_min[0]
-  w_max0 = w_max[0]
-  clen0 = clen[0]
-  seed0 = seed[0]
-  step = ti.uint32(ti.max((num_row + 1) >> 5, 1))
-
-  for i in range(num_row * 32):
-    i_row = i >> 5
-    index = i & 31
-    i_col = step * index - 1
-    end_col = ti.min(i_col + step, num_col)
-    r = 0.
-    key = lfsr88_key(seed0 + i)
-    key, inc = lfsr88_random_integers(key, 1, clen0)
-    i_col += inc
-    while i_col < end_col:
-      key, row_v = lfsr88_uniform(key, w_min0, w_max0)
-      r += row_v * events[i_col]  # TODO: speed comparison with if else
-      key, inc = lfsr88_random_integers(key, 1, clen0)
-      i_col += inc
-    out[i_row] += r  # TODO: warp-level reduction
-
-
-def _event_mv_prob_uniform_jvp_events(
-    evt_dot, events, w_low, w_high, clen, seed, *, outs, shape, transpose, outdim_parallel
-):
-  shape = _reverse(shape) if transpose else shape
-  return raw_mv_prob_uniform(evt_dot, w_low, w_high, clen, seed,
-                             shape=shape, transpose=transpose, outdim_parallel=outdim_parallel)
-
-
-def _event_mv_prob_uniform_jvp_w_low(
-    w_dot, events, w_low, w_high, clen, seed, *, outs, shape, transpose, outdim_parallel
-):
-  shape = _reverse(shape) if transpose else shape
-  return raw_mv_prob_uniform(events, w_dot, w_high, clen, seed,
-                             shape=shape, transpose=transpose, outdim_parallel=outdim_parallel)
-
+        i_col += inc
+      out[i_row] += r  # TODO: warp-level reduction
+
+
+  def _event_mv_prob_uniform_jvp_events(
+      evt_dot, events, w_low, w_high, clen, seed, *, outs, shape, transpose, outdim_parallel
+  ):
+    shape = _reverse(shape) if transpose else shape
+    return raw_mv_prob_uniform(evt_dot, w_low, w_high, clen, seed,
+                               shape=shape, transpose=transpose, outdim_parallel=outdim_parallel)
+
+
+  def _event_mv_prob_uniform_jvp_w_low(
+      w_dot, events, w_low, w_high, clen, seed, *, outs, shape, transpose, outdim_parallel
+  ):
+    shape = _reverse(shape) if transpose else shape
+    return raw_mv_prob_uniform(events, w_dot, w_high, clen, seed,
+                               shape=shape, transpose=transpose, outdim_parallel=outdim_parallel)
+
+
+  def _event_mv_prob_uniform_jvp_w_high(
+      w_dot, events, w_low, w_high, clen, seed, *, outs, shape, transpose, outdim_parallel
+  ):
+    shape = _reverse(shape) if transpose else shape
+    return raw_mv_prob_uniform(events, w_low, w_dot, clen, seed,
+                               shape=shape, transpose=transpose, outdim_parallel=outdim_parallel)
+
+
+  def raw_event_mv_prob_uniform(
+      events: jax.Array,
+      w_low: jax.Array,  # vector with size 1
+      w_high: jax.Array,  # vector with size 1
+      conn_len: jax.Array,  # vector with size 1
+      seed: jax.Array,  # vector with size 1
+      *,
+      shape: Tuple[int, int],
+      transpose: bool = False,
+      outdim_parallel: bool = True,
+  ) -> jax.Array:
+    mat_shape, out_shape = _event_checking(events, conn_len, seed, shape, outdim_parallel, transpose, w_low, w_high)
+
+    if outdim_parallel:
+      if events.dtype == jnp.bool_:
+        prim = _event_mv_prob_uniform_outdim_parallel_bool_p
+      else:
+        prim = _event_mv_prob_uniform_outdim_parallel_p
+    else:
+      if events.dtype == jnp.bool_:
+        prim = _event_mv_prob_uniform_bool_p
+      else:
+        prim = _event_mv_prob_uniform_p
+
+    return prim(events,
+                w_low,
+                w_high,
+                conn_len,
+                seed,
+                outs=[jax.ShapeDtypeStruct(shape=out_shape, dtype=w_low.dtype)],
+                shape=mat_shape,
+                transpose=transpose,
+                outdim_parallel=outdim_parallel)
+
+
+  def _define_event_mv_prob_uniform_prim(cpu_kernel, gpu_kernel):
+    prim = XLACustomOp(cpu_kernel=cpu_kernel, gpu_kernel=gpu_kernel)
+    prim.defjvp(_event_mv_prob_uniform_jvp_events,
+                _event_mv_prob_uniform_jvp_w_low,
+                _event_mv_prob_uniform_jvp_w_high,
+                None,
+                None)
+    prim.def_transpose_rule(_mv_prob_uniform_transpose)
+    return prim
+
+
+  # outdim_parallel = True, events.dtype = jnp.bool_
+  _event_mv_prob_uniform_outdim_parallel_bool_p = _define_event_mv_prob_uniform_prim(
+    cpu_kernel=_event_mv_prob_uniform_outdim_parallel_bool_cpu,
+    gpu_kernel=_event_mv_prob_uniform_outdim_parallel_bool_gpu
+  )
 
-def _event_mv_prob_uniform_jvp_w_high(
-    w_dot, events, w_low, w_high, clen, seed, *, outs, shape, transpose, outdim_parallel
-):
-  shape = _reverse(shape) if transpose else shape
-  return raw_mv_prob_uniform(events, w_low, w_dot, clen, seed,
-                             shape=shape, transpose=transpose, outdim_parallel=outdim_parallel)
+  # outdim_parallel = False, events.dtype = jnp.bool_
+  _event_mv_prob_uniform_bool_p = _define_event_mv_prob_uniform_prim(
+    cpu_kernel=_event_mv_prob_uniform_bool_cpu,
+    gpu_kernel=_event_mv_prob_uniform_bool_gpu
+  )
 
+  # outdim_parallel = True, events.dtype != jnp.bool_
+  _event_mv_prob_uniform_outdim_parallel_p = _define_event_mv_prob_uniform_prim(
+    cpu_kernel=_event_mv_prob_uniform_outdim_parallel_cpu,
+    gpu_kernel=_event_mv_prob_uniform_outdim_parallel_gpu
+  )
 
-def raw_event_mv_prob_uniform(
-    events: jax.Array,
-    w_low: jax.Array,  # vector with size 1
-    w_high: jax.Array,  # vector with size 1
-    conn_len: jax.Array,  # vector with size 1
-    seed: jax.Array,  # vector with size 1
-    *,
-    shape: Tuple[int, int],
-    transpose: bool = False,
-    outdim_parallel: bool = True,
-) -> jax.Array:
-  mat_shape, out_shape = _event_checking(events, conn_len, seed, shape, outdim_parallel, transpose, w_low, w_high)
+  # outdim_parallel = False, events.dtype != jnp.bool_
+  _event_mv_prob_uniform_p = _define_event_mv_prob_uniform_prim(
+    cpu_kernel=_event_mv_prob_uniform_cpu,
+    gpu_kernel=_event_mv_prob_uniform_gpu
+  )
 
-  if outdim_parallel:
-    if events.dtype == jnp.bool_:
-      prim = _event_mv_prob_uniform_outdim_parallel_bool_p
-    else:
-      prim = _event_mv_prob_uniform_outdim_parallel_p
-  else:
-    if events.dtype == jnp.bool_:
-      prim = _event_mv_prob_uniform_bool_p
-    else:
-      prim = _event_mv_prob_uniform_p
 
-  return prim(events,
-              w_low,
-              w_high,
-              conn_len,
-              seed,
-              outs=[jax.ShapeDtypeStruct(shape=out_shape, dtype=w_low.dtype)],
-              shape=mat_shape,
-              transpose=transpose,
-              outdim_parallel=outdim_parallel)
-
-
-def _define_event_mv_prob_uniform_prim(cpu_kernel, gpu_kernel):
-  prim = XLACustomOp(cpu_kernel=cpu_kernel, gpu_kernel=gpu_kernel)
-  prim.defjvp(_event_mv_prob_uniform_jvp_events,
-              _event_mv_prob_uniform_jvp_w_low,
-              _event_mv_prob_uniform_jvp_w_high,
-              None,
-              None)
-  prim.def_transpose_rule(_mv_prob_uniform_transpose)
-  return prim
-
-
-# outdim_parallel = True, events.dtype = jnp.bool_
-_event_mv_prob_uniform_outdim_parallel_bool_p = _define_event_mv_prob_uniform_prim(
-  cpu_kernel=_event_mv_prob_uniform_outdim_parallel_bool_cpu,
-  gpu_kernel=_event_mv_prob_uniform_outdim_parallel_bool_gpu
-)
-
-# outdim_parallel = False, events.dtype = jnp.bool_
-_event_mv_prob_uniform_bool_p = _define_event_mv_prob_uniform_prim(
-  cpu_kernel=_event_mv_prob_uniform_bool_cpu,
-  gpu_kernel=_event_mv_prob_uniform_bool_gpu
-)
-
-# outdim_parallel = True, events.dtype != jnp.bool_
-_event_mv_prob_uniform_outdim_parallel_p = _define_event_mv_prob_uniform_prim(
-  cpu_kernel=_event_mv_prob_uniform_outdim_parallel_cpu,
-  gpu_kernel=_event_mv_prob_uniform_outdim_parallel_gpu
-)
-
-# outdim_parallel = False, events.dtype != jnp.bool_
-_event_mv_prob_uniform_p = _define_event_mv_prob_uniform_prim(
-  cpu_kernel=_event_mv_prob_uniform_cpu,
-  gpu_kernel=_event_mv_prob_uniform_gpu
-)
-
-
-@ti.kernel
-def _event_mv_prob_normal_bool_cpu(
-    events: ti.types.ndarray(ndim=1),
-    w_mu: ti.types.ndarray(ndim=1),
-    w_sigma: ti.types.ndarray(ndim=1),
-    clen: ti.types.ndarray(ndim=1),
-    seed: ti.types.ndarray(ndim=1),
-    out: ti.types.ndarray(ndim=1)
-):
-  num_row = out.shape[0]
-  num_col = events.shape[0]
-  w_mu0 = w_mu[0]
-  w_sigma0 = w_sigma[0]
-  clen0 = clen[0]
-  seed0 = seed[0]
-
-  for i_col in range(num_col):
-    if events[i_col]:
-      key = lfsr88_key(seed0 + i_col)
-      key, i_row = lfsr88_random_integers(key, 0, clen0 - 1)
-      while i_row < num_row:
+  @ti.kernel
+  def _event_mv_prob_normal_bool_cpu(
+      events: ti.types.ndarray(ndim=1),
+      w_mu: ti.types.ndarray(ndim=1),
+      w_sigma: ti.types.ndarray(ndim=1),
+      clen: ti.types.ndarray(ndim=1),
+      seed: ti.types.ndarray(ndim=1),
+      out: ti.types.ndarray(ndim=1)
+  ):
+    num_row = out.shape[0]
+    num_col = events.shape[0]
+    w_mu0 = w_mu[0]
+    w_sigma0 = w_sigma[0]
+    clen0 = clen[0]
+    seed0 = seed[0]
+
+    for i_col in range(num_col):
+      if events[i_col]:
+        key = lfsr88_key(seed0 + i_col)
+        key, i_row = lfsr88_random_integers(key, 0, clen0 - 1)
+        while i_row < num_row:
+          key, row_v = lfsr88_normal(key, w_mu0, w_sigma0)
+          out[i_row] += row_v
+          key, inc = lfsr88_random_integers(key, 1, clen0)
+          i_row += inc
+
+
+  @ti.kernel
+  def _event_mv_prob_normal_outdim_parallel_bool_cpu(
+      events: ti.types.ndarray(ndim=1),
+      w_mu: ti.types.ndarray(ndim=1),
+      w_sigma: ti.types.ndarray(ndim=1),
+      clen: ti.types.ndarray(ndim=1),
+      seed: ti.types.ndarray(ndim=1),
+      out: ti.types.ndarray(ndim=1)
+  ):
+    num_row = out.shape[0]
+    num_col = events.shape[0]
+    w_mu0 = w_mu[0]
+    w_sigma0 = w_sigma[0]
+    clen0 = clen[0]
+    seed0 = seed[0]
+
+    for i_row in range(num_row):
+      r = 0.
+      key = lfsr88_key(seed0 + i_row)
+      key, i_col = lfsr88_random_integers(key, 0, clen0 - 1)
+      while i_col < num_col:
         key, row_v = lfsr88_normal(key, w_mu0, w_sigma0)
-        out[i_row] += row_v
+        if events[i_col]:
+          r += row_v
         key, inc = lfsr88_random_integers(key, 1, clen0)
-        i_row += inc
-
-
-@ti.kernel
-def _event_mv_prob_normal_outdim_parallel_bool_cpu(
-    events: ti.types.ndarray(ndim=1),
-    w_mu: ti.types.ndarray(ndim=1),
-    w_sigma: ti.types.ndarray(ndim=1),
-    clen: ti.types.ndarray(ndim=1),
-    seed: ti.types.ndarray(ndim=1),
-    out: ti.types.ndarray(ndim=1)
-):
-  num_row = out.shape[0]
-  num_col = events.shape[0]
-  w_mu0 = w_mu[0]
-  w_sigma0 = w_sigma[0]
-  clen0 = clen[0]
-  seed0 = seed[0]
-
-  for i_row in range(num_row):
-    r = 0.
-    key = lfsr88_key(seed0 + i_row)
-    key, i_col = lfsr88_random_integers(key, 0, clen0 - 1)
-    while i_col < num_col:
-      key, row_v = lfsr88_normal(key, w_mu0, w_sigma0)
+        i_col += inc
+      out[i_row] = r
+
+
+  @ti.kernel
+  def _event_mv_prob_normal_bool_gpu(
+      events: ti.types.ndarray(ndim=1),
+      w_mu: ti.types.ndarray(ndim=1),
+      w_sigma: ti.types.ndarray(ndim=1),
+      clen: ti.types.ndarray(ndim=1),
+      seed: ti.types.ndarray(ndim=1),
+      out: ti.types.ndarray(ndim=1)
+  ):
+    num_row = out.shape[0]
+    num_col = events.shape[0]
+    w_mu0 = w_mu[0]
+    w_sigma0 = w_sigma[0]
+    clen0 = clen[0]
+    seed0 = seed[0]
+    step = ti.uint32(ti.max((num_row + 1) >> 5, 1))
+
+    for i in range(num_col * 32):
+      i_col = i >> 5
       if events[i_col]:
-        r += row_v
-      key, inc = lfsr88_random_integers(key, 1, clen0)
-      i_col += inc
-    out[i_row] = r
-
-
-@ti.kernel
-def _event_mv_prob_normal_bool_gpu(
-    events: ti.types.ndarray(ndim=1),
-    w_mu: ti.types.ndarray(ndim=1),
-    w_sigma: ti.types.ndarray(ndim=1),
-    clen: ti.types.ndarray(ndim=1),
-    seed: ti.types.ndarray(ndim=1),
-    out: ti.types.ndarray(ndim=1)
-):
-  num_row = out.shape[0]
-  num_col = events.shape[0]
-  w_mu0 = w_mu[0]
-  w_sigma0 = w_sigma[0]
-  clen0 = clen[0]
-  seed0 = seed[0]
-  step = ti.uint32(ti.max((num_row + 1) >> 5, 1))
-
-  for i in range(num_col * 32):
-    i_col = i >> 5
-    if events[i_col]:
+        index = i & 31
+        i_row = step * index - 1
+        end = ti.min(i_row + step, num_row)
+        key = lfsr88_key(seed0 + i)
+        key, inc = lfsr88_random_integers(key, 1, clen0)
+        i_row += inc
+        while i_row < end:
+          key, row_v = lfsr88_normal(key, w_mu0, w_sigma0)
+          out[i_row] += row_v
+          key, inc = lfsr88_random_integers(key, 1, clen0)
+          i_row += inc
+
+
+  @ti.kernel
+  def _event_mv_prob_normal_outdim_parallel_bool_gpu(
+      events: ti.types.ndarray(ndim=1),
+      w_mu: ti.types.ndarray(ndim=1),
+      w_sigma: ti.types.ndarray(ndim=1),
+      clen: ti.types.ndarray(ndim=1),
+      seed: ti.types.ndarray(ndim=1),
+      out: ti.types.ndarray(ndim=1)
+  ):
+    num_row = out.shape[0]
+    num_col = events.shape[0]
+    w_mu0 = w_mu[0]
+    w_sigma0 = w_sigma[0]
+    clen0 = clen[0]
+    seed0 = seed[0]
+    step = ti.u32(ti.max((num_row + 1) >> 5, 1))
+
+    for i in range(num_row * 32):
+      i_row = i >> 5
       index = i & 31
-      i_row = step * index - 1
-      end = ti.min(i_row + step, num_row)
+      i_col = step * index - 1
+      end_col = ti.min(i_col + step, num_col)
+      r = 0.
       key = lfsr88_key(seed0 + i)
       key, inc = lfsr88_random_integers(key, 1, clen0)
-      i_row += inc
-      while i_row < end:
+      i_col += inc
+      while i_col < end_col:
         key, row_v = lfsr88_normal(key, w_mu0, w_sigma0)
-        out[i_row] += row_v
+        r += row_v * events[i_col]  # TODO: speed comparison without if else
         key, inc = lfsr88_random_integers(key, 1, clen0)
-        i_row += inc
-
-
-@ti.kernel
-def _event_mv_prob_normal_outdim_parallel_bool_gpu(
-    events: ti.types.ndarray(ndim=1),
-    w_mu: ti.types.ndarray(ndim=1),
-    w_sigma: ti.types.ndarray(ndim=1),
-    clen: ti.types.ndarray(ndim=1),
-    seed: ti.types.ndarray(ndim=1),
-    out: ti.types.ndarray(ndim=1)
-):
-  num_row = out.shape[0]
-  num_col = events.shape[0]
-  w_mu0 = w_mu[0]
-  w_sigma0 = w_sigma[0]
-  clen0 = clen[0]
-  seed0 = seed[0]
-  step = ti.u32(ti.max((num_row + 1) >> 5, 1))
-
-  for i in range(num_row * 32):
-    i_row = i >> 5
-    index = i & 31
-    i_col = step * index - 1
-    end_col = ti.min(i_col + step, num_col)
-    r = 0.
-    key = lfsr88_key(seed0 + i)
-    key, inc = lfsr88_random_integers(key, 1, clen0)
-    i_col += inc
-    while i_col < end_col:
-      key, row_v = lfsr88_normal(key, w_mu0, w_sigma0)
-      r += row_v * events[i_col]  # TODO: speed comparison without if else
-      key, inc = lfsr88_random_integers(key, 1, clen0)
-      i_col += inc
-    out[i_row] += r  # TODO: warp-level reduction
-
-
-@ti.kernel
-def _event_mv_prob_normal_cpu(
-    events: ti.types.ndarray(ndim=1),
-    w_mu: ti.types.ndarray(ndim=1),
-    w_sigma: ti.types.ndarray(ndim=1),
-    clen: ti.types.ndarray(ndim=1),
-    seed: ti.types.ndarray(ndim=1),
-    out: ti.types.ndarray(ndim=1)
-):
-  num_row = out.shape[0]
-  num_col = events.shape[0]
-  w_mu0 = w_mu[0]
-  w_sigma0 = w_sigma[0]
-  clen0 = clen[0]
-  seed0 = seed[0]
-
-  for i_col in range(num_col):
-    if events[i_col] != 0.:
-      key = lfsr88_key(seed0 + i_col)
-      key, i_row = lfsr88_random_integers(key, 0, clen0 - 1)
-      while i_row < num_row:
+        i_col += inc
+      out[i_row] += r  # TODO: warp-level reduction
+
+
+  @ti.kernel
+  def _event_mv_prob_normal_cpu(
+      events: ti.types.ndarray(ndim=1),
+      w_mu: ti.types.ndarray(ndim=1),
+      w_sigma: ti.types.ndarray(ndim=1),
+      clen: ti.types.ndarray(ndim=1),
+      seed: ti.types.ndarray(ndim=1),
+      out: ti.types.ndarray(ndim=1)
+  ):
+    num_row = out.shape[0]
+    num_col = events.shape[0]
+    w_mu0 = w_mu[0]
+    w_sigma0 = w_sigma[0]
+    clen0 = clen[0]
+    seed0 = seed[0]
+
+    for i_col in range(num_col):
+      if events[i_col] != 0.:
+        key = lfsr88_key(seed0 + i_col)
+        key, i_row = lfsr88_random_integers(key, 0, clen0 - 1)
+        while i_row < num_row:
+          key, row_v = lfsr88_normal(key, w_mu0, w_sigma0)
+          out[i_row] += row_v
+          key, inc = lfsr88_random_integers(key, 1, clen0)
+          i_row += inc
+
+
+  @ti.kernel
+  def _event_mv_prob_normal_outdim_parallel_cpu(
+      events: ti.types.ndarray(ndim=1),
+      w_mu: ti.types.ndarray(ndim=1),
+      w_sigma: ti.types.ndarray(ndim=1),
+      clen: ti.types.ndarray(ndim=1),
+      seed: ti.types.ndarray(ndim=1),
+      out: ti.types.ndarray(ndim=1)
+  ):
+    num_row = out.shape[0]
+    num_col = events.shape[0]
+    w_mu0 = w_mu[0]
+    w_sigma0 = w_sigma[0]
+    clen0 = clen[0]
+    seed0 = seed[0]
+
+    for i_row in range(num_row):
+      r = 0.
+      key = lfsr88_key(seed0 + i_row)
+      key, i_col = lfsr88_random_integers(key, 0, clen0 - 1)
+      while i_col < num_col:
         key, row_v = lfsr88_normal(key, w_mu0, w_sigma0)
-        out[i_row] += row_v
+        if events[i_col] != 0.:
+          r += row_v
         key, inc = lfsr88_random_integers(key, 1, clen0)
-        i_row += inc
-
-
-@ti.kernel
-def _event_mv_prob_normal_outdim_parallel_cpu(
-    events: ti.types.ndarray(ndim=1),
-    w_mu: ti.types.ndarray(ndim=1),
-    w_sigma: ti.types.ndarray(ndim=1),
-    clen: ti.types.ndarray(ndim=1),
-    seed: ti.types.ndarray(ndim=1),
-    out: ti.types.ndarray(ndim=1)
-):
-  num_row = out.shape[0]
-  num_col = events.shape[0]
-  w_mu0 = w_mu[0]
-  w_sigma0 = w_sigma[0]
-  clen0 = clen[0]
-  seed0 = seed[0]
-
-  for i_row in range(num_row):
-    r = 0.
-    key = lfsr88_key(seed0 + i_row)
-    key, i_col = lfsr88_random_integers(key, 0, clen0 - 1)
-    while i_col < num_col:
-      key, row_v = lfsr88_normal(key, w_mu0, w_sigma0)
+        i_col += inc
+      out[i_row] = r
+
+
+  @ti.kernel
+  def _event_mv_prob_normal_gpu(
+      events: ti.types.ndarray(ndim=1),
+      w_mu: ti.types.ndarray(ndim=1),
+      w_sigma: ti.types.ndarray(ndim=1),
+      clen: ti.types.ndarray(ndim=1),
+      seed: ti.types.ndarray(ndim=1),
+      out: ti.types.ndarray(ndim=1)
+  ):
+    num_row = out.shape[0]
+    num_col = events.shape[0]
+    w_mu0 = w_mu[0]
+    w_sigma0 = w_sigma[0]
+    clen0 = clen[0]
+    seed0 = seed[0]
+    step = ti.uint32(ti.max((num_row + 1) >> 5, 1))
+
+    for i in range(num_col * 32):
+      i_col = i >> 5
       if events[i_col] != 0.:
-        r += row_v
-      key, inc = lfsr88_random_integers(key, 1, clen0)
-      i_col += inc
-    out[i_row] = r
-
-
-@ti.kernel
-def _event_mv_prob_normal_gpu(
-    events: ti.types.ndarray(ndim=1),
-    w_mu: ti.types.ndarray(ndim=1),
-    w_sigma: ti.types.ndarray(ndim=1),
-    clen: ti.types.ndarray(ndim=1),
-    seed: ti.types.ndarray(ndim=1),
-    out: ti.types.ndarray(ndim=1)
-):
-  num_row = out.shape[0]
-  num_col = events.shape[0]
-  w_mu0 = w_mu[0]
-  w_sigma0 = w_sigma[0]
-  clen0 = clen[0]
-  seed0 = seed[0]
-  step = ti.uint32(ti.max((num_row + 1) >> 5, 1))
-
-  for i in range(num_col * 32):
-    i_col = i >> 5
-    if events[i_col] != 0.:
+        index = i & 31
+        i_row = step * index - 1
+        end = ti.min(i_row + step, num_row)
+        key = lfsr88_key(seed0 + i)
+        key, inc = lfsr88_random_integers(key, 1, clen0)
+        i_row += inc
+        while i_row < end:
+          key, row_v = lfsr88_normal(key, w_mu0, w_sigma0)
+          out[i_row] += row_v
+          key, inc = lfsr88_random_integers(key, 1, clen0)
+          i_row += inc
+
+
+  @ti.kernel
+  def _event_mv_prob_normal_outdim_parallel_gpu(
+      events: ti.types.ndarray(ndim=1),
+      w_mu: ti.types.ndarray(ndim=1),
+      w_sigma: ti.types.ndarray(ndim=1),
+      clen: ti.types.ndarray(ndim=1),
+      seed: ti.types.ndarray(ndim=1),
+      out: ti.types.ndarray(ndim=1)
+  ):
+    num_row = out.shape[0]
+    num_col = events.shape[0]
+    w_mu0 = w_mu[0]
+    w_sigma0 = w_sigma[0]
+    clen0 = clen[0]
+    seed0 = seed[0]
+    step = ti.uint32(ti.max((num_row + 1) >> 5, 1))
+
+    for i in range(num_row * 32):
+      i_row = i >> 5
       index = i & 31
-      i_row = step * index - 1
-      end = ti.min(i_row + step, num_row)
+      i_col = step * index - 1
+      end_col = ti.min(i_col + step, num_col)
+      r = 0.
       key = lfsr88_key(seed0 + i)
       key, inc = lfsr88_random_integers(key, 1, clen0)
-      i_row += inc
-      while i_row < end:
+      i_col += inc
+      while i_col < end_col:
         key, row_v = lfsr88_normal(key, w_mu0, w_sigma0)
-        out[i_row] += row_v
+        r += row_v * events[i_col]  # TODO: speed comparison with if else
         key, inc = lfsr88_random_integers(key, 1, clen0)
-        i_row += inc
-
-
-@ti.kernel
-def _event_mv_prob_normal_outdim_parallel_gpu(
-    events: ti.types.ndarray(ndim=1),
-    w_mu: ti.types.ndarray(ndim=1),
-    w_sigma: ti.types.ndarray(ndim=1),
-    clen: ti.types.ndarray(ndim=1),
-    seed: ti.types.ndarray(ndim=1),
-    out: ti.types.ndarray(ndim=1)
-):
-  num_row = out.shape[0]
-  num_col = events.shape[0]
-  w_mu0 = w_mu[0]
-  w_sigma0 = w_sigma[0]
-  clen0 = clen[0]
-  seed0 = seed[0]
-  step = ti.uint32(ti.max((num_row + 1) >> 5, 1))
-
-  for i in range(num_row * 32):
-    i_row = i >> 5
-    index = i & 31
-    i_col = step * index - 1
-    end_col = ti.min(i_col + step, num_col)
-    r = 0.
-    key = lfsr88_key(seed0 + i)
-    key, inc = lfsr88_random_integers(key, 1, clen0)
-    i_col += inc
-    while i_col < end_col:
-      key, row_v = lfsr88_normal(key, w_mu0, w_sigma0)
-      r += row_v * events[i_col]  # TODO: speed comparison with if else
-      key, inc = lfsr88_random_integers(key, 1, clen0)
-      i_col += inc
-    out[i_row] += r  # TODO: warp-level reduction
-
-
-def _event_mv_prob_normal_jvp_events(
-    evt_dot, events, w_mu, w_sigma, clen, seed, *, outs, shape, transpose, outdim_parallel
-):
-  shape = _reverse(shape) if transpose else shape
-  return raw_mv_prob_normal(evt_dot, w_mu, w_sigma, clen, seed,
-                            shape=shape, transpose=transpose, outdim_parallel=outdim_parallel)
-
-
-def _event_mv_prob_normal_jvp_w_mu(
-    w_dot, events, w_mu, w_sigma, clen, seed, *, outs, shape, transpose, outdim_parallel
-):
-  shape = _reverse(shape) if transpose else shape
-  return raw_mv_prob_normal(events, w_dot, w_sigma, clen, seed,
-                            shape=shape, transpose=transpose, outdim_parallel=outdim_parallel)
-
-
-def _event_mv_prob_normal_jvp_w_sigma(
-    w_dot, events, w_mu, w_sigma, clen, seed, *, outs, shape, transpose, outdim_parallel
-):
-  shape = _reverse(shape) if transpose else shape
-  return raw_mv_prob_normal(events, w_mu, w_dot, clen, seed,
-                            shape=shape, transpose=transpose, outdim_parallel=outdim_parallel)
-
+        i_col += inc
+      out[i_row] += r  # TODO: warp-level reduction
+
+
+  def _event_mv_prob_normal_jvp_events(
+      evt_dot, events, w_mu, w_sigma, clen, seed, *, outs, shape, transpose, outdim_parallel
+  ):
+    shape = _reverse(shape) if transpose else shape
+    return raw_mv_prob_normal(evt_dot, w_mu, w_sigma, clen, seed,
+                              shape=shape, transpose=transpose, outdim_parallel=outdim_parallel)
+
+
+  def _event_mv_prob_normal_jvp_w_mu(
+      w_dot, events, w_mu, w_sigma, clen, seed, *, outs, shape, transpose, outdim_parallel
+  ):
+    shape = _reverse(shape) if transpose else shape
+    return raw_mv_prob_normal(events, w_dot, w_sigma, clen, seed,
+                              shape=shape, transpose=transpose, outdim_parallel=outdim_parallel)
+
+
+  def _event_mv_prob_normal_jvp_w_sigma(
+      w_dot, events, w_mu, w_sigma, clen, seed, *, outs, shape, transpose, outdim_parallel
+  ):
+    shape = _reverse(shape) if transpose else shape
+    return raw_mv_prob_normal(events, w_mu, w_dot, clen, seed,
+                              shape=shape, transpose=transpose, outdim_parallel=outdim_parallel)
+
+
+  def raw_event_mv_prob_normal(
+      events: jax.Array,
+      w_mu: jax.Array,  # vector with size 1
+      w_sigma: jax.Array,  # vector with size 1
+      conn_len: jax.Array,  # vector with size 1
+      seed: jax.Array,  # vector with size 1
+      *,
+      shape: Tuple[int, int],
+      transpose: bool = False,
+      outdim_parallel: bool = True,
+  ) -> jax.Array:
+    mat_shape, out_shape = _event_checking(events, conn_len, seed, shape, outdim_parallel, transpose, w_mu, w_sigma)
+
+    if outdim_parallel:
+      if events.dtype == jnp.bool_:
+        prim = _event_mv_prob_normal_outdim_parallel_bool_p
+      else:
+        prim = _event_mv_prob_normal_outdim_parallel_p
+    else:
+      if events.dtype == jnp.bool_:
+        prim = _event_mv_prob_normal_bool_p
+      else:
+        prim = _event_mv_prob_normal_p
+
+    return prim(events,
+                w_mu,
+                w_sigma,
+                conn_len,
+                seed,
+                outs=[jax.ShapeDtypeStruct(shape=out_shape, dtype=w_mu.dtype)],
+                shape=mat_shape,
+                transpose=transpose,
+                outdim_parallel=outdim_parallel)
+
+
+  def _define_event_mv_prob_normal_prim(cpu_kernel, gpu_kernel):
+    prim = XLACustomOp(cpu_kernel=cpu_kernel, gpu_kernel=gpu_kernel)
+    prim.defjvp(_event_mv_prob_normal_jvp_events,
+                _event_mv_prob_normal_jvp_w_mu,
+                _event_mv_prob_normal_jvp_w_sigma,
+                None,
+                None)
+    prim.def_transpose_rule(_mv_prob_normal_transpose)
+    return prim
+
+
+  # outdim_parallel = True, events.dtype = jnp.bool_
+  _event_mv_prob_normal_outdim_parallel_bool_p = _define_event_mv_prob_normal_prim(
+    cpu_kernel=_event_mv_prob_normal_outdim_parallel_bool_cpu,
+    gpu_kernel=_event_mv_prob_normal_outdim_parallel_bool_gpu
+  )
 
-def raw_event_mv_prob_normal(
-    events: jax.Array,
-    w_mu: jax.Array,  # vector with size 1
-    w_sigma: jax.Array,  # vector with size 1
-    conn_len: jax.Array,  # vector with size 1
-    seed: jax.Array,  # vector with size 1
-    *,
-    shape: Tuple[int, int],
-    transpose: bool = False,
-    outdim_parallel: bool = True,
-) -> jax.Array:
-  mat_shape, out_shape = _event_checking(events, conn_len, seed, shape, outdim_parallel, transpose, w_mu, w_sigma)
+  # outdim_parallel = False, events.dtype = jnp.bool_
+  _event_mv_prob_normal_bool_p = _define_event_mv_prob_normal_prim(
+    cpu_kernel=_event_mv_prob_normal_bool_cpu,
+    gpu_kernel=_event_mv_prob_normal_bool_gpu
+  )
 
-  if outdim_parallel:
-    if events.dtype == jnp.bool_:
-      prim = _event_mv_prob_normal_outdim_parallel_bool_p
-    else:
-      prim = _event_mv_prob_normal_outdim_parallel_p
-  else:
-    if events.dtype == jnp.bool_:
-      prim = _event_mv_prob_normal_bool_p
-    else:
-      prim = _event_mv_prob_normal_p
+  # outdim_parallel = True, events.dtype != jnp.bool_
+  _event_mv_prob_normal_outdim_parallel_p = _define_event_mv_prob_normal_prim(
+    cpu_kernel=_event_mv_prob_normal_outdim_parallel_cpu,
+    gpu_kernel=_event_mv_prob_normal_outdim_parallel_gpu
+  )
 
-  return prim(events,
-              w_mu,
-              w_sigma,
-              conn_len,
-              seed,
-              outs=[jax.ShapeDtypeStruct(shape=out_shape, dtype=w_mu.dtype)],
-              shape=mat_shape,
-              transpose=transpose,
-              outdim_parallel=outdim_parallel)
-
-
-def _define_event_mv_prob_normal_prim(cpu_kernel, gpu_kernel):
-  prim = XLACustomOp(cpu_kernel=cpu_kernel, gpu_kernel=gpu_kernel)
-  prim.defjvp(_event_mv_prob_normal_jvp_events,
-              _event_mv_prob_normal_jvp_w_mu,
-              _event_mv_prob_normal_jvp_w_sigma,
-              None,
-              None)
-  prim.def_transpose_rule(_mv_prob_normal_transpose)
-  return prim
-
-
-# outdim_parallel = True, events.dtype = jnp.bool_
-_event_mv_prob_normal_outdim_parallel_bool_p = _define_event_mv_prob_normal_prim(
-  cpu_kernel=_event_mv_prob_normal_outdim_parallel_bool_cpu,
-  gpu_kernel=_event_mv_prob_normal_outdim_parallel_bool_gpu
-)
-
-# outdim_parallel = False, events.dtype = jnp.bool_
-_event_mv_prob_normal_bool_p = _define_event_mv_prob_normal_prim(
-  cpu_kernel=_event_mv_prob_normal_bool_cpu,
-  gpu_kernel=_event_mv_prob_normal_bool_gpu
-)
-
-# outdim_parallel = True, events.dtype != jnp.bool_
-_event_mv_prob_normal_outdim_parallel_p = _define_event_mv_prob_normal_prim(
-  cpu_kernel=_event_mv_prob_normal_outdim_parallel_cpu,
-  gpu_kernel=_event_mv_prob_normal_outdim_parallel_gpu
-)
-
-# outdim_parallel = False, events.dtype != jnp.bool_
-_event_mv_prob_normal_p = _define_event_mv_prob_normal_prim(
-  cpu_kernel=_event_mv_prob_normal_cpu,
-  gpu_kernel=_event_mv_prob_normal_gpu
-)
+  # outdim_parallel = False, events.dtype != jnp.bool_
+  _event_mv_prob_normal_p = _define_event_mv_prob_normal_prim(
+    cpu_kernel=_event_mv_prob_normal_cpu,
+    gpu_kernel=_event_mv_prob_normal_gpu
+  )
diff --git a/brainpy/_src/math/jitconn/_matvec.py b/brainpy/_src/math/jitconn/_matvec.py
index 0caa9c996..00e5778f9 100644
--- a/brainpy/_src/math/jitconn/_matvec.py
+++ b/brainpy/_src/math/jitconn/_matvec.py
@@ -1,24 +1,20 @@
 # -*- coding: utf-8 -*-
 
 
-from functools import partial
 from typing import Tuple, Optional, Union
 
 import jax
 import numpy as np
-from jax import numpy as jnp, dtypes
-from jax.core import ShapedArray, Primitive
-from jax.interpreters import xla, ad
-from jax.lib import xla_client
+from jax import numpy as jnp
+from jax.interpreters import ad
 
-from brainpy._src.dependency_check import import_brainpylib_gpu_ops, import_brainpylib_cpu_ops, import_taichi
+from brainpy._src.dependency_check import import_taichi
 from brainpy._src.math.interoperability import as_jax
 from brainpy._src.math.ndarray import Array, _get_dtype
-from brainpy._src.math.op_register import register_general_batching, XLACustomOp
-from brainpy._src.math.tifunc import (lfsr88_key, lfsr88_random_integers, lfsr88_uniform, lfsr88_normal)
-from brainpy.errors import GPUOperatorNotFound
+from brainpy._src.math.op_register import XLACustomOp
+from brainpy.errors import PackageMissingError
 
-ti = import_taichi()
+ti = import_taichi(error_if_not_found=False)
 
 __all__ = [
   'mv_prob_homo',
@@ -85,8 +81,22 @@ def mv_prob_homo(
   out: Array, ndarray
     The output of :math:`y = M @ v`.
   """
-  return mv_prob_homo_taichi(vector, weight, conn_prob, seed, shape=shape, transpose=transpose,
-                             outdim_parallel=outdim_parallel)
+  if ti is None:
+    raise PackageMissingError.by_purpose('taichi', purpose='customized operators')
+
+  vector = as_jax(vector)
+  if isinstance(weight, float):
+    weight = as_jax(weight, dtype=vector.dtype)
+  weight = jnp.atleast_1d(as_jax(weight))
+  conn_len = jnp.ceil(1 / conn_prob) * 2 - 1
+  clen = jnp.asarray(jnp.atleast_1d(conn_len), dtype=jnp.int32)
+  if seed is None:
+    with jax.ensure_compile_time_eval():
+      seed = np.random.randint(0, int(1e8), 1)
+  seed = jnp.asarray(seed, dtype=jnp.uint32)
+  seed = jnp.atleast_1d(seed)
+  return raw_mv_prob_homo(vector, weight, clen, seed, shape=shape,
+                          transpose=transpose, outdim_parallel=outdim_parallel)[0]
 
 
 def mv_prob_uniform(
@@ -150,8 +160,22 @@ def mv_prob_uniform(
   out: Array, ndarray
     The output of :math:`y = M @ v`.
   """
-  return mv_prob_uniform_taichi(vector, w_low, w_high, conn_prob, seed, shape=shape, transpose=transpose,
-                                outdim_parallel=outdim_parallel)
+  if ti is None:
+    raise PackageMissingError.by_purpose('taichi', purpose='customized operators')
+
+  vector = as_jax(vector)
+  if isinstance(w_low, float): w_low = as_jax(w_low, dtype=vector.dtype)
+  if isinstance(w_high, float): w_high = as_jax(w_high, dtype=vector.dtype)
+  w_low = jnp.atleast_1d(as_jax(w_low))
+  w_high = jnp.atleast_1d(as_jax(w_high))
+  conn_len = jnp.ceil(1 / conn_prob) * 2 - 1
+  conn_len = jnp.asarray(jnp.atleast_1d(conn_len), dtype=jnp.int32)
+  if seed is None:
+    with jax.ensure_compile_time_eval():
+      seed = np.random.randint(0, int(1e8), 1)
+  seed = jnp.atleast_1d(jnp.asarray(seed, dtype=jnp.uint32))
+  return raw_mv_prob_uniform(vector, w_low, w_high, conn_len, seed, shape=shape,
+                             transpose=transpose, outdim_parallel=outdim_parallel)[0]
 
 
 def mv_prob_normal(
@@ -215,1188 +239,110 @@ def mv_prob_normal(
   out: Array, ndarray
     The output of :math:`y = M @ v`.
   """
-  return mv_prob_uniform_taichi(vector, w_mu, w_sigma, conn_prob, seed, shape=shape, transpose=transpose,
-                                outdim_parallel=outdim_parallel)
+  if ti is None:
+    raise PackageMissingError.by_purpose('taichi', purpose='customized operators')
 
+  vector = as_jax(vector)
+  if isinstance(w_mu, float): w_mu = as_jax(w_mu, dtype=vector.dtype)
+  if isinstance(w_sigma, float): w_sigma = as_jax(w_sigma, dtype=vector.dtype)
+  w_mu = jnp.atleast_1d(as_jax(w_mu))
+  w_sigma = jnp.atleast_1d(as_jax(w_sigma))
+  conn_len = jnp.ceil(1 / conn_prob) * 2 - 1
+  conn_len = jnp.asarray(jnp.atleast_1d(conn_len), dtype=jnp.int32)
+  if seed is None:
+    with jax.ensure_compile_time_eval():
+      seed = np.random.randint(0, int(1e8), 1)
+  seed = jnp.atleast_1d(jnp.asarray(seed, dtype=jnp.uint32))
+  return raw_mv_prob_normal(vector, w_mu, w_sigma, conn_len, seed, shape=shape,
+                            transpose=transpose, outdim_parallel=outdim_parallel)[0]
 
-### BRAINYPLIB ###
 
-def mv_prob_homo_brainpylib(
-    vector: Union[Array, jax.Array],
-    weight: float,
-    conn_prob: float,
-    seed: Optional[int] = None,
+def raw_mv_prob_homo(
+    vector: jax.Array,
+    weight: jax.Array,  # vector with size 1
+    clen: jax.Array,  # vector with size 1
+    seed: jax.Array,  # vector with size 1
     *,
     shape: Tuple[int, int],
     transpose: bool = False,
     outdim_parallel: bool = True,
 ) -> jax.Array:
-  r"""Perform the :math:`y=M@v` operation,
-  where :math:`M` is just-in-time randomly generated with a scalar `weight` at each position.
-
-  This operator support ``jit()``, ``vmap()``, ``grad()`` and ``pmap()`` etc. transformations
-  on CPU and GPU devices.
-
-  .. warning::
-
-     This API may change in the future.
-
-  In this operation, :math:`M` is the random matrix with a connection probability
-  `conn_prob`, and at each connection the value is the same scalar `weight`.
-
-  When ``transpose=True``, we perform an operation of :math:`y=M^T@v`.
-
-  .. note::
+  mat_shape, out_shape = _non_event_checking(vector, clen, seed, shape, outdim_parallel, transpose, weight)
 
-     Note that the just-in-time generated :math:`M` (`transpose=False`) is
-     different from the generated :math:`M^T` (`transpose=True`).
+  if outdim_parallel:
+    prim = _mv_prob_homo_outdim_parallel_p
+  else:
+    prim = _mv_prob_homo_p
 
-     If you pursue the same :math:`M` and :math:`M^T` when performing the just-in-time
-     matrix generation, you should set ``outdim_parallel=True``, with the sacrifice of
-     the speed compared with ``outdim_parallel=False``.
+  return prim(vector,
+              weight,
+              clen,
+              seed,
+              outs=[jax.ShapeDtypeStruct(shape=out_shape, dtype=vector.dtype)],
+              shape=mat_shape,
+              transpose=transpose,
+              outdim_parallel=outdim_parallel)
 
-  Parameters
-  ----------
-  vector: Array, ndarray
-    The vector.
-  weight: float
-    The value of the random matrix.
-  conn_prob: float
-    The connection probability.
-  shape: tuple of int
-    The matrix shape.
-  seed: int
-    The random number generation seed.
-  transpose: bool
-    Transpose the random matrix or not.
-  outdim_parallel: bool
-    Perform the parallel random generations along the out dimension or not.
-    It can be used to set the just-in-time generated :math:M^T: is the same
-    as the just-in-time generated :math:`M` when ``transpose=True``.
 
-  Returns
-  -------
-  out: Array, ndarray
-    The output of :math:`y = M @ v`.
-  """
-  vector = as_jax(vector)
-  weight = jnp.atleast_1d(as_jax(weight))
-  conn_prob = jnp.atleast_1d(as_jax(conn_prob))
-  clen = jnp.asarray(jnp.ceil(1 / conn_prob) * 2 - 1, dtype=jnp.int32)
-  with jax.ensure_compile_time_eval():
-    if seed is None:
-      seed = int(np.random.randint(0, int(1e8)))
-  seed = jnp.atleast_1d(as_jax(seed, dtype=jnp.int32))
-  return mv_prob_homo_p.bind(vector,
-                             weight,
-                             clen,
-                             seed,
-                             shape=shape,
-                             transpose=transpose,
-                             outdim_parallel=outdim_parallel,
-                             )[0]
-
-
-def mv_prob_uniform_brainpylib(
+def raw_mv_prob_uniform(
     vector: jax.Array,
-    w_low: float,
-    w_high: float,
-    conn_prob: float,
-    seed: Optional[int] = None,
+    w_low: jax.Array,
+    w_high: jax.Array,
+    conn_len: jax.Array,
+    seed: jax.Array,
     *,
     shape: Tuple[int, int],
     transpose: bool = False,
     outdim_parallel: bool = True,
 ) -> jax.Array:
-  r"""Perform the :math:`y=M@v` operation,
-  where :math:`M` is just-in-time randomly generated with a uniform distribution for its value.
-
-  This operator support ``jit()``, ``vmap()``, ``grad()`` and ``pmap()`` etc. transformations
-  on CPU and GPU devices.
-
-  .. warning::
-
-     This API may change in the future.
-
-  In this operation, :math:`M` is the random matrix with a connection probability
-  `conn_prob`, and at each connection the value is the same scalar `weight`.
-
-  When ``transpose=True``, we perform an operation of :math:`y=M^T@v`.
-
-  .. note::
+  mat_shape, out_shape = _non_event_checking(vector, conn_len, seed, shape, outdim_parallel, transpose, w_low, w_high)
 
-     Note that the just-in-time generated :math:`M` (`transpose=False`) is
-     different from the generated :math:`M^T` (`transpose=True`).
+  if outdim_parallel:
+    prim = _mv_prob_uniform_outdim_parallel_p
+  else:
+    prim = _mv_prob_uniform_p
 
-     If you pursue the same :math:`M` and :math:`M^T` when performing the just-in-time
-     matrix generation, you should set ``outdim_parallel=True``, with the sacrifice of
-     the speed compared with ``outdim_parallel=False``.
+  return prim(vector,
+              w_low,
+              w_high,
+              conn_len,
+              seed,
+              outs=[jax.ShapeDtypeStruct(shape=out_shape, dtype=vector.dtype)],
+              shape=mat_shape,
+              transpose=transpose,
+              outdim_parallel=outdim_parallel)
 
-  Parameters
-  ----------
-  vector: Array, ndarray
-    The vector.
-  w_low: float
-    Lower boundary of the output interval.
-  w_high: float
-    Upper boundary of the output interval.
-  conn_prob: float
-    The connection probability.
-  shape: tuple of int
-    The matrix shape.
-  seed: int
-    The random number generation seed.
-  transpose: bool
-    Transpose the random matrix or not.
-  outdim_parallel: bool
-    Perform the parallel random generations along the out dimension or not.
-    It can be used to set the just-in-time generated :math:M^T: is the same
-    as the just-in-time generated :math:`M` when ``transpose=True``.
 
-  Returns
-  -------
-  out: Array, ndarray
-    The output of :math:`y = M @ v`.
-  """
-  vector = as_jax(vector)
-  w_low = jnp.atleast_1d(as_jax(w_low))
-  w_high = jnp.atleast_1d(as_jax(w_high))
-  conn_prob = jnp.atleast_1d(as_jax(conn_prob))
-  clen = jnp.asarray(jnp.ceil(1 / conn_prob) * 2 - 1, dtype=jnp.int32)
-  with jax.ensure_compile_time_eval():
-    if seed is None:
-      seed = int(np.random.randint(0, int(1e8)))
-  seed = jnp.atleast_1d(as_jax(seed, dtype=jnp.int32))
-  return mv_prob_uniform_p.bind(vector,
-                                w_low,
-                                w_high,
-                                clen,
-                                seed,
-                                shape=shape,
-                                transpose=transpose,
-                                outdim_parallel=outdim_parallel)[0]
-
-
-def mv_prob_normal_brainpylib(
+def raw_mv_prob_normal(
     vector: jax.Array,
-    w_mu: float,
-    w_sigma: float,
-    conn_prob: float,
-    seed: Optional[int] = None,
+    w_mu: jax.Array,
+    w_sigma: jax.Array,
+    conn_len: jax.Array,
+    seed: jax.Array,
     *,
     shape: Tuple[int, int],
     transpose: bool = False,
     outdim_parallel: bool = True,
 ) -> jax.Array:
-  r"""Perform the :math:`y=M@v` operation,
-  where :math:`M` is just-in-time randomly generated with a normal distribution for its value.
-
-  This operator support ``jit()``, ``vmap()``, ``grad()`` and ``pmap()`` etc. transformations
-  on CPU and GPU devices.
-
-  .. warning::
-
-     This API may change in the future.
-
-  In this operation, :math:`M` is the random matrix with a connection probability
-  `conn_prob`, and at each connection the value is the same scalar `weight`.
-
-  When ``transpose=True``, we perform an operation of :math:`y=M^T@v`.
-
-  .. note::
-
-     Note that the just-in-time generated :math:`M` (`transpose=False`) is
-     different from the generated :math:`M^T` (`transpose=True`).
+  mat_shape, out_shape = _non_event_checking(vector, conn_len, seed, shape, outdim_parallel, transpose, w_mu, w_sigma)
 
-     If you pursue the same :math:`M` and :math:`M^T` when performing the just-in-time
-     matrix generation, you should set ``outdim_parallel=True``, with the sacrifice of
-     the speed compared with ``outdim_parallel=False``.
+  if outdim_parallel:
+    prim = _mv_prob_normal_outdim_parallel_p
+  else:
+    prim = _mv_prob_normal_p
 
-  Parameters
-  ----------
-  vector: Array, ndarray
-    The vector.
-  w_mu: float
-    Mean (centre) of the distribution.
-  w_sigma: float
-    Standard deviation (spread or “width”) of the distribution. Must be non-negative.
-  conn_prob: float
-    The connection probability.
-  shape: tuple of int
-    The matrix shape.
-  seed: int
-    The random number generation seed.
-  transpose: bool
-    Transpose the random matrix or not.
-  outdim_parallel: bool
-    Perform the parallel random generations along the out dimension or not.
-    It can be used to set the just-in-time generated :math:M^T: is the same
-    as the just-in-time generated :math:`M` when ``transpose=True``.
+  return prim(vector,
+              w_mu,
+              w_sigma,
+              conn_len,
+              seed,
+              outs=[jax.ShapeDtypeStruct(shape=out_shape, dtype=vector.dtype)],
+              shape=mat_shape,
+              transpose=transpose,
+              outdim_parallel=outdim_parallel)
 
-  Returns
-  -------
-  out: Array, ndarray
-    The output of :math:`y = M @ v`.
-  """
-  vector = as_jax(vector)
-  w_mu = jnp.atleast_1d(as_jax(w_mu))
-  w_sigma = jnp.atleast_1d(as_jax(w_sigma))
-  conn_prob = jnp.atleast_1d(as_jax(conn_prob))
-  clen = jnp.asarray(jnp.ceil(1 / conn_prob) * 2 - 1, dtype=jnp.int32)
-  with jax.ensure_compile_time_eval():
-    if seed is None:
-      seed = int(np.random.randint(0, int(1e8)))
-  seed = jnp.atleast_1d(as_jax(seed, dtype=jnp.int32))
-  return mv_prob_normal_p.bind(vector,
-                               w_mu,
-                               w_sigma,
-                               clen,
-                               seed,
-                               shape=shape,
-                               transpose=transpose,
-                               outdim_parallel=outdim_parallel)[0]
-
-
-def _matvec_prob_homo_abstract(
-    vector, weight, clen, seed, *, shape, transpose, outdim_parallel
-):
-  assert _get_dtype(vector) in [jnp.float32, jnp.float64]
-  assert _get_dtype(weight) in [jnp.float32, jnp.float64], '"weight" must be float valued.'
-  assert _get_dtype(clen) in [jnp.int32, jnp.int64, jnp.uint32, jnp.uint64]
-  assert _get_dtype(seed) in [jnp.int32, jnp.int64, jnp.uint32, jnp.uint64]
 
-  if vector.ndim != 1:
-    raise ValueError('vector should be a 1D vector.')
-  if len(shape) != 2:
-    raise ValueError('shape should be a length-2 tuple.')
-  if seed.ndim != 1:
-    raise ValueError('seed must be a 1D scalar.')
-  if clen.ndim != 1:
-    raise ValueError('conn_prob must be a 1D scalar.')
-  if weight.ndim != 1:
-    raise ValueError('weight must be a 1D scalar.')
-
-  if len(shape) != 2:
-    raise ValueError('shape should be a length-2 tuple.')
-  if not isinstance(outdim_parallel, bool):
-    raise ValueError('outdim_parallel must be boolean value.')
-  if not isinstance(transpose, bool):
-    raise ValueError('transpose must be boolean value.')
-  if transpose:
-    if vector.shape[0] != shape[0]:
-      raise ValueError(f'Shape mismatch, vec ({vector.shape[0]},) @ mat {shape}.')
-  else:
-    if vector.shape[0] != shape[1]:
-      raise ValueError(f'Shape mismatch, mat {shape} @ vec ({vector.shape[0]},).')
-  out = ShapedArray(dtype=dtypes.canonicalize_dtype(float),
-                    shape=(shape[1] if transpose else shape[0],))
-  return [out]
-
-
-def _matvec_prob_homo_cpu_translation(
-    c, vector, weight, clen, seed, *, shape, transpose, outdim_parallel
-):
-  import_brainpylib_cpu_ops()
-  n_row, n_col = (shape[1], shape[0]) if transpose else shape
-
-  vec_shape = c.get_shape(vector)
-  out_dtype = vec_shape.element_type()
-  if out_dtype == jnp.float32:
-    out_type = b'_float'
-  elif out_dtype == jnp.float64:
-    out_type = b'_double'
-  else:
-    raise TypeError
-
-  if outdim_parallel:
-    fn = b'cpu_matvec_prob_homo' + out_type
-  else:
-    fn = b'cpu_matvec_atomic_prob_homo' + out_type
-  return xla_client.ops.CustomCallWithLayout(
-    c,
-    fn,
-    operands=(vector,
-              weight,
-              clen,
-              seed,
-              xla_client.ops.ConstantLiteral(c, n_row),
-              xla_client.ops.ConstantLiteral(c, n_col)),
-    operand_shapes_with_layout=(c.get_shape(vector),
-                                c.get_shape(weight),
-                                c.get_shape(clen),
-                                c.get_shape(seed),
-                                xla_client.Shape.array_shape(np.dtype(np.uint32), (), ()),
-                                xla_client.Shape.array_shape(np.dtype(np.uint32), (), ())),
-    shape_with_layout=xla_client.Shape.tuple_shape(
-      (
-        xla_client.Shape.array_shape(out_dtype, (shape[1] if transpose else shape[0],), (0,)),
-      )
-    ),
-  )
-
-
-def _matvec_prob_homo_gpu_translation(
-    c, vector, weight, clen, seed, *, shape, transpose, outdim_parallel
-):
-  gpu_ops = import_brainpylib_gpu_ops()
-  if gpu_ops is None:
-    raise GPUOperatorNotFound(mv_prob_homo_p.name)
-
-  vec_shape = c.get_shape(vector)
-  out_dtype = vec_shape.element_type()
-  if out_dtype == jnp.float32:
-    type_name = b'_float'
-  elif out_dtype == jnp.float64:
-    type_name = b'_double'
-  else:
-    raise TypeError
-
-  opaque = gpu_ops.build_double_size_descriptor(shape[1] if transpose else shape[0],
-                                                shape[0] if transpose else shape[1])
-
-  if outdim_parallel:
-    fn = b'gpu_jit_csrmv_prob_homo_v2' + type_name
-  else:
-    fn = b'gpu_jit_csrmv_atomic_prob_homo_v2' + type_name
-  return xla_client.ops.CustomCallWithLayout(
-    c,
-    fn,
-    operands=(vector, weight, clen, seed),
-    operand_shapes_with_layout=(c.get_shape(vector),
-                                c.get_shape(weight),
-                                c.get_shape(clen),
-                                c.get_shape(seed)),
-    shape_with_layout=xla_client.Shape.tuple_shape(
-      (
-        xla_client.Shape.array_shape(out_dtype, (shape[1] if transpose else shape[0],), (0,)),
-      )
-    ),
-    opaque=opaque,
-  )
-
-
-def _matvec_prob_homo_jvp(
-    primals, tangents, *, shape, transpose, outdim_parallel
-):
-  vector, weight, clen, seed = primals
-  vector_dot, weight_dot, clen_dot, seed_dot = tangents
-  r = mv_prob_homo_p.bind(vector,
-                          weight,
-                          clen,
-                          seed,
-                          shape=shape,
-                          transpose=transpose,
-                          outdim_parallel=outdim_parallel)
-  assert type(clen_dot) is ad.Zero
-  assert type(seed_dot) is ad.Zero
-  if type(weight_dot) is ad.Zero:
-    if type(vector_dot) is ad.Zero:
-      raise ValueError
-    r_dot = mv_prob_homo_p.bind(vector_dot,
-                                weight,
-                                clen,
-                                seed,
-                                shape=shape,
-                                transpose=transpose,
-                                outdim_parallel=outdim_parallel)
-  elif type(vector_dot) is ad.Zero:
-    r_dot = mv_prob_homo_p.bind(vector,
-                                weight_dot,
-                                clen,
-                                seed,
-                                shape=shape,
-                                transpose=transpose,
-                                outdim_parallel=outdim_parallel)
-  else:
-    r_dot = mv_prob_homo_p.bind(vector_dot,
-                                weight_dot,
-                                clen,
-                                seed,
-                                shape=shape,
-                                transpose=transpose,
-                                outdim_parallel=outdim_parallel)
-
-  return r, r_dot
-
-
-def _matvec_prob_homo_transpose(
-    ct, vector, weight, clen, seed, *, shape, transpose, outdim_parallel
-):
-  assert type(weight) is not ad.UndefinedPrimal
-  assert type(clen) is not ad.UndefinedPrimal
-  assert type(seed) is not ad.UndefinedPrimal
-  assert type(vector) is ad.UndefinedPrimal
-  r = mv_prob_homo_p.bind(ct[0],
-                          weight,
-                          clen,
-                          seed,
-                          shape=shape,
-                          transpose=not transpose,
-                          outdim_parallel=not outdim_parallel)[0]
-  return r, weight, clen, seed
-
-
-mv_prob_homo_p = Primitive('matvec_prob_homo')
-mv_prob_homo_p.multiple_results = True
-mv_prob_homo_p.def_abstract_eval(_matvec_prob_homo_abstract)
-mv_prob_homo_p.def_impl(partial(xla.apply_primitive, mv_prob_homo_p))
-# xla.backend_specific_translations['cpu'][mv_prob_homo_p] = _matvec_prob_homo_cpu_translation
-# xla.backend_specific_translations['gpu'][mv_prob_homo_p] = _matvec_prob_homo_gpu_translation
-register_general_batching(mv_prob_homo_p)
-ad.primitive_jvps[mv_prob_homo_p] = _matvec_prob_homo_jvp
-ad.primitive_transposes[mv_prob_homo_p] = _matvec_prob_homo_transpose
-
-
-def _matvec_prob_uniform_abstract(
-    vector, w_low, w_high, clen, seed, *, shape, transpose, outdim_parallel
-):
-  assert _get_dtype(vector) in [jnp.float32, jnp.float64]
-  _w_low_dtype = _get_dtype(w_low)
-  _w_high_dtype = _get_dtype(w_low)
-  assert _w_low_dtype == _w_high_dtype, '"w_low" and "w_high" must be same typed.'
-  assert _w_low_dtype in [jnp.float32, jnp.float64], '"w_low" must be float valued.'
-  assert _w_high_dtype in [jnp.float32, jnp.float64], '"w_high" must be float valued.'
-  assert _get_dtype(clen) in [jnp.int32, jnp.int64, jnp.uint32, jnp.uint64]
-  assert _get_dtype(seed) in [jnp.int32, jnp.int64, jnp.uint32, jnp.uint64]
-
-  if vector.ndim != 1:
-    raise ValueError('vector should be a 1D vector.')
-  if len(shape) != 2:
-    raise ValueError('shape should be a length-2 tuple.')
-  if w_low.ndim != 1:
-    raise ValueError('w_low must be a 1D scalar.')
-  if w_high.ndim != 1:
-    raise ValueError('w_high must be a 1D scalar.')
-  if clen.ndim != 1:
-    raise ValueError('clen must be a 1D scalar.')
-  if seed.ndim != 1:
-    raise ValueError('seed must be a 1D scalar.')
-
-  if not isinstance(transpose, bool):
-    raise ValueError('transpose must be a boolean value.')
-  if not isinstance(outdim_parallel, bool):
-    raise ValueError('outdim_parallel must be a boolean value.')
-  assert w_low.dtype == w_high.dtype == vector.dtype
-
-  out = ShapedArray(dtype=dtypes.canonicalize_dtype(float),
-                    shape=(shape[1] if transpose else shape[0],))
-  return [out]
-
-
-def _matvec_prob_uniform_cpu_translation(
-    c, vector, w_low, w_high, clen, seed, *, shape, transpose, outdim_parallel
-):
-  import_brainpylib_cpu_ops()
-  n_row, n_col = (shape[1], shape[0]) if transpose else shape
-
-  vec_shape = c.get_shape(vector)
-  out_dtype = vec_shape.element_type()
-
-  if out_dtype == jnp.float32:
-    type_name = b'_float'
-  elif out_dtype == jnp.float64:
-    type_name = b'_double'
-  else:
-    raise TypeError
-
-  if outdim_parallel:
-    fn = b'cpu_matvec_prob_uniform' + type_name
-  else:
-    fn = b'cpu_matvec_atomic_prob_uniform' + type_name
-
-  return xla_client.ops.CustomCallWithLayout(
-    c,
-    fn,
-    operands=(vector,
-              w_low,
-              w_high,
-              clen,
-              seed,
-              xla_client.ops.ConstantLiteral(c, n_row),
-              xla_client.ops.ConstantLiteral(c, n_col)),
-    operand_shapes_with_layout=(c.get_shape(vector),
-                                c.get_shape(w_low),
-                                c.get_shape(w_high),
-                                c.get_shape(clen),
-                                c.get_shape(seed),
-                                xla_client.Shape.array_shape(np.dtype(np.uint32), (), ()),
-                                xla_client.Shape.array_shape(np.dtype(np.uint32), (), ())),
-    shape_with_layout=xla_client.Shape.tuple_shape(
-      (
-        xla_client.Shape.array_shape(out_dtype, (shape[1] if transpose else shape[0],), (0,)),
-      )
-    ),
-  )
-
-
-def _matvec_prob_uniform_gpu_translation(
-    c, vector, w_low, w_high, clen, seed, *, shape, transpose, outdim_parallel
-):
-  gpu_ops = import_brainpylib_gpu_ops()
-  if gpu_ops is None:
-    raise GPUOperatorNotFound(mv_prob_homo_p.name)
-
-  vec_shape = c.get_shape(vector)
-  out_dtype = vec_shape.element_type()
-  if out_dtype == jnp.float32:
-    type_name = b'_float'
-  elif out_dtype == jnp.float64:
-    type_name = b'_double'
-  else:
-    raise TypeError(f'Only support float or double, while got {out_dtype}')
-
-  opaque = gpu_ops.build_double_size_descriptor(shape[1] if transpose else shape[0],
-                                                shape[0] if transpose else shape[1])
-
-  if outdim_parallel:
-    fn = b'gpu_jit_csrmv_prob_uniform_v2' + type_name
-  else:
-    fn = b'gpu_jit_csrmv_atomic_prob_uniform_v2' + type_name
-
-  return xla_client.ops.CustomCallWithLayout(
-    c,
-    fn,
-    operands=(vector, w_low, w_high, clen, seed),
-    operand_shapes_with_layout=(c.get_shape(vector),
-                                c.get_shape(w_low),
-                                c.get_shape(w_high),
-                                c.get_shape(clen),
-                                c.get_shape(seed),),
-    shape_with_layout=xla_client.Shape.tuple_shape(
-      (
-        xla_client.Shape.array_shape(out_dtype, (shape[1] if transpose else shape[0],), (0,)),
-      )
-    ),
-    opaque=opaque,
-  )
-
-
-def _matvec_prob_uniform_jvp(
-    primals, tangents, *, shape, transpose, outdim_parallel
-):
-  vector, w_low, w_high, clen, seed = primals
-  vector_dot, w_low_dot, w_high_dot, clen_dot, seed_dot = tangents
-  r = mv_prob_uniform_p.bind(vector,
-                             w_low,
-                             w_high,
-                             clen,
-                             seed,
-                             shape=shape,
-                             transpose=transpose,
-                             outdim_parallel=outdim_parallel)
-  assert type(w_low_dot) is ad.Zero
-  assert type(w_high_dot) is ad.Zero
-  assert type(clen_dot) is ad.Zero
-  assert type(seed_dot) is ad.Zero
-  r_dot = mv_prob_uniform_p.bind(vector_dot,
-                                 w_low,
-                                 w_high,
-                                 clen,
-                                 seed,
-                                 shape=shape,
-                                 transpose=transpose,
-                                 outdim_parallel=outdim_parallel)
-  return r, r_dot
-
-
-def _matvec_prob_uniform_transpose(
-    ct, vector, w_low, w_high, clen, seed, *, shape, transpose, outdim_parallel
-):
-  assert type(vector) is ad.UndefinedPrimal
-  assert type(w_low) is not ad.UndefinedPrimal
-  assert type(w_high) is not ad.UndefinedPrimal
-  assert type(clen) is not ad.UndefinedPrimal
-  assert type(seed) is not ad.UndefinedPrimal
-
-  r = mv_prob_uniform_p.bind(ct[0],
-                             w_low,
-                             w_high,
-                             clen,
-                             seed,
-                             shape=shape,
-                             transpose=not transpose,
-                             outdim_parallel=not outdim_parallel)[0]
-  return r, w_low, w_high, clen, seed
-
-
-mv_prob_uniform_p = Primitive('matvec_prob_uniform')
-mv_prob_uniform_p.multiple_results = True
-mv_prob_uniform_p.def_abstract_eval(_matvec_prob_uniform_abstract)
-mv_prob_uniform_p.def_impl(partial(xla.apply_primitive, mv_prob_uniform_p))
-# xla.backend_specific_translations['cpu'][mv_prob_uniform_p] = _matvec_prob_uniform_cpu_translation
-# xla.backend_specific_translations['gpu'][mv_prob_uniform_p] = _matvec_prob_uniform_gpu_translation
-register_general_batching(mv_prob_uniform_p)
-ad.primitive_jvps[mv_prob_uniform_p] = _matvec_prob_uniform_jvp
-ad.primitive_transposes[mv_prob_uniform_p] = _matvec_prob_uniform_transpose
-
-
-def _matvec_prob_normal_abstract(
-    vector, w_mu, w_sigma, clen, seed, *, shape, transpose, outdim_parallel
-):
-  assert _get_dtype(vector) in [jnp.float32, jnp.float64]
-  _w_mu_dtype = _get_dtype(w_mu)
-  _w_sigma_dtype = _get_dtype(w_sigma)
-  assert _w_mu_dtype == _w_sigma_dtype, '"w_mu" and "w_sigma" must be same typed.'
-  assert _w_mu_dtype in [jnp.float32, jnp.float64], '"w_mu" must be float valued.'
-  assert _w_sigma_dtype in [jnp.float32, jnp.float64], '"w_sigma" must be float valued.'
-  assert _get_dtype(clen) in [jnp.int32, jnp.int64, jnp.uint32, jnp.uint64]
-  assert _get_dtype(seed) in [jnp.int32, jnp.int64, jnp.uint32, jnp.uint64]
-
-  if w_mu.ndim != 1:
-    raise ValueError('w_mu should be a 1D scalar.')
-  if w_sigma.ndim != 1:
-    raise ValueError('w_sigma should be a 1D scalar.')
-  if clen.ndim != 1:
-    raise ValueError('clen should be a 1D scalar.')
-  if vector.ndim != 1:
-    raise ValueError('vector should be a 1D vector.')
-  if seed.ndim != 1:
-    raise ValueError('seed must be a 1D scalar.')
-
-  if len(shape) != 2:
-    raise ValueError('shape should be a length-2 tuple.')
-  if not isinstance(transpose, bool):
-    raise ValueError('transpose must be a boolean value.')
-  if not isinstance(outdim_parallel, bool):
-    raise ValueError('outdim_parallel must be a boolean value.')
-
-  out = ShapedArray(dtype=dtypes.canonicalize_dtype(float),
-                    shape=(shape[1] if transpose else shape[0],))
-  return [out]
-
-
-def _matvec_prob_normal_cpu_translation(
-    c, vector, w_mu, w_sigma, clen, seed, *, shape, transpose, outdim_parallel
-):
-  import_brainpylib_cpu_ops()
-  n_row, n_col = (shape[1], shape[0]) if transpose else shape
-
-  vec_shape = c.get_shape(vector)
-  out_dtype = vec_shape.element_type()
-
-  if out_dtype == jnp.float32:
-    type_name = b'_float'
-  elif out_dtype == jnp.float64:
-    type_name = b'_double'
-  else:
-    raise TypeError
-
-  if outdim_parallel:
-    fn = b'cpu_matvec_prob_normal' + type_name
-  else:
-    fn = b'cpu_matvec_atomic_prob_normal' + type_name
-
-  return xla_client.ops.CustomCallWithLayout(
-    c,
-    fn,
-    operands=(vector,
-              w_mu,
-              w_sigma,
-              clen,
-              seed,
-              xla_client.ops.ConstantLiteral(c, n_row),
-              xla_client.ops.ConstantLiteral(c, n_col)),
-    operand_shapes_with_layout=(c.get_shape(vector),
-                                c.get_shape(w_mu),
-                                c.get_shape(w_sigma),
-                                c.get_shape(clen),
-                                c.get_shape(seed),
-                                xla_client.Shape.array_shape(np.dtype(np.uint32), (), ()),
-                                xla_client.Shape.array_shape(np.dtype(np.uint32), (), ())),
-    shape_with_layout=xla_client.Shape.tuple_shape(
-      (
-        xla_client.Shape.array_shape(out_dtype, (shape[1] if transpose else shape[0],), (0,)),
-      )
-    ),
-  )
-
-
-def _matvec_prob_normal_gpu_translation(
-    c, vector, w_mu, w_sigma, clen, seed, *, shape, transpose, outdim_parallel
-):
-  gpu_ops = import_brainpylib_gpu_ops()
-  if gpu_ops is None:
-    raise GPUOperatorNotFound(mv_prob_homo_p.name)
-
-  event_shape = c.get_shape(vector)
-  out_dtype = event_shape.element_type()
-
-  if out_dtype == jnp.float32:
-    type_name = b'_float'
-  elif out_dtype == jnp.float64:
-    type_name = b'_double'
-  else:
-    raise TypeError(f'Only support float or double, while got {out_dtype}')
-  opaque = gpu_ops.build_double_size_descriptor(shape[1] if transpose else shape[0],
-                                                shape[0] if transpose else shape[1])
-
-  if outdim_parallel:
-    fn = b'gpu_jit_csrmv_prob_normal_v2' + type_name
-  else:
-    fn = b'gpu_jit_csrmv_atomic_prob_normal_v2' + type_name
-
-  return xla_client.ops.CustomCallWithLayout(
-    c,
-    fn,
-    operands=(vector,
-              w_mu,
-              w_sigma,
-              clen,
-              seed,),
-    operand_shapes_with_layout=(c.get_shape(vector),
-                                c.get_shape(w_mu),
-                                c.get_shape(w_sigma),
-                                c.get_shape(clen),
-                                c.get_shape(seed),),
-    shape_with_layout=xla_client.Shape.tuple_shape(
-      (
-        xla_client.Shape.array_shape(out_dtype, (shape[1] if transpose else shape[0],), (0,)),
-      )
-    ),
-    opaque=opaque,
-  )
-
-
-def _matvec_prob_normal_jvp(
-    primals, tangents, *, shape, transpose, outdim_parallel
-):
-  vector, w_mu, w_sigma, clen, seed = primals
-  vector_dot, w_mu_dot, w_sigma_dot, clen_dot, seed_dot = tangents
-  r = mv_prob_normal_p.bind(vector,
-                            w_mu,
-                            w_sigma,
-                            clen,
-                            seed,
-                            shape=shape,
-                            transpose=transpose,
-                            outdim_parallel=outdim_parallel)
-  assert type(w_mu_dot) is ad.Zero
-  assert type(w_sigma_dot) is ad.Zero
-  assert type(clen_dot) is ad.Zero
-  assert type(seed_dot) is ad.Zero
-  r_dot = mv_prob_normal_p.bind(vector_dot,
-                                w_mu,
-                                w_sigma,
-                                clen,
-                                seed,
-                                shape=shape,
-                                transpose=transpose,
-                                outdim_parallel=outdim_parallel)
-  return r, r_dot
-
-
-def _matvec_prob_normal_transpose(
-    ct, vector, w_mu, w_sigma, clen, seed, *, shape, transpose, outdim_parallel
-):
-  assert type(vector) is ad.UndefinedPrimal
-  assert type(w_mu) is not ad.UndefinedPrimal
-  assert type(w_sigma) is not ad.UndefinedPrimal
-  assert type(clen) is not ad.UndefinedPrimal
-  assert type(seed) is not ad.UndefinedPrimal
-
-  r = mv_prob_normal_p.bind(ct[0],
-                            w_mu,
-                            w_sigma,
-                            clen,
-                            seed,
-                            shape=shape,
-                            transpose=not transpose,
-                            outdim_parallel=not outdim_parallel)[0]
-  return r, w_mu, w_sigma, clen, seed
-
-
-mv_prob_normal_p = Primitive('matvec_prob_normal')
-mv_prob_normal_p.multiple_results = True
-mv_prob_normal_p.def_abstract_eval(_matvec_prob_normal_abstract)
-mv_prob_normal_p.def_impl(partial(xla.apply_primitive, mv_prob_normal_p))
-# xla.backend_specific_translations['cpu'][mv_prob_normal_p] = _matvec_prob_normal_cpu_translation
-# xla.backend_specific_translations['gpu'][mv_prob_normal_p] = _matvec_prob_normal_gpu_translation
-register_general_batching(mv_prob_normal_p)
-ad.primitive_jvps[mv_prob_normal_p] = _matvec_prob_normal_jvp
-ad.primitive_transposes[mv_prob_normal_p] = _matvec_prob_normal_transpose
-
-
-### TAICHI ###
-def mv_prob_homo_taichi(
-    vector: Union[Array, jax.Array],
-    weight: float,
-    conn_prob: float,
-    seed: Optional[int] = None,
-    *,
-    shape: Tuple[int, int],
-    transpose: bool = False,
-    outdim_parallel: bool = True,
-) -> jax.Array:
-  r"""Perform the :math:`y=M@v` operation,
-  where :math:`M` is just-in-time randomly generated with a scalar `weight` at each position.
-
-  This operator support ``jit()``, ``vmap()``, ``grad()`` and ``pmap()`` etc. transformations
-  on CPU and GPU devices.
-
-  .. warning::
-
-      This API may change in the future.
-
-  In this operation, :math:`M` is the random matrix with a connection probability
-  `conn_prob`, and at each connection the value is the same scalar `weight`.
-
-  When ``transpose=True``, we perform an operation of :math:`y=M^T@v`.
-
-  .. note::
-
-      Note that the just-in-time generated :math:`M` (`transpose=False`) is
-      different from the generated :math:`M^T` (`transpose=True`).
-
-      If you pursue the same :math:`M` and :math:`M^T` when performing the just-in-time
-      matrix generation, you should set ``outdim_parallel=True``, with the sacrifice of
-      the speed compared with ``outdim_parallel=False``.
-
-      Generally, the :math:`M` in ``f(outdim_parallel=True, transpose=False)`` is the same of
-      the :math:`M^T` used in ``f(outdim_parallel=False, transpose=True)``.
-
-      Similarly, the :math:`M^T` in ``f(outdim_parallel=True, transpose=True)`` is the same
-      of the :math:`M` used in ``f(outdim_parallel=False, transpose=False)``.
-
-  Parameters
-  ----------
-  vector: Array, ndarray
-      The vector.
-  weight: float
-      The value of the random matrix.
-  conn_prob: float
-      The connection probability.
-  shape: tuple of int
-      The matrix shape.
-  seed: int
-      The random number generation seed.
-  transpose: bool
-      Transpose the random matrix or not.
-  outdim_parallel: bool
-      Perform the parallel random generations along the out dimension or not.
-      It can be used to set the just-in-time generated :math:M^T: is the same
-      as the just-in-time generated :math:`M` when ``transpose=True``.
-
-  Returns
-  -------
-  out: Array, ndarray
-      The output of :math:`y = M @ v`.
-  """
-  vector = as_jax(vector)
-  if isinstance(weight, float):
-    weight = as_jax(weight, dtype=vector.dtype)
-  weight = jnp.atleast_1d(as_jax(weight))
-  conn_len = jnp.ceil(1 / conn_prob) * 2 - 1
-  clen = jnp.asarray(jnp.atleast_1d(conn_len), dtype=jnp.int32)
-  if seed is None:
-    with jax.ensure_compile_time_eval():
-      seed = np.random.randint(0, int(1e8), 1)
-  seed = jnp.asarray(seed, dtype=jnp.uint32)
-  seed = jnp.atleast_1d(seed)
-  return raw_mv_prob_homo(vector, weight, clen, seed, shape=shape,
-                          transpose=transpose, outdim_parallel=outdim_parallel)[0]
-
-
-def mv_prob_uniform_taichi(
-    vector: jax.Array,
-    w_low: float,
-    w_high: float,
-    conn_prob: float,
-    seed: Optional[int] = None,
-    *,
-    shape: Tuple[int, int],
-    transpose: bool = False,
-    outdim_parallel: bool = True,
-) -> jax.Array:
-  r"""Perform the :math:`y=M@v` operation,
-  where :math:`M` is just-in-time randomly generated with a uniform distribution for its value.
-
-  This operator support ``jit()``, ``vmap()``, ``grad()`` and ``pmap()`` etc. transformations
-  on CPU and GPU devices.
-
-  .. warning::
-
-      This API may change in the future.
-
-  In this operation, :math:`M` is the random matrix with a connection probability
-  `conn_prob`, and at each connection the value is the same scalar `weight`.
-
-  When ``transpose=True``, we perform an operation of :math:`y=M^T@v`.
-
-  .. note::
-
-      Note that the just-in-time generated :math:`M` (`transpose=False`) is
-      different from the generated :math:`M^T` (`transpose=True`).
-
-      If you pursue the same :math:`M` and :math:`M^T` when performing the just-in-time
-      matrix generation, you should set ``outdim_parallel=True``, with the sacrifice of
-      the speed compared with ``outdim_parallel=False``.
-
-  Parameters
-  ----------
-  vector: Array, ndarray
-      The vector.
-  w_low: float
-      Lower boundary of the output interval.
-  w_high: float
-      Upper boundary of the output interval.
-  conn_prob: float
-      The connection probability.
-  shape: tuple of int
-      The matrix shape.
-  seed: int
-      The random number generation seed.
-  transpose: bool
-      Transpose the random matrix or not.
-  outdim_parallel: bool
-      Perform the parallel random generations along the out dimension or not.
-      It can be used to set the just-in-time generated :math:M^T: is the same
-      as the just-in-time generated :math:`M` when ``transpose=True``.
-
-  Returns
-  -------
-  out: Array, ndarray
-      The output of :math:`y = M @ v`.
-  """
-  vector = as_jax(vector)
-  if isinstance(w_low, float): w_low = as_jax(w_low, dtype=vector.dtype)
-  if isinstance(w_high, float): w_high = as_jax(w_high, dtype=vector.dtype)
-  w_low = jnp.atleast_1d(as_jax(w_low))
-  w_high = jnp.atleast_1d(as_jax(w_high))
-  conn_len = jnp.ceil(1 / conn_prob) * 2 - 1
-  conn_len = jnp.asarray(jnp.atleast_1d(conn_len), dtype=jnp.int32)
-  if seed is None:
-    with jax.ensure_compile_time_eval():
-      seed = np.random.randint(0, int(1e8), 1)
-  seed = jnp.atleast_1d(jnp.asarray(seed, dtype=jnp.uint32))
-  return raw_mv_prob_uniform(vector, w_low, w_high, conn_len, seed, shape=shape,
-                             transpose=transpose, outdim_parallel=outdim_parallel)[0]
-
-
-def mv_prob_normal_taichi(
-    vector: jax.Array,
-    w_mu: float,
-    w_sigma: float,
-    conn_prob: float,
-    seed: Optional[int] = None,
-    *,
-    shape: Tuple[int, int],
-    transpose: bool = False,
-    outdim_parallel: bool = True,
-) -> jax.Array:
-  r"""Perform the :math:`y=M@v` operation,
-  where :math:`M` is just-in-time randomly generated with a normal distribution for its value.
-
-  This operator support ``jit()``, ``vmap()``, ``grad()`` and ``pmap()`` etc. transformations
-  on CPU and GPU devices.
-
-  .. warning::
-
-      This API may change in the future.
-
-  In this operation, :math:`M` is the random matrix with a connection probability
-  `conn_prob`, and at each connection the value is the same scalar `weight`.
-
-  When ``transpose=True``, we perform an operation of :math:`y=M^T@v`.
-
-  .. note::
-
-      Note that the just-in-time generated :math:`M` (`transpose=False`) is
-      different from the generated :math:`M^T` (`transpose=True`).
-
-      If you pursue the same :math:`M` and :math:`M^T` when performing the just-in-time
-      matrix generation, you should set ``outdim_parallel=True``, with the sacrifice of
-      the speed compared with ``outdim_parallel=False``.
-
-  Parameters
-  ----------
-  vector: Array, ndarray
-      The vector.
-  w_mu: float
-      Mean (centre) of the distribution.
-  w_sigma: float
-      Standard deviation (spread or “width”) of the distribution. Must be non-negative.
-  conn_prob: float
-      The connection probability.
-  shape: tuple of int
-      The matrix shape.
-  seed: int
-      The random number generation seed.
-  transpose: bool
-      Transpose the random matrix or not.
-  outdim_parallel: bool
-      Perform the parallel random generations along the out dimension or not.
-      It can be used to set the just-in-time generated :math:M^T: is the same
-      as the just-in-time generated :math:`M` when ``transpose=True``.
-
-  Returns
-  -------
-  out: Array, ndarray
-      The output of :math:`y = M @ v`.
-  """
-  vector = as_jax(vector)
-  if isinstance(w_mu, float): w_mu = as_jax(w_mu, dtype=vector.dtype)
-  if isinstance(w_sigma, float): w_sigma = as_jax(w_sigma, dtype=vector.dtype)
-  w_mu = jnp.atleast_1d(as_jax(w_mu))
-  w_sigma = jnp.atleast_1d(as_jax(w_sigma))
-  conn_len = jnp.ceil(1 / conn_prob) * 2 - 1
-  conn_len = jnp.asarray(jnp.atleast_1d(conn_len), dtype=jnp.int32)
-  if seed is None:
-    with jax.ensure_compile_time_eval():
-      seed = np.random.randint(0, int(1e8), 1)
-  seed = jnp.atleast_1d(jnp.asarray(seed, dtype=jnp.uint32))
-  return raw_mv_prob_normal(vector, w_mu, w_sigma, conn_len, seed, shape=shape,
-                            transpose=transpose, outdim_parallel=outdim_parallel)[0]
-
-
-def _reverse(shape):
-  return shape[::-1]
-
-
-@ti.kernel
-def _mv_prob_homo_cpu(
-    vector: ti.types.ndarray(ndim=1),
-    weight: ti.types.ndarray(ndim=1),
-    clen: ti.types.ndarray(ndim=1),
-    seed: ti.types.ndarray(ndim=1),
-    out: ti.types.ndarray(ndim=1)
-):
-  num_row = out.shape[0]
-  num_col = vector.shape[0]
-  weight0 = weight[0]
-  clen0 = clen[0]
-  seed0 = seed[0]
-
-  for i_col in range(num_col):
-    key = lfsr88_key(seed0 + i_col)
-    key, i_row = lfsr88_random_integers(key, 0, clen0 - 1)
-    v = vector[i_col] * weight0
-    while i_row < num_row:
-      out[i_row] += v
-      key, inc = lfsr88_random_integers(key, 1, clen0)
-      i_row += inc
-
-
-@ti.kernel
-def _mv_prob_homo_outdim_parallel_cpu(
-    vector: ti.types.ndarray(ndim=1),
-    weight: ti.types.ndarray(ndim=1),
-    clen: ti.types.ndarray(ndim=1),
-    seed: ti.types.ndarray(ndim=1),
-    out: ti.types.ndarray(ndim=1)
-):
-  num_row = out.shape[0]
-  num_col = vector.shape[0]
-  weight0 = weight[0]
-  clen0 = clen[0]
-  seed0 = seed[0]
-
-  for i_row in range(num_row):
-    r = 0.
-    key = lfsr88_key(seed0 + i_row)
-    key, i_col = lfsr88_random_integers(key, 0, clen0 - 1)
-    while i_col < num_col:
-      r += vector[i_col]
-      key, inc = lfsr88_random_integers(key, 1, clen0)
-      i_col += inc
-    out[i_row] = r * weight0
-
-
-@ti.kernel
-def _mv_prob_homo_gpu(
-    vector: ti.types.ndarray(ndim=1),
-    weight: ti.types.ndarray(ndim=1),
-    clen: ti.types.ndarray(ndim=1),
-    seed: ti.types.ndarray(ndim=1),
-    out: ti.types.ndarray(ndim=1)
-):
-  num_row = out.shape[0]
-  num_col = vector.shape[0]
-  weight0 = weight[0]
-  clen0 = clen[0]
-  seed0 = seed[0]
-  step = ti.uint32(ti.max((num_row + 1) >> 5, 1))
-
-  for i in range(num_col * 32):
-    i_col = i >> 5
-    index = i & 31
-    col_v = vector[i_col]
-    i_row = step * index - 1
-    end = ti.min(i_row + step, num_row)
-    key = lfsr88_key(seed0 + i)
-    key, inc = lfsr88_random_integers(key, 1, clen0)
-    i_row += inc
-    while i_row < end:
-      out[i_row] += weight0 * col_v
-      key, inc = lfsr88_random_integers(key, 1, clen0)
-      i_row += inc
-
-
-@ti.kernel
-def _mv_prob_homo_outdim_parallel_gpu(
-    vector: ti.types.ndarray(ndim=1),
-    weight: ti.types.ndarray(ndim=1),
-    clen: ti.types.ndarray(ndim=1),
-    seed: ti.types.ndarray(ndim=1),
-    out: ti.types.ndarray(ndim=1)
-):
-  num_row = out.shape[0]
-  num_col = vector.shape[0]
-  weight0 = weight[0]
-  clen0 = clen[0]
-  seed0 = seed[0]
-  step = ti.u32(ti.max((num_row + 1) >> 5, 1))
-
-  for i in range(num_row * 32):
-    i_row = i >> 5
-    i_thread = i & 31
-    i_col = step * i_thread - 1
-    end_col = ti.min(i_col + step, num_col)
-    r = 0.
-    key = lfsr88_key(seed0 + i)
-    key, inc = lfsr88_random_integers(key, 1, clen0)
-    i_col += inc
-    while i_col < end_col:
-      r += vector[i_col]
-      key, inc = lfsr88_random_integers(key, 1, clen0)
-      i_col += inc
-    out[i_row] += weight0 * r  # TODO: warp-level reduction
-
-
-def _mv_prob_homo_jvp_vector(v_dot, vector, weight, clen, seed, *, outs, shape, transpose, outdim_parallel):
-  shape = _reverse(shape) if transpose else shape
-  return raw_mv_prob_homo(v_dot, weight, clen, seed, shape=shape, transpose=transpose, outdim_parallel=outdim_parallel)
-
-
-def _mv_prob_homo_jvp_weight(w_dot, vector, weight, clen, seed, *, outs, shape, transpose, outdim_parallel):
-  shape = _reverse(shape) if transpose else shape
-  return raw_mv_prob_homo(vector, w_dot, clen, seed, shape=shape, transpose=transpose, outdim_parallel=outdim_parallel)
-
-
-def _mv_prob_homo_transpose(
-    ct, vector, weight, clen, seed, *, outs, shape, transpose, outdim_parallel
-):
-  shape = _reverse(shape) if transpose else shape
-  if ad.is_undefined_primal(vector):
-    if type(ct) is ad.Zero:
-      return ad.Zero(vector), weight, clen, seed
-    else:
-      dv = raw_mv_prob_homo(ct[0], weight, clen, seed, shape=shape,
-                            transpose=not transpose, outdim_parallel=not outdim_parallel)[0]
-      return dv, weight, clen, seed
-  elif ad.is_undefined_primal(weight):
-    if type(ct) is ad.Zero:
-      return vector, ad.Zero(weight), clen, seed
-    else:
-      row = raw_mv_prob_homo(ct[0], jnp.ones(1, dtype=ct[0].dtype), clen, seed,
-                             shape=shape, transpose=transpose, outdim_parallel=outdim_parallel)[0]
-      dw = jnp.sum(row * vector, keepdims=True)
-      return vector, dw, clen, seed
-  else:
-    assert type(clen) is not ad.UndefinedPrimal, 'Cannot differentiate through clen.'
-    assert type(seed) is not ad.UndefinedPrimal, 'Cannot differentiate through seed.'
-
-
-def _general_checking(vector, clen, seed, shape, outdim_parallel, transpose, *weights):
+def _general_checking(vector, clen, seed, shape, outdim_parallel, transpose, *weights):
   if vector.ndim != 1:
     raise ValueError('vector should be a 1D vector.')
   if len(shape) != 2:
@@ -1437,190 +383,28 @@ def _non_event_checking(vector, clen, seed, shape, outdim_parallel, transpose, *
   return _general_checking(vector, clen, seed, shape, outdim_parallel, transpose, *weights)
 
 
-def raw_mv_prob_homo(
-    vector: jax.Array,
-    weight: jax.Array,  # vector with size 1
-    clen: jax.Array,  # vector with size 1
-    seed: jax.Array,  # vector with size 1
-    *,
-    shape: Tuple[int, int],
-    transpose: bool = False,
-    outdim_parallel: bool = True,
-) -> jax.Array:
-  mat_shape, out_shape = _non_event_checking(vector, clen, seed, shape, outdim_parallel, transpose, weight)
-
-  if outdim_parallel:
-    prim = _mv_prob_homo_outdim_parallel_p
-  else:
-    prim = _mv_prob_homo_p
-
-  return prim(vector,
-              weight,
-              clen,
-              seed,
-              outs=[jax.ShapeDtypeStruct(shape=out_shape, dtype=vector.dtype)],
-              shape=mat_shape,
-              transpose=transpose,
-              outdim_parallel=outdim_parallel)
-
-
-def _define_mv_prob_homo_prim(cpu_kernel, gpu_kernel):
-  prim = XLACustomOp(cpu_kernel=cpu_kernel, gpu_kernel=gpu_kernel)
-  prim.defjvp(_mv_prob_homo_jvp_vector, _mv_prob_homo_jvp_weight, None, None)
-  prim.def_transpose_rule(_mv_prob_homo_transpose)
-  return prim
-
-
-# outdim_parallel = True
-_mv_prob_homo_outdim_parallel_p = _define_mv_prob_homo_prim(cpu_kernel=_mv_prob_homo_outdim_parallel_cpu,
-                                                            gpu_kernel=_mv_prob_homo_outdim_parallel_gpu)
-
-# outdim_parallel = False
-_mv_prob_homo_p = _define_mv_prob_homo_prim(cpu_kernel=_mv_prob_homo_cpu,
-                                            gpu_kernel=_mv_prob_homo_gpu)
-
-
-@ti.kernel
-def _mv_prob_uniform_cpu(
-    vector: ti.types.ndarray(ndim=1),
-    w_min: ti.types.ndarray(ndim=1),
-    w_max: ti.types.ndarray(ndim=1),
-    clen: ti.types.ndarray(ndim=1),
-    seed: ti.types.ndarray(ndim=1),
-    out: ti.types.ndarray(ndim=1)
-):
-  num_row = out.shape[0]
-  num_col = vector.shape[0]
-  w_min0 = w_min[0]
-  w_max0 = w_max[0]
-  clen0 = clen[0]
-  seed0 = seed[0]
-
-  for i_col in range(num_col):
-    col_v = vector[i_col]
-    key = lfsr88_key(seed0 + i_col)
-    key, i_row = lfsr88_random_integers(key, 0, clen0 - 1)
-    while i_row < num_row:
-      key, raw_v = lfsr88_uniform(key, w_min0, w_max0)
-      out[i_row] += col_v * raw_v
-      key, inc = lfsr88_random_integers(key, 1, clen0)
-      i_row += inc
-
-
-@ti.kernel
-def _mv_prob_uniform_outdim_parallel_cpu(
-    vector: ti.types.ndarray(ndim=1),
-    w_min: ti.types.ndarray(ndim=1),
-    w_max: ti.types.ndarray(ndim=1),
-    clen: ti.types.ndarray(ndim=1),
-    seed: ti.types.ndarray(ndim=1),
-    out: ti.types.ndarray(ndim=1)
-):
-  num_row = out.shape[0]
-  num_col = vector.shape[0]
-  w_min0 = w_min[0]
-  w_max0 = w_max[0]
-  clen0 = clen[0]
-  seed0 = seed[0]
-
-  for i_row in range(num_row):
-    r = 0.
-    key = lfsr88_key(seed0 + i_row)
-    key, i_col = lfsr88_random_integers(key, 0, clen0 - 1)
-    while i_col < num_col:
-      key, raw_v = lfsr88_uniform(key, w_min0, w_max0)
-      r += vector[i_col] * raw_v
-      key, inc = lfsr88_random_integers(key, 1, clen0)
-      i_col += inc
-    out[i_row] = r
-
-
-@ti.kernel
-def _mv_prob_uniform_gpu(
-    vector: ti.types.ndarray(ndim=1),
-    w_min: ti.types.ndarray(ndim=1),
-    w_max: ti.types.ndarray(ndim=1),
-    clen: ti.types.ndarray(ndim=1),
-    seed: ti.types.ndarray(ndim=1),
-    out: ti.types.ndarray(ndim=1)
-):
-  num_row = out.shape[0]
-  num_col = vector.shape[0]
-  w_min0 = w_min[0]
-  w_max0 = w_max[0]
-  clen0 = clen[0]
-  seed0 = seed[0]
-  step = ti.uint32(ti.max((num_row + 1) >> 5, 1))
-
-  for i in range(num_col * 32):
-    i_col = i >> 5
-    index = i & 31
-    col_v = vector[i_col]
-    i_row = step * index - 1
-    end = ti.min(i_row + step, num_row)
-    key = lfsr88_key(seed0 + i)
-    key, inc = lfsr88_random_integers(key, 1, clen0)
-    i_row += inc
-    while i_row < end:
-      key, row_v = lfsr88_uniform(key, w_min0, w_max0)
-      out[i_row] += row_v * col_v
-      key, inc = lfsr88_random_integers(key, 1, clen0)
-      i_row += inc
-
-
-@ti.kernel
-def _mv_prob_uniform_outdim_parallel_gpu(
-    vector: ti.types.ndarray(ndim=1),
-    w_min: ti.types.ndarray(ndim=1),
-    w_max: ti.types.ndarray(ndim=1),
-    clen: ti.types.ndarray(ndim=1),
-    seed: ti.types.ndarray(ndim=1),
-    out: ti.types.ndarray(ndim=1)
+def _mv_prob_homo_transpose(
+    ct, vector, weight, clen, seed, *, outs, shape, transpose, outdim_parallel
 ):
-  num_row = out.shape[0]
-  num_col = vector.shape[0]
-  w_min0 = w_min[0]
-  w_max0 = w_max[0]
-  clen0 = clen[0]
-  seed0 = seed[0]
-  step = ti.u32(ti.max((num_row + 1) >> 5, 1))
-
-  for i in range(num_row * 32):
-    i_row = i >> 5
-    i_thread = i & 31
-    i_col = step * i_thread - 1
-    end_col = ti.min(i_col + step, num_col)
-    r = 0.
-    key = lfsr88_key(seed0 + i)
-    key, inc = lfsr88_random_integers(key, 1, clen0)
-    i_col += inc
-    while i_col < end_col:
-      key, row_v = lfsr88_uniform(key, w_min0, w_max0)
-      r += vector[i_col] * row_v
-      key, inc = lfsr88_random_integers(key, 1, clen0)
-      i_col += inc
-    out[i_row] += r  # TODO: warp-level reduction
-
-
-def _mv_prob_uniform_jvp_vector(v_dot, vector, w_low, w_high, clen, seed, *,
-                                outs, shape, transpose, outdim_parallel):
   shape = _reverse(shape) if transpose else shape
-  return raw_mv_prob_uniform(v_dot, w_low, w_high, clen, seed, shape=shape,
-                             transpose=transpose, outdim_parallel=outdim_parallel)
-
-
-def _mv_prob_uniform_jvp_wlow(w_dot, vector, w_low, w_high, clen, seed, *,
-                              outs, shape, transpose, outdim_parallel):
-  shape = _reverse(shape) if transpose else shape
-  return raw_mv_prob_uniform(vector, w_dot, w_high, clen, seed, shape=shape,
-                             transpose=transpose, outdim_parallel=outdim_parallel)
-
-
-def _mv_prob_uniform_jvp_whigh(w_dot, vector, w_low, w_high, clen, seed, *,
-                               outs, shape, transpose, outdim_parallel):
-  shape = _reverse(shape) if transpose else shape
-  return raw_mv_prob_uniform(vector, w_low, w_dot, clen, seed, shape=shape,
-                             transpose=transpose, outdim_parallel=outdim_parallel)
+  if ad.is_undefined_primal(vector):
+    if type(ct) is ad.Zero:
+      return ad.Zero(vector), weight, clen, seed
+    else:
+      dv = raw_mv_prob_homo(ct[0], weight, clen, seed, shape=shape,
+                            transpose=not transpose, outdim_parallel=not outdim_parallel)[0]
+      return dv, weight, clen, seed
+  elif ad.is_undefined_primal(weight):
+    if type(ct) is ad.Zero:
+      return vector, ad.Zero(weight), clen, seed
+    else:
+      row = raw_mv_prob_homo(ct[0], jnp.ones(1, dtype=ct[0].dtype), clen, seed,
+                             shape=shape, transpose=transpose, outdim_parallel=outdim_parallel)[0]
+      dw = jnp.sum(row * vector, keepdims=True)
+      return vector, dw, clen, seed
+  else:
+    assert type(clen) is not ad.UndefinedPrimal, 'Cannot differentiate through clen.'
+    assert type(seed) is not ad.UndefinedPrimal, 'Cannot differentiate through seed.'
 
 
 def _mv_prob_uniform_transpose(
@@ -1641,265 +425,496 @@ def _mv_prob_uniform_transpose(
     assert type(seed) is not ad.UndefinedPrimal, 'Cannot differentiate through seed.'
 
 
-def raw_mv_prob_uniform(
-    vector: jax.Array,
-    w_low: jax.Array,
-    w_high: jax.Array,
-    conn_len: jax.Array,
-    seed: jax.Array,
-    *,
-    shape: Tuple[int, int],
-    transpose: bool = False,
-    outdim_parallel: bool = True,
-) -> jax.Array:
-  mat_shape, out_shape = _non_event_checking(vector, conn_len, seed, shape, outdim_parallel, transpose, w_low, w_high)
-
-  if outdim_parallel:
-    prim = _mv_prob_uniform_outdim_parallel_p
+def _mv_prob_normal_transpose(
+    ct, vector, w_mu, w_sigma, clen, seed, *, outs, shape, transpose, outdim_parallel
+):
+  shape = _reverse(shape) if transpose else shape
+  if ad.is_undefined_primal(vector):
+    if type(ct) is ad.Zero:
+      return ad.Zero(vector), w_mu, w_sigma, clen, seed
+    else:
+      dv = raw_mv_prob_normal(ct[0], w_mu, w_sigma, clen, seed, shape=shape,
+                              transpose=not transpose, outdim_parallel=not outdim_parallel)[0]
+      return dv, w_mu, w_sigma, clen, seed
   else:
-    prim = _mv_prob_uniform_p
+    assert type(w_mu) is not ad.UndefinedPrimal, 'Cannot differentiate through w_mu.'
+    assert type(w_sigma) is not ad.UndefinedPrimal, 'Cannot differentiate through w_sigma.'
+    assert type(clen) is not ad.UndefinedPrimal, 'Cannot differentiate through clen.'
+    assert type(seed) is not ad.UndefinedPrimal, 'Cannot differentiate through seed.'
 
-  return prim(vector,
-              w_low,
-              w_high,
-              conn_len,
-              seed,
-              outs=[jax.ShapeDtypeStruct(shape=out_shape, dtype=vector.dtype)],
-              shape=mat_shape,
-              transpose=transpose,
-              outdim_parallel=outdim_parallel)
 
+def _reverse(shape):
+  return shape[::-1]
 
-def _define_mv_prob_uniform_prim(cpu_kernel, gpu_kernel):
-  prim = XLACustomOp(cpu_kernel=cpu_kernel, gpu_kernel=gpu_kernel)
-  prim.defjvp(_mv_prob_uniform_jvp_vector,
-              _mv_prob_uniform_jvp_wlow,
-              _mv_prob_uniform_jvp_whigh,
-              None,
-              None)
-  prim.def_transpose_rule(_mv_prob_uniform_transpose)
-  return prim
-
-
-# outdim_parallel = True
-_mv_prob_uniform_outdim_parallel_p = _define_mv_prob_uniform_prim(
-  cpu_kernel=_mv_prob_uniform_outdim_parallel_cpu,
-  gpu_kernel=_mv_prob_uniform_outdim_parallel_gpu
-)
-
-# outdim_parallel = False
-_mv_prob_uniform_p = _define_mv_prob_uniform_prim(
-  cpu_kernel=_mv_prob_uniform_cpu,
-  gpu_kernel=_mv_prob_uniform_gpu
-)
-
-
-@ti.kernel
-def _mv_prob_normal_cpu(
-    vector: ti.types.ndarray(ndim=1),
-    w_mu: ti.types.ndarray(ndim=1),
-    w_sigma: ti.types.ndarray(ndim=1),
-    clen: ti.types.ndarray(ndim=1),
-    seed: ti.types.ndarray(ndim=1),
-    out: ti.types.ndarray(ndim=1)
-):
-  num_row = out.shape[0]
-  num_col = vector.shape[0]
-  w_mu0 = w_mu[0]
-  w_sigma0 = w_sigma[0]
-  clen0 = clen[0]
-  seed0 = seed[0]
-
-  for i_col in range(num_col):
-    col_v = vector[i_col]
-    key = lfsr88_key(seed0 + i_col)
-    key, i_row = lfsr88_random_integers(key, 0, clen0 - 1)
-    while i_row < num_row:
-      key, raw_v = lfsr88_normal(key, w_mu0, w_sigma0)
-      out[i_row] += col_v * raw_v
+
+if ti is not None:
+  from brainpy._src.math.tifunc import (lfsr88_key, lfsr88_random_integers, lfsr88_uniform, lfsr88_normal)
+
+
+  @ti.kernel
+  def _mv_prob_homo_cpu(
+      vector: ti.types.ndarray(ndim=1),
+      weight: ti.types.ndarray(ndim=1),
+      clen: ti.types.ndarray(ndim=1),
+      seed: ti.types.ndarray(ndim=1),
+      out: ti.types.ndarray(ndim=1)
+  ):
+    num_row = out.shape[0]
+    num_col = vector.shape[0]
+    weight0 = weight[0]
+    clen0 = clen[0]
+    seed0 = seed[0]
+
+    for i_col in range(num_col):
+      key = lfsr88_key(seed0 + i_col)
+      key, i_row = lfsr88_random_integers(key, 0, clen0 - 1)
+      v = vector[i_col] * weight0
+      while i_row < num_row:
+        out[i_row] += v
+        key, inc = lfsr88_random_integers(key, 1, clen0)
+        i_row += inc
+
+
+  @ti.kernel
+  def _mv_prob_homo_outdim_parallel_cpu(
+      vector: ti.types.ndarray(ndim=1),
+      weight: ti.types.ndarray(ndim=1),
+      clen: ti.types.ndarray(ndim=1),
+      seed: ti.types.ndarray(ndim=1),
+      out: ti.types.ndarray(ndim=1)
+  ):
+    num_row = out.shape[0]
+    num_col = vector.shape[0]
+    weight0 = weight[0]
+    clen0 = clen[0]
+    seed0 = seed[0]
+
+    for i_row in range(num_row):
+      r = 0.
+      key = lfsr88_key(seed0 + i_row)
+      key, i_col = lfsr88_random_integers(key, 0, clen0 - 1)
+      while i_col < num_col:
+        r += vector[i_col]
+        key, inc = lfsr88_random_integers(key, 1, clen0)
+        i_col += inc
+      out[i_row] = r * weight0
+
+
+  @ti.kernel
+  def _mv_prob_homo_gpu(
+      vector: ti.types.ndarray(ndim=1),
+      weight: ti.types.ndarray(ndim=1),
+      clen: ti.types.ndarray(ndim=1),
+      seed: ti.types.ndarray(ndim=1),
+      out: ti.types.ndarray(ndim=1)
+  ):
+    num_row = out.shape[0]
+    num_col = vector.shape[0]
+    weight0 = weight[0]
+    clen0 = clen[0]
+    seed0 = seed[0]
+    step = ti.uint32(ti.max((num_row + 1) >> 5, 1))
+
+    for i in range(num_col * 32):
+      i_col = i >> 5
+      index = i & 31
+      col_v = vector[i_col]
+      i_row = step * index - 1
+      end = ti.min(i_row + step, num_row)
+      key = lfsr88_key(seed0 + i)
       key, inc = lfsr88_random_integers(key, 1, clen0)
       i_row += inc
-
-
-@ti.kernel
-def _mv_prob_normal_outdim_parallel_cpu(
-    vector: ti.types.ndarray(ndim=1),
-    w_mu: ti.types.ndarray(ndim=1),
-    w_sigma: ti.types.ndarray(ndim=1),
-    clen: ti.types.ndarray(ndim=1),
-    seed: ti.types.ndarray(ndim=1),
-    out: ti.types.ndarray(ndim=1)
-):
-  num_row = out.shape[0]
-  num_col = vector.shape[0]
-  w_mu0 = w_mu[0]
-  w_sigma0 = w_sigma[0]
-  clen0 = clen[0]
-  seed0 = seed[0]
-
-  for i_row in range(num_row):
-    r = 0.
-    key = lfsr88_key(seed0 + i_row)
-    key, i_col = lfsr88_random_integers(key, 0, clen0 - 1)
-    while i_col < num_col:
-      key, raw_v = lfsr88_normal(key, w_mu0, w_sigma0)
-      r += vector[i_col] * raw_v
+      while i_row < end:
+        out[i_row] += weight0 * col_v
+        key, inc = lfsr88_random_integers(key, 1, clen0)
+        i_row += inc
+
+
+  @ti.kernel
+  def _mv_prob_homo_outdim_parallel_gpu(
+      vector: ti.types.ndarray(ndim=1),
+      weight: ti.types.ndarray(ndim=1),
+      clen: ti.types.ndarray(ndim=1),
+      seed: ti.types.ndarray(ndim=1),
+      out: ti.types.ndarray(ndim=1)
+  ):
+    num_row = out.shape[0]
+    num_col = vector.shape[0]
+    weight0 = weight[0]
+    clen0 = clen[0]
+    seed0 = seed[0]
+    step = ti.u32(ti.max((num_row + 1) >> 5, 1))
+
+    for i in range(num_row * 32):
+      i_row = i >> 5
+      i_thread = i & 31
+      i_col = step * i_thread - 1
+      end_col = ti.min(i_col + step, num_col)
+      r = 0.
+      key = lfsr88_key(seed0 + i)
       key, inc = lfsr88_random_integers(key, 1, clen0)
       i_col += inc
-    out[i_row] = r
+      while i_col < end_col:
+        r += vector[i_col]
+        key, inc = lfsr88_random_integers(key, 1, clen0)
+        i_col += inc
+      out[i_row] += weight0 * r  # TODO: warp-level reduction
 
 
-@ti.kernel
-def _mv_prob_normal_gpu(
-    vector: ti.types.ndarray(ndim=1),
-    w_mu: ti.types.ndarray(ndim=1),
-    w_sigma: ti.types.ndarray(ndim=1),
-    clen: ti.types.ndarray(ndim=1),
-    seed: ti.types.ndarray(ndim=1),
-    out: ti.types.ndarray(ndim=1)
-):
-  num_row = out.shape[0]
-  num_col = vector.shape[0]
-  w_mu0 = w_mu[0]
-  w_sigma0 = w_sigma[0]
-  clen0 = clen[0]
-  seed0 = seed[0]
-  step = ti.uint32(ti.max((num_row + 1) >> 5, 1))
-
-  for i in range(num_col * 32):
-    i_col = i >> 5
-    index = i & 31
-    col_v = vector[i_col]
-    i_row = step * index - 1
-    end = ti.min(i_row + step, num_row)
-    key = lfsr88_key(seed0 + i)
-    key, inc = lfsr88_random_integers(key, 1, clen0)
-    i_row += inc
-    while i_row < end:
-      key, row_v = lfsr88_normal(key, w_mu0, w_sigma0)
-      out[i_row] += row_v * col_v
-      key, inc = lfsr88_random_integers(key, 1, clen0)
-      i_row += inc
+  def _mv_prob_homo_jvp_vector(v_dot, vector, weight, clen, seed, *, outs, shape, transpose, outdim_parallel):
+    shape = _reverse(shape) if transpose else shape
+    return raw_mv_prob_homo(v_dot, weight, clen, seed, shape=shape, transpose=transpose,
+                            outdim_parallel=outdim_parallel)
 
 
-@ti.kernel
-def _mv_prob_normal_outdim_parallel_gpu(
-    vector: ti.types.ndarray(ndim=1),
-    w_mu: ti.types.ndarray(ndim=1),
-    w_sigma: ti.types.ndarray(ndim=1),
-    clen: ti.types.ndarray(ndim=1),
-    seed: ti.types.ndarray(ndim=1),
-    out: ti.types.ndarray(ndim=1)
-):
-  num_row = out.shape[0]
-  num_col = vector.shape[0]
-  w_mu0 = w_mu[0]
-  w_sigma0 = w_sigma[0]
-  clen0 = clen[0]
-  seed0 = seed[0]
-  step = ti.u32(ti.max((num_row + 1) >> 5, 1))
-
-  for i in range(num_row * 32):
-    i_row = i >> 5
-    i_thread = i & 31
-    i_col = step * i_thread - 1
-    end_col = ti.min(i_col + step, num_col)
-    r = 0.
-    key = lfsr88_key(seed0 + i)
-    key, inc = lfsr88_random_integers(key, 1, clen0)
-    i_col += inc
-    while i_col < end_col:
-      key, row_v = lfsr88_normal(key, w_mu0, w_sigma0)
-      r += vector[i_col] * row_v
+  def _mv_prob_homo_jvp_weight(w_dot, vector, weight, clen, seed, *, outs, shape, transpose, outdim_parallel):
+    shape = _reverse(shape) if transpose else shape
+    return raw_mv_prob_homo(vector, w_dot, clen, seed, shape=shape, transpose=transpose,
+                            outdim_parallel=outdim_parallel)
+
+
+  def _define_mv_prob_homo_prim(cpu_kernel, gpu_kernel):
+    prim = XLACustomOp(cpu_kernel=cpu_kernel, gpu_kernel=gpu_kernel)
+    prim.defjvp(_mv_prob_homo_jvp_vector, _mv_prob_homo_jvp_weight, None, None)
+    prim.def_transpose_rule(_mv_prob_homo_transpose)
+    return prim
+
+
+  # outdim_parallel = True
+  _mv_prob_homo_outdim_parallel_p = _define_mv_prob_homo_prim(cpu_kernel=_mv_prob_homo_outdim_parallel_cpu,
+                                                              gpu_kernel=_mv_prob_homo_outdim_parallel_gpu)
+
+  # outdim_parallel = False
+  _mv_prob_homo_p = _define_mv_prob_homo_prim(cpu_kernel=_mv_prob_homo_cpu,
+                                              gpu_kernel=_mv_prob_homo_gpu)
+
+
+  @ti.kernel
+  def _mv_prob_uniform_cpu(
+      vector: ti.types.ndarray(ndim=1),
+      w_min: ti.types.ndarray(ndim=1),
+      w_max: ti.types.ndarray(ndim=1),
+      clen: ti.types.ndarray(ndim=1),
+      seed: ti.types.ndarray(ndim=1),
+      out: ti.types.ndarray(ndim=1)
+  ):
+    num_row = out.shape[0]
+    num_col = vector.shape[0]
+    w_min0 = w_min[0]
+    w_max0 = w_max[0]
+    clen0 = clen[0]
+    seed0 = seed[0]
+
+    for i_col in range(num_col):
+      col_v = vector[i_col]
+      key = lfsr88_key(seed0 + i_col)
+      key, i_row = lfsr88_random_integers(key, 0, clen0 - 1)
+      while i_row < num_row:
+        key, raw_v = lfsr88_uniform(key, w_min0, w_max0)
+        out[i_row] += col_v * raw_v
+        key, inc = lfsr88_random_integers(key, 1, clen0)
+        i_row += inc
+
+
+  @ti.kernel
+  def _mv_prob_uniform_outdim_parallel_cpu(
+      vector: ti.types.ndarray(ndim=1),
+      w_min: ti.types.ndarray(ndim=1),
+      w_max: ti.types.ndarray(ndim=1),
+      clen: ti.types.ndarray(ndim=1),
+      seed: ti.types.ndarray(ndim=1),
+      out: ti.types.ndarray(ndim=1)
+  ):
+    num_row = out.shape[0]
+    num_col = vector.shape[0]
+    w_min0 = w_min[0]
+    w_max0 = w_max[0]
+    clen0 = clen[0]
+    seed0 = seed[0]
+
+    for i_row in range(num_row):
+      r = 0.
+      key = lfsr88_key(seed0 + i_row)
+      key, i_col = lfsr88_random_integers(key, 0, clen0 - 1)
+      while i_col < num_col:
+        key, raw_v = lfsr88_uniform(key, w_min0, w_max0)
+        r += vector[i_col] * raw_v
+        key, inc = lfsr88_random_integers(key, 1, clen0)
+        i_col += inc
+      out[i_row] = r
+
+
+  @ti.kernel
+  def _mv_prob_uniform_gpu(
+      vector: ti.types.ndarray(ndim=1),
+      w_min: ti.types.ndarray(ndim=1),
+      w_max: ti.types.ndarray(ndim=1),
+      clen: ti.types.ndarray(ndim=1),
+      seed: ti.types.ndarray(ndim=1),
+      out: ti.types.ndarray(ndim=1)
+  ):
+    num_row = out.shape[0]
+    num_col = vector.shape[0]
+    w_min0 = w_min[0]
+    w_max0 = w_max[0]
+    clen0 = clen[0]
+    seed0 = seed[0]
+    step = ti.uint32(ti.max((num_row + 1) >> 5, 1))
+
+    for i in range(num_col * 32):
+      i_col = i >> 5
+      index = i & 31
+      col_v = vector[i_col]
+      i_row = step * index - 1
+      end = ti.min(i_row + step, num_row)
+      key = lfsr88_key(seed0 + i)
+      key, inc = lfsr88_random_integers(key, 1, clen0)
+      i_row += inc
+      while i_row < end:
+        key, row_v = lfsr88_uniform(key, w_min0, w_max0)
+        out[i_row] += row_v * col_v
+        key, inc = lfsr88_random_integers(key, 1, clen0)
+        i_row += inc
+
+
+  @ti.kernel
+  def _mv_prob_uniform_outdim_parallel_gpu(
+      vector: ti.types.ndarray(ndim=1),
+      w_min: ti.types.ndarray(ndim=1),
+      w_max: ti.types.ndarray(ndim=1),
+      clen: ti.types.ndarray(ndim=1),
+      seed: ti.types.ndarray(ndim=1),
+      out: ti.types.ndarray(ndim=1)
+  ):
+    num_row = out.shape[0]
+    num_col = vector.shape[0]
+    w_min0 = w_min[0]
+    w_max0 = w_max[0]
+    clen0 = clen[0]
+    seed0 = seed[0]
+    step = ti.u32(ti.max((num_row + 1) >> 5, 1))
+
+    for i in range(num_row * 32):
+      i_row = i >> 5
+      i_thread = i & 31
+      i_col = step * i_thread - 1
+      end_col = ti.min(i_col + step, num_col)
+      r = 0.
+      key = lfsr88_key(seed0 + i)
       key, inc = lfsr88_random_integers(key, 1, clen0)
       i_col += inc
-    out[i_row] += r  # TODO: warp-level reduction
+      while i_col < end_col:
+        key, row_v = lfsr88_uniform(key, w_min0, w_max0)
+        r += vector[i_col] * row_v
+        key, inc = lfsr88_random_integers(key, 1, clen0)
+        i_col += inc
+      out[i_row] += r  # TODO: warp-level reduction
 
 
-def _mv_prob_normal_jvp_vector(v_dot, vector, w_mu, w_sigma, clen, seed, *, outs, shape, transpose, outdim_parallel):
-  shape = _reverse(shape) if transpose else shape
-  return raw_mv_prob_normal(v_dot, w_mu, w_sigma, clen, seed, shape=shape,
-                            transpose=transpose, outdim_parallel=outdim_parallel)
+  def _mv_prob_uniform_jvp_vector(v_dot, vector, w_low, w_high, clen, seed, *,
+                                  outs, shape, transpose, outdim_parallel):
+    shape = _reverse(shape) if transpose else shape
+    return raw_mv_prob_uniform(v_dot, w_low, w_high, clen, seed, shape=shape,
+                               transpose=transpose, outdim_parallel=outdim_parallel)
 
 
-def _mv_prob_normal_jvp_w_mu(w_dot, vector, w_mu, w_sigma, clen, seed, *, outs, shape, transpose, outdim_parallel):
-  shape = _reverse(shape) if transpose else shape
-  return raw_mv_prob_normal(vector, w_dot, w_sigma, clen, seed, shape=shape,
-                            transpose=transpose, outdim_parallel=outdim_parallel)
+  def _mv_prob_uniform_jvp_wlow(w_dot, vector, w_low, w_high, clen, seed, *,
+                                outs, shape, transpose, outdim_parallel):
+    shape = _reverse(shape) if transpose else shape
+    return raw_mv_prob_uniform(vector, w_dot, w_high, clen, seed, shape=shape,
+                               transpose=transpose, outdim_parallel=outdim_parallel)
 
 
-def _mv_prob_normal_jvp_w_sigma(w_dot, vector, w_mu, w_sigma, clen, seed, *, outs, shape, transpose, outdim_parallel):
-  shape = _reverse(shape) if transpose else shape
-  return raw_mv_prob_normal(vector, w_mu, w_dot, clen, seed, shape=shape,
-                            transpose=transpose, outdim_parallel=outdim_parallel)
+  def _mv_prob_uniform_jvp_whigh(w_dot, vector, w_low, w_high, clen, seed, *,
+                                 outs, shape, transpose, outdim_parallel):
+    shape = _reverse(shape) if transpose else shape
+    return raw_mv_prob_uniform(vector, w_low, w_dot, clen, seed, shape=shape,
+                               transpose=transpose, outdim_parallel=outdim_parallel)
 
 
-def _mv_prob_normal_transpose(
-    ct, vector, w_mu, w_sigma, clen, seed, *, outs, shape, transpose, outdim_parallel
-):
-  shape = _reverse(shape) if transpose else shape
-  if ad.is_undefined_primal(vector):
-    if type(ct) is ad.Zero:
-      return ad.Zero(vector), w_mu, w_sigma, clen, seed
-    else:
-      dv = raw_mv_prob_normal(ct[0], w_mu, w_sigma, clen, seed, shape=shape,
-                              transpose=not transpose, outdim_parallel=not outdim_parallel)[0]
-      return dv, w_mu, w_sigma, clen, seed
-  else:
-    assert type(w_mu) is not ad.UndefinedPrimal, 'Cannot differentiate through w_mu.'
-    assert type(w_sigma) is not ad.UndefinedPrimal, 'Cannot differentiate through w_sigma.'
-    assert type(clen) is not ad.UndefinedPrimal, 'Cannot differentiate through clen.'
-    assert type(seed) is not ad.UndefinedPrimal, 'Cannot differentiate through seed.'
+  def _define_mv_prob_uniform_prim(cpu_kernel, gpu_kernel):
+    prim = XLACustomOp(cpu_kernel=cpu_kernel, gpu_kernel=gpu_kernel)
+    prim.defjvp(_mv_prob_uniform_jvp_vector,
+                _mv_prob_uniform_jvp_wlow,
+                _mv_prob_uniform_jvp_whigh,
+                None,
+                None)
+    prim.def_transpose_rule(_mv_prob_uniform_transpose)
+    return prim
 
 
-def raw_mv_prob_normal(
-    vector: jax.Array,
-    w_mu: jax.Array,
-    w_sigma: jax.Array,
-    conn_len: jax.Array,
-    seed: jax.Array,
-    *,
-    shape: Tuple[int, int],
-    transpose: bool = False,
-    outdim_parallel: bool = True,
-) -> jax.Array:
-  mat_shape, out_shape = _non_event_checking(vector, conn_len, seed, shape, outdim_parallel, transpose, w_mu, w_sigma)
+  # outdim_parallel = True
+  _mv_prob_uniform_outdim_parallel_p = _define_mv_prob_uniform_prim(
+    cpu_kernel=_mv_prob_uniform_outdim_parallel_cpu,
+    gpu_kernel=_mv_prob_uniform_outdim_parallel_gpu
+  )
 
-  if outdim_parallel:
-    prim = _mv_prob_normal_outdim_parallel_p
-  else:
-    prim = _mv_prob_normal_p
+  # outdim_parallel = False
+  _mv_prob_uniform_p = _define_mv_prob_uniform_prim(
+    cpu_kernel=_mv_prob_uniform_cpu,
+    gpu_kernel=_mv_prob_uniform_gpu
+  )
 
-  return prim(vector,
-              w_mu,
-              w_sigma,
-              conn_len,
-              seed,
-              outs=[jax.ShapeDtypeStruct(shape=out_shape, dtype=vector.dtype)],
-              shape=mat_shape,
-              transpose=transpose,
-              outdim_parallel=outdim_parallel)
 
+  @ti.kernel
+  def _mv_prob_normal_cpu(
+      vector: ti.types.ndarray(ndim=1),
+      w_mu: ti.types.ndarray(ndim=1),
+      w_sigma: ti.types.ndarray(ndim=1),
+      clen: ti.types.ndarray(ndim=1),
+      seed: ti.types.ndarray(ndim=1),
+      out: ti.types.ndarray(ndim=1)
+  ):
+    num_row = out.shape[0]
+    num_col = vector.shape[0]
+    w_mu0 = w_mu[0]
+    w_sigma0 = w_sigma[0]
+    clen0 = clen[0]
+    seed0 = seed[0]
+
+    for i_col in range(num_col):
+      col_v = vector[i_col]
+      key = lfsr88_key(seed0 + i_col)
+      key, i_row = lfsr88_random_integers(key, 0, clen0 - 1)
+      while i_row < num_row:
+        key, raw_v = lfsr88_normal(key, w_mu0, w_sigma0)
+        out[i_row] += col_v * raw_v
+        key, inc = lfsr88_random_integers(key, 1, clen0)
+        i_row += inc
+
+
+  @ti.kernel
+  def _mv_prob_normal_outdim_parallel_cpu(
+      vector: ti.types.ndarray(ndim=1),
+      w_mu: ti.types.ndarray(ndim=1),
+      w_sigma: ti.types.ndarray(ndim=1),
+      clen: ti.types.ndarray(ndim=1),
+      seed: ti.types.ndarray(ndim=1),
+      out: ti.types.ndarray(ndim=1)
+  ):
+    num_row = out.shape[0]
+    num_col = vector.shape[0]
+    w_mu0 = w_mu[0]
+    w_sigma0 = w_sigma[0]
+    clen0 = clen[0]
+    seed0 = seed[0]
+
+    for i_row in range(num_row):
+      r = 0.
+      key = lfsr88_key(seed0 + i_row)
+      key, i_col = lfsr88_random_integers(key, 0, clen0 - 1)
+      while i_col < num_col:
+        key, raw_v = lfsr88_normal(key, w_mu0, w_sigma0)
+        r += vector[i_col] * raw_v
+        key, inc = lfsr88_random_integers(key, 1, clen0)
+        i_col += inc
+      out[i_row] = r
+
+
+  @ti.kernel
+  def _mv_prob_normal_gpu(
+      vector: ti.types.ndarray(ndim=1),
+      w_mu: ti.types.ndarray(ndim=1),
+      w_sigma: ti.types.ndarray(ndim=1),
+      clen: ti.types.ndarray(ndim=1),
+      seed: ti.types.ndarray(ndim=1),
+      out: ti.types.ndarray(ndim=1)
+  ):
+    num_row = out.shape[0]
+    num_col = vector.shape[0]
+    w_mu0 = w_mu[0]
+    w_sigma0 = w_sigma[0]
+    clen0 = clen[0]
+    seed0 = seed[0]
+    step = ti.uint32(ti.max((num_row + 1) >> 5, 1))
+
+    for i in range(num_col * 32):
+      i_col = i >> 5
+      index = i & 31
+      col_v = vector[i_col]
+      i_row = step * index - 1
+      end = ti.min(i_row + step, num_row)
+      key = lfsr88_key(seed0 + i)
+      key, inc = lfsr88_random_integers(key, 1, clen0)
+      i_row += inc
+      while i_row < end:
+        key, row_v = lfsr88_normal(key, w_mu0, w_sigma0)
+        out[i_row] += row_v * col_v
+        key, inc = lfsr88_random_integers(key, 1, clen0)
+        i_row += inc
+
+
+  @ti.kernel
+  def _mv_prob_normal_outdim_parallel_gpu(
+      vector: ti.types.ndarray(ndim=1),
+      w_mu: ti.types.ndarray(ndim=1),
+      w_sigma: ti.types.ndarray(ndim=1),
+      clen: ti.types.ndarray(ndim=1),
+      seed: ti.types.ndarray(ndim=1),
+      out: ti.types.ndarray(ndim=1)
+  ):
+    num_row = out.shape[0]
+    num_col = vector.shape[0]
+    w_mu0 = w_mu[0]
+    w_sigma0 = w_sigma[0]
+    clen0 = clen[0]
+    seed0 = seed[0]
+    step = ti.u32(ti.max((num_row + 1) >> 5, 1))
+
+    for i in range(num_row * 32):
+      i_row = i >> 5
+      i_thread = i & 31
+      i_col = step * i_thread - 1
+      end_col = ti.min(i_col + step, num_col)
+      r = 0.
+      key = lfsr88_key(seed0 + i)
+      key, inc = lfsr88_random_integers(key, 1, clen0)
+      i_col += inc
+      while i_col < end_col:
+        key, row_v = lfsr88_normal(key, w_mu0, w_sigma0)
+        r += vector[i_col] * row_v
+        key, inc = lfsr88_random_integers(key, 1, clen0)
+        i_col += inc
+      out[i_row] += r  # TODO: warp-level reduction
+
+
+  def _mv_prob_normal_jvp_vector(v_dot, vector, w_mu, w_sigma, clen, seed, *, outs, shape, transpose, outdim_parallel):
+    shape = _reverse(shape) if transpose else shape
+    return raw_mv_prob_normal(v_dot, w_mu, w_sigma, clen, seed, shape=shape,
+                              transpose=transpose, outdim_parallel=outdim_parallel)
+
+
+  def _mv_prob_normal_jvp_w_mu(w_dot, vector, w_mu, w_sigma, clen, seed, *, outs, shape, transpose, outdim_parallel):
+    shape = _reverse(shape) if transpose else shape
+    return raw_mv_prob_normal(vector, w_dot, w_sigma, clen, seed, shape=shape,
+                              transpose=transpose, outdim_parallel=outdim_parallel)
 
-def _define_mv_prob_normal_prim(cpu_kernel, gpu_kernel):
-  prim = XLACustomOp(cpu_kernel=cpu_kernel, gpu_kernel=gpu_kernel)
-  prim.defjvp(_mv_prob_normal_jvp_vector,
-              _mv_prob_normal_jvp_w_mu,
-              _mv_prob_normal_jvp_w_sigma,
-              None,
-              None)
-  prim.def_transpose_rule(_mv_prob_normal_transpose)
-  return prim
-
-
-# outdim_parallel = True
-_mv_prob_normal_outdim_parallel_p = _define_mv_prob_normal_prim(
-  cpu_kernel=_mv_prob_normal_outdim_parallel_cpu,
-  gpu_kernel=_mv_prob_normal_outdim_parallel_gpu
-)
-
-# outdim_parallel = False
-_mv_prob_normal_p = _define_mv_prob_normal_prim(
-  cpu_kernel=_mv_prob_normal_cpu,
-  gpu_kernel=_mv_prob_normal_gpu
-)
+
+  def _mv_prob_normal_jvp_w_sigma(w_dot, vector, w_mu, w_sigma, clen, seed, *, outs, shape, transpose, outdim_parallel):
+    shape = _reverse(shape) if transpose else shape
+    return raw_mv_prob_normal(vector, w_mu, w_dot, clen, seed, shape=shape,
+                              transpose=transpose, outdim_parallel=outdim_parallel)
+
+
+  def _define_mv_prob_normal_prim(cpu_kernel, gpu_kernel):
+    prim = XLACustomOp(cpu_kernel=cpu_kernel, gpu_kernel=gpu_kernel)
+    prim.defjvp(_mv_prob_normal_jvp_vector,
+                _mv_prob_normal_jvp_w_mu,
+                _mv_prob_normal_jvp_w_sigma,
+                None,
+                None)
+    prim.def_transpose_rule(_mv_prob_normal_transpose)
+    return prim
+
+
+  # outdim_parallel = True
+  _mv_prob_normal_outdim_parallel_p = _define_mv_prob_normal_prim(
+    cpu_kernel=_mv_prob_normal_outdim_parallel_cpu,
+    gpu_kernel=_mv_prob_normal_outdim_parallel_gpu
+  )
+
+  # outdim_parallel = False
+  _mv_prob_normal_p = _define_mv_prob_normal_prim(
+    cpu_kernel=_mv_prob_normal_cpu,
+    gpu_kernel=_mv_prob_normal_gpu
+  )
diff --git a/brainpy/_src/math/jitconn/tests/test_event_matvec.py b/brainpy/_src/math/jitconn/tests/test_event_matvec.py
index b10d55d21..d8e086540 100644
--- a/brainpy/_src/math/jitconn/tests/test_event_matvec.py
+++ b/brainpy/_src/math/jitconn/tests/test_event_matvec.py
@@ -4,8 +4,14 @@
 import jax
 import jax.numpy as jnp
 from absl.testing import parameterized
+import pytest
 
 import brainpy.math as bm
+from brainpy._src.dependency_check import import_taichi
+
+if import_taichi(error_if_not_found=False) is None:
+  pytest.skip('no taichi', allow_module_level=True)
+
 
 shapes = [(100, 200), (10, 1000), (2, 1000), (1000, 10), (1000, 2)]
 shapes = [(100, 200), (2, 1000), (1000, 2)]
diff --git a/brainpy/_src/math/jitconn/tests/test_matvec.py b/brainpy/_src/math/jitconn/tests/test_matvec.py
index 2e6e406cf..8a0ae444d 100644
--- a/brainpy/_src/math/jitconn/tests/test_matvec.py
+++ b/brainpy/_src/math/jitconn/tests/test_matvec.py
@@ -4,8 +4,13 @@
 import jax
 import jax.numpy as jnp
 from absl.testing import parameterized
+import pytest
 
 import brainpy.math as bm
+from brainpy._src.dependency_check import import_taichi
+
+if import_taichi(error_if_not_found=False) is None:
+  pytest.skip('no taichi', allow_module_level=True)
 
 shapes = [(100, 200), (10, 1000), (2, 1000), (1000, 10), (1000, 2)]
 shapes = [(100, 200), (2, 1000), (1000, 2)]
diff --git a/brainpy/_src/math/object_transform/naming.py b/brainpy/_src/math/object_transform/naming.py
index 1c8ca6ef9..6326929c4 100644
--- a/brainpy/_src/math/object_transform/naming.py
+++ b/brainpy/_src/math/object_transform/naming.py
@@ -1,5 +1,5 @@
 # -*- coding: utf-8 -*-
-
+import gc
 import warnings
 
 from brainpy import errors
@@ -11,6 +11,7 @@
 
 _name2id = dict()
 _typed_names = {}
+_fun2stack = dict()
 
 
 def check_name_uniqueness(name, obj):
@@ -49,9 +50,6 @@ def clear_name_cache(ignore_warn=False):
     warnings.warn(f'All named models and their ids are cleared.', UserWarning)
 
 
-_fun2stack = dict()
-
-
 def cache_stack(func, stack):
   _fun2stack[func] = stack
 
@@ -59,6 +57,7 @@ def cache_stack(func, stack):
 def clear_stack_cache():
   for k in tuple(_fun2stack.keys()):
     del _fun2stack[k]
+  gc.collect()
 
 
 def get_stack_cache(func):
diff --git a/brainpy/_src/math/op_register/__init__.py b/brainpy/_src/math/op_register/__init__.py
index 01f77dbca..ed687eea5 100644
--- a/brainpy/_src/math/op_register/__init__.py
+++ b/brainpy/_src/math/op_register/__init__.py
@@ -1,7 +1,8 @@
-
-from .numba_approach import (CustomOpByNumba,
-                             register_op_with_numba,
-                             compile_cpu_signature_with_numba)
-from .taichi_aot_based import clean_caches, check_kernels_count
-from .base import XLACustomOp
-from .utils import register_general_batching
+from .numba_approach import (CustomOpByNumba,
+                             register_op_with_numba,
+                             compile_cpu_signature_with_numba)
+from .base import XLACustomOp
+from .utils import register_general_batching
+from .taichi_aot_based import clean_caches, check_kernels_count
+from .base import XLACustomOp
+from .utils import register_general_batching
diff --git a/brainpy/_src/math/op_register/base.py b/brainpy/_src/math/op_register/base.py
index 1824ac911..ca070a197 100644
--- a/brainpy/_src/math/op_register/base.py
+++ b/brainpy/_src/math/op_register/base.py
@@ -4,8 +4,8 @@
 import jax
 import numpy as np
 from jax.interpreters import xla, batching, ad, mlir
-from numba.core.dispatcher import Dispatcher
 
+from brainpy._src.dependency_check import import_numba
 from brainpy._src.math.ndarray import Array
 from brainpy._src.math.object_transform.base import BrainPyObject
 
@@ -20,6 +20,8 @@
 from .utils import register_general_batching
 from brainpy._src.math.op_register.ad_support import defjvp
 
+numba = import_numba(error_if_not_found=False)
+
 __all__ = [
   'XLACustomOp',
 ]
@@ -104,24 +106,30 @@ def __init__(
     self.primitive.def_impl(partial(xla.apply_primitive, self.primitive))
 
     # cpu function
+    cpu_checked = False
     if cpu_kernel is None:
-      pass
-    elif isinstance(cpu_kernel, Dispatcher):  # numba
-      register_numba_cpu_translation_rule(self.primitive, cpu_kernel)
-    elif hasattr(cpu_kernel, '_is_wrapped_kernel') and cpu_kernel._is_wrapped_kernel:  # taichi
+      cpu_checked = True
+    if numba is not None:  # numba
+      from numba.core.dispatcher import Dispatcher
+      if isinstance(cpu_kernel, Dispatcher):
+        register_numba_cpu_translation_rule(self.primitive, cpu_kernel)
+        cpu_checked = True
+    if hasattr(cpu_kernel, '_is_wrapped_kernel') and cpu_kernel._is_wrapped_kernel:  # taichi
       register_taichi_cpu_translation_rule(self.primitive, cpu_kernel)
-    else:
+      cpu_checked = True
+    if not cpu_checked:
       raise ValueError(f'"cpu_kernel" must be a numba jitted function or a taichi kernel function. '
                        f'But we got {cpu_kernel}')
 
     # gpu function
+    gpu_checked = False
     if gpu_kernel is None:
-      pass
-    elif hasattr(gpu_kernel, '_is_wrapped_kernel') and gpu_kernel._is_wrapped_kernel:  # taichi
+      gpu_checked = True
+    if hasattr(gpu_kernel, '_is_wrapped_kernel') and gpu_kernel._is_wrapped_kernel:  # taichi
       register_taichi_gpu_translation_rule(self.primitive, gpu_kernel)
-    else:
-      raise ValueError(f'"cpu_kernel" must be a taichi kernel function. '
-                       f'But we got {gpu_kernel}')
+      gpu_checked = True
+    if not gpu_checked:
+      raise ValueError(f'"cpu_kernel" must be a taichi kernel function. But we got {gpu_kernel}')
 
     # batching rule
     if batching_translation is None:
diff --git a/brainpy/_src/math/op_register/numba_approach/__init__.py b/brainpy/_src/math/op_register/numba_approach/__init__.py
index cc2ce5b4c..5bbd04e0c 100644
--- a/brainpy/_src/math/op_register/numba_approach/__init__.py
+++ b/brainpy/_src/math/op_register/numba_approach/__init__.py
@@ -1,20 +1,22 @@
 # -*- coding: utf-8 -*-
 
-import warnings
 from functools import partial
 from typing import Callable
 from typing import Union, Sequence
 
-import numba
 import jax
 from jax.interpreters import xla, batching, ad
 from jax.tree_util import tree_map
-from numba.core.dispatcher import Dispatcher
 
+from brainpy._src.dependency_check import import_numba
 from brainpy._src.math.ndarray import Array
 from brainpy._src.math.object_transform.base import BrainPyObject
+from brainpy.errors import PackageMissingError
 from .cpu_translation import _cpu_translation, compile_cpu_signature_with_numba
 
+numba = import_numba(error_if_not_found=False)
+
+
 __all__ = [
   'CustomOpByNumba',
   'register_op_with_numba',
@@ -137,6 +139,9 @@ def register_op_with_numba(
                        f'For more information, please refer to the documentation: '
                        f'https://brainpy.readthedocs.io/en/latest/tutorial_advanced/operator_custom_with_taichi.html.')
 
+  if numba is None:
+    raise PackageMissingError.by_purpose('numba', 'custom op with numba')
+
   if out_shapes is None:
     raise RuntimeError('out_shapes cannot be None. It can be a `ShapedArray` or '
                        'a sequence of `ShapedArray`. If it is a function, it takes as input the argument '
@@ -146,6 +151,7 @@ def register_op_with_numba(
   prim.multiple_results = multiple_results
 
   # user defined function
+  from numba.core.dispatcher import Dispatcher
   if not isinstance(cpu_func, Dispatcher):
     cpu_func = numba.jit(fastmath=True, nopython=True)(cpu_func)
 
@@ -196,5 +202,3 @@ def abs_eval_rule(*input_shapes, **info):
     ad.primitive_transposes[prim] = transpose_translation
 
   return prim
-
-
diff --git a/brainpy/_src/math/op_register/numba_approach/cpu_translation.py b/brainpy/_src/math/op_register/numba_approach/cpu_translation.py
index 13974b5b2..4b06effdf 100644
--- a/brainpy/_src/math/op_register/numba_approach/cpu_translation.py
+++ b/brainpy/_src/math/op_register/numba_approach/cpu_translation.py
@@ -1,146 +1,152 @@
-# -*- coding: utf-8 -*-
-
-import ctypes
-
-from jax import dtypes, numpy as jnp
-from jax.core import ShapedArray
-from jax.lib import xla_client
-from numba import types, carray, cfunc
-
-__all__ = [
-  'compile_cpu_signature_with_numba'
-]
-
-ctypes.pythonapi.PyCapsule_New.argtypes = [
-  ctypes.c_void_p,  # void* pointer
-  ctypes.c_char_p,  # const char *name
-  ctypes.c_void_p,  # PyCapsule_Destructor destructor
-]
-ctypes.pythonapi.PyCapsule_New.restype = ctypes.py_object
-
-
-def _cpu_translation(func, abs_eval_fn, multiple_results, c, *inputs, **info):
-  target_name, inputs, input_shapes, xla_output_shapes = \
-    compile_cpu_signature_with_numba(c, func, abs_eval_fn, multiple_results, inputs, info)
-  return xla_client.ops.CustomCallWithLayout(
-    c,
-    target_name,
-    operands=inputs,
-    operand_shapes_with_layout=input_shapes,
-    shape_with_layout=xla_output_shapes,
-  )
-
-
-def _cpu_signature(
-    func,
-    input_dtypes,
-    input_shapes,
-    output_dtypes,
-    output_shapes,
-    multiple_results: bool,
-    debug: bool = False
-):
-  code_scope = dict(
-    func_to_call=func,
-    input_shapes=input_shapes,
-    input_dtypes=input_dtypes,
-    output_shapes=output_shapes,
-    output_dtypes=output_dtypes,
-    carray=carray,
-  )
-
-  # inputs
-  if len(input_shapes) > 1:
-    args_in = [
-      f'carray(input_ptrs[{i}], input_shapes[{i}], dtype=input_dtypes[{i}]),'
-      for i in range(len(input_shapes))
-    ]
-    args_in = '(\n    ' + "\n    ".join(args_in) + '\n  )'
-  else:
-    args_in = 'carray(input_ptrs[0], input_shapes[0], dtype=input_dtypes[0])'
-
-  # outputs
-  if multiple_results:
-    args_out = [
-      f'carray(output_ptrs[{i}], output_shapes[{i}], dtype=output_dtypes[{i}]),'
-      for i in range(len(output_shapes))
-    ]
-    args_out = '(\n    ' + "\n    ".join(args_out) + '\n  )'
-  else:
-    args_out = 'carray(output_ptrs, output_shapes[0], dtype=output_dtypes[0])'
-
-  # function body
-  code_string = '''
-def xla_cpu_custom_call_target(output_ptrs, input_ptrs):
-  args_out = {args_out}
-  args_in = {args_in}
-  func_to_call(args_out, args_in)
-    '''.format(args_in=args_in,
-               args_out=args_out)
-  if debug: print(code_string)
-  exec(compile(code_string.strip(), '', 'exec'), code_scope)
-
-  new_f = code_scope['xla_cpu_custom_call_target']
-  if multiple_results:
-    xla_c_rule = cfunc(types.void(types.CPointer(types.voidptr),
-                                  types.CPointer(types.voidptr)))(new_f)
-  else:
-    xla_c_rule = cfunc(types.void(types.voidptr, types.CPointer(types.voidptr)))(new_f)
-  target_name = xla_c_rule.native_name.encode("ascii")
-  capsule = ctypes.pythonapi.PyCapsule_New(
-    xla_c_rule.address,  # A CFFI pointer to a function
-    b"xla._CUSTOM_CALL_TARGET",  # A binary string
-    None  # PyCapsule object run at destruction
-  )
-  xla_client.register_custom_call_target(target_name, capsule, "cpu")
-  return target_name
-
-
-def compile_cpu_signature_with_numba(
-    c,
-    func,
-    abs_eval_fn,
-    multiple_results,
-    inputs: tuple,
-    description: dict = None,
-):
-  input_layouts = [c.get_shape(arg) for arg in inputs]
-  info_inputs = []
-  if description is None: description = dict()
-  for v in description.values():
-    if isinstance(v, (int, float)):
-      input_layouts.append(xla_client.Shape.array_shape(dtypes.canonicalize_dtype(type(v)), (), ()))
-      info_inputs.append(xla_client.ops.ConstantLiteral(c, v))
-    elif isinstance(v, (tuple, list)):
-      v = jnp.asarray(v)
-      input_layouts.append(xla_client.Shape.array_shape(v.dtype, v.shape, tuple(range(len(v.shape) - 1, -1, -1))))
-      info_inputs.append(xla_client.ops.Constant(c, v))
-    else:
-      raise TypeError
-  input_layouts = tuple(input_layouts)
-  input_dtypes = tuple(shape.element_type() for shape in input_layouts)
-  input_dimensions = tuple(shape.dimensions() for shape in input_layouts)
-  output_abstract_arrays = abs_eval_fn(*tuple(ShapedArray(shape.dimensions(), shape.element_type())
-                                              for shape in input_layouts[:len(inputs)]),
-                                       **description)
-  if isinstance(output_abstract_arrays, ShapedArray):
-    output_abstract_arrays = (output_abstract_arrays,)
-    assert not multiple_results
-  else:
-    assert multiple_results
-  output_shapes = tuple(array.shape for array in output_abstract_arrays)
-  output_dtypes = tuple(array.dtype for array in output_abstract_arrays)
-  output_layouts = map(lambda shape: range(len(shape) - 1, -1, -1), output_shapes)
-  target_name = _cpu_signature(func,
-                               input_dtypes,
-                               input_dimensions,
-                               output_dtypes,
-                               output_shapes,
-                               multiple_results,
-                               debug=False)
-  output_layouts = [xla_client.Shape.array_shape(*arg)
-                    for arg in zip(output_dtypes, output_shapes, output_layouts)]
-  output_layouts = (xla_client.Shape.tuple_shape(output_layouts)
-                    if multiple_results else
-                    output_layouts[0])
-  return target_name, tuple(inputs) + tuple(info_inputs), input_layouts, output_layouts
+# -*- coding: utf-8 -*-
+
+import ctypes
+
+from jax import dtypes, numpy as jnp
+from jax.core import ShapedArray
+from jax.lib import xla_client
+
+from brainpy._src.dependency_check import import_numba
+
+numba = import_numba(error_if_not_found=False)
+ctypes.pythonapi.PyCapsule_New.argtypes = [
+  ctypes.c_void_p,  # void* pointer
+  ctypes.c_char_p,  # const char *name
+  ctypes.c_void_p,  # PyCapsule_Destructor destructor
+]
+ctypes.pythonapi.PyCapsule_New.restype = ctypes.py_object
+
+__all__ = [
+  '_cpu_translation',
+  'compile_cpu_signature_with_numba',
+]
+
+if numba is not None:
+  from numba import types, carray, cfunc
+
+
+def _cpu_translation(func, abs_eval_fn, multiple_results, c, *inputs, **info):
+  target_name, inputs, input_shapes, xla_output_shapes = \
+    compile_cpu_signature_with_numba(c, func, abs_eval_fn, multiple_results, inputs, info)
+  return xla_client.ops.CustomCallWithLayout(
+    c,
+    target_name,
+    operands=inputs,
+    operand_shapes_with_layout=input_shapes,
+    shape_with_layout=xla_output_shapes,
+  )
+
+
+def _cpu_signature(
+    func,
+    input_dtypes,
+    input_shapes,
+    output_dtypes,
+    output_shapes,
+    multiple_results: bool,
+    debug: bool = False
+):
+  code_scope = dict(
+    func_to_call=func,
+    input_shapes=input_shapes,
+    input_dtypes=input_dtypes,
+    output_shapes=output_shapes,
+    output_dtypes=output_dtypes,
+    carray=carray,
+  )
+
+  # inputs
+  if len(input_shapes) > 1:
+    args_in = [
+      f'carray(input_ptrs[{i}], input_shapes[{i}], dtype=input_dtypes[{i}]),'
+      for i in range(len(input_shapes))
+    ]
+    args_in = '(\n    ' + "\n    ".join(args_in) + '\n  )'
+  else:
+    args_in = 'carray(input_ptrs[0], input_shapes[0], dtype=input_dtypes[0])'
+
+  # outputs
+  if multiple_results:
+    args_out = [
+      f'carray(output_ptrs[{i}], output_shapes[{i}], dtype=output_dtypes[{i}]),'
+      for i in range(len(output_shapes))
+    ]
+    args_out = '(\n    ' + "\n    ".join(args_out) + '\n  )'
+  else:
+    args_out = 'carray(output_ptrs, output_shapes[0], dtype=output_dtypes[0])'
+
+  # function body
+  code_string = '''
+def xla_cpu_custom_call_target(output_ptrs, input_ptrs):
+  args_out = {args_out}
+  args_in = {args_in}
+  func_to_call(args_out, args_in)
+    '''.format(args_in=args_in,
+               args_out=args_out)
+  if debug: print(code_string)
+  exec(compile(code_string.strip(), '', 'exec'), code_scope)
+
+  new_f = code_scope['xla_cpu_custom_call_target']
+  if multiple_results:
+    xla_c_rule = cfunc(types.void(types.CPointer(types.voidptr),
+                                  types.CPointer(types.voidptr)))(new_f)
+  else:
+    xla_c_rule = cfunc(types.void(types.voidptr, types.CPointer(types.voidptr)))(new_f)
+  target_name = xla_c_rule.native_name.encode("ascii")
+  capsule = ctypes.pythonapi.PyCapsule_New(
+    xla_c_rule.address,  # A CFFI pointer to a function
+    b"xla._CUSTOM_CALL_TARGET",  # A binary string
+    None  # PyCapsule object run at destruction
+  )
+  xla_client.register_custom_call_target(target_name, capsule, "cpu")
+  return target_name
+
+
+def compile_cpu_signature_with_numba(
+    c,
+    func,
+    abs_eval_fn,
+    multiple_results,
+    inputs: tuple,
+    description: dict = None,
+):
+  input_layouts = [c.get_shape(arg) for arg in inputs]
+  info_inputs = []
+  if description is None: description = dict()
+  for v in description.values():
+    if isinstance(v, (int, float)):
+      input_layouts.append(xla_client.Shape.array_shape(dtypes.canonicalize_dtype(type(v)), (), ()))
+      info_inputs.append(xla_client.ops.ConstantLiteral(c, v))
+    elif isinstance(v, (tuple, list)):
+      v = jnp.asarray(v)
+      input_layouts.append(xla_client.Shape.array_shape(v.dtype, v.shape, tuple(range(len(v.shape) - 1, -1, -1))))
+      info_inputs.append(xla_client.ops.Constant(c, v))
+    else:
+      raise TypeError
+  input_layouts = tuple(input_layouts)
+  input_dtypes = tuple(shape.element_type() for shape in input_layouts)
+  input_dimensions = tuple(shape.dimensions() for shape in input_layouts)
+  output_abstract_arrays = abs_eval_fn(*tuple(ShapedArray(shape.dimensions(), shape.element_type())
+                                              for shape in input_layouts[:len(inputs)]),
+                                       **description)
+  if isinstance(output_abstract_arrays, ShapedArray):
+    output_abstract_arrays = (output_abstract_arrays,)
+    assert not multiple_results
+  else:
+    assert multiple_results
+  output_shapes = tuple(array.shape for array in output_abstract_arrays)
+  output_dtypes = tuple(array.dtype for array in output_abstract_arrays)
+  output_layouts = map(lambda shape: range(len(shape) - 1, -1, -1), output_shapes)
+  target_name = _cpu_signature(func,
+                               input_dtypes,
+                               input_dimensions,
+                               output_dtypes,
+                               output_shapes,
+                               multiple_results,
+                               debug=False)
+  output_layouts = [xla_client.Shape.array_shape(*arg)
+                    for arg in zip(output_dtypes, output_shapes, output_layouts)]
+  output_layouts = (xla_client.Shape.tuple_shape(output_layouts)
+                    if multiple_results else
+                    output_layouts[0])
+  return target_name, tuple(inputs) + tuple(info_inputs), input_layouts, output_layouts
diff --git a/brainpy/_src/math/op_register/numba_based.py b/brainpy/_src/math/op_register/numba_based.py
index fb76aed24..f461f4277 100644
--- a/brainpy/_src/math/op_register/numba_based.py
+++ b/brainpy/_src/math/op_register/numba_based.py
@@ -6,17 +6,20 @@
 from jax.interpreters import xla, mlir
 from jax.lib import xla_client
 from jaxlib.hlo_helpers import custom_call
-from numba import types, carray, cfunc
 
+from brainpy._src.dependency_check import import_numba
+from brainpy.errors import PackageMissingError
 from .utils import _shape_to_layout
 
+numba = import_numba(error_if_not_found=False)
+if numba is not None:
+  from numba import types, carray, cfunc
 
 __all__ = [
   'register_numba_xla_cpu_translation_rule',
   'register_numba_mlir_cpu_translation_rule',
 ]
 
-
 #                                         [void* pointer,
 #                                          const char *name,
 #                                          PyCapsule_Destructor destructor]
@@ -104,6 +107,9 @@ def _numba_xla_cpu_translation_rule(kernel, debug: bool, c, *ins, **kwargs):
 
 
 def register_numba_xla_cpu_translation_rule(primitive, cpu_kernel, debug=False):
+  if numba is None:
+    raise PackageMissingError.by_purpose("numba", 'register numba xla cpu translation rule')
+
   # do not support after jax >= 0.4.24
   xla.backend_specific_translations['cpu'][primitive] = partial(_numba_xla_cpu_translation_rule,
                                                                 cpu_kernel,
@@ -168,5 +174,8 @@ def numba_cpu_custom_call_target(output_ptrs, input_ptrs):
 
 
 def register_numba_mlir_cpu_translation_rule(primitive, cpu_kernel, debug=False):
+  if numba is None:
+    raise PackageMissingError.by_purpose("numba", 'register numba xla cpu translation rule')
+
   rule = partial(_numba_mlir_cpu_translation_rule, cpu_kernel, debug)
   mlir.register_lowering(primitive, rule, platform='cpu')
diff --git a/brainpy/_src/math/op_register/tests/test_ad_support.py b/brainpy/_src/math/op_register/tests/test_ad_support.py
index 24f010a12..2c9f09724 100644
--- a/brainpy/_src/math/op_register/tests/test_ad_support.py
+++ b/brainpy/_src/math/op_register/tests/test_ad_support.py
@@ -1,13 +1,18 @@
+import pytest
 from typing import Tuple
 
 import jax
-import numba
 from jax import core
 from jax import numpy as jnp
 from jax.interpreters import ad
 
 import brainpy as bp
 import brainpy.math as bm
+from brainpy._src.dependency_check import import_numba
+
+numba = import_numba(error_if_not_found=False)
+if numba is None:
+  pytest.skip('no numba', allow_module_level=True)
 
 bm.set_platform('cpu')
 
diff --git a/brainpy/_src/math/op_register/tests/test_numba_based.py b/brainpy/_src/math/op_register/tests/test_numba_based.py
index 968155ef9..dc093f624 100644
--- a/brainpy/_src/math/op_register/tests/test_numba_based.py
+++ b/brainpy/_src/math/op_register/tests/test_numba_based.py
@@ -1,6 +1,11 @@
+import pytest
 import jax.core
 import brainpy.math as bm
-import numba
+
+from brainpy._src.dependency_check import import_numba
+numba = import_numba(error_if_not_found=False)
+if numba is None:
+  pytest.skip('no numba', allow_module_level=True)
 
 bm.set_platform('cpu')
 
diff --git a/brainpy/_src/math/op_register/tests/test_taichi_based.py b/brainpy/_src/math/op_register/tests/test_taichi_based.py
index 03023754c..4db38fbcb 100644
--- a/brainpy/_src/math/op_register/tests/test_taichi_based.py
+++ b/brainpy/_src/math/op_register/tests/test_taichi_based.py
@@ -1,9 +1,14 @@
+import pytest
 import jax
 import jax.numpy as jnp
-import taichi as ti
 
 import brainpy.math as bm
 
+from brainpy._src.dependency_check import import_taichi
+ti = import_taichi(error_if_not_found=False)
+if ti is None:
+  pytest.skip('no taichi', allow_module_level=True)
+
 bm.set_platform('cpu')
 
 
diff --git a/brainpy/_src/math/op_register/tests/test_taichi_clean_cache.py b/brainpy/_src/math/op_register/tests/test_taichi_clean_cache.py
index 1bebcdafe..51c964b29 100644
--- a/brainpy/_src/math/op_register/tests/test_taichi_clean_cache.py
+++ b/brainpy/_src/math/op_register/tests/test_taichi_clean_cache.py
@@ -1,54 +1,58 @@
-import brainpy.math as bm
-import jax
-import jax.numpy as jnp
-import platform
-import pytest
-import taichi
-
-if not platform.platform().startswith('Windows'):
-  pytest.skip(allow_module_level=True)
-
-@taichi.func
-def get_weight(weight: taichi.types.ndarray(ndim=1)) -> taichi.f32:
-  return weight[0]
-
-
-@taichi.func
-def update_output(out: taichi.types.ndarray(ndim=1), index: taichi.i32, weight_val: taichi.f32):
-  out[index] += weight_val
-
-@taichi.kernel
-def event_ell_cpu(indices: taichi.types.ndarray(ndim=2),
-                  vector: taichi.types.ndarray(ndim=1),
-                  weight: taichi.types.ndarray(ndim=1),
-                  out: taichi.types.ndarray(ndim=1)):
-  weight_val = get_weight(weight)
-  num_rows, num_cols = indices.shape
-  taichi.loop_config(serialize=True)
-  for i in range(num_rows):
-    if vector[i]:
-      for j in range(num_cols):
-        update_output(out, indices[i, j], weight_val)
-
-prim = bm.XLACustomOp(cpu_kernel=event_ell_cpu)
-
-def test_taichi_clean_cache():
-  s = 1000
-  indices = bm.random.randint(0, s, (s, 1000))
-  vector = bm.random.rand(s) < 0.1
-  weight = bm.array([1.0])
-
-  out = prim(indices, vector, weight, outs=[jax.ShapeDtypeStruct((s,), dtype=jnp.float32)])
-
-  out = prim(indices, vector, weight, outs=[jax.ShapeDtypeStruct((s,), dtype=jnp.float32)])
-
-  print(out)
-  bm.clear_buffer_memory()
-
-  print('kernels: ', bm.check_kernels_count())
-
-  bm.clean_caches()
-
-  print('kernels: ', bm.check_kernels_count())
-
+import brainpy.math as bm
+import jax
+import jax.numpy as jnp
+import platform
+import pytest
+
+from brainpy._src.dependency_check import import_taichi
+ti = import_taichi(error_if_not_found=False)
+if ti is None:
+  pytest.skip('no taichi', allow_module_level=True)
+
+if not platform.platform().startswith('Windows'):
+  pytest.skip(allow_module_level=True)
+
+@ti.func
+def get_weight(weight: ti.types.ndarray(ndim=1)) -> ti.f32:
+  return weight[0]
+
+
+@ti.func
+def update_output(out: ti.types.ndarray(ndim=1), index: ti.i32, weight_val: ti.f32):
+  out[index] += weight_val
+
+@ti.kernel
+def event_ell_cpu(indices: ti.types.ndarray(ndim=2),
+                  vector: ti.types.ndarray(ndim=1),
+                  weight: ti.types.ndarray(ndim=1),
+                  out: ti.types.ndarray(ndim=1)):
+  weight_val = get_weight(weight)
+  num_rows, num_cols = indices.shape
+  ti.loop_config(serialize=True)
+  for i in range(num_rows):
+    if vector[i]:
+      for j in range(num_cols):
+        update_output(out, indices[i, j], weight_val)
+
+prim = bm.XLACustomOp(cpu_kernel=event_ell_cpu)
+
+def test_taichi_clean_cache():
+  s = 1000
+  indices = bm.random.randint(0, s, (s, 1000))
+  vector = bm.random.rand(s) < 0.1
+  weight = bm.array([1.0])
+
+  out = prim(indices, vector, weight, outs=[jax.ShapeDtypeStruct((s,), dtype=jnp.float32)])
+
+  out = prim(indices, vector, weight, outs=[jax.ShapeDtypeStruct((s,), dtype=jnp.float32)])
+
+  print(out)
+  bm.clear_buffer_memory()
+
+  print('kernels: ', bm.check_kernels_count())
+
+  bm.clean_caches()
+
+  print('kernels: ', bm.check_kernels_count())
+
 # test_taichi_clean_cache()
\ No newline at end of file
diff --git a/brainpy/_src/math/sparse/__init__.py b/brainpy/_src/math/sparse/__init__.py
index d45f2c80b..d53533247 100644
--- a/brainpy/_src/math/sparse/__init__.py
+++ b/brainpy/_src/math/sparse/__init__.py
@@ -1,8 +1,7 @@
-
-from ._coo_mv import *
+# from ._coo_mv import *
+# from ._bsr_mv import *
 from ._csr_mv import *
 from ._utils import *
-from ._bsr_mv import *
 from ._bsr_mm import *
 from ._jax_prim import *
 
diff --git a/brainpy/_src/math/sparse/_bsr_mm.py b/brainpy/_src/math/sparse/_bsr_mm.py
index 453ab387d..19800749d 100644
--- a/brainpy/_src/math/sparse/_bsr_mm.py
+++ b/brainpy/_src/math/sparse/_bsr_mm.py
@@ -1,22 +1,23 @@
 # -*- coding: utf-8 -*-
 
 from functools import partial
-from typing import Union, Tuple
+from typing import Tuple
 
 import jax.lax
-import numba
 import numpy as np
 from jax import numpy as jnp
 from jax.core import Primitive, ShapedArray
 from jax.interpreters import ad, xla
 from jax.lib import xla_client
 
+from brainpy._src.dependency_check import import_brainpylib_gpu_ops, import_numba
 from brainpy._src.math.interoperability import as_jax
-from brainpy._src.dependency_check import import_brainpylib_gpu_ops
 from brainpy._src.math.op_register import (compile_cpu_signature_with_numba,
                                            register_general_batching)
 from brainpy.errors import GPUOperatorNotFound
 
+numba = import_numba(error_if_not_found=False)
+
 __all__ = [
   'bcsrmm',
 ]
@@ -264,52 +265,53 @@ def bcsrmm(
     raise ValueError
 
 
-@numba.njit(fastmath=True, parallel=True, nogil=True)
-def _bcsrmm_cutlass_imp_transpose(outs, ins):  # dense(m, k) @ bcsr(n, k) -> dense(n, m)
-  res_val = outs[0]
-  # B_data: (num_block, block_size_k, block_size_n)
-  A_data, B_data, B_indices, B_inptr, m, k, n, block_size_k, block_size_n = ins
-  block_size_k = block_size_k[()]
-  block_size_n = block_size_n[()]
-  n_block = n // block_size_n
-
-  for ni in numba.prange(n_block):
-    C_tmp = np.zeros((block_size_n, m), dtype=A_data.dtype)
-    start, end = B_inptr[ni], B_inptr[ni + 1]
-    ns = ni * block_size_n
-    ne = ns + block_size_n
-    for i in range(start, end):
-      ki = B_indices[i, 0]
-      ks = ki * block_size_k
-      ke = ki + block_size_k
-      bi = B_indices[i, 1]
-      C_tmp += np.matmul(B_data[bi], A_data[:, ks: ke].T)
-    res_val[ns: ne] = C_tmp
-  return res_val
-
-
-@numba.njit(fastmath=True, parallel=True, nogil=True)
-def _bcsrmm_cutlass_imp2(outs, ins):  # dense(m, k) @ bcsr(k, n) -> dense(n, m)
-  res_val = outs[0]
-  # B_data: (num_block, block_size_n, block_size_k)
-  A_data, B_data, B_indices, B_inptr, m, k, n, block_size_k, block_size_n = ins
-  block_size_k = block_size_k[()]
-  block_size_n = block_size_n[()]
-  n_block = n // block_size_n
-
-  for ni in numba.prange(n_block):
-    C_tmp = np.zeros((block_size_n, m), dtype=A_data.dtype)
-    start, end = B_inptr[ni], B_inptr[ni + 1]
-    ns = ni * block_size_n
-    ne = ns + block_size_n
-    for i in range(start, end):
-      ki = B_indices[i, 0]
-      ks = ki * block_size_k
-      ke = ki + block_size_k
-      bi = B_indices[i, 1]
-      C_tmp += np.matmul(B_data[bi], A_data[:, ks: ke].T)
-    res_val[ns: ne] = C_tmp
-  return res_val
+if numba is not None:
+  @numba.njit(fastmath=True, parallel=True, nogil=True)
+  def _bcsrmm_cutlass_imp_transpose(outs, ins):  # dense(m, k) @ bcsr(n, k) -> dense(n, m)
+    res_val = outs[0]
+    # B_data: (num_block, block_size_k, block_size_n)
+    A_data, B_data, B_indices, B_inptr, m, k, n, block_size_k, block_size_n = ins
+    block_size_k = block_size_k[()]
+    block_size_n = block_size_n[()]
+    n_block = n // block_size_n
+
+    for ni in numba.prange(n_block):
+      C_tmp = np.zeros((block_size_n, m), dtype=A_data.dtype)
+      start, end = B_inptr[ni], B_inptr[ni + 1]
+      ns = ni * block_size_n
+      ne = ns + block_size_n
+      for i in range(start, end):
+        ki = B_indices[i, 0]
+        ks = ki * block_size_k
+        ke = ki + block_size_k
+        bi = B_indices[i, 1]
+        C_tmp += np.matmul(B_data[bi], A_data[:, ks: ke].T)
+      res_val[ns: ne] = C_tmp
+    return res_val
+
+
+  @numba.njit(fastmath=True, parallel=True, nogil=True)
+  def _bcsrmm_cutlass_imp2(outs, ins):  # dense(m, k) @ bcsr(k, n) -> dense(n, m)
+    res_val = outs[0]
+    # B_data: (num_block, block_size_n, block_size_k)
+    A_data, B_data, B_indices, B_inptr, m, k, n, block_size_k, block_size_n = ins
+    block_size_k = block_size_k[()]
+    block_size_n = block_size_n[()]
+    n_block = n // block_size_n
+
+    for ni in numba.prange(n_block):
+      C_tmp = np.zeros((block_size_n, m), dtype=A_data.dtype)
+      start, end = B_inptr[ni], B_inptr[ni + 1]
+      ns = ni * block_size_n
+      ne = ns + block_size_n
+      for i in range(start, end):
+        ki = B_indices[i, 0]
+        ks = ki * block_size_k
+        ke = ki + block_size_k
+        bi = B_indices[i, 1]
+        C_tmp += np.matmul(B_data[bi], A_data[:, ks: ke].T)
+      res_val[ns: ne] = C_tmp
+    return res_val
 
 
 def _bcsrmm_cutlass_abstract(
diff --git a/brainpy/_src/math/sparse/_csr_mv.py b/brainpy/_src/math/sparse/_csr_mv.py
index 377597579..42969f435 100644
--- a/brainpy/_src/math/sparse/_csr_mv.py
+++ b/brainpy/_src/math/sparse/_csr_mv.py
@@ -1,28 +1,22 @@
 # -*- coding: utf-8 -*-
 
 
-from functools import partial
 from typing import Union, Tuple
 
 import jax
-import numba
-import numpy as np
-from jax import core, dtypes
 from jax import numpy as jnp
-from jax.interpreters import ad, mlir, xla
-from jax.lib import xla_client
-from jaxlib import gpu_sparse
+from jax.experimental.sparse import csr
+from jax.interpreters import ad
 
-from brainpy._src.dependency_check import import_brainpylib_gpu_ops, import_taichi
+import brainpy.math as bm
+from brainpy._src.dependency_check import import_taichi
 from brainpy._src.math.interoperability import as_jax
 from brainpy._src.math.ndarray import Array
-from brainpy._src.math.op_register import (compile_cpu_signature_with_numba,
-                                           register_general_batching,
-                                           XLACustomOp)
+from brainpy._src.math.op_register import (register_general_batching, XLACustomOp)
 from brainpy._src.math.sparse._utils import csr_to_coo
-from brainpy.errors import GPUOperatorNotFound
+from brainpy.errors import PackageMissingError
 
-ti = import_taichi()
+ti = import_taichi(error_if_not_found=False)
 
 __all__ = [
   'csrmv',
@@ -37,7 +31,6 @@ def csrmv(
     *,
     shape: Tuple[int, int],
     transpose: bool = False,
-    method: str = None,
 ):
   """Product of CSR sparse matrix and a dense vector using cuSPARSE algorithm.
 
@@ -70,495 +63,6 @@ def csrmv(
     - ``vector``:
     - ``adaptive``:
 
-  Returns
-  -------
-  y : ndarry
-    The array of shape ``(shape[1] if transpose else shape[0],)`` representing
-    the matrix vector product.
-  """
-  if method is None:
-    return csrmv_taichi(data, indices, indptr, vector, shape=shape, transpose=transpose)
-  else:
-    return csrmv_brainpylib(data, indices, indptr, vector, shape=shape, transpose=transpose, method=method)
-
-
-### BRAINPYLIB ###
-
-def csrmv_brainpylib(
-    data: Union[float, jnp.ndarray, Array],
-    indices: Union[jnp.ndarray, Array],
-    indptr: Union[jnp.ndarray, Array],
-    vector: Union[jnp.ndarray, Array],
-    *,
-    shape: Tuple[int, int],
-    transpose: bool = False,
-    method: str = 'cusparse',
-):
-  """Product of CSR sparse matrix and a dense vector using cuSPARSE algorithm.
-
-  This function supports JAX transformations, including `jit()`, `grad()`,
-  `vmap()` and `pmap()`.
-
-  Parameters
-  ----------
-  data: ndarray, float
-    An array of shape ``(nse,)``.
-  indices: ndarray
-    An array of shape ``(nse,)``.
-  indptr: ndarray
-    An array of shape ``(shape[0] + 1,)`` and dtype ``indices.dtype``.
-  vector: ndarray
-    An array of shape ``(shape[0] if transpose else shape[1],)``
-    and dtype ``data.dtype``.
-  shape: tuple of int
-    A length-2 tuple representing the matrix shape.
-  transpose: bool
-    A boolean specifying whether to transpose the sparse matrix
-    before computing.
-  method: str
-    The method used to compute Matrix-Vector Multiplication. The candidate methods are:
-
-    - ``cusparse``: using cuSPARSE library.
-    - ``scalar``:
-    - ``vector``:
-    - ``adaptive``:
-
-  Returns
-  -------
-  y : ndarry
-    The array of shape ``(shape[1] if transpose else shape[0],)`` representing
-    the matrix vector product.
-  """
-
-  data = jnp.atleast_1d(as_jax(data))
-  indices = as_jax(indices)
-  indptr = as_jax(indptr)
-  vector = as_jax(vector)
-
-  if vector.dtype == jnp.bool_:
-    vector = as_jax(vector, dtype=data.dtype)
-
-  if method == 'cusparse':
-    if jax.default_backend() == 'gpu':
-      if data.shape[0] == 1:
-        data = jnp.ones(indices.shape, dtype=data.dtype) * data
-      if indices.dtype in [jnp.uint32, jnp.uint64]:
-        indices = jnp.asarray(indices, dtype=dtypes.canonicalize_dtype(jnp.int64))
-      if indptr.dtype in [jnp.uint32, jnp.uint64]:
-        indptr = jnp.asarray(indptr, dtype=dtypes.canonicalize_dtype(jnp.int64))
-    return _csrmv_cusparse_p.bind(data,
-                                  indices,
-                                  indptr,
-                                  vector,
-                                  shape=shape,
-                                  transpose=transpose)
-
-  elif method == 'adaptive':
-    return _csrmv_adaptive_p.bind(data, indices, indptr, vector, shape=shape, transpose=transpose)
-
-  elif method == 'scalar':
-    return _csrmv_scalar_p.bind(data, indices, indptr, vector, shape=shape, transpose=transpose)
-
-  elif method == 'vector':
-    return _csrmv_vector_p.bind(data, indices, indptr, vector, shape=shape, transpose=transpose)
-
-  else:
-    raise ValueError(f'Only support methods: cusparse, scalar, vector, and adaptive. But we got {method}.')
-
-
-def _csrmv_abstract(data, indices, indptr, vector, *, shape, transpose):
-  if data.dtype not in [jnp.float32, jnp.float64]:
-    raise TypeError(f'Only support float32 and float64. But we got {data.dtype}.')
-  if data.dtype != vector.dtype:
-    raise TypeError('The types of data and vector should be the same. '
-                    f'But we got {data.dtype} != {vector.dtype}.')
-  assert data.ndim == indices.ndim == indptr.ndim == vector.ndim == 1
-  if not jnp.issubdtype(indices.dtype, jnp.integer):
-    raise ValueError('indices should be a 1D vector with integer type.')
-  if not jnp.issubdtype(indptr.dtype, jnp.integer):
-    raise ValueError('indptr should be a 1D vector with integer type.')
-  out_shape = shape[1] if transpose else shape[0]
-  return core.ShapedArray((out_shape,), data.dtype)
-
-
-@numba.njit(fastmath=True)
-def _csr_matvec_transpose_numba_imp(outs, ins):
-  res_val = outs
-  res_val.fill(0)
-  values, col_indices, row_ptr, vector, shape, _ = ins
-  # (csr mat).T @ vec
-
-  if values.shape[0] == 1:
-    values = values[0]
-    for row_i in range(shape[0]):
-      v = vector[row_i]
-      for j in range(row_ptr[row_i], row_ptr[row_i + 1]):
-        res_val[col_indices[j]] += values * v
-  else:
-    for row_i in range(shape[0]):
-      v = vector[row_i]
-      for j in range(row_ptr[row_i], row_ptr[row_i + 1]):
-        res_val[col_indices[j]] += v * values[j]
-
-
-@numba.njit(fastmath=True, parallel=True, nogil=True)
-def _csr_matvec_numba_imp(outs, ins):
-  res_val = outs
-  res_val.fill(0)
-  values, col_indices, row_ptr, vector, shape, _ = ins
-  # csr mat @ vec
-  if values.shape[0] == 1:
-    values = values[0]
-    for row_i in numba.prange(shape[0]):
-      r = 0.
-      for j in range(row_ptr[row_i], row_ptr[row_i + 1]):
-        r += values * vector[col_indices[j]]
-      res_val[row_i] = r
-  else:
-    for row_i in numba.prange(shape[0]):
-      r = 0.
-      for j in range(row_ptr[row_i], row_ptr[row_i + 1]):
-        r += values[j] * vector[col_indices[j]]
-      res_val[row_i] = r
-
-
-def _csrmv_cpu_translation(c, data, indices, indptr, vector, *, shape, transpose):
-  inputs = (data, indices, indptr, vector)
-  description = dict(shape=shape, transpose=transpose)
-  if transpose:
-    target_name, inputs, input_layouts, output_layouts = compile_cpu_signature_with_numba(
-      c,
-      _csr_matvec_transpose_numba_imp,
-      _csrmv_abstract,
-      multiple_results=False,
-      inputs=inputs,
-      description=description
-    )
-  else:
-    target_name, inputs, input_layouts, output_layouts = compile_cpu_signature_with_numba(
-      c,
-      _csr_matvec_numba_imp,
-      _csrmv_abstract,
-      multiple_results=False,
-      inputs=inputs,
-      description=description
-    )
-  return xla_client.ops.CustomCallWithLayout(
-    c,
-    target_name,
-    operands=inputs,
-    operand_shapes_with_layout=input_layouts,
-    shape_with_layout=output_layouts,
-  )
-
-
-def _csrmv_cusparse_gpu_lowering(ctx, data, indices, indptr, vector, *, shape, transpose):
-  data_aval, indices_aval, _, v_aval = ctx.avals_in
-  dtype = data_aval.dtype
-  if dtype not in [np.float32, np.float64, np.complex64, np.complex128]:
-    raise TypeError(f"cusparse_csr_matvec cusparse/hipsparse lowering not available for dtype={dtype}. "
-                    "Falling back to default implementation.")
-  return [gpu_sparse.cuda_csr_matvec(data, indices, indptr, vector,
-                                     shape=shape,
-                                     transpose=transpose,
-                                     data_dtype=dtype,
-                                     x_dtype=v_aval.dtype,
-                                     index_dtype=indices_aval.dtype)]
-
-
-def _csrmv_jvp_mat(csr_prim, data_dot, data, indices, indptr, v, *, shape, transpose):
-  return csr_prim.bind(data_dot, indices, indptr, v, shape=shape, transpose=transpose)
-
-
-def _csrmv_jvp_vec(prim, v_dot, data, indices, indptr, v, *, shape, transpose):
-  return prim.bind(data, indices, indptr, v_dot, shape=shape, transpose=transpose)
-
-
-def _csrmv_cusparse_transpose(ct, data, indices, indptr, vector, *, shape, transpose):
-  if ad.is_undefined_primal(indices) or ad.is_undefined_primal(indptr):
-    raise ValueError("Cannot transpose with respect to sparse indices.")
-
-  if ad.is_undefined_primal(vector):
-    if type(ct) is ad.Zero:
-      return data, indices, indptr, ad.Zero(vector)
-    else:
-      ct_vector = _csrmv_cusparse_p.bind(data, indices, indptr, ct, shape=shape, transpose=not transpose)
-      return data, indices, indptr, ct_vector
-
-  else:
-    if type(ct) is ad.Zero:
-      ct_data = ad.Zero(data)
-    else:
-      if data.aval.shape[0] == 1:  # scalar
-        ct_data = _csrmv_cusparse_p.bind(jnp.ones(1), indices, indptr, vector, shape=shape, transpose=transpose)
-        ct_data = jnp.inner(ct, ct_data)
-      else:  # heterogeneous values
-        row, col = csr_to_coo(indices, indptr)
-        ct_data = vector[row] * ct[col] if transpose else vector[col] * ct[row]
-    return ct_data, indices, indptr, vector
-
-
-_csrmv_cusparse_p = core.Primitive('cusparse_csr_matvec')
-_csrmv_cusparse_p.def_abstract_eval(_csrmv_abstract)
-_csrmv_cusparse_p.def_impl(partial(xla.apply_primitive, _csrmv_cusparse_p))
-# xla.backend_specific_translations['cpu'][_csrmv_cusparse_p] = _csrmv_cpu_translation
-ad.defjvp(_csrmv_cusparse_p,
-          partial(_csrmv_jvp_mat, _csrmv_cusparse_p),
-          None,
-          None,
-          partial(_csrmv_jvp_vec, _csrmv_cusparse_p))
-ad.primitive_transposes[_csrmv_cusparse_p] = _csrmv_cusparse_transpose
-register_general_batching(_csrmv_cusparse_p)
-mlir.register_lowering(_csrmv_cusparse_p, _csrmv_cusparse_gpu_lowering, platform='cuda')
-
-
-def _csr_matvec_scalar_gpu_translation(c, data, indices, indptr, vector, *, shape, transpose):
-  gpu_ops = import_brainpylib_gpu_ops()
-  if gpu_ops is None:
-    raise GPUOperatorNotFound(_csrmv_scalar_p.name)
-  if transpose:
-    raise NotImplementedError
-
-  data_shape = c.get_shape(data)
-  if data_shape.element_type() == np.float32:
-    ftype = b'_float'
-  elif data_shape.element_type() == np.float64:
-    ftype = b'_double'
-  else:
-    raise ValueError
-  indices_shape = c.get_shape(indices)
-  if indices_shape.element_type() == np.int32:
-    itype = b'_int'
-  elif indices_shape.element_type() == np.int64:
-    itype = b'_long'
-  else:
-    raise ValueError
-  data_name = b'homo' if data_shape.dimensions() == (1,) else b'heter'
-  opaque = gpu_ops.build_double_size_descriptor(shape[0], shape[1])
-  return xla_client.ops.CustomCallWithLayout(
-    c,
-    b'csrmv_' + data_name + b'_scalar' + ftype + itype,
-    operands=(data, indices, indptr, vector),
-    operand_shapes_with_layout=(c.get_shape(data),
-                                c.get_shape(indices),
-                                c.get_shape(indptr),
-                                c.get_shape(vector)),
-    shape_with_layout=xla_client.Shape.array_shape(data_shape.element_type(), (shape[0],), (0,)),
-    opaque=opaque,
-  )
-
-
-def _csrmv_scalar_transpose(ct, data, indices, indptr, vector, *, shape, transpose):
-  if ad.is_undefined_primal(indices) or ad.is_undefined_primal(indptr):
-    raise ValueError("Cannot transpose with respect to sparse indices.")
-
-  if ad.is_undefined_primal(vector):
-    ct_vector = _csrmv_scalar_p.bind(data, indices, indptr, ct, shape=shape, transpose=not transpose)
-    return data, indices, indptr, (ad.Zero(vector) if type(ct) is ad.Zero else ct_vector)
-
-  else:
-    if type(ct) is ad.Zero:
-      ct_data = ad.Zero(data)
-    else:
-      if data.aval.shape[0] == 1:  # scalar
-        ct_data = _csrmv_scalar_p.bind(jnp.ones(1), indices, indptr, vector, shape=shape, transpose=transpose)
-        ct_data = jnp.inner(ct, ct_data)
-      else:  # heterogeneous values
-        row, col = csr_to_coo(indices, indptr)
-        ct_data = vector[row] * ct[col] if transpose else vector[col] * ct[row]
-    return ct_data, indices, indptr, vector
-
-
-_csrmv_scalar_p = core.Primitive('csr_matvec_scalar')
-_csrmv_scalar_p.def_abstract_eval(_csrmv_abstract)
-_csrmv_scalar_p.def_impl(partial(xla.apply_primitive, _csrmv_scalar_p))
-# xla.backend_specific_translations['cpu'][_csrmv_scalar_p] = _csrmv_cpu_translation
-# xla.backend_specific_translations['gpu'][_csrmv_scalar_p] = _csr_matvec_scalar_gpu_translation
-ad.defjvp(_csrmv_scalar_p,
-          partial(_csrmv_jvp_mat, _csrmv_scalar_p),
-          None,
-          None,
-          partial(_csrmv_jvp_vec, _csrmv_scalar_p), )
-ad.primitive_transposes[_csrmv_scalar_p] = _csrmv_scalar_transpose
-register_general_batching(_csrmv_scalar_p)
-
-
-def _csr_matvec_vector_gpu_translation(c, data, indices, indptr, vector, *, shape, transpose):
-  gpu_ops = import_brainpylib_gpu_ops()
-  if gpu_ops is None:
-    raise GPUOperatorNotFound(_csrmv_vector_p.name)
-  if transpose:
-    raise NotImplementedError
-
-  data_shape = c.get_shape(data)
-  if data_shape.element_type() == np.float32:
-    ftype = b'_float'
-  elif data_shape.element_type() == np.float64:
-    ftype = b'_double'
-  else:
-    raise ValueError
-  indices_shape = c.get_shape(indices)
-  if indices_shape.element_type() == np.int32:
-    itype = b'_int'
-  elif indices_shape.element_type() == np.int64:
-    itype = b'_long'
-  else:
-    raise ValueError
-  data_name = b'homo' if data_shape.dimensions() == (1,) else b'heter'
-  opaque = gpu_ops.build_double_size_descriptor(shape[0], shape[1])
-  return xla_client.ops.CustomCallWithLayout(
-    c,
-    b'csrmv_' + data_name + b'_vector' + ftype + itype,
-    operands=(data, indices, indptr, vector),
-    operand_shapes_with_layout=(c.get_shape(data),
-                                c.get_shape(indices),
-                                c.get_shape(indptr),
-                                c.get_shape(vector)),
-    shape_with_layout=xla_client.Shape.array_shape(data_shape.element_type(), (shape[0],), (0,)),
-    opaque=opaque,
-  )
-
-
-def _csrmv_vector_transpose(ct, data, indices, indptr, vector, *, shape, transpose):
-  if ad.is_undefined_primal(indices) or ad.is_undefined_primal(indptr):
-    raise ValueError("Cannot transpose with respect to sparse indices.")
-
-  if ad.is_undefined_primal(vector):
-    ct_vector = _csrmv_vector_p.bind(data, indices, indptr, ct, shape=shape, transpose=not transpose)
-    return data, indices, indptr, (ad.Zero(vector) if type(ct) is ad.Zero else ct_vector)
-
-  else:
-    if type(ct) is ad.Zero:
-      ct_data = ad.Zero(data)
-    else:
-      if data.aval.shape[0] == 1:  # scalar
-        ct_data = _csrmv_vector_p.bind(jnp.ones(1), indices, indptr, vector, shape=shape, transpose=transpose)
-        ct_data = jnp.inner(ct, ct_data)
-      else:  # heterogeneous values
-        row, col = csr_to_coo(indices, indptr)
-        ct_data = vector[row] * ct[col] if transpose else vector[col] * ct[row]
-    return ct_data, indices, indptr, vector
-
-
-_csrmv_vector_p = core.Primitive('csr_matvec_vector')
-_csrmv_vector_p.def_abstract_eval(_csrmv_abstract)
-_csrmv_vector_p.def_impl(partial(xla.apply_primitive, _csrmv_vector_p))
-# xla.backend_specific_translations['cpu'][_csrmv_vector_p] = _csrmv_cpu_translation
-# xla.backend_specific_translations['gpu'][_csrmv_vector_p] = _csr_matvec_vector_gpu_translation
-ad.defjvp(_csrmv_vector_p,
-          partial(_csrmv_jvp_mat, _csrmv_vector_p),
-          None,
-          None,
-          partial(_csrmv_jvp_vec, _csrmv_vector_p), )
-ad.primitive_transposes[_csrmv_vector_p] = _csrmv_vector_transpose
-register_general_batching(_csrmv_vector_p)
-
-
-def _csr_matvec_adaptive_gpu_translation(c, data, indices, indptr, row_blocks, vector, *, shape, transpose):
-  gpu_ops = import_brainpylib_gpu_ops()
-  if gpu_ops is None:
-    raise GPUOperatorNotFound(_csrmv_adaptive_p.name)
-  if transpose:
-    raise NotImplementedError
-
-  data_shape = c.get_shape(data)
-  if data_shape.element_type() == np.float32:
-    ftype = b'_float'
-  elif data_shape.element_type() == np.float64:
-    ftype = b'_double'
-  else:
-    raise ValueError
-  indices_shape = c.get_shape(indices)
-  if indices_shape.element_type() == np.int32:
-    itype = b'_int'
-  elif indices_shape.element_type() == np.int64:
-    itype = b'_long'
-  else:
-    raise ValueError
-  data_name = b'homo' if data_shape.dimensions() == (1,) else b'heter'
-  opaque = gpu_ops.build_double_size_descriptor(shape[0], shape[1])
-  return xla_client.ops.CustomCallWithLayout(
-    c,
-    b'csrmv_' + data_name + b'_vector' + ftype + itype,
-    operands=(data, indices, indptr, row_blocks, vector),
-    operand_shapes_with_layout=(c.get_shape(data),
-                                c.get_shape(indices),
-                                c.get_shape(indptr),
-                                c.get_shape(row_blocks),
-                                c.get_shape(vector)),
-    shape_with_layout=xla_client.Shape.array_shape(data_shape.element_type(), (shape[0],), (0,)),
-    opaque=opaque,
-  )
-
-
-def _csrmv_adaptive_transpose(ct, data, indices, indptr, vector, *, shape, transpose):
-  if ad.is_undefined_primal(indices) or ad.is_undefined_primal(indptr):
-    raise ValueError("Cannot transpose with respect to sparse indices.")
-
-  if ad.is_undefined_primal(vector):
-    ct_vector = _csrmv_adaptive_p.bind(data, indices, indptr, ct, shape=shape, transpose=not transpose)
-    return data, indices, indptr, (ad.Zero(vector) if type(ct) is ad.Zero else ct_vector)
-
-  else:
-    if type(ct) is ad.Zero:
-      ct_data = ad.Zero(data)
-    else:
-      if data.aval.shape[0] == 1:  # scalar
-        ct_data = _csrmv_adaptive_p.bind(jnp.ones(1), indices, indptr, vector, shape=shape, transpose=transpose)
-        ct_data = jnp.inner(ct, ct_data)
-      else:  # heterogeneous values
-        row, col = csr_to_coo(indices, indptr)
-        ct_data = vector[row] * ct[col] if transpose else vector[col] * ct[row]
-    return ct_data, indices, indptr, vector
-
-
-_csrmv_adaptive_p = core.Primitive('csr_matvec_adaptive')
-_csrmv_adaptive_p.def_abstract_eval(_csrmv_abstract)
-_csrmv_adaptive_p.def_impl(partial(xla.apply_primitive, _csrmv_adaptive_p))
-# xla.backend_specific_translations['cpu'][_csrmv_adaptive_p] = _csrmv_cpu_translation
-# xla.backend_specific_translations['gpu'][_csrmv_adaptive_p] = _csr_matvec_adaptive_gpu_translation
-ad.defjvp(_csrmv_adaptive_p,
-          partial(_csrmv_jvp_mat, _csrmv_adaptive_p),
-          None,
-          None,
-          partial(_csrmv_jvp_vec, _csrmv_adaptive_p), )
-ad.primitive_transposes[_csrmv_adaptive_p] = _csrmv_adaptive_transpose
-register_general_batching(_csrmv_adaptive_p)
-
-
-### TAICHI ###
-
-def csrmv_taichi(
-    data: Union[float, jnp.ndarray, Array],
-    indices: Union[jnp.ndarray, Array],
-    indptr: Union[jnp.ndarray, Array],
-    vector: Union[jnp.ndarray, Array],
-    *,
-    shape: Tuple[int, int],
-    transpose: bool = False,
-) -> jax.Array:
-  """Product of CSR sparse matrix and a dense vector using cuSPARSE algorithm.
-
-  This function supports JAX transformations, including `jit()`, `grad()`,
-  `vmap()` and `pmap()`.
-
-  Parameters
-  ----------
-  data: ndarray, float
-    An array of shape ``(nse,)``.
-  indices: ndarray
-    An array of shape ``(nse,)``.
-  indptr: ndarray
-    An array of shape ``(shape[0] + 1,)`` and dtype ``indices.dtype``.
-  vector: ndarray
-    An array of shape ``(shape[0] if transpose else shape[1],)``
-    and dtype ``data.dtype``.
-  shape: tuple of int
-    A length-2 tuple representing the matrix shape.
-  transpose: bool
-    A boolean specifying whether to transpose the sparse matrix
-    before computing.
-
   Returns
   -------
   y : ndarry
@@ -593,171 +97,6 @@ def csrmv_taichi(
   return raw_csrmv_taichi(data, indices, indptr, vector, shape=shape, transpose=transpose)[0]
 
 
-# -------------
-# CPU operators
-# -------------
-
-
-@ti.kernel
-def _sparse_csr_matvec_transpose_homo_cpu(values: ti.types.ndarray(ndim=1),
-                                          col_indices: ti.types.ndarray(ndim=1),
-                                          row_ptr: ti.types.ndarray(ndim=1),
-                                          vector: ti.types.ndarray(ndim=1),
-                                          out: ti.types.ndarray(ndim=1)):
-  value = values[0]
-  ti.loop_config(serialize=True)
-  for row_i in range(row_ptr.shape[0] - 1):
-    for j in range(row_ptr[row_i], row_ptr[row_i + 1]):
-      out[col_indices[j]] += value * vector[row_i]
-
-
-@ti.kernel
-def _sparse_csr_matvec_transpose_heter_cpu(values: ti.types.ndarray(ndim=1),
-                                           col_indices: ti.types.ndarray(ndim=1),
-                                           row_ptr: ti.types.ndarray(ndim=1),
-                                           vector: ti.types.ndarray(ndim=1),
-                                           out: ti.types.ndarray(ndim=1)):
-  ti.loop_config(serialize=True)
-  for row_i in range(row_ptr.shape[0] - 1):
-    for j in range(row_ptr[row_i], row_ptr[row_i + 1]):
-      out[col_indices[j]] += vector[row_i] * values[j]
-
-
-@ti.kernel
-def _sparse_csr_matvec_homo_cpu(values: ti.types.ndarray(ndim=1),
-                                col_indices: ti.types.ndarray(ndim=1),
-                                row_ptr: ti.types.ndarray(ndim=1),
-                                vector: ti.types.ndarray(ndim=1),
-                                out: ti.types.ndarray(ndim=1)):
-  value = values[0]
-  # ti.loop_config(serialize=True)
-  for row_i in range(row_ptr.shape[0] - 1):
-    r = 0.
-    for j in range(row_ptr[row_i], row_ptr[row_i + 1]):
-      r += vector[col_indices[j]]
-    out[row_i] = r * value
-
-
-@ti.kernel
-def _sparse_csr_matvec_heter_cpu(values: ti.types.ndarray(ndim=1),
-                                 col_indices: ti.types.ndarray(ndim=1),
-                                 row_ptr: ti.types.ndarray(ndim=1),
-                                 vector: ti.types.ndarray(ndim=1),
-                                 out: ti.types.ndarray(ndim=1)):
-  # ti.loop_config(serialize=True)
-  for row_i in range(row_ptr.shape[0] - 1):
-    r = 0.
-    for j in range(row_ptr[row_i], row_ptr[row_i + 1]):
-      r += values[j] * vector[col_indices[j]]
-    out[row_i] = r
-
-
-# -------------
-# GPU operators
-# -------------
-
-
-@ti.kernel
-def _sparse_csr_matvec_transpose_homo_gpu(values: ti.types.ndarray(ndim=1),
-                                          col_indices: ti.types.ndarray(ndim=1),
-                                          row_ptr: ti.types.ndarray(ndim=1),
-                                          vector: ti.types.ndarray(ndim=1),
-                                          out: ti.types.ndarray(ndim=1)):
-  value = values[0]
-  for i in range((row_ptr.shape[0] - 1) * 32):
-    row_i = i >> 5
-    index = i & 31
-    j = row_ptr[row_i] + index
-    end_index = row_ptr[row_i + 1]
-    while j < end_index:
-      out[col_indices[j]] += value * vector[row_i]
-      j += 32
-
-
-@ti.kernel
-def _sparse_csr_matvec_homo_gpu(values: ti.types.ndarray(ndim=1),
-                                col_indices: ti.types.ndarray(ndim=1),
-                                row_ptr: ti.types.ndarray(ndim=1),
-                                vector: ti.types.ndarray(ndim=1),
-                                out: ti.types.ndarray(ndim=1)):
-  value = values[0]
-  for i in range((row_ptr.shape[0] - 1) * 32):
-    row_i = i >> 5
-    index = i & 31
-    r = 0.
-    j = row_ptr[row_i] + index
-    end_index = row_ptr[row_i + 1]
-    while j < end_index:
-      r += vector[col_indices[j]]
-      j += 32
-    out[row_i] += value * r
-
-
-@ti.kernel
-def _sparse_csr_matvec_transpose_heter_gpu(values: ti.types.ndarray(ndim=1),
-                                           col_indices: ti.types.ndarray(ndim=1),
-                                           row_ptr: ti.types.ndarray(ndim=1),
-                                           vector: ti.types.ndarray(ndim=1),
-                                           out: ti.types.ndarray(ndim=1)):
-  for i in range((row_ptr.shape[0] - 1) * 32):
-    row_i = i >> 5
-    index = i & 31
-    j = row_ptr[row_i] + index
-    end_index = row_ptr[row_i + 1]
-    while j < end_index:
-      out[col_indices[j]] += values[j] * vector[row_i]
-      j += 32
-
-
-@ti.kernel
-def _sparse_csr_matvec_heter_gpu(values: ti.types.ndarray(ndim=1),
-                                 col_indices: ti.types.ndarray(ndim=1),
-                                 row_ptr: ti.types.ndarray(ndim=1),
-                                 vector: ti.types.ndarray(ndim=1),
-                                 out: ti.types.ndarray(ndim=1)):
-  for i in range((row_ptr.shape[0] - 1) * 32):
-    row_i = i >> 5
-    index = i & 31
-    r = 0.
-    j = row_ptr[row_i] + index
-    end_index = row_ptr[row_i + 1]
-    while j < end_index:
-      r += values[j] * vector[col_indices[j]]
-      j += 32
-    out[row_i] += r  # TODO: warp-level primitive
-
-
-def _sparse_csr_matvec_jvp_values(val_dot, values, col_indices, row_ptr, vector, *, outs, transpose, shape):
-  return raw_csrmv_taichi(val_dot, col_indices, row_ptr, vector, shape=shape, transpose=transpose)
-
-
-def _sparse_csr_matvec_jvp_vector(vec_dot, values, col_indices, row_ptr, vector, *, outs, transpose, shape):
-  return raw_csrmv_taichi(values, col_indices, row_ptr, vec_dot, shape=shape, transpose=transpose)
-
-
-def _sparse_csr_matvec_transpose(
-    ct, data, indices, indptr, vector, *, outs, transpose, shape,
-):
-  if ad.is_undefined_primal(indices) or ad.is_undefined_primal(indptr):
-    raise ValueError("Cannot transpose with respect to sparse indices.")
-  if ad.is_undefined_primal(vector):
-    ct_vector = raw_csrmv_taichi(data, indices, indptr, ct[0], shape=shape, transpose=not transpose)[0]
-    return data, indices, indptr, (ad.Zero(vector) if type(ct[0]) is ad.Zero else ct_vector)
-
-  else:
-    if type(ct[0]) is ad.Zero:
-      ct_data = ad.Zero(data)
-    else:
-      if data.aval.shape[0] == 1:  # scalar
-        ct_data = raw_csrmv_taichi(jnp.ones(1), indices, indptr, vector, shape=shape, transpose=transpose)[0]
-        ct_data = jnp.inner(ct[0], ct_data)
-      else:
-        row, col = csr_to_coo(indices, indptr)
-        ct_data = vector[row] * ct[0][col] if transpose else vector[col] * ct[0][row]
-
-    return ct_data, indices, indptr, vector
-
-
 def raw_csrmv_taichi(
     data: Union[float, jnp.ndarray, Array],
     indices: Union[jnp.ndarray, Array],
@@ -767,17 +106,22 @@ def raw_csrmv_taichi(
     shape: Tuple[int, int],
     transpose: bool = False,
 ):
+  if ti is None:
+    raise PackageMissingError.by_purpose('taichi', purpose='customized operators')
   out_shape = shape[1] if transpose else shape[0]
-  if transpose:
-    if data.shape[0] == 1:
-      prim = _csr_matvec_transpose_homo_p
+  if data.shape[0] != 1:
+    if bm.get_platform() == 'gpu':
+      return [_csr_matvec_cusparse_p.bind(data, indices, indptr, vector, shape=shape, transpose=transpose)]
     else:
-      prim = _csr_matvec_transpose_heter_p
+      if transpose:
+        prim = _csr_matvec_transpose_heter_p
+      else:
+        prim = _csr_matvec_heter_p
   else:
-    if data.shape[0] == 1:
-      prim = _csr_matvec_homo_p
+    if transpose:
+      prim = _csr_matvec_transpose_homo_p
     else:
-      prim = _csr_matvec_heter_p
+      prim = _csr_matvec_homo_p
 
   return prim(data,
               indices,
@@ -788,25 +132,193 @@ def raw_csrmv_taichi(
               shape=shape)
 
 
-def _define_op(cpu_kernel, gpu_kernel):
-  prim = XLACustomOp(cpu_kernel=cpu_kernel, gpu_kernel=gpu_kernel)
-  prim.defjvp(_sparse_csr_matvec_jvp_values, None, None, _sparse_csr_matvec_jvp_vector)
-  prim.def_transpose_rule(_sparse_csr_matvec_transpose)
-  return prim
+if ti is not None:
+
+  # -------------
+  # CPU operators
+  # -------------
+  @ti.kernel
+  def _sparse_csr_matvec_transpose_homo_cpu(values: ti.types.ndarray(ndim=1),
+                                            col_indices: ti.types.ndarray(ndim=1),
+                                            row_ptr: ti.types.ndarray(ndim=1),
+                                            vector: ti.types.ndarray(ndim=1),
+                                            out: ti.types.ndarray(ndim=1)):
+    value = values[0]
+    ti.loop_config(serialize=True)
+    for row_i in range(row_ptr.shape[0] - 1):
+      for j in range(row_ptr[row_i], row_ptr[row_i + 1]):
+        out[col_indices[j]] += value * vector[row_i]
+
+
+  @ti.kernel
+  def _sparse_csr_matvec_transpose_heter_cpu(values: ti.types.ndarray(ndim=1),
+                                             col_indices: ti.types.ndarray(ndim=1),
+                                             row_ptr: ti.types.ndarray(ndim=1),
+                                             vector: ti.types.ndarray(ndim=1),
+                                             out: ti.types.ndarray(ndim=1)):
+    ti.loop_config(serialize=True)
+    for row_i in range(row_ptr.shape[0] - 1):
+      for j in range(row_ptr[row_i], row_ptr[row_i + 1]):
+        out[col_indices[j]] += vector[row_i] * values[j]
+
+
+  @ti.kernel
+  def _sparse_csr_matvec_homo_cpu(values: ti.types.ndarray(ndim=1),
+                                  col_indices: ti.types.ndarray(ndim=1),
+                                  row_ptr: ti.types.ndarray(ndim=1),
+                                  vector: ti.types.ndarray(ndim=1),
+                                  out: ti.types.ndarray(ndim=1)):
+    value = values[0]
+    # ti.loop_config(serialize=True)
+    for row_i in range(row_ptr.shape[0] - 1):
+      r = 0.
+      for j in range(row_ptr[row_i], row_ptr[row_i + 1]):
+        r += vector[col_indices[j]]
+      out[row_i] = r * value
+
+
+  @ti.kernel
+  def _sparse_csr_matvec_heter_cpu(values: ti.types.ndarray(ndim=1),
+                                   col_indices: ti.types.ndarray(ndim=1),
+                                   row_ptr: ti.types.ndarray(ndim=1),
+                                   vector: ti.types.ndarray(ndim=1),
+                                   out: ti.types.ndarray(ndim=1)):
+    # ti.loop_config(serialize=True)
+    for row_i in range(row_ptr.shape[0] - 1):
+      r = 0.
+      for j in range(row_ptr[row_i], row_ptr[row_i + 1]):
+        r += values[j] * vector[col_indices[j]]
+      out[row_i] = r
+
+
+  # -------------
+  # GPU operators
+  # -------------
+
+  @ti.kernel
+  def _sparse_csr_matvec_transpose_homo_gpu(values: ti.types.ndarray(ndim=1),
+                                            col_indices: ti.types.ndarray(ndim=1),
+                                            row_ptr: ti.types.ndarray(ndim=1),
+                                            vector: ti.types.ndarray(ndim=1),
+                                            out: ti.types.ndarray(ndim=1)):
+    value = values[0]
+    for i in range((row_ptr.shape[0] - 1) * 32):
+      row_i = i >> 5
+      index = i & 31
+      j = row_ptr[row_i] + index
+      end_index = row_ptr[row_i + 1]
+      while j < end_index:
+        out[col_indices[j]] += value * vector[row_i]
+        j += 32
+
+
+  @ti.kernel
+  def _sparse_csr_matvec_homo_gpu(values: ti.types.ndarray(ndim=1),
+                                  col_indices: ti.types.ndarray(ndim=1),
+                                  row_ptr: ti.types.ndarray(ndim=1),
+                                  vector: ti.types.ndarray(ndim=1),
+                                  out: ti.types.ndarray(ndim=1)):
+    value = values[0]
+    for i in range((row_ptr.shape[0] - 1) * 32):
+      row_i = i >> 5
+      index = i & 31
+      r = 0.
+      j = row_ptr[row_i] + index
+      end_index = row_ptr[row_i + 1]
+      while j < end_index:
+        r += vector[col_indices[j]]
+        j += 32
+      out[row_i] += value * r
+
+
+  @ti.kernel
+  def _sparse_csr_matvec_transpose_heter_gpu(values: ti.types.ndarray(ndim=1),
+                                             col_indices: ti.types.ndarray(ndim=1),
+                                             row_ptr: ti.types.ndarray(ndim=1),
+                                             vector: ti.types.ndarray(ndim=1),
+                                             out: ti.types.ndarray(ndim=1)):
+    for i in range((row_ptr.shape[0] - 1) * 32):
+      row_i = i >> 5
+      index = i & 31
+      j = row_ptr[row_i] + index
+      end_index = row_ptr[row_i + 1]
+      while j < end_index:
+        out[col_indices[j]] += values[j] * vector[row_i]
+        j += 32
+
+
+  @ti.kernel
+  def _sparse_csr_matvec_heter_gpu(values: ti.types.ndarray(ndim=1),
+                                   col_indices: ti.types.ndarray(ndim=1),
+                                   row_ptr: ti.types.ndarray(ndim=1),
+                                   vector: ti.types.ndarray(ndim=1),
+                                   out: ti.types.ndarray(ndim=1)):
+    for i in range((row_ptr.shape[0] - 1) * 32):
+      row_i = i >> 5
+      index = i & 31
+      r = 0.
+      j = row_ptr[row_i] + index
+      end_index = row_ptr[row_i + 1]
+      while j < end_index:
+        r += values[j] * vector[col_indices[j]]
+        j += 32
+      out[row_i] += r  # TODO: warp-level primitive
+
+
+  def _sparse_csr_matvec_jvp_values(val_dot, values, col_indices, row_ptr, vector, *, outs, transpose, shape):
+    return raw_csrmv_taichi(val_dot, col_indices, row_ptr, vector, shape=shape, transpose=transpose)
+
+
+  def _sparse_csr_matvec_jvp_vector(vec_dot, values, col_indices, row_ptr, vector, *, outs, transpose, shape):
+    return raw_csrmv_taichi(values, col_indices, row_ptr, vec_dot, shape=shape, transpose=transpose)
+
+
+  def _sparse_csr_matvec_transpose(
+      ct, data, indices, indptr, vector, *, outs, transpose, shape,
+  ):
+    if ad.is_undefined_primal(indices) or ad.is_undefined_primal(indptr):
+      raise ValueError("Cannot transpose with respect to sparse indices.")
+    if ad.is_undefined_primal(vector):
+      ct_vector = raw_csrmv_taichi(data, indices, indptr, ct[0], shape=shape, transpose=not transpose)[0]
+      return data, indices, indptr, (ad.Zero(vector) if type(ct[0]) is ad.Zero else ct_vector)
+
+    else:
+      if type(ct[0]) is ad.Zero:
+        ct_data = ad.Zero(data)
+      else:
+        if data.aval.shape[0] == 1:  # scalar
+          ct_data = raw_csrmv_taichi(jnp.ones(1), indices, indptr, vector, shape=shape, transpose=transpose)[0]
+          ct_data = jnp.inner(ct[0], ct_data)
+        else:
+          row, col = csr_to_coo(indices, indptr)
+          ct_data = vector[row] * ct[0][col] if transpose else vector[col] * ct[0][row]
+
+      return ct_data, indices, indptr, vector
+
+
+  def _define_op(cpu_kernel, gpu_kernel):
+    prim = XLACustomOp(cpu_kernel=cpu_kernel, gpu_kernel=gpu_kernel)
+    prim.defjvp(_sparse_csr_matvec_jvp_values, None, None, _sparse_csr_matvec_jvp_vector)
+    prim.def_transpose_rule(_sparse_csr_matvec_transpose)
+    return prim
+
 
+  # transpose homo
+  _csr_matvec_transpose_homo_p = _define_op(cpu_kernel=_sparse_csr_matvec_transpose_homo_cpu,
+                                            gpu_kernel=_sparse_csr_matvec_transpose_homo_gpu)
 
-# transpose homo
-_csr_matvec_transpose_homo_p = _define_op(cpu_kernel=_sparse_csr_matvec_transpose_homo_cpu,
-                                          gpu_kernel=_sparse_csr_matvec_transpose_homo_gpu)
+  # no transpose homo
+  _csr_matvec_homo_p = _define_op(cpu_kernel=_sparse_csr_matvec_homo_cpu,
+                                  gpu_kernel=_sparse_csr_matvec_homo_gpu)
 
-# no transpose homo
-_csr_matvec_homo_p = _define_op(cpu_kernel=_sparse_csr_matvec_homo_cpu,
-                                gpu_kernel=_sparse_csr_matvec_homo_gpu)
+  # transpose heter
+  _csr_matvec_transpose_heter_p = _define_op(cpu_kernel=_sparse_csr_matvec_transpose_heter_cpu,
+                                             gpu_kernel=_sparse_csr_matvec_transpose_heter_gpu)
 
-# transpose heter
-_csr_matvec_transpose_heter_p = _define_op(cpu_kernel=_sparse_csr_matvec_transpose_heter_cpu,
-                                           gpu_kernel=_sparse_csr_matvec_transpose_heter_gpu)
+  # no transpose heter
+  _csr_matvec_heter_p = _define_op(cpu_kernel=_sparse_csr_matvec_heter_cpu,
+                                   gpu_kernel=_sparse_csr_matvec_heter_gpu)
 
-# no transpose heter
-_csr_matvec_heter_p = _define_op(cpu_kernel=_sparse_csr_matvec_heter_cpu,
-                                 gpu_kernel=_sparse_csr_matvec_heter_gpu)
+  # heter cusparse
+  _csr_matvec_cusparse_p = csr.csr_matvec_p
+  register_general_batching(_csr_matvec_cusparse_p)
diff --git a/brainpy/_src/math/sparse/_utils.py b/brainpy/_src/math/sparse/_utils.py
index a1dc9190e..f5b74e5eb 100644
--- a/brainpy/_src/math/sparse/_utils.py
+++ b/brainpy/_src/math/sparse/_utils.py
@@ -3,9 +3,8 @@
 import warnings
 from typing import Tuple
 
-import jax
 import numpy as np
-from jax import core, numpy as jnp, dtypes
+from jax import core, numpy as jnp
 from jax.interpreters import mlir, ad
 from jaxlib import gpu_sparse
 
diff --git a/brainpy/_src/math/sparse/tests/test_csrmv.py b/brainpy/_src/math/sparse/tests/test_csrmv.py
index 2c75f0901..ec448e658 100644
--- a/brainpy/_src/math/sparse/tests/test_csrmv.py
+++ b/brainpy/_src/math/sparse/tests/test_csrmv.py
@@ -5,10 +5,14 @@
 import jax
 from absl.testing import parameterized
 
+import pytest
 import brainpy as bp
 import brainpy.math as bm
 
-# bm.set_platform('gpu')
+from brainpy._src.dependency_check import import_taichi
+
+if import_taichi(error_if_not_found=False) is None:
+  pytest.skip('no taichi', allow_module_level=True)
 
 seed = 1234
 
diff --git a/brainpy/_src/math/sparse/tests/test_csrmv_old.py b/brainpy/_src/math/sparse/tests/test_csrmv_old.py
deleted file mode 100644
index b73217496..000000000
--- a/brainpy/_src/math/sparse/tests/test_csrmv_old.py
+++ /dev/null
@@ -1,352 +0,0 @@
-# -*- coding: utf-8 -*-
-
-from functools import partial
-
-import jax
-import pytest
-from absl.testing import parameterized
-import platform
-import brainpy as bp
-import brainpy.math as bm
-
-pytest.skip('Old implementation.', allow_module_level=True)
-
-is_manual_test = False
-# if platform.system() == 'Windows' and not is_manual_test:
-#   pytest.skip('brainpy.math package may need manual tests.', allow_module_level=True)
-
-cusparse_csr_matvec = partial(bm.sparse.csrmv, method='cusparse')
-scalar_csr_matvec = partial(bm.sparse.csrmv, method='scalar')
-vector_csr_matvec = partial(bm.sparse.csrmv, method='vector')
-
-
-class Test_cusparse_csrmv(parameterized.TestCase):
-  def __init__(self, *args, platform='cpu', **kwargs):
-    super(Test_cusparse_csrmv, self).__init__(*args, **kwargs)
-
-    print()
-    bm.set_platform(platform)
-
-  @parameterized.product(
-    transpose=[True, False],
-    shape=[(200, 200), (200, 100), (10, 1000), (2, 2000)],
-    homo_data=[-1., 0., 1.]
-  )
-  def test_homo(self, transpose, shape, homo_data):
-    rng = bm.random.RandomState()
-    conn = bp.conn.FixedProb(0.1)
-
-    indices, indptr = conn(*shape).require('pre2post')
-    indices = bm.as_jax(indices)
-    indptr = bm.as_jax(indptr)
-
-    heter_data = bm.ones(indices.shape).value * homo_data
-
-    vector = rng.random(shape[0] if transpose else shape[1])
-    vector = bm.as_jax(vector)
-    r1 = cusparse_csr_matvec(homo_data, indices, indptr, vector, shape=shape, transpose=transpose)
-    r2 = cusparse_csr_matvec(heter_data, indices, indptr, vector, shape=shape, transpose=transpose)
-    self.assertTrue(bm.allclose(r1, r2))
-
-    dense = bm.sparse.csr_to_dense(heter_data, indices, indptr, shape=shape)
-    r3 = (vector @ dense) if transpose else (dense @ vector)
-    self.assertTrue(bm.allclose(r1, r3))
-
-    bm.clear_buffer_memory()
-
-  @parameterized.product(
-    transpose=[True, False],
-    shape=[(200, 200), (200, 100), (10, 1000), (2, 2000)],
-    v=[-1., 0., 1.]
-  )
-  def test_homo_vmap(self, transpose, shape, v):
-    rng = bm.random.RandomState()
-    conn = bp.conn.FixedProb(0.1)
-
-    indices, indptr = conn(*shape).require('pre2post')
-    indices = bm.as_jax(indices)
-    indptr = bm.as_jax(indptr)
-    vector = rng.random(shape[0] if transpose else shape[1])
-    vector = bm.as_jax(vector)
-
-    heter_data = bm.ones((10, indices.shape[0])).value * v
-    homo_data = bm.ones(10).value * v
-    dense_data = jax.vmap(lambda a: bm.sparse.csr_to_dense(a, indices, indptr, shape=shape))(heter_data)
-
-    f1 = partial(cusparse_csr_matvec, indices=indices, indptr=indptr, vector=vector,
-                 shape=shape, transpose=transpose)
-    f2 = lambda a: (a.T @ vector) if transpose else (a @ vector)
-
-    r1 = jax.vmap(f1)(homo_data)
-    r2 = jax.vmap(f1)(heter_data)
-    self.assertTrue(bm.allclose(r1, r2))
-
-    r3 = jax.vmap(f2)(dense_data)
-    self.assertTrue(bm.allclose(r1, r3))
-
-    bm.clear_buffer_memory()
-
-  @parameterized.product(
-    transpose=[True, False],
-    shape=[(200, 200), (200, 100), (10, 1000), (2, 2000)],
-    homo_data=[-1., 0., 1.]
-  )
-  def test_homo_grad(self, transpose, shape, homo_data):
-    rng = bm.random.RandomState()
-    conn = bp.conn.FixedProb(0.1)
-
-    indices, indptr = conn(*shape).require('pre2post')
-    indices = bm.as_jax(indices)
-    indptr = bm.as_jax(indptr)
-    dense = bm.sparse.csr_to_dense(bm.ones(indices.shape).value,
-                                   indices,
-                                   indptr,
-                                   shape=shape)
-    vector = rng.random(shape[0] if transpose else shape[1])
-    vector = bm.as_jax(vector)
-
-    csr_f1 = jax.grad(lambda a: cusparse_csr_matvec(a, indices, indptr, vector,
-                                                    shape=shape, transpose=transpose).sum(),
-                      argnums=0)
-    dense_f1 = jax.grad(lambda a: ((vector @ (dense * a)).sum()
-                                   if transpose else
-                                   ((dense * a) @ vector).sum()),
-                        argnums=0)
-
-    r1 = csr_f1(homo_data)
-    r2 = dense_f1(homo_data)
-    self.assertTrue(bm.allclose(r1, r2))
-
-    csr_f2 = jax.grad(lambda v: cusparse_csr_matvec(homo_data, indices, indptr, v,
-                                                    shape=shape, transpose=transpose).sum())
-    dense_data = dense * homo_data
-    dense_f2 = jax.grad(lambda v: ((v @ dense_data).sum() if transpose else (dense_data @ v).sum()))
-
-    r3 = csr_f2(vector)
-    r4 = dense_f2(vector)
-    self.assertTrue(bm.allclose(r3, r4))
-
-    csr_f3 = jax.grad(lambda a, v: cusparse_csr_matvec(a, indices, indptr, v,
-                                                       shape=shape, transpose=transpose).sum(),
-                      argnums=(0, 1))
-    dense_f3 = jax.grad(lambda a, v: ((v @ (dense * a)).sum()
-                                      if transpose else
-                                      ((dense * a) @ v).sum()),
-                        argnums=(0, 1))
-
-    r5 = csr_f3(homo_data, vector)
-    r6 = dense_f3(homo_data, vector)
-    self.assertTrue(bm.allclose(r5[0], r6[0]))
-    self.assertTrue(bm.allclose(r5[1], r6[1]))
-
-    bm.clear_buffer_memory()
-
-  @parameterized.product(
-    transpose=[True, False],
-    shape=[(200, 200), (200, 100), (10, 1000), (2, 2000)],
-  )
-  def test_heter(self, transpose, shape):
-    rng = bm.random.RandomState()
-    conn = bp.conn.FixedProb(0.1)
-
-    indices, indptr = conn(*shape).require('pre2post')
-    indices = bm.as_jax(indices)
-    indptr = bm.as_jax(indptr)
-
-    heter_data = rng.random(indices.shape)
-    heter_data = bm.as_jax(heter_data)
-
-    vector = rng.random(shape[0] if transpose else shape[1])
-    vector = bm.as_jax(vector)
-    r1 = cusparse_csr_matvec(heter_data, indices, indptr, vector,
-                             shape=shape, transpose=transpose)
-    dense = bm.sparse.csr_to_dense(heter_data, indices, indptr, shape=shape)
-    r2 = (vector @ dense) if transpose else (dense @ vector)
-    self.assertTrue(bm.allclose(r1, r2))
-
-    bm.clear_buffer_memory()
-
-  @parameterized.product(
-    transpose=[True, False],
-    shape=[(200, 200), (200, 100), (10, 1000), (2, 2000)]
-  )
-  def test_heter_vmap(self, transpose, shape):
-    rng = bm.random.RandomState()
-    conn = bp.conn.FixedProb(0.1)
-
-    indices, indptr = conn(*shape).require('pre2post')
-    indices = bm.as_jax(indices)
-    indptr = bm.as_jax(indptr)
-    vector = rng.random(shape[0] if transpose else shape[1])
-    vector = bm.as_jax(vector)
-
-    heter_data = rng.random((10, indices.shape[0]))
-    heter_data = bm.as_jax(heter_data)
-    dense_data = jax.vmap(lambda a: bm.sparse.csr_to_dense(a, indices, indptr,
-                                                           shape=shape))(heter_data)
-
-    f1 = partial(cusparse_csr_matvec, indices=indices, indptr=indptr, vector=vector,
-                 shape=shape, transpose=transpose)
-    f2 = lambda a: (a.T @ vector) if transpose else (a @ vector)
-
-    r1 = jax.vmap(f1)(heter_data)
-    r2 = jax.vmap(f2)(dense_data)
-    self.assertTrue(bm.allclose(r1, r2))
-
-    bm.clear_buffer_memory()
-
-  @parameterized.product(
-    transpose=[True, False],
-    shape=[(200, 200), (200, 100), (10, 1000), (2, 2000)]
-  )
-  def test_heter_grad(self, transpose, shape):
-    rng = bm.random.RandomState()
-    conn = bp.conn.FixedProb(0.1)
-
-    indices, indptr = conn(*shape).require('pre2post')
-    indices = bm.as_jax(indices)
-    indptr = bm.as_jax(indptr)
-    heter_data = rng.random(indices.shape)
-    heter_data = bm.as_jax(heter_data)
-    dense_data = bm.sparse.csr_to_dense(heter_data, indices, indptr, shape=shape)
-    vector = rng.random(shape[0] if transpose else shape[1])
-    vector = bm.as_jax(vector)
-
-    csr_f1 = jax.grad(lambda a: cusparse_csr_matvec(a, indices, indptr, vector,
-                                                    shape=shape,
-                                                    transpose=transpose).sum(),
-                      argnums=0)
-    dense_f1 = jax.grad(lambda a: ((vector @ a).sum() if transpose else (a @ vector).sum()),
-                        argnums=0)
-
-    r1 = csr_f1(heter_data)
-    r2 = dense_f1(dense_data)
-    rows, cols = bm.sparse.csr_to_coo(indices, indptr)
-    r2 = r2[rows, cols]
-    self.assertTrue(bm.allclose(r1, r2))
-
-    csr_f2 = jax.grad(lambda v: cusparse_csr_matvec(heter_data, indices, indptr, v,
-                                                    shape=shape,
-                                                    transpose=transpose).sum(),
-                      argnums=0)
-    dense_f2 = jax.grad(lambda v: ((v @ dense_data).sum() if transpose else (dense_data @ v).sum()),
-                        argnums=0)
-    r3 = csr_f2(vector)
-    r4 = dense_f2(vector)
-    self.assertTrue(bm.allclose(r3, r4))
-
-    bm.clear_buffer_memory()
-
-
-class Test_csrmv(parameterized.TestCase):
-  def __init__(self, *args, platform='cpu', **kwargs):
-    super(Test_csrmv, self).__init__(*args, **kwargs)
-
-    print()
-    bm.set_platform(platform)
-
-  @parameterized.product(
-    homo_data=[-1., 0., 0.1, 1.],
-    shape=[(100, 200), (10, 1000), (2, 2000)],
-  )
-  def test_homo(self, shape, homo_data):
-    conn = bp.conn.FixedProb(0.1)
-
-    # matrix
-    indices, indptr = conn(*shape).require('pre2post')
-    indices = bm.as_jax(indices)
-    indptr = bm.as_jax(indptr)
-    # vector
-    rng = bm.random.RandomState(123)
-    vector = rng.random(shape[1])
-    vector = bm.as_jax(vector)
-
-    # csrmv
-    r1 = scalar_csr_matvec(homo_data, indices, indptr, vector, shape=shape)
-    r2 = cusparse_csr_matvec(homo_data, indices, indptr, vector, shape=shape)
-    r3 = vector_csr_matvec(homo_data, indices, indptr, vector, shape=shape)
-    self.assertTrue(bm.allclose(r1, r2))
-    self.assertTrue(bm.allclose(r1, r3))
-
-    heter_data = bm.ones(indices.shape).to_jax() * homo_data
-    r4 = scalar_csr_matvec(heter_data, indices, indptr, vector, shape=shape)
-    r5 = cusparse_csr_matvec(heter_data, indices, indptr, vector, shape=shape)
-    r6 = vector_csr_matvec(heter_data, indices, indptr, vector, shape=shape)
-    self.assertTrue(bm.allclose(r1, r4))
-    self.assertTrue(bm.allclose(r1, r5))
-    self.assertTrue(bm.allclose(r1, r6))
-
-    dense = bm.sparse.csr_to_dense(heter_data, indices, indptr, shape=shape)
-    rdense = dense @ vector
-    self.assertTrue(bm.allclose(r1, rdense))
-
-    bm.clear_buffer_memory()
-
-  @parameterized.product(
-    shape=[(100, 200), (200, 100), (10, 1000), (2, 2000)]
-  )
-  def test_heter(self, shape):
-    rng = bm.random.RandomState()
-    conn = bp.conn.FixedProb(0.1)
-
-    indices, indptr = conn(*shape).require('pre2post')
-    indices = bm.as_jax(indices)
-    indptr = bm.as_jax(indptr)
-    heter_data = bm.as_jax(rng.random(indices.shape))
-    vector = bm.as_jax(rng.random(shape[1]))
-
-    r1 = scalar_csr_matvec(heter_data, indices, indptr, vector, shape=shape)
-    r2 = cusparse_csr_matvec(heter_data, indices, indptr, vector, shape=shape)
-    r3 = vector_csr_matvec(heter_data, indices, indptr, vector, shape=shape)
-
-    dense = bm.sparse.csr_to_dense(heter_data, indices, indptr, shape=shape)
-    r4 = dense @ vector
-    self.assertTrue(bm.allclose(r1, r2))
-    self.assertTrue(bm.allclose(r1, r3))
-    self.assertTrue(bm.allclose(r1, r4))
-
-    bm.clear_buffer_memory()
-
-  @parameterized.product(
-    shape=[(200, 200), (200, 100), (10, 1000), (2, 2000)]
-  )
-  def test_heter_grad(self, shape):
-    rng = bm.random.RandomState()
-    conn = bp.conn.FixedProb(0.1)
-
-    indices, indptr = conn(*shape).require('pre2post')
-    heter_data = rng.random(indices.shape)
-    dense_data = bm.sparse.csr_to_dense(heter_data, indices, indptr, shape=shape)
-    vector = rng.random(shape[1])
-
-    csr_f1 = jax.grad(lambda a: cusparse_csr_matvec(a, indices, indptr, vector, shape=shape).sum())
-    csr_f2 = jax.grad(lambda a: scalar_csr_matvec(a, indices, indptr, vector, shape=shape).sum())
-    csr_f3 = jax.grad(lambda a: vector_csr_matvec(a, indices, indptr, vector, shape=shape).sum())
-    dense_f1 = jax.grad(lambda a: (a @ vector).sum())
-
-    r1 = csr_f1(heter_data)
-    r2 = csr_f2(heter_data)
-    r3 = csr_f3(heter_data)
-
-    d1 = dense_f1(dense_data)
-    rows, cols = bm.sparse.csr_to_coo(indices, indptr)
-    d1 = d1[rows, cols]
-    self.assertTrue(bm.allclose(r1, r2))
-    self.assertTrue(bm.allclose(r1, r3))
-    self.assertTrue(bm.allclose(r1, d1))
-
-    # csr_f4 = jax.grad(lambda v: cusparse_csr_matvec(heter_data, indices, indptr, v, shape=shape).sum())
-    # csr_f5 = jax.grad(lambda v: scalar_csr_matvec(heter_data, indices, indptr, v, shape=shape).sum())
-    # csr_f6 = jax.grad(lambda v: vector_csr_matvec(heter_data, indices, indptr, v, shape=shape).sum())
-    # dense_f2 = jax.grad(lambda v: (dense_data @ v).sum())
-    # r4 = csr_f4(vector)
-    # r5 = csr_f5(vector)
-    # r6 = csr_f6(vector)
-    # d2 = dense_f2(vector)
-    # self.assertTrue(bm.allclose(r4, r5))
-    # self.assertTrue(bm.allclose(r4, r6))
-    # self.assertTrue(bm.allclose(r4, d2))
-
-    bm.clear_buffer_memory()
-
-
diff --git a/brainpy/_src/math/surrogate/_one_input_new.py b/brainpy/_src/math/surrogate/_one_input_new.py
index 64c7280d0..bfffd88f5 100644
--- a/brainpy/_src/math/surrogate/_one_input_new.py
+++ b/brainpy/_src/math/surrogate/_one_input_new.py
@@ -90,7 +90,30 @@ def _as_jax(x):
 
 
 class Surrogate(object):
-  """The base surrograte gradient function."""
+  """The base surrograte gradient function.
+
+  To customize a surrogate gradient function, you can inherit this class and
+  implement the `surrogate_fun` and `surrogate_grad` methods.
+
+  Examples
+  --------
+
+  >>> import brainpy as bp
+  >>> import brainpy.math as bm
+  >>> import jax.numpy as jnp
+
+  >>> class MySurrogate(bm.Surrogate):
+  ...   def __init__(self, alpha=1.):
+  ...     super().__init__()
+  ...     self.alpha = alpha
+  ...
+  ...   def surrogate_fun(self, x):
+  ...     return jnp.sin(x) * self.alpha
+  ...
+  ...   def surrogate_grad(self, x):
+  ...     return jnp.cos(x) * self.alpha
+
+  """
 
   def __call__(self, x):
     x = _as_jax(x)
@@ -123,7 +146,7 @@ def __init__(self, alpha: float = 4.):
     self.alpha = alpha
 
   def surrogate_fun(self, x):
-    return sci.special.expit(x)
+    return sci.special.expit(self.alpha * x)
 
   def surrogate_grad(self, x):
     sgax = sci.special.expit(x * self.alpha)
diff --git a/brainpy/_src/math/tests/test_tifunc.py b/brainpy/_src/math/tests/test_tifunc.py
index 6823ebabd..db6e7debc 100644
--- a/brainpy/_src/math/tests/test_tifunc.py
+++ b/brainpy/_src/math/tests/test_tifunc.py
@@ -1,122 +1,124 @@
-# -*- coding: utf-8 -*-
-
-import jax
-import jax.numpy as jnp
-import pytest
-
-pytestmark = pytest.mark.skip(reason="Skipped due to MacOS limitation, manual execution required for testing.")
-import brainpy.math as bm
-import taichi as ti
-import matplotlib.pyplot as plt
-import os
-
-
-bm.set_platform('cpu')
-
-
-def test_taichi_random():
-  @ti.kernel
-  def test_taichi_lfsr88(seed: ti.types.ndarray(ndim=1, dtype=ti.u32),
-                         out: ti.types.ndarray(ndim=1, dtype=ti.f32)):
-    key = bm.tifunc.lfsr88_key(seed[0])
-    for i in range(out.shape[0]):
-      key, result = bm.tifunc.lfsr88_rand(key)
-      out[i] = result
-
-  @ti.kernel
-  def test_taichi_lcg_rand(seed: ti.types.ndarray(ndim=1),
-                           out: ti.types.ndarray(ndim=1)):
-    for i in range(out.shape[0]):
-      out[i] = bm.tifunc.taichi_lcg_rand(seed)
-
-  @ti.kernel
-  def test_taichi_uniform_int_distribution(seed: ti.types.ndarray(ndim=1),
-                                           low_high: ti.types.ndarray(ndim=1),
-                                           out: ti.types.ndarray(ndim=1)):
-    key = bm.tifunc.lfsr88_key(seed[0])
-    low = low_high[0]
-    high = low_high[1]
-    for i in range(out.shape[0]):
-      key, out[i] = bm.tifunc.lfsr88_randint(key, low, high)
-
-  @ti.kernel
-  def test_taichi_uniform_real_distribution(seed: ti.types.ndarray(ndim=1),
-                                            low_high: ti.types.ndarray(ndim=1),
-                                            out: ti.types.ndarray(ndim=1)):
-    key = bm.tifunc.lfsr88_key(seed[0])
-    low = low_high[0]
-    high = low_high[1]
-    for i in range(out.shape[0]):
-      key, out[i] = bm.tifunc.lfsr88_uniform(key, low, high)
-
-  @ti.kernel
-  def test_taichi_normal_distribution(seed: ti.types.ndarray(ndim=1),
-                                      mu_sigma: ti.types.ndarray(ndim=1),
-                                      out: ti.types.ndarray(ndim=1)):
-    key = bm.tifunc.lfsr88_key(seed[0])
-    mu = mu_sigma[0]
-    sigma = mu_sigma[1]
-
-    for i in range(out.shape[0]):
-      key, out[i] = bm.tifunc.lfsr88_normal(key, mu, sigma)
-
-  n = 100000
-  seed = jnp.array([1234, ], dtype=jnp.uint32)
-  low_high = jnp.array([0, 10])
-  mu_sigma = jnp.array([0, 1])
-
-  prim_lfsr88 = bm.XLACustomOp(cpu_kernel=test_taichi_lfsr88,
-                               gpu_kernel=test_taichi_lfsr88)
-  
-
-  prim_lcg_rand = bm.XLACustomOp(cpu_kernel=test_taichi_lcg_rand,
-                                 gpu_kernel=test_taichi_lcg_rand)
-  prim_uniform_int_distribution = bm.XLACustomOp(cpu_kernel=test_taichi_uniform_int_distribution,
-                                                 gpu_kernel=test_taichi_uniform_int_distribution)
-  prim_uniform_real_distribution = bm.XLACustomOp(cpu_kernel=test_taichi_uniform_real_distribution,
-                                                  gpu_kernel=test_taichi_uniform_real_distribution)
-  prim_normal_distribution = bm.XLACustomOp(cpu_kernel=test_taichi_normal_distribution,
-                                            gpu_kernel=test_taichi_normal_distribution)
-
-  file_path = os.path.dirname(os.path.abspath(__file__))
-
-  out = prim_lfsr88(seed, outs=[jax.ShapeDtypeStruct((n,), jnp.float32)])
-  # show the distribution of out
-  plt.hist(out, bins=100)
-  plt.title("LFSR88 random number generator")
-  plt.savefig(file_path + "/lfsr88.png")
-  plt.close()
-
-  out = prim_lcg_rand(seed,
-                      outs=[jax.ShapeDtypeStruct((n,), jnp.float32)])
-  # show the distribution of out
-  plt.hist(out, bins=100)
-  plt.title("LCG random number generator")
-  plt.savefig(file_path + "/lcg_rand.png")
-  plt.close()
-
-  out = prim_uniform_int_distribution(seed, low_high,
-                                      outs=[jax.ShapeDtypeStruct((n,), jnp.int32)])
-  # show the distribution of out
-  plt.hist(out, bins=10)
-  plt.title("Uniform int distribution (0, 10)")
-  plt.savefig(file_path + "/uniform_int_distribution.png")
-  plt.close()
-
-  out = prim_uniform_real_distribution(seed, low_high,
-                                       outs=[jax.ShapeDtypeStruct((n,), jnp.float32)])
-  # show the distribution of out
-  plt.hist(out, bins=100)
-  plt.title("Uniform real distribution (0, 10)")
-  plt.savefig(file_path + "/uniform_real_distribution.png")
-  plt.close()
-
-  out = prim_normal_distribution(seed, mu_sigma,
-                                 outs=[jax.ShapeDtypeStruct((n,), jnp.float32)])
-  # show the distribution of out
-  plt.title("Normal distribution mu=0, sigma=1")
-  plt.hist(out, bins=100)
-  plt.savefig(file_path + "/normal_distribution.png")
-
-
-# TODO; test default types
+# -*- coding: utf-8 -*-
+
+import jax
+import jax.numpy as jnp
+import pytest
+
+pytestmark = pytest.mark.skip(reason="Skipped due to MacOS limitation, manual execution required for testing.")
+import brainpy.math as bm
+import matplotlib.pyplot as plt
+import os
+
+from brainpy._src.dependency_check import import_taichi
+
+ti = import_taichi(error_if_not_found=False)
+if ti is None:
+  pytest.skip('no taichi', allow_module_level=True)
+
+bm.set_platform('cpu')
+
+
+def test_taichi_random():
+  @ti.kernel
+  def test_taichi_lfsr88(seed: ti.types.ndarray(ndim=1, dtype=ti.u32),
+                         out: ti.types.ndarray(ndim=1, dtype=ti.f32)):
+    key = bm.tifunc.lfsr88_key(seed[0])
+    for i in range(out.shape[0]):
+      key, result = bm.tifunc.lfsr88_rand(key)
+      out[i] = result
+
+  @ti.kernel
+  def test_taichi_lcg_rand(seed: ti.types.ndarray(ndim=1),
+                           out: ti.types.ndarray(ndim=1)):
+    for i in range(out.shape[0]):
+      out[i] = bm.tifunc.taichi_lcg_rand(seed)
+
+  @ti.kernel
+  def test_taichi_uniform_int_distribution(seed: ti.types.ndarray(ndim=1),
+                                           low_high: ti.types.ndarray(ndim=1),
+                                           out: ti.types.ndarray(ndim=1)):
+    key = bm.tifunc.lfsr88_key(seed[0])
+    low = low_high[0]
+    high = low_high[1]
+    for i in range(out.shape[0]):
+      key, out[i] = bm.tifunc.lfsr88_randint(key, low, high)
+
+  @ti.kernel
+  def test_taichi_uniform_real_distribution(seed: ti.types.ndarray(ndim=1),
+                                            low_high: ti.types.ndarray(ndim=1),
+                                            out: ti.types.ndarray(ndim=1)):
+    key = bm.tifunc.lfsr88_key(seed[0])
+    low = low_high[0]
+    high = low_high[1]
+    for i in range(out.shape[0]):
+      key, out[i] = bm.tifunc.lfsr88_uniform(key, low, high)
+
+  @ti.kernel
+  def test_taichi_normal_distribution(seed: ti.types.ndarray(ndim=1),
+                                      mu_sigma: ti.types.ndarray(ndim=1),
+                                      out: ti.types.ndarray(ndim=1)):
+    key = bm.tifunc.lfsr88_key(seed[0])
+    mu = mu_sigma[0]
+    sigma = mu_sigma[1]
+
+    for i in range(out.shape[0]):
+      key, out[i] = bm.tifunc.lfsr88_normal(key, mu, sigma)
+
+  n = 100000
+  seed = jnp.array([1234, ], dtype=jnp.uint32)
+  low_high = jnp.array([0, 10])
+  mu_sigma = jnp.array([0, 1])
+
+  prim_lfsr88 = bm.XLACustomOp(cpu_kernel=test_taichi_lfsr88,
+                               gpu_kernel=test_taichi_lfsr88)
+
+  prim_lcg_rand = bm.XLACustomOp(cpu_kernel=test_taichi_lcg_rand,
+                                 gpu_kernel=test_taichi_lcg_rand)
+  prim_uniform_int_distribution = bm.XLACustomOp(cpu_kernel=test_taichi_uniform_int_distribution,
+                                                 gpu_kernel=test_taichi_uniform_int_distribution)
+  prim_uniform_real_distribution = bm.XLACustomOp(cpu_kernel=test_taichi_uniform_real_distribution,
+                                                  gpu_kernel=test_taichi_uniform_real_distribution)
+  prim_normal_distribution = bm.XLACustomOp(cpu_kernel=test_taichi_normal_distribution,
+                                            gpu_kernel=test_taichi_normal_distribution)
+
+  file_path = os.path.dirname(os.path.abspath(__file__))
+
+  out = prim_lfsr88(seed, outs=[jax.ShapeDtypeStruct((n,), jnp.float32)])
+  # show the distribution of out
+  plt.hist(out, bins=100)
+  plt.title("LFSR88 random number generator")
+  plt.savefig(file_path + "/lfsr88.png")
+  plt.close()
+
+  out = prim_lcg_rand(seed,
+                      outs=[jax.ShapeDtypeStruct((n,), jnp.float32)])
+  # show the distribution of out
+  plt.hist(out, bins=100)
+  plt.title("LCG random number generator")
+  plt.savefig(file_path + "/lcg_rand.png")
+  plt.close()
+
+  out = prim_uniform_int_distribution(seed, low_high,
+                                      outs=[jax.ShapeDtypeStruct((n,), jnp.int32)])
+  # show the distribution of out
+  plt.hist(out, bins=10)
+  plt.title("Uniform int distribution (0, 10)")
+  plt.savefig(file_path + "/uniform_int_distribution.png")
+  plt.close()
+
+  out = prim_uniform_real_distribution(seed, low_high,
+                                       outs=[jax.ShapeDtypeStruct((n,), jnp.float32)])
+  # show the distribution of out
+  plt.hist(out, bins=100)
+  plt.title("Uniform real distribution (0, 10)")
+  plt.savefig(file_path + "/uniform_real_distribution.png")
+  plt.close()
+
+  out = prim_normal_distribution(seed, mu_sigma,
+                                 outs=[jax.ShapeDtypeStruct((n,), jnp.float32)])
+  # show the distribution of out
+  plt.title("Normal distribution mu=0, sigma=1")
+  plt.hist(out, bins=100)
+  plt.savefig(file_path + "/normal_distribution.png")
+
+# TODO; test default types
diff --git a/brainpy/_src/math/tifunc.py b/brainpy/_src/math/tifunc.py
index a9ee39f4a..9cfd39e1a 100644
--- a/brainpy/_src/math/tifunc.py
+++ b/brainpy/_src/math/tifunc.py
@@ -1,7 +1,7 @@
-from brainpy._src.dependency_check import import_taichi
+from brainpy._src.dependency_check import import_taichi, raise_taichi_not_found
 from . import defaults
 
-ti = import_taichi()
+ti = import_taichi(error_if_not_found=False)
 
 __all__ = [
   # taichi function for other utilities
@@ -16,349 +16,330 @@
   'lfsr113_random_integers', 'lfsr113_randint', 'lfsr113_uniform', 'lfsr113_rand',
 ]
 
+if ti is not None:
 
-@ti.func
-def _lcg_rand(state: ti.types.ndarray(ndim=1)):
-  # LCG constants
-  state[0] = ti.u32(1664525) * state[0] + ti.u32(1013904223)
-  return state[0]
+  #############################################
+  # Random Number Generator: LFSR88 algorithm #
+  #############################################
 
+  @ti.func
+  def lfsr88_key(seed: ti.u32) -> ti.types.vector(4, ti.u32):
+    """Initialize the random key of LFSR88 algorithm (Combined LFSR random number generator by L'Ecuyer).
 
-@ti.func
-def taichi_lcg_rand(seed: ti.types.ndarray(ndim=1)):
-  """
-  Generate a random number using the Taichi LCG algorithm.
+    This key is used in LFSR88 based random number generator functions, like ``lfsr88_rand()``.
 
-  Parameters:
-    seed (ti.types.ndarray): The seed value for the random number generator.
+    Source:
+    https://github.com/cmcqueen/simplerandom/blob/main/c/lecuyer/lfsr88.c
 
-  Returns:
-    float: A random number between 0 and 1.
-  """
+    /**** VERY IMPORTANT **** :
+      The initial seeds s1, s2, s3  MUST be larger than
+      1, 7, and 15 respectively.
+    */
 
-  return float(_lcg_rand(seed)) / ti.u32(2 ** 32 - 1)
+    Args:
+      seed: int. The seed value for the random number generator.
 
+    Returns:
+      ti.math.uvec4: The random key for the LFSR88 random number generator.
+    """
+    return ti.math.uvec4(ti.u32(seed + 1), ti.u32(seed + 7), ti.u32(seed + 15), ti.u32(0))
 
-#############################################
-# Random Number Generator: LFSR88 algorithm #
-#############################################
 
+  @ti.func
+  def lfsr88_next_key(key: ti.types.vector(4, ti.u32)) -> ti.types.vector(4, ti.u32):
+    """Next random key of LFSR88 algorithm (Combined LFSR random number generator by L'Ecuyer).
 
-@ti.func
-def lfsr88_key(seed: ti.u32) -> ti.types.vector(4, ti.u32):
-  """Initialize the random key of LFSR88 algorithm (Combined LFSR random number generator by L'Ecuyer).
+    Args:
+      key: The state value for the random number generator.
 
-  This key is used in LFSR88 based random number generator functions, like ``lfsr88_rand()``.
+    Returns:
+      ti.math.uvec4: The next random key.
+    """
+    b = ti.u32(((key[0] << 13) ^ key[0]) >> 19)
+    s1 = ((key[0] & ti.u32(4294967294)) << 12) ^ b
+    b = ((key[1] << 2) ^ key[1]) >> 25
+    s2 = ((key[1] & ti.u32(4294967288)) << 4) ^ b
+    b = ((key[2] << 3) ^ key[2]) >> 11
+    s3 = ((key[2] & ti.u32(4294967280)) << 17) ^ b
+    return ti.math.uvec4(s1, s2, s3, b)
 
-  Source:
-  https://github.com/cmcqueen/simplerandom/blob/main/c/lecuyer/lfsr88.c
 
-  /**** VERY IMPORTANT **** :
-    The initial seeds s1, s2, s3  MUST be larger than
-    1, 7, and 15 respectively.
-  */
+  @ti.func
+  def lfsr88_normal(key: ti.types.vector(4, ti.u32), mu, sigma, epsilon=1e-10):
+    """
+    Generate a random number of the normal distribution ``N(mu, sigma)`` using the LFSR88 algorithm.
 
-  Args:
-    seed: int. The seed value for the random number generator.
+    Args:
+      key: The state value for the random number generator.
+      mu: The mean of the normal distribution.
+      sigma: The standard deviation of the normal distribution.
+      epsilon: The epsilon value to avoid log(0).
+    """
 
-  Returns:
-    ti.math.uvec4: The random key for the LFSR88 random number generator.
-  """
-  return ti.math.uvec4(ti.u32(seed + 1), ti.u32(seed + 7), ti.u32(seed + 15), ti.u32(0))
+    key, r = lfsr88_randn(key, epsilon)
+    return key, mu + sigma * r
 
 
-@ti.func
-def lfsr88_next_key(key: ti.types.vector(4, ti.u32)) -> ti.types.vector(4, ti.u32):
-  """Next random key of LFSR88 algorithm (Combined LFSR random number generator by L'Ecuyer).
+  @ti.func
+  def lfsr88_randn(key: ti.types.vector(4, ti.u32), epsilon=1e-10):
+    """
+    Generate a random number with the standard normal distribution using the LFSR88 algorithm.
 
-  Args:
-    key: The state value for the random number generator.
+    Args:
+      key: The state value for the random number generator.
+      epsilon: The epsilon value to avoid log(0).
 
-  Returns:
-    ti.math.uvec4: The next random key.
-  """
-  b = ti.u32(((key[0] << 13) ^ key[0]) >> 19)
-  s1 = ((key[0] & ti.u32(4294967294)) << 12) ^ b
-  b = ((key[1] << 2) ^ key[1]) >> 25
-  s2 = ((key[1] & ti.u32(4294967288)) << 4) ^ b
-  b = ((key[2] << 3) ^ key[2]) >> 11
-  s3 = ((key[2] & ti.u32(4294967280)) << 17) ^ b
-  return ti.math.uvec4(s1, s2, s3, b)
+    References:
+      Box–Muller transform. https://en.wikipedia.org/wiki/Box%E2%80%93Muller_transform
+      Marsaglia polar method. https://en.wikipedia.org/wiki/Marsaglia_polar_method
 
+    """
 
-@ti.func
-def lfsr88_normal(key: ti.types.vector(4, ti.u32), mu, sigma, epsilon=1e-10):
-  """
-  Generate a random number of the normal distribution ``N(mu, sigma)`` using the LFSR88 algorithm.
+    key, u1 = lfsr88_rand(key)
+    key, u2 = lfsr88_rand(key)
 
-  Args:
-    key: The state value for the random number generator.
-    mu: The mean of the normal distribution.
-    sigma: The standard deviation of the normal distribution.
-    epsilon: The epsilon value to avoid log(0).
-  """
+    # Ensure state1 is not zero to avoid log(0)
+    u1 = ti.cast(ti.max(u1, epsilon), defaults.ti_float)
 
-  key, r = lfsr88_randn(key, epsilon)
-  return key, mu + sigma * r
+    # Normalize the uniform samples
+    mag = ti.cast(ti.sqrt(-2.0 * ti.log(u1)), defaults.ti_float)
 
+    # Box-Muller transform
+    # z1 = mag * ti.cos(2 * ti.math.pi * u2)
+    z2 = ti.cast(mag * ti.sin(2 * ti.math.pi * u2), defaults.ti_float)
 
-@ti.func
-def lfsr88_randn(key: ti.types.vector(4, ti.u32), epsilon=1e-10):
-  """
-  Generate a random number with the standard normal distribution using the LFSR88 algorithm.
+    return key, z2
 
-  Args:
-    key: The state value for the random number generator.
-    epsilon: The epsilon value to avoid log(0).
 
-  References:
-    Box–Muller transform. https://en.wikipedia.org/wiki/Box%E2%80%93Muller_transform
-    Marsaglia polar method. https://en.wikipedia.org/wiki/Marsaglia_polar_method
+  @ti.func
+  def lfsr88_random_integers(key: ti.types.vector(4, ti.u32), low, high):
+    """
+    Generates a uniformly distributed random integer between `low` and `high` (inclusive) using the LFSR88 algorithm.
 
-  """
+    Parameters:
+      key: The state value used for random number generation.
+      low: The lower bound of the range.
+      high: The upper bound of the range.
+    """
+    key = lfsr88_next_key(key)
+    return key, ti.cast((key[0] ^ key[1] ^ key[2]) % (high + 1 - low) + low, defaults.ti_int)
 
-  key, u1 = lfsr88_rand(key)
-  key, u2 = lfsr88_rand(key)
 
-  # Ensure state1 is not zero to avoid log(0)
-  u1 = ti.cast(ti.max(u1, epsilon), defaults.ti_float)
+  @ti.func
+  def lfsr88_randint(key: ti.types.vector(4, ti.u32), dtype=ti.u32):
+    key = lfsr88_next_key(key)
+    return key, dtype(key[0] ^ key[1] ^ key[2])
 
-  # Normalize the uniform samples
-  mag = ti.cast(ti.sqrt(-2.0 * ti.log(u1)), defaults.ti_float)
 
-  # Box-Muller transform
-  # z1 = mag * ti.cos(2 * ti.math.pi * u2)
-  z2 = ti.cast(mag * ti.sin(2 * ti.math.pi * u2), defaults.ti_float)
+  @ti.func
+  def lfsr88_uniform(key: ti.types.vector(4, ti.u32), low, high):
+    """
+    Generates a uniformly distributed random float between `low` and `high` (inclusive) using the LFSR88 algorithm.
 
-  return key, z2
+    Args:
+      key: The state value used for random number generation.
+      low: The lower bound of the range.
+      high: The upper bound of the range.
+    """
+    key = lfsr88_next_key(key)
+    r = (key[0] ^ key[1] ^ key[2]) * ti.cast(2.3283064365386963e-10, defaults.ti_float)
+    return key, ti.cast(r * (high - low) + low, defaults.ti_float)
 
 
-@ti.func
-def lfsr88_random_integers(key: ti.types.vector(4, ti.u32), low, high):
-  """
-  Generates a uniformly distributed random integer between `low` and `high` (inclusive) using the LFSR88 algorithm.
+  @ti.func
+  def lfsr88_rand(key: ti.types.vector(4, ti.u32)):
+    """
+    Generates a uniformly distributed random float between 0 and 1 using the LFSR88 algorithm.
 
-  Parameters:
-    key: The state value used for random number generation.
-    low: The lower bound of the range.
-    high: The upper bound of the range.
-  """
-  key = lfsr88_next_key(key)
-  return key, ti.cast((key[0] ^ key[1] ^ key[2]) % (high + 1 - low) + low, defaults.ti_int)
+    Args:
+      key: The state value used for random number generation.
+    """
+    key = lfsr88_next_key(key)
+    return key, (key[0] ^ key[1] ^ key[2]) * ti.cast(2.3283064365386963e-10, defaults.ti_float)
 
 
-@ti.func
-def lfsr88_randint(key: ti.types.vector(4, ti.u32), dtype=ti.u32):
-  key = lfsr88_next_key(key)
-  return key, dtype(key[0] ^ key[1] ^ key[2])
+  ##############################################
+  # Random Number Generator: LFSR113 algorithm #
+  ##############################################
 
+  @ti.func
+  def lfsr113_key(seed: ti.u32) -> ti.types.vector(4, ti.u32):
+    """Initialize the random key of LFSR113 algorithm (Combined LFSR random number generator by L'Ecuyer).
 
-@ti.func
-def lfsr88_uniform(key: ti.types.vector(4, ti.u32), low, high):
-  """
-  Generates a uniformly distributed random float between `low` and `high` (inclusive) using the LFSR88 algorithm.
+    This key is used in LFSR113 based random number generator functions, like ``lfsr113_rand()``.
 
-  Args:
-    key: The state value used for random number generation.
-    low: The lower bound of the range.
-    high: The upper bound of the range.
-  """
-  key = lfsr88_next_key(key)
-  r = (key[0] ^ key[1] ^ key[2]) * ti.cast(2.3283064365386963e-10, defaults.ti_float)
-  return key, ti.cast(r * (high - low) + low, defaults.ti_float)
+    Source:
+    https://github.com/cmcqueen/simplerandom/blob/main/c/lecuyer/lfsr113.c
 
+    /**** VERY IMPORTANT **** :
+      The initial seeds s1, s2, s3, s4  MUST be larger than
+      1, 7, 15, and 127 respectively.
+    */
 
-@ti.func
-def lfsr88_rand(key: ti.types.vector(4, ti.u32)):
-  """
-  Generates a uniformly distributed random float between 0 and 1 using the LFSR88 algorithm.
+    Args:
+      seed: int. The seed value for the random number generator.
 
-  Args:
-    key: The state value used for random number generation.
-  """
-  key = lfsr88_next_key(key)
-  return key, (key[0] ^ key[1] ^ key[2]) * ti.cast(2.3283064365386963e-10, defaults.ti_float)
+    Returns:
+      ti.math.uvec4: The random key for the LFSR113 random number generator.
+    """
+    return ti.math.uvec4(ti.u32(seed + 1), ti.u32(seed + 7), ti.u32(seed + 15), ti.u32(seed + 127))
 
 
-##############################################
-# Random Number Generator: LFSR113 algorithm #
-##############################################
+  @ti.func
+  def lfsr113_next_key(key: ti.types.vector(4, ti.u32)) -> ti.types.vector(4, ti.u32):
+    """Next random key of LFSR113 algorithm (Combined LFSR random number generator by L'Ecuyer).
 
+    Args:
+      key: The state value for the random number generator.
 
-@ti.func
-def lfsr113_key(seed: ti.u32) -> ti.types.vector(4, ti.u32):
-  """Initialize the random key of LFSR113 algorithm (Combined LFSR random number generator by L'Ecuyer).
+    Returns:
+      ti.math.uvec4: The next random key.
+    """
+    z1 = key[0]
+    z2 = key[1]
+    z3 = key[2]
+    z4 = key[3]
+    b = ((z1 << 6) ^ z1) >> 13
+    z1 = ti.u32(((z1 & ti.u64(4294967294)) << 18) ^ b)
+    b = ((z2 << 2) ^ z2) >> 27
+    z2 = ti.u32(((z2 & ti.u64(4294967288)) << 2) ^ b)
+    b = ((z3 << 13) ^ z3) >> 21
+    z3 = ti.u32(((z3 & ti.u64(4294967280)) << 7) ^ b)
+    b = ((z4 << 3) ^ z4) >> 12
+    z4 = ti.u32(((z4 & ti.u64(4294967168)) << 13) ^ b)
+    return ti.math.uvec4(z1, z2, z3, z4)
 
-  This key is used in LFSR113 based random number generator functions, like ``lfsr113_rand()``.
 
-  Source:
-  https://github.com/cmcqueen/simplerandom/blob/main/c/lecuyer/lfsr113.c
+  @ti.func
+  def lfsr113_normal(key: ti.types.vector(4, ti.u32), mu, sigma, epsilon=1e-10):
+    """
+    Generate a random number of the normal distribution ``N(mu, sigma)`` using the LFSR113 algorithm.
 
-  /**** VERY IMPORTANT **** :
-    The initial seeds s1, s2, s3, s4  MUST be larger than
-    1, 7, 15, and 127 respectively.
-  */
+    Args:
+      key: The state value for the random number generator.
+      mu: The mean of the normal distribution.
+      sigma: The standard deviation of the normal distribution.
+      epsilon: The epsilon value to avoid log(0).
+    """
 
-  Args:
-    seed: int. The seed value for the random number generator.
+    key, r = lfsr113_randn(key, epsilon)
+    return key, ti.cast(mu + sigma * r, defaults.ti_float)
 
-  Returns:
-    ti.math.uvec4: The random key for the LFSR113 random number generator.
-  """
-  return ti.math.uvec4(ti.u32(seed + 1), ti.u32(seed + 7), ti.u32(seed + 15), ti.u32(seed + 127))
 
+  @ti.func
+  def lfsr113_randn(key: ti.types.vector(4, ti.u32), epsilon=1e-10):
+    """
+    Generate a random number with standard normal distribution using the LFSR113 algorithm.
 
-@ti.func
-def lfsr113_next_key(key: ti.types.vector(4, ti.u32)) -> ti.types.vector(4, ti.u32):
-  """Next random key of LFSR113 algorithm (Combined LFSR random number generator by L'Ecuyer).
+    Args:
+      key: The state value for the random number generator.
+      epsilon: The epsilon value to avoid log(0).
 
-  Args:
-    key: The state value for the random number generator.
+    References:
+      Box–Muller transform. https://en.wikipedia.org/wiki/Box%E2%80%93Muller_transform
+      Marsaglia polar method. https://en.wikipedia.org/wiki/Marsaglia_polar_method
 
-  Returns:
-    ti.math.uvec4: The next random key.
-  """
-  z1 = key[0]
-  z2 = key[1]
-  z3 = key[2]
-  z4 = key[3]
-  b = ((z1 << 6) ^ z1) >> 13
-  z1 = ti.u32(((z1 & ti.u64(4294967294)) << 18) ^ b)
-  b = ((z2 << 2) ^ z2) >> 27
-  z2 = ti.u32(((z2 & ti.u64(4294967288)) << 2) ^ b)
-  b = ((z3 << 13) ^ z3) >> 21
-  z3 = ti.u32(((z3 & ti.u64(4294967280)) << 7) ^ b)
-  b = ((z4 << 3) ^ z4) >> 12
-  z4 = ti.u32(((z4 & ti.u64(4294967168)) << 13) ^ b)
-  return ti.math.uvec4(z1, z2, z3, z4)
+    """
 
+    key, u1 = lfsr113_rand(key)
+    key, u2 = lfsr113_rand(key)
 
-@ti.func
-def lfsr113_normal(key: ti.types.vector(4, ti.u32), mu, sigma, epsilon=1e-10):
-  """
-  Generate a random number of the normal distribution ``N(mu, sigma)`` using the LFSR113 algorithm.
+    # Ensure state1 is not zero to avoid log(0)
+    u1 = ti.cast(ti.max(u1, epsilon), defaults.ti_float)
 
-  Args:
-    key: The state value for the random number generator.
-    mu: The mean of the normal distribution.
-    sigma: The standard deviation of the normal distribution.
-    epsilon: The epsilon value to avoid log(0).
-  """
+    # Normalize the uniform samples
+    mag = ti.cast(ti.sqrt(-2.0 * ti.log(u1)), defaults.ti_float)
 
-  key, r = lfsr113_randn(key, epsilon)
-  return key, ti.cast(mu + sigma * r, defaults.ti_float)
+    # Box-Muller transform
+    # z1 = mag * ti.cos(2 * ti.math.pi * u2)
+    z2 = ti.cast(mag * ti.sin(2 * ti.math.pi * u2), defaults.ti_float)
 
+    return key, z2
 
-@ti.func
-def lfsr113_randn(key: ti.types.vector(4, ti.u32), epsilon=1e-10):
-  """
-  Generate a random number with standard normal distribution using the LFSR113 algorithm.
 
-  Args:
-    key: The state value for the random number generator.
-    epsilon: The epsilon value to avoid log(0).
+  @ti.func
+  def lfsr113_random_integers(key: ti.types.vector(4, ti.u32), low, high):
+    """
+    Generates a uniformly distributed random integer between `low` and `high` (inclusive) using the LFSR113 algorithm.
 
-  References:
-    Box–Muller transform. https://en.wikipedia.org/wiki/Box%E2%80%93Muller_transform
-    Marsaglia polar method. https://en.wikipedia.org/wiki/Marsaglia_polar_method
+    Parameters:
+      key: The state value used for random number generation.
+      low: The lower bound of the range.
+      high: The upper bound of the range.
+    """
+    key = lfsr113_next_key(key)
+    return key, ti.cast((key[0] ^ key[1] ^ key[2] ^ key[3]) % (high + 1 - low) + low, defaults.ti_int)
 
-  """
 
-  key, u1 = lfsr113_rand(key)
-  key, u2 = lfsr113_rand(key)
+  @ti.func
+  def lfsr113_randint(key: ti.types.vector(4, ti.u32)):
+    key = lfsr113_next_key(key)
+    return key, ti.cast(key[0] ^ key[1] ^ key[2] ^ key[3], defaults.ti_int)
 
-  # Ensure state1 is not zero to avoid log(0)
-  u1 = ti.cast(ti.max(u1, epsilon), defaults.ti_float)
 
-  # Normalize the uniform samples
-  mag = ti.cast(ti.sqrt(-2.0 * ti.log(u1)), defaults.ti_float)
+  @ti.func
+  def lfsr113_uniform(key: ti.types.vector(4, ti.u32), low, high):
+    """
+    Generates a uniformly distributed random float between `low` and `high` (inclusive) using the LFSR113 algorithm.
 
-  # Box-Muller transform
-  # z1 = mag * ti.cos(2 * ti.math.pi * u2)
-  z2 = ti.cast(mag * ti.sin(2 * ti.math.pi * u2), defaults.ti_float)
+    Args:
+      key: The state value used for random number generation.
+      low: The lower bound of the range.
+      high: The upper bound of the range.
+    """
+    key = lfsr88_next_key(key)
+    r = (key[0] ^ key[1] ^ key[2] ^ key[3]) * ti.cast(2.3283064365386963e-10, defaults.ti_float)
+    return key, ti.cast(r * (high - low) + low, defaults.ti_float)
+
+
+  @ti.func
+  def lfsr113_rand(key: ti.types.vector(4, ti.u32)):
+    """
+    Generates a uniformly distributed random float between 0 and 1 using the LFSR113 algorithm.
 
-  return key, z2
+    Args:
+      key: The state value used for random number generation.
+    """
+    key = lfsr113_next_key(key)
+    return key, (key[0] ^ key[1] ^ key[2] ^ key[3]) * ti.cast(2.3283064365386963e-10, defaults.ti_float)
 
 
-@ti.func
-def lfsr113_random_integers(key: ti.types.vector(4, ti.u32), low, high):
-  """
-  Generates a uniformly distributed random integer between `low` and `high` (inclusive) using the LFSR113 algorithm.
+  ###########################
+  # Reductions: warp reduce #
+  ###########################
 
-  Parameters:
-    key: The state value used for random number generation.
-    low: The lower bound of the range.
-    high: The upper bound of the range.
-  """
-  key = lfsr113_next_key(key)
-  return key, ti.cast((key[0] ^ key[1] ^ key[2] ^ key[3]) % (high + 1 - low) + low, defaults.ti_int)
+  @ti.func
+  def warp_reduce_sum_all(val):
+    """
+    Warp reduce sum.
 
+    Args:
+      val (float): The value to be reduced.
 
-@ti.func
-def lfsr113_randint(key: ti.types.vector(4, ti.u32)):
-  key = lfsr113_next_key(key)
-  return key, ti.cast(key[0] ^ key[1] ^ key[2] ^ key[3], defaults.ti_int)
+    Returns:
+      float: The reduced value.
+    """
+    for i in ti.static(range(1, 32)):
+      val += ti.static(ti.simt.warp.shfl_xor(val, i))
+    return val
 
 
-@ti.func
-def lfsr113_uniform(key: ti.types.vector(4, ti.u32), low, high):
-  """
-  Generates a uniformly distributed random float between `low` and `high` (inclusive) using the LFSR113 algorithm.
+  @ti.func
+  def warp_reduce_sum(val):
+    """
+    Warp reduce sum.
 
-  Args:
-    key: The state value used for random number generation.
-    low: The lower bound of the range.
-    high: The upper bound of the range.
-  """
-  key = lfsr88_next_key(key)
-  r = (key[0] ^ key[1] ^ key[2] ^ key[3]) * ti.cast(2.3283064365386963e-10, defaults.ti_float)
-  return key, ti.cast(r * (high - low) + low, defaults.ti_float)
-
-
-@ti.func
-def lfsr113_rand(key: ti.types.vector(4, ti.u32)):
-  """
-  Generates a uniformly distributed random float between 0 and 1 using the LFSR113 algorithm.
+    Args:
+      val (float): The value to be reduced.
 
-  Args:
-    key: The state value used for random number generation.
-  """
-  key = lfsr113_next_key(key)
-  return key, (key[0] ^ key[1] ^ key[2] ^ key[3]) * ti.cast(2.3283064365386963e-10, defaults.ti_float)
+    Returns:
+      float: The reduced value.
+    """
+    for offset in ti.static((16, 8, 4, 2, 1)):
+      val += ti.simt.warp.shfl_down_f32(ti.u32(0xFFFFFFFF), val, offset)
+    return val
 
 
-###########################
-# Reductions: warp reduce #
-###########################
-
-
-@ti.func
-def warp_reduce_sum_all(val):
-  """
-  Warp reduce sum.
-
-  Args:
-    val (float): The value to be reduced.
-
-  Returns:
-    float: The reduced value.
-  """
-  for i in ti.static(range(1, 32)):
-    val += ti.static(ti.simt.warp.shfl_xor(val, i))
-  return val
-
-
-@ti.func
-def warp_reduce_sum(val):
-  """
-  Warp reduce sum.
-
-  Args:
-    val (float): The value to be reduced.
-
-  Returns:
-    float: The reduced value.
-  """
-  for offset in ti.static((16, 8, 4, 2, 1)):
-    val += ti.simt.warp.shfl_down_f32(ti.u32(0xFFFFFFFF), val, offset)
-  return val
+else:
+  for func in __all__:
+    globals()[func] = raise_taichi_not_found
\ No newline at end of file
diff --git a/brainpy/_src/tests/test_dyn_runner.py b/brainpy/_src/tests/test_dyn_runner.py
index dd6865e64..6f2411ee8 100644
--- a/brainpy/_src/tests/test_dyn_runner.py
+++ b/brainpy/_src/tests/test_dyn_runner.py
@@ -1,134 +1,133 @@
-# -*- coding: utf-8 -*-
-
-
-import unittest
-import brainpy as bp
-import brainpy.math as bm
-
-
-class TestDSRunner(unittest.TestCase):
-  def test1(self):
-    class ExampleDS(bp.DynamicalSystem):
-      def __init__(self):
-        super(ExampleDS, self).__init__()
-        self.i = bm.Variable(bm.zeros(1))
-
-      def update(self):
-        self.i += 1
-
-    ds = ExampleDS()
-    runner = bp.DSRunner(ds, dt=1., monitors=['i'], progress_bar=False)
-    runner.run(100.)
-
-  def test_t_and_dt(self):
-    class ExampleDS(bp.DynamicalSystem):
-      def __init__(self):
-        super(ExampleDS, self).__init__()
-        self.i = bm.Variable(bm.zeros(1))
-
-      def update(self):
-        self.i += 1 * bp.share['dt']
-
-    runner = bp.DSRunner(ExampleDS(), dt=1., monitors=['i'], progress_bar=False)
-    runner.run(100.)
-
-  def test_DSView(self):
-    class EINet(bp.Network):
-      def __init__(self, scale=1.0, method='exp_auto'):
-        super(EINet, self).__init__()
-
-        # network size
-        num_exc = int(800 * scale)
-        num_inh = int(200 * scale)
-
-        # neurons
-        pars = dict(V_rest=-60., V_th=-50., V_reset=-60., tau=20., tau_ref=5.)
-        self.E = bp.neurons.LIF(num_exc, **pars, method=method)
-        self.I = bp.neurons.LIF(num_inh, **pars, method=method)
-        self.E.V[:] = bm.random.randn(num_exc) * 2 - 55.
-        self.I.V[:] = bm.random.randn(num_inh) * 2 - 55.
-
-        # synapses
-        we = 0.6 / scale  # excitatory synaptic weight (voltage)
-        wi = 6.7 / scale  # inhibitory synaptic weight
-        self.E2E = bp.synapses.Exponential(self.E, self.E[:100], bp.conn.FixedProb(0.02),
-                                           output=bp.synouts.COBA(E=0.), g_max=we,
-                                           tau=5., method=method)
-        self.E2I = bp.synapses.Exponential(self.E, self.I[:100], bp.conn.FixedProb(0.02),
-                                           output=bp.synouts.COBA(E=0.), g_max=we,
-                                           tau=5., method=method)
-        self.I2E = bp.synapses.Exponential(self.I, self.E[:100], bp.conn.FixedProb(0.02),
-                                           output=bp.synouts.COBA(E=-80.), g_max=wi,
-                                           tau=10., method=method)
-        self.I2I = bp.synapses.Exponential(self.I, self.I[:100], bp.conn.FixedProb(0.02),
-                                           output=bp.synouts.COBA(E=-80.), g_max=wi,
-                                           tau=10., method=method)
-
-    bm.random.seed()
-
-    net = EINet(scale=1., method='exp_auto')
-    # with JIT
-    runner = bp.DSRunner(net, monitors={'E.spike': net.E.spike},
-                         inputs=[(net.E.input, 20.), (net.I.input, 20.)]).run(1.)
-
-    # without JIT
-    runner = bp.DSRunner(net, monitors={'E.spike': net.E.spike},
-                         inputs=[(net.E.input, 20.), (net.I.input, 20.)], jit=False).run(0.2)
-
-
-
-class TestMemoryEfficient(unittest.TestCase):
-  pass
-
-
-
-
-
-
-# class TestMonitor(TestCase):
-#   def test_1d_array(self):
-#     try1 = TryGroup(monitors=['a'])
-#     try1.a = np.ones(1)
-#     try1.run(100.)
-#
-#     assert np.ndim(try1.mon.a) == 2 and np.shape(try1.mon.a)[1] == 1
-#     assert np.allclose(np.arange(2, 1002).reshape((-1, 1)), try1.mon.a)
-#
-#   def test_2d_array():
-#     set(dt=0.1)
-#     try1 = TryGroup(monitors=['a'])
-#     try1.a = np.ones((2, 2))
-#     try1.run(100.)
-#
-#     assert np.ndim(try1.mon.a) == 2 and np.shape(try1.mon.a)[1] == 4
-#     series = np.arange(2, 1002).reshape((-1, 1))
-#     series = np.repeat(series, 4, axis=1)
-#     assert np.allclose(series, try1.mon.a)
-#
-#   def test_monitor_with_every():
-#     set(dt=0.1)
-#
-#     # try1: 2d array
-#     try1 = TryGroup(monitors=Monitor(variables=['a'], every=[1.]))
-#     try1.run(100.)
-#     assert np.ndim(try1.mon.a) == 2 and np.shape(try1.mon.a)[1] == 4
-#     series = np.arange(2, 1002, 1. / 0.1).reshape((-1, 1))
-#     series = np.repeat(series, 4, axis=1)
-#     assert np.allclose(series, try1.mon.a)
-#
-#     # try2: 1d array
-#     try2 = TryGroup(monitors=Monitor(variables=['a'], every=[1.]))
-#     try2.a = np.array([1., 1.])
-#     try2.run(100.)
-#     assert np.ndim(try2.mon.a) == 2 and np.shape(try2.mon.a)[1] == 2
-#     series = np.arange(2, 1002, 1. / 0.1).reshape((-1, 1))
-#     series = np.repeat(series, 2, axis=1)
-#     assert np.allclose(series, try2.mon.a)
-#
-#     # try2: scalar
-#     try3 = TryGroup(monitors=Monitor(variables=['a'], every=[1.]))
-#     try3.a = 1.
-#     try3.run(100.)
-#     assert np.ndim(try3.mon.a) == 2 and np.shape(try3.mon.a)[1] == 1
-#     series = np.arange(2, 1002, 1. / 0.1).reshape((-1, 1))
-#     assert np.allclose(series, try3.mon.a)
+# -*- coding: utf-8 -*-
+
+import pytest
+import unittest
+import brainpy as bp
+import brainpy.math as bm
+
+from brainpy._src.dependency_check import import_taichi
+
+if import_taichi(error_if_not_found=False) is None:
+  pytest.skip('no taichi', allow_module_level=True)
+
+
+class TestDSRunner(unittest.TestCase):
+  def test1(self):
+    class ExampleDS(bp.DynamicalSystem):
+      def __init__(self):
+        super(ExampleDS, self).__init__()
+        self.i = bm.Variable(bm.zeros(1))
+
+      def update(self):
+        self.i += 1
+
+    ds = ExampleDS()
+    runner = bp.DSRunner(ds, dt=1., monitors=['i'], progress_bar=False)
+    runner.run(100.)
+
+  def test_t_and_dt(self):
+    class ExampleDS(bp.DynamicalSystem):
+      def __init__(self):
+        super(ExampleDS, self).__init__()
+        self.i = bm.Variable(bm.zeros(1))
+
+      def update(self):
+        self.i += 1 * bp.share['dt']
+
+    runner = bp.DSRunner(ExampleDS(), dt=1., monitors=['i'], progress_bar=False)
+    runner.run(100.)
+
+  def test_DSView(self):
+    class EINet(bp.Network):
+      def __init__(self, scale=1.0, method='exp_auto'):
+        super(EINet, self).__init__()
+
+        # network size
+        num_exc = int(800 * scale)
+        num_inh = int(200 * scale)
+
+        # neurons
+        pars = dict(V_rest=-60., V_th=-50., V_reset=-60., tau=20., tau_ref=5.)
+        self.E = bp.neurons.LIF(num_exc, **pars, method=method)
+        self.I = bp.neurons.LIF(num_inh, **pars, method=method)
+        self.E.V[:] = bm.random.randn(num_exc) * 2 - 55.
+        self.I.V[:] = bm.random.randn(num_inh) * 2 - 55.
+
+        # synapses
+        we = 0.6 / scale  # excitatory synaptic weight (voltage)
+        wi = 6.7 / scale  # inhibitory synaptic weight
+        self.E2E = bp.synapses.Exponential(self.E, self.E[:100], bp.conn.FixedProb(0.02),
+                                           output=bp.synouts.COBA(E=0.), g_max=we,
+                                           tau=5., method=method)
+        self.E2I = bp.synapses.Exponential(self.E, self.I[:100], bp.conn.FixedProb(0.02),
+                                           output=bp.synouts.COBA(E=0.), g_max=we,
+                                           tau=5., method=method)
+        self.I2E = bp.synapses.Exponential(self.I, self.E[:100], bp.conn.FixedProb(0.02),
+                                           output=bp.synouts.COBA(E=-80.), g_max=wi,
+                                           tau=10., method=method)
+        self.I2I = bp.synapses.Exponential(self.I, self.I[:100], bp.conn.FixedProb(0.02),
+                                           output=bp.synouts.COBA(E=-80.), g_max=wi,
+                                           tau=10., method=method)
+
+    bm.random.seed()
+
+    net = EINet(scale=1., method='exp_auto')
+    # with JIT
+    runner = bp.DSRunner(net, monitors={'E.spike': net.E.spike},
+                         inputs=[(net.E.input, 20.), (net.I.input, 20.)]).run(1.)
+
+    # without JIT
+    runner = bp.DSRunner(net, monitors={'E.spike': net.E.spike},
+                         inputs=[(net.E.input, 20.), (net.I.input, 20.)], jit=False).run(0.2)
+
+
+class TestMemoryEfficient(unittest.TestCase):
+  pass
+
+# class TestMonitor(TestCase):
+#   def test_1d_array(self):
+#     try1 = TryGroup(monitors=['a'])
+#     try1.a = np.ones(1)
+#     try1.run(100.)
+#
+#     assert np.ndim(try1.mon.a) == 2 and np.shape(try1.mon.a)[1] == 1
+#     assert np.allclose(np.arange(2, 1002).reshape((-1, 1)), try1.mon.a)
+#
+#   def test_2d_array():
+#     set(dt=0.1)
+#     try1 = TryGroup(monitors=['a'])
+#     try1.a = np.ones((2, 2))
+#     try1.run(100.)
+#
+#     assert np.ndim(try1.mon.a) == 2 and np.shape(try1.mon.a)[1] == 4
+#     series = np.arange(2, 1002).reshape((-1, 1))
+#     series = np.repeat(series, 4, axis=1)
+#     assert np.allclose(series, try1.mon.a)
+#
+#   def test_monitor_with_every():
+#     set(dt=0.1)
+#
+#     # try1: 2d array
+#     try1 = TryGroup(monitors=Monitor(variables=['a'], every=[1.]))
+#     try1.run(100.)
+#     assert np.ndim(try1.mon.a) == 2 and np.shape(try1.mon.a)[1] == 4
+#     series = np.arange(2, 1002, 1. / 0.1).reshape((-1, 1))
+#     series = np.repeat(series, 4, axis=1)
+#     assert np.allclose(series, try1.mon.a)
+#
+#     # try2: 1d array
+#     try2 = TryGroup(monitors=Monitor(variables=['a'], every=[1.]))
+#     try2.a = np.array([1., 1.])
+#     try2.run(100.)
+#     assert np.ndim(try2.mon.a) == 2 and np.shape(try2.mon.a)[1] == 2
+#     series = np.arange(2, 1002, 1. / 0.1).reshape((-1, 1))
+#     series = np.repeat(series, 2, axis=1)
+#     assert np.allclose(series, try2.mon.a)
+#
+#     # try2: scalar
+#     try3 = TryGroup(monitors=Monitor(variables=['a'], every=[1.]))
+#     try3.a = 1.
+#     try3.run(100.)
+#     assert np.ndim(try3.mon.a) == 2 and np.shape(try3.mon.a)[1] == 1
+#     series = np.arange(2, 1002, 1. / 0.1).reshape((-1, 1))
+#     assert np.allclose(series, try3.mon.a)
diff --git a/brainpy/_src/tools/progress.py b/brainpy/_src/tools/progress.py
new file mode 100644
index 000000000..13b6a1574
--- /dev/null
+++ b/brainpy/_src/tools/progress.py
@@ -0,0 +1,519 @@
+"""Python utilities required by Keras."""
+
+import binascii
+import codecs
+import importlib
+import marshal
+import os
+import re
+import sys
+import time
+import types as python_types
+
+import numpy as np
+
+
+# isort: off
+
+
+def func_dump(func):
+  """Serializes a user defined function.
+
+  Args:
+      func: the function to serialize.
+
+  Returns:
+      A tuple `(code, defaults, closure)`.
+  """
+  if os.name == "nt":
+    raw_code = marshal.dumps(func.__code__).replace(b"\\", b"/")
+    code = codecs.encode(raw_code, "base64").decode("ascii")
+  else:
+    raw_code = marshal.dumps(func.__code__)
+    code = codecs.encode(raw_code, "base64").decode("ascii")
+  defaults = func.__defaults__
+  if func.__closure__:
+    closure = tuple(c.cell_contents for c in func.__closure__)
+  else:
+    closure = None
+  return code, defaults, closure
+
+
+def func_load(code, defaults=None, closure=None, globs=None):
+  """Deserializes a user defined function.
+
+  Args:
+      code: bytecode of the function.
+      defaults: defaults of the function.
+      closure: closure of the function.
+      globs: dictionary of global objects.
+
+  Returns:
+      A function object.
+  """
+  if isinstance(code, (tuple, list)):  # unpack previous dump
+    code, defaults, closure = code
+    if isinstance(defaults, list):
+      defaults = tuple(defaults)
+
+  def ensure_value_to_cell(value):
+    """Ensures that a value is converted to a python cell object.
+
+    Args:
+        value: Any value that needs to be casted to the cell type
+
+    Returns:
+        A value wrapped as a cell object (see function "func_load")
+    """
+
+    def dummy_fn():
+      value  # just access it so it gets captured in .__closure__
+
+    cell_value = dummy_fn.__closure__[0]
+    if not isinstance(value, type(cell_value)):
+      return cell_value
+    return value
+
+  if closure is not None:
+    closure = tuple(ensure_value_to_cell(_) for _ in closure)
+  try:
+    raw_code = codecs.decode(code.encode("ascii"), "base64")
+  except (UnicodeEncodeError, binascii.Error):
+    raw_code = code.encode("raw_unicode_escape")
+  code = marshal.loads(raw_code)
+  if globs is None:
+    globs = globals()
+  return python_types.FunctionType(
+    code, globs, name=code.co_name, argdefs=defaults, closure=closure
+  )
+
+
+class Progbar:
+  """Displays a progress bar.
+
+  Args:
+      target: Total number of steps expected, None if unknown.
+      width: Progress bar width on screen.
+      verbose: Verbosity mode, 0 (silent), 1 (verbose), 2 (semi-verbose)
+      stateful_metrics: Iterable of string names of metrics that should *not*
+        be averaged over time. Metrics in this list will be displayed as-is.
+        All others will be averaged by the progbar before display.
+      interval: Minimum visual progress update interval (in seconds).
+      unit_name: Display name for step counts (usually "step" or "sample").
+  """
+
+  def __init__(
+      self,
+      target,
+      width=30,
+      verbose=1,
+      interval=0.05,
+      stateful_metrics=None,
+      unit_name="step",
+  ):
+    self.target = target
+    self.width = width
+    self.verbose = verbose
+    self.interval = interval
+    self.unit_name = unit_name
+    if stateful_metrics:
+      self.stateful_metrics = set(stateful_metrics)
+    else:
+      self.stateful_metrics = set()
+
+    self._dynamic_display = (
+        (hasattr(sys.stdout, "isatty") and sys.stdout.isatty())
+        or "ipykernel" in sys.modules
+        or "posix" in sys.modules
+        or "PYCHARM_HOSTED" in os.environ
+    )
+    self._total_width = 0
+    self._seen_so_far = 0
+    # We use a dict + list to avoid garbage collection
+    # issues found in OrderedDict
+    self._values = {}
+    self._values_order = []
+    self._start = time.time()
+    self._last_update = 0
+    self._time_at_epoch_start = self._start
+    self._time_at_epoch_end = None
+    self._time_after_first_step = None
+
+  def update(self, current, values=None, finalize=None):
+    """Updates the progress bar.
+
+    Args:
+        current: Index of current step.
+        values: List of tuples: `(name, value_for_last_step)`. If `name` is
+          in `stateful_metrics`, `value_for_last_step` will be displayed
+          as-is. Else, an average of the metric over time will be
+          displayed.
+        finalize: Whether this is the last update for the progress bar. If
+          `None`, uses `current >= self.target`. Defaults to `None`.
+    """
+    if finalize is None:
+      if self.target is None:
+        finalize = False
+      else:
+        finalize = current >= self.target
+
+    values = values or []
+    for k, v in values:
+      if k not in self._values_order:
+        self._values_order.append(k)
+      if k not in self.stateful_metrics:
+        # In the case that progress bar doesn't have a target value in
+        # the first epoch, both on_batch_end and on_epoch_end will be
+        # called, which will cause 'current' and 'self._seen_so_far' to
+        # have the same value. Force the minimal value to 1 here,
+        # otherwise stateful_metric will be 0s.
+        value_base = max(current - self._seen_so_far, 1)
+        if k not in self._values:
+          self._values[k] = [v * value_base, value_base]
+        else:
+          self._values[k][0] += v * value_base
+          self._values[k][1] += value_base
+      else:
+        # Stateful metrics output a numeric value. This representation
+        # means "take an average from a single value" but keeps the
+        # numeric formatting.
+        self._values[k] = [v, 1]
+    self._seen_so_far = current
+
+    message = ""
+    now = time.time()
+    info = f" - {now - self._start:.0f}s"
+    if current == self.target:
+      self._time_at_epoch_end = now
+    if self.verbose == 1:
+      if now - self._last_update < self.interval and not finalize:
+        return
+
+      prev_total_width = self._total_width
+      if self._dynamic_display:
+        message += "\b" * prev_total_width
+        message += "\r"
+      else:
+        message += "\n"
+
+      if self.target is not None:
+        numdigits = int(np.log10(self.target)) + 1
+        bar = ("%" + str(numdigits) + "d/%d [") % (current, self.target)
+        prog = float(current) / self.target
+        prog_width = int(self.width * prog)
+        if prog_width > 0:
+          bar += "=" * (prog_width - 1)
+          if current < self.target:
+            bar += ">"
+          else:
+            bar += "="
+        bar += "." * (self.width - prog_width)
+        bar += "]"
+      else:
+        bar = "%7d/Unknown" % current
+
+      self._total_width = len(bar)
+      message += bar
+
+      time_per_unit = self._estimate_step_duration(current, now)
+
+      if self.target is None or finalize:
+        info += self._format_time(time_per_unit, self.unit_name)
+      else:
+        eta = time_per_unit * (self.target - current)
+        if eta > 3600:
+          eta_format = "%d:%02d:%02d" % (
+            eta // 3600,
+            (eta % 3600) // 60,
+            eta % 60,
+          )
+        elif eta > 60:
+          eta_format = "%d:%02d" % (eta // 60, eta % 60)
+        else:
+          eta_format = "%ds" % eta
+
+        info = f" - ETA: {eta_format}"
+
+      for k in self._values_order:
+        info += f" - {k}:"
+        if isinstance(self._values[k], list):
+          avg = np.mean(
+            self._values[k][0] / max(1, self._values[k][1])
+          )
+          if abs(avg) > 1e-3:
+            info += f" {avg:.4f}"
+          else:
+            info += f" {avg:.4e}"
+        else:
+          info += f" {self._values[k]}"
+
+      self._total_width += len(info)
+      if prev_total_width > self._total_width:
+        info += " " * (prev_total_width - self._total_width)
+
+      if finalize:
+        info += "\n"
+
+      message += info
+      print_msg(message, line_break=False)
+      message = ""
+
+    elif self.verbose == 2:
+      if finalize:
+        numdigits = int(np.log10(self.target)) + 1
+        count = ("%" + str(numdigits) + "d/%d") % (current, self.target)
+        info = count + info
+        for k in self._values_order:
+          info += f" - {k}:"
+          avg = np.mean(
+            self._values[k][0] / max(1, self._values[k][1])
+          )
+          if avg > 1e-3:
+            info += f" {avg:.4f}"
+          else:
+            info += f" {avg:.4e}"
+        if self._time_at_epoch_end:
+          time_per_epoch = (
+              self._time_at_epoch_end - self._time_at_epoch_start
+          )
+          avg_time_per_step = time_per_epoch / self.target
+          self._time_at_epoch_start = now
+          self._time_at_epoch_end = None
+          info += " -" + self._format_time(time_per_epoch, "epoch")
+          info += " -" + self._format_time(
+            avg_time_per_step, self.unit_name
+          )
+          info += "\n"
+        message += info
+        print_msg(message, line_break=False)
+        message = ""
+
+    self._last_update = now
+
+  def add(self, n, values=None):
+    self.update(self._seen_so_far + n, values)
+
+  def _format_time(self, time_per_unit, unit_name):
+    """format a given duration to display to the user.
+
+    Given the duration, this function formats it in either milliseconds
+    or seconds and displays the unit (i.e. ms/step or s/epoch)
+    Args:
+      time_per_unit: the duration to display
+      unit_name: the name of the unit to display
+    Returns:
+      a string with the correctly formatted duration and units
+    """
+    formatted = ""
+    if time_per_unit >= 1 or time_per_unit == 0:
+      formatted += f" {time_per_unit:.0f}s/{unit_name}"
+    elif time_per_unit >= 1e-3:
+      formatted += f" {time_per_unit * 1000.0:.0f}ms/{unit_name}"
+    else:
+      formatted += f" {time_per_unit * 1000000.0:.0f}us/{unit_name}"
+    return formatted
+
+  def _estimate_step_duration(self, current, now):
+    """Estimate the duration of a single step.
+
+    Given the step number `current` and the corresponding time `now` this
+    function returns an estimate for how long a single step takes. If this
+    is called before one step has been completed (i.e. `current == 0`) then
+    zero is given as an estimate. The duration estimate ignores the duration
+    of the (assumed to be non-representative) first step for estimates when
+    more steps are available (i.e. `current>1`).
+
+    Args:
+      current: Index of current step.
+      now: The current time.
+
+    Returns: Estimate of the duration of a single step.
+    """
+    if current:
+      # there are a few special scenarios here:
+      # 1) somebody is calling the progress bar without ever supplying
+      #    step 1
+      # 2) somebody is calling the progress bar and supplies step one
+      #    multiple times, e.g. as part of a finalizing call
+      # in these cases, we just fall back to the simple calculation
+      if self._time_after_first_step is not None and current > 1:
+        time_per_unit = (now - self._time_after_first_step) / (
+            current - 1
+        )
+      else:
+        time_per_unit = (now - self._start) / current
+
+      if current == 1:
+        self._time_after_first_step = now
+      return time_per_unit
+    else:
+      return 0
+
+  def _update_stateful_metrics(self, stateful_metrics):
+    self.stateful_metrics = self.stateful_metrics.union(stateful_metrics)
+
+
+def make_batches(size, batch_size):
+  """Returns a list of batch indices (tuples of indices).
+
+  Args:
+      size: Integer, total size of the data to slice into batches.
+      batch_size: Integer, batch size.
+
+  Returns:
+      A list of tuples of array indices.
+  """
+  num_batches = int(np.ceil(size / float(batch_size)))
+  return [
+    (i * batch_size, min(size, (i + 1) * batch_size))
+    for i in range(0, num_batches)
+  ]
+
+
+def slice_arrays(arrays, start=None, stop=None):
+  """Slice an array or list of arrays.
+
+  This takes an array-like, or a list of
+  array-likes, and outputs:
+      - arrays[start:stop] if `arrays` is an array-like
+      - [x[start:stop] for x in arrays] if `arrays` is a list
+
+  Can also work on list/array of indices: `slice_arrays(x, indices)`
+
+  Args:
+      arrays: Single array or list of arrays.
+      start: can be an integer index (start index) or a list/array of indices
+      stop: integer (stop index); should be None if `start` was a list.
+
+  Returns:
+      A slice of the array(s).
+
+  Raises:
+      ValueError: If the value of start is a list and stop is not None.
+  """
+  if arrays is None:
+    return [None]
+  if isinstance(start, list) and stop is not None:
+    raise ValueError(
+      "The stop argument has to be None if the value of start "
+      f"is a list. Received start={start}, stop={stop}"
+    )
+  elif isinstance(arrays, list):
+    if hasattr(start, "__len__"):
+      # hdf5 datasets only support list objects as indices
+      if hasattr(start, "shape"):
+        start = start.tolist()
+      return [None if x is None else x[start] for x in arrays]
+    return [
+      None
+      if x is None
+      else None
+      if not hasattr(x, "__getitem__")
+      else x[start:stop]
+      for x in arrays
+    ]
+  else:
+    if hasattr(start, "__len__"):
+      if hasattr(start, "shape"):
+        start = start.tolist()
+      return arrays[start]
+    if hasattr(start, "__getitem__"):
+      return arrays[start:stop]
+    return [None]
+
+
+def to_list(x):
+  """Normalizes a list/tensor into a list.
+
+  If a tensor is passed, we return
+  a list of size 1 containing the tensor.
+
+  Args:
+      x: target object to be normalized.
+
+  Returns:
+      A list.
+  """
+  if isinstance(x, list):
+    return x
+  return [x]
+
+
+def to_snake_case(name):
+  intermediate = re.sub("(.)([A-Z][a-z]+)", r"\1_\2", name)
+  insecure = re.sub("([a-z])([A-Z])", r"\1_\2", intermediate).lower()
+  # If the class is private the name starts with "_" which is not secure
+  # for creating scopes. We prefix the name with "private" in this case.
+  if insecure[0] != "_":
+    return insecure
+  return "private" + insecure
+
+
+def check_for_unexpected_keys(name, input_dict, expected_values):
+  unknown = set(input_dict.keys()).difference(expected_values)
+  if unknown:
+    raise ValueError(
+      f"Unknown entries in {name} dictionary: {list(unknown)}. "
+      f"Only expected following keys: {expected_values}"
+    )
+
+
+def validate_kwargs(
+    kwargs, allowed_kwargs, error_message="Keyword argument not understood:"
+):
+  """Checks that all keyword arguments are in the set of allowed keys."""
+  for kwarg in kwargs:
+    if kwarg not in allowed_kwargs:
+      raise TypeError(error_message, kwarg)
+
+
+def default(method):
+  """Decorates a method to detect overrides in subclasses."""
+  method._is_default = True
+  return method
+
+
+def is_default(method):
+  """Check if a method is decorated with the `default` wrapper."""
+  return getattr(method, "_is_default", False)
+
+
+def populate_dict_with_module_objects(target_dict, modules, obj_filter):
+  for module in modules:
+    for name in dir(module):
+      obj = getattr(module, name)
+      if obj_filter(obj):
+        target_dict[name] = obj
+
+
+class LazyLoader(python_types.ModuleType):
+  """Lazily import a module, mainly to avoid pulling in large dependencies."""
+
+  def __init__(self, local_name, parent_module_globals, name):
+    self._local_name = local_name
+    self._parent_module_globals = parent_module_globals
+    super().__init__(name)
+
+  def _load(self):
+    """Load the module and insert it into the parent's globals."""
+    # Import the target module and insert it into the parent's namespace
+    module = importlib.import_module(self.__name__)
+    self._parent_module_globals[self._local_name] = module
+    # Update this object's dict so that if someone keeps a reference to the
+    # LazyLoader, lookups are efficient (__getattr__ is only called on
+    # lookups that fail).
+    self.__dict__.update(module.__dict__)
+    return module
+
+  def __getattr__(self, item):
+    module = self._load()
+    return getattr(module, item)
+
+
+def print_msg(message, line_break=True):
+  """Print the message to absl logging or stdout."""
+  if line_break:
+    sys.stdout.write(message + "\n")
+  else:
+    sys.stdout.write(message)
+  sys.stdout.flush()
diff --git a/brainpy/errors.py b/brainpy/errors.py
index e59bb326c..453c9c818 100644
--- a/brainpy/errors.py
+++ b/brainpy/errors.py
@@ -38,7 +38,12 @@ class AnalyzerError(BrainPyError):
 class PackageMissingError(BrainPyError):
   """The package missing error.
   """
-  pass
+
+  @classmethod
+  def by_purpose(cls, name, purpose):
+    err = (f'"{name}" must be installed when the user wants to use {purpose}. \n'
+           f'Please install through "pip install {name}".')
+    return cls(err)
 
 
 class BackendNotInstalled(BrainPyError):
@@ -236,9 +241,5 @@ def __init__(self, name):
     ''')
 
 
-
-
 class SharedArgError(BrainPyError):
   pass
-
-
diff --git a/brainpy/math/__init__.py b/brainpy/math/__init__.py
index 02f671345..9a64f9f25 100644
--- a/brainpy/math/__init__.py
+++ b/brainpy/math/__init__.py
@@ -1,103 +1,102 @@
-# -*- coding: utf-8 -*-
-
-# data structure
-from .ndarray import *
-from .delayvars import *
-from .interoperability import *
-from .datatypes import *
-from .compat_numpy import *
-from .compat_tensorflow import *
-from .compat_pytorch import *
-from .einops import *
-
-# functions
-from .activations import *
-from . import activations
-
-# operators
-from .pre_syn_post import *
-from .op_register import *
-from . import surrogate, event, sparse, jitconn
-
-# Variable and Objects for object-oriented JAX transformations
-from .oo_transform import *
-
-# environment settings
-from .modes import *
-from .environment import *
-from .scales import *
-from .others import *
-
-# high-level numpy operations
-from . import fft
-from . import linalg
-from . import random
-
-# taichi operations
-from . import tifunc
-
-# others
-from . import sharding
-
-import jax.numpy as jnp
-from jax import config
-
-del jnp, config
-
-from brainpy._src.math.surrogate._compt import (
-  spike_with_sigmoid_grad as spike_with_sigmoid_grad,
-  spike_with_linear_grad as spike_with_linear_grad,
-  spike_with_gaussian_grad as spike_with_gaussian_grad,
-  spike_with_mg_grad as spike_with_mg_grad,
-)
-
-from brainpy._src.math import defaults
-from brainpy._src.deprecations import deprecation_getattr
-__deprecations = {
-  "sparse_matmul": ("brainpy.math.sparse_matmul is deprecated. Use brainpy.math.sparse.seg_matmul instead.",
-                    sparse.seg_matmul),
-  'csr_matvec': ("brainpy.math.csr_matvec is deprecated. Use brainpy.math.sparse.csrmv instead.",
-                 sparse.csrmv),
-  'event_matvec_prob_conn_homo_weight': ("brainpy.math.event_matvec_prob_conn_homo_weight is deprecated. "
-                                         "Use brainpy.math.jitconn.event_mv_prob_homo instead.",
-                                         jitconn.event_mv_prob_homo),
-  'event_matvec_prob_conn_uniform_weight': ("brainpy.math.event_matvec_prob_conn_uniform_weight is deprecated. "
-                                            "Use brainpy.math.jitconn.event_mv_prob_uniform instead.",
-                                            jitconn.event_mv_prob_uniform),
-  'event_matvec_prob_conn_normal_weight': ("brainpy.math.event_matvec_prob_conn_normal_weight is deprecated. "
-                                           "Use brainpy.math.jitconn.event_mv_prob_normal instead.",
-                                           jitconn.event_mv_prob_normal),
-  'matvec_prob_conn_homo_weight': ("brainpy.math.matvec_prob_conn_homo_weight is deprecated. "
-                                   "Use brainpy.math.jitconn.mv_prob_homo instead.",
-                                   jitconn.mv_prob_homo),
-  'matvec_prob_conn_uniform_weight': ("brainpy.math.matvec_prob_conn_uniform_weight is deprecated. "
-                                      "Use brainpy.math.jitconn.mv_prob_uniform instead.",
-                                      jitconn.mv_prob_uniform),
-  'matvec_prob_conn_normal_weight': ("brainpy.math.matvec_prob_conn_normal_weight is deprecated. "
-                                     "Use brainpy.math.jitconn.mv_prob_normal instead.",
-                                     jitconn.mv_prob_normal),
-  'cusparse_csr_matvec': ("brainpy.math.cusparse_csr_matvec is deprecated. "
-                          "Use brainpy.math.sparse.csrmv instead.",
-                          sparse.csrmv),
-  'cusparse_coo_matvec': ("brainpy.math.cusparse_coo_matvec is deprecated. "
-                          "Use brainpy.math.sparse.coomv instead.",
-                          sparse.coomv),
-  'coo_to_csr': ("brainpy.math.coo_to_csr is deprecated. "
-                 "Use brainpy.math.sparse.coo_to_csr instead.",
-                 sparse.coo_to_csr),
-  'csr_to_coo': ("brainpy.math.csr_to_coo is deprecated. "
-                 "Use brainpy.math.sparse.csr_to_coo instead.",
-                 sparse.csr_to_coo),
-  'csr_to_dense': ("brainpy.math.csr_to_dense is deprecated. "
-                   "Use brainpy.math.sparse.csr_to_dense instead.",
-                   sparse.csr_to_dense),
-  'event_csr_matvec': ("brainpy.math.event_csr_matvec is deprecated. "
-                       "Use brainpy.math.event.csr_to_dense instead.",
-                       event.csrmv),
-  'event_info': ("brainpy.math.event_info is deprecated. "
-                 "Use brainpy.math.event.info instead.",
-                 event.info),
-}
-
-__getattr__ = deprecation_getattr(__name__, __deprecations, redirects=defaults.__all__, redirect_module=defaults)
-del deprecation_getattr, defaults
+# -*- coding: utf-8 -*-
+
+# data structure
+from .ndarray import *
+from .delayvars import *
+from .interoperability import *
+from .datatypes import *
+from .compat_numpy import *
+from .compat_tensorflow import *
+from .compat_pytorch import *
+from .einops import *
+
+# functions
+from .activations import *
+from . import activations
+
+# operators
+from .pre_syn_post import *
+from .op_register import *
+from . import surrogate, event, sparse, jitconn
+
+# Variable and Objects for object-oriented JAX transformations
+from .oo_transform import *
+
+# environment settings
+from .modes import *
+from .environment import *
+from .scales import *
+from .others import *
+
+# high-level numpy operations
+from . import fft
+from . import linalg
+from . import random
+
+# taichi operations
+from . import tifunc
+
+# others
+from . import sharding
+
+import jax.numpy as jnp
+from jax import config
+
+del jnp, config
+
+from brainpy._src.math.surrogate._compt import (
+  spike_with_sigmoid_grad as spike_with_sigmoid_grad,
+  spike_with_linear_grad as spike_with_linear_grad,
+  spike_with_gaussian_grad as spike_with_gaussian_grad,
+  spike_with_mg_grad as spike_with_mg_grad,
+)
+
+from brainpy._src.math import defaults
+from brainpy._src.deprecations import deprecation_getattr
+from brainpy._src.dependency_check import import_taichi, import_numba
+
+import_taichi(error_if_not_found=False)
+import_numba(error_if_not_found=False)
+
+__deprecations = {
+  "sparse_matmul": ("brainpy.math.sparse_matmul is deprecated. Use brainpy.math.sparse.seg_matmul instead.",
+                    sparse.seg_matmul),
+  'csr_matvec': ("brainpy.math.csr_matvec is deprecated. Use brainpy.math.sparse.csrmv instead.",
+                 sparse.csrmv),
+  'event_matvec_prob_conn_homo_weight': ("brainpy.math.event_matvec_prob_conn_homo_weight is deprecated. "
+                                         "Use brainpy.math.jitconn.event_mv_prob_homo instead.",
+                                         jitconn.event_mv_prob_homo),
+  'event_matvec_prob_conn_uniform_weight': ("brainpy.math.event_matvec_prob_conn_uniform_weight is deprecated. "
+                                            "Use brainpy.math.jitconn.event_mv_prob_uniform instead.",
+                                            jitconn.event_mv_prob_uniform),
+  'event_matvec_prob_conn_normal_weight': ("brainpy.math.event_matvec_prob_conn_normal_weight is deprecated. "
+                                           "Use brainpy.math.jitconn.event_mv_prob_normal instead.",
+                                           jitconn.event_mv_prob_normal),
+  'matvec_prob_conn_homo_weight': ("brainpy.math.matvec_prob_conn_homo_weight is deprecated. "
+                                   "Use brainpy.math.jitconn.mv_prob_homo instead.",
+                                   jitconn.mv_prob_homo),
+  'matvec_prob_conn_uniform_weight': ("brainpy.math.matvec_prob_conn_uniform_weight is deprecated. "
+                                      "Use brainpy.math.jitconn.mv_prob_uniform instead.",
+                                      jitconn.mv_prob_uniform),
+  'matvec_prob_conn_normal_weight': ("brainpy.math.matvec_prob_conn_normal_weight is deprecated. "
+                                     "Use brainpy.math.jitconn.mv_prob_normal instead.",
+                                     jitconn.mv_prob_normal),
+  'cusparse_csr_matvec': ("brainpy.math.cusparse_csr_matvec is deprecated. "
+                          "Use brainpy.math.sparse.csrmv instead.",
+                          sparse.csrmv),
+  'coo_to_csr': ("brainpy.math.coo_to_csr is deprecated. "
+                 "Use brainpy.math.sparse.coo_to_csr instead.",
+                 sparse.coo_to_csr),
+  'csr_to_coo': ("brainpy.math.csr_to_coo is deprecated. "
+                 "Use brainpy.math.sparse.csr_to_coo instead.",
+                 sparse.csr_to_coo),
+  'csr_to_dense': ("brainpy.math.csr_to_dense is deprecated. "
+                   "Use brainpy.math.sparse.csr_to_dense instead.",
+                   sparse.csr_to_dense),
+  'event_csr_matvec': ("brainpy.math.event_csr_matvec is deprecated. "
+                       "Use brainpy.math.event.csr_to_dense instead.",
+                       event.csrmv),
+}
+
+__getattr__ = deprecation_getattr(__name__, __deprecations, redirects=defaults.__all__, redirect_module=defaults)
+del deprecation_getattr, defaults
diff --git a/brainpy/math/event.py b/brainpy/math/event.py
index 0a17cae7c..02e98b8f3 100644
--- a/brainpy/math/event.py
+++ b/brainpy/math/event.py
@@ -1,5 +1,3 @@
-
 from brainpy._src.math.event import (
   csrmv as csrmv,
-  info as info,
 )
diff --git a/brainpy/math/jitconn.py b/brainpy/math/jitconn.py
index 90a028b7e..a87d27d58 100644
--- a/brainpy/math/jitconn.py
+++ b/brainpy/math/jitconn.py
@@ -1,10 +1,10 @@
-from brainpy._src.math.jitconn import (
-  event_mv_prob_homo as event_mv_prob_homo,
-  event_mv_prob_uniform as event_mv_prob_uniform,
-  event_mv_prob_normal as event_mv_prob_normal,
-
-  mv_prob_homo as mv_prob_homo,
-  mv_prob_uniform as mv_prob_uniform,
-  mv_prob_normal as mv_prob_normal,
-)
-
+from brainpy._src.math.jitconn import (
+  event_mv_prob_homo as event_mv_prob_homo,
+  event_mv_prob_uniform as event_mv_prob_uniform,
+  event_mv_prob_normal as event_mv_prob_normal,
+
+  mv_prob_homo as mv_prob_homo,
+  mv_prob_uniform as mv_prob_uniform,
+  mv_prob_normal as mv_prob_normal,
+)
+
diff --git a/brainpy/math/op_register.py b/brainpy/math/op_register.py
index a48268ef4..c0fcb67ae 100644
--- a/brainpy/math/op_register.py
+++ b/brainpy/math/op_register.py
@@ -1,14 +1,12 @@
-# -*- coding: utf-8 -*-
-
-
-from brainpy._src.math.op_register import (
-  CustomOpByNumba,
-  compile_cpu_signature_with_numba,
-  clean_caches,
-  check_kernels_count,
-)
-
-from brainpy._src.math.op_register.base import XLACustomOp
-from brainpy._src.math.op_register.ad_support import defjvp
-
-
+# -*- coding: utf-8 -*-
+from brainpy._src.math.op_register import (
+  CustomOpByNumba,
+  compile_cpu_signature_with_numba,
+  clean_caches,
+  check_kernels_count,
+)
+
+from brainpy._src.math.op_register.base import XLACustomOp
+from brainpy._src.math.op_register.ad_support import defjvp
+
+
diff --git a/brainpy/math/sparse.py b/brainpy/math/sparse.py
index 1380a9e9c..aa86679ec 100644
--- a/brainpy/math/sparse.py
+++ b/brainpy/math/sparse.py
@@ -1,8 +1,9 @@
-from brainpy._src.math.sparse import (
-  csrmv,
-  coomv,
 
+from brainpy._src.math.sparse import (
   seg_matmul,
+)
+from brainpy._src.math.sparse import (
+  csrmv,
 
   csr_to_dense as csr_to_dense,
   csr_to_coo as csr_to_coo,
diff --git a/brainpy/math/surrogate.py b/brainpy/math/surrogate.py
index 0121bddec..bf7897435 100644
--- a/brainpy/math/surrogate.py
+++ b/brainpy/math/surrogate.py
@@ -1,8 +1,9 @@
 # -*- coding: utf-8 -*-
 
 
-
 from brainpy._src.math.surrogate._one_input_new import (
+  Surrogate,
+
   Sigmoid,
   sigmoid as sigmoid,
 
diff --git a/brainpy/math/tifunc.py b/brainpy/math/tifunc.py
index 63f3cbe45..bea49c220 100644
--- a/brainpy/math/tifunc.py
+++ b/brainpy/math/tifunc.py
@@ -1,26 +1,25 @@
-# -*- coding: utf-8 -*-
-
-from brainpy._src.math.tifunc import (
-  taichi_lcg_rand,
-
-  # warp reduction primitives
-  warp_reduce_sum,
-
-  # random number generator
-  lfsr88_key,
-  lfsr88_next_key,
-  lfsr88_normal,
-  lfsr88_randn,
-  lfsr88_random_integers,
-  lfsr88_randint,
-  lfsr88_uniform,
-  lfsr88_rand,
-  lfsr113_key,
-  lfsr113_next_key,
-  lfsr113_normal,
-  lfsr113_randn,
-  lfsr113_random_integers,
-  lfsr113_randint,
-  lfsr113_uniform,
-  lfsr113_rand
-)
+# -*- coding: utf-8 -*-
+
+from brainpy._src.math.tifunc import (
+
+  # warp reduction primitives
+  warp_reduce_sum,
+
+  # random number generator
+  lfsr88_key,
+  lfsr88_next_key,
+  lfsr88_normal,
+  lfsr88_randn,
+  lfsr88_random_integers,
+  lfsr88_randint,
+  lfsr88_uniform,
+  lfsr88_rand,
+  lfsr113_key,
+  lfsr113_next_key,
+  lfsr113_normal,
+  lfsr113_randn,
+  lfsr113_random_integers,
+  lfsr113_randint,
+  lfsr113_uniform,
+  lfsr113_rand
+)
diff --git a/docs/quickstart/installation.rst b/docs/quickstart/installation.rst
index 2e0bb1905..46ce3822f 100644
--- a/docs/quickstart/installation.rst
+++ b/docs/quickstart/installation.rst
@@ -10,285 +10,71 @@ Installation
 Linux, and MacOS. It only relies on Python libraries.
 
 
-Installation with pip
----------------------
+Minimum requirements
+--------------------
 
-You can install ``BrainPy`` from the `pypi <https://pypi.org/project/brainpy/>`_.
-To do so, use:
+To install brainpy with minimum requirements (only depends on ``jax``), you can use:
 
 .. code-block:: bash
 
-    pip install brainpy
-
-To update the latest BrainPy, you can use
-
-.. code-block:: bash
-
-    pip install -U brainpy
-
-
-If you want to install the pre-release version (the latest development version)
-of BrainPy, you can use:
-
-.. code-block:: bash
-
-   pip install --pre brainpy
-
-
-
-Installation from source
-------------------------
-
-If you decide not to use ``pip``, you can install ``BrainPy`` from
-`GitHub <https://github.com/PKU-NIP-Lab/BrainPy>`_,
-or `OpenI <https://git.openi.org.cn/OpenI/BrainPy>`_.
-
-To do so, use:
-
-.. code-block:: bash
-
-    pip install git+https://github.com/PKU-NIP-Lab/BrainPy
+    pip install brainpy[cpu_mini] # for CPU
 
     # or
 
-    pip install git+https://git.openi.org.cn/OpenI/BrainPy
+    pip install brainpy[cuda_mini] -f https://storage.googleapis.com/jax-releases/jax_cuda_releases.html  # for GPU (Linux only)
 
 
-Dependency 1: NumPy
---------------------------------
 
-In order to make BrainPy work normally, users should install
-several dependent Python packages.
+CPU with all dependencies
+-------------------------
 
-The basic function of ``BrainPy`` only relies on `NumPy`_, which is very
-easy to install through ``pip`` or ``conda``:
+To install a CPU-only version of BrainPy, which might be useful for doing local development on a laptop, you can run
 
 .. code-block:: bash
 
-    pip install numpy
-
-    # or
-
-    conda install numpy
-
-Dependency 2: JAX
------------------
-
-BrainPy relies on `JAX`_. JAX is a high-performance JIT compiler which enables
-users to run Python code on CPU, GPU, and TPU devices. Core functionalities of
-BrainPy (>=2.0.0) have been migrated to the JAX backend.
-
-Linux
-^^^^^
-
-Currently, JAX supports **Linux** (Ubuntu 16.04 or later) and **macOS** (10.12 or
-later) platforms. The provided binary releases of `jax` and `jaxlib` for Linux and macOS
-systems are available at
+    pip install brainpy[cpu]
 
-- for CPU: https://storage.googleapis.com/jax-releases/jax_releases.html
-- for GPU: https://storage.googleapis.com/jax-releases/jax_cuda_releases.html
 
 
-If you want to install a CPU-only version of `jax` and `jaxlib`, you can run
+GPU with all dependencies
+-------------------------
 
-.. code-block:: bash
-
-    pip install --upgrade "jax[cpu]"
-
-If you want to install JAX with both CPU and NVidia GPU support, you must first install
-`CUDA`_ and `CuDNN`_, if they have already been installed. Next, run
+BrainPy supports NVIDIA GPUs that have SM version 5.2 (Maxwell) or newer.
+To install a GPU-only version of BrainPy, you can run
 
 .. code-block:: bash
 
-    # CUDA 12 installation
-    # Note: wheels only available on linux.
-    pip install --upgrade "jax[cuda12_local]" -f https://storage.googleapis.com/jax-releases/jax_cuda_releases.html
-
-    # CUDA 11 installation
-    # Note: wheels only available on linux.
-    pip install --upgrade "jax[cuda11_local]" -f https://storage.googleapis.com/jax-releases/jax_cuda_releases.html
-
-In the event of a version mismatch error with JAX, such as encountering an error message like:
-
-.. code-block:: text
+    pip install brainpy[cuda12] -f https://storage.googleapis.com/jax-releases/jax_cuda_releases.html  # for CUDA 12.0
+    pip install brainpy[cuda11] -f https://storage.googleapis.com/jax-releases/jax_cuda_releases.html  # for CUDA 11.0
 
-    CUDA backend failed to initialize: Found CUDA version 12000, but JAX was built against version 12020, which is newer. The copy of CUDA that is installed must be at least as new as the version against which JAX was built. (Set TF_CPP_MIN_LOG_LEVEL=0 and rerun for more info.)
 
-You will need to employ an alternative installation method that aligns with your environment's CUDA version. This can be achieved using the following commands:
 
-.. code-block:: bash
+``brainpylib``
+--------------
 
-    # CUDA 12 installation
-    pip install --upgrade "jax[cuda12_pip]" -f https://storage.googleapis.com/jax-releases/jax_cuda_releases.html
 
-    # CUDA 11 installation
-    pip install --upgrade "jax[cuda11_pip]" -f https://storage.googleapis.com/jax-releases/jax_cuda_releases.html
+``brainpylib`` defines a set of useful operators for building and simulating spiking neural networks.
 
 
-Alternatively, you can download the preferred release ".whl" file for jaxlib
-from the above release links, and install it via ``pip``:
+To install the ``brainpylib`` package on CPU devices, you can run
 
 .. code-block:: bash
 
-    pip install xxx-0.4.15-xxx.whl
-
-    pip install jax==0.4.15
-
-.. note::
-
-    Note that the versions of jaxlib and jax should be consistent.
-
-    For example, if you are using jax==0.4.15, you would better install jax==0.4.15.
-
+    pip install brainpylib
 
-MacOS
-^^^^^
 
-If you are using macOS Intel, we recommend you first to install the Miniconda Intel installer:
+To install the ``brainpylib`` package on CUDA 11, you can run
 
-1. Download the package in the link https://repo.anaconda.com/miniconda/Miniconda3-latest-MacOSX-x86_64.pkg
-2. Then click the downloaded package and install it.
-
-
-If you are using the latest M1 macOS version, you'd better to install the Miniconda M1 installer:
-
-
-1. Download the package in the link https://repo.anaconda.com/miniconda/Miniconda3-latest-MacOSX-arm64.pkg
-2. Then click the downloaded package and install it.
-
-
-Finally, you can install `jax` and `jaxlib` as the same as the Linux platform.
 
 .. code-block:: bash
 
-   pip install --upgrade "jax[cpu]"
-
-
-
-Windows
-^^^^^^^
-
-For **Windows** users with Python >= 3.9, `jax` and `jaxlib` can be installed
-directly from the PyPi channel.
-
-.. code-block:: bash
+    pip install brainpylib-cu11x
 
-   pip install jax jaxlib
 
+To install the ``brainpylib`` package on CUDA 12, you can run
 
-For **Windows** users with Python <= 3.8, `jax` and `jaxlib` can be installed
-from the community supports. Specifically, you can install `jax` and `jaxlib` through:
 
 .. code-block:: bash
 
-   pip install "jax[cpu]" -f https://whls.blob.core.windows.net/unstable/index.html
-
-If you are using GPU, you can install GPU-versioned wheels through:
-
-.. code-block:: bash
-
-   pip install "jax[cuda111]" -f https://whls.blob.core.windows.net/unstable/index.html
-
-Alternatively, you can manually install you favourite version of `jax` and `jaxlib` by
-downloading binary releases of JAX for Windows from
-https://whls.blob.core.windows.net/unstable/index.html .
-Then install it via ``pip``:
-
-.. code-block:: bash
-
-    pip install xxx-0.4.15-xxx.whl
-
-    pip install jax==0.4.15
-
-WSL
-^^^
-
-Moreover, for Windows 10+ system, we recommend using `Windows Subsystem for Linux (WSL)`_.
-The installation guide can be found in
-`WSL Installation Guide for Windows 10/11 <https://docs.microsoft.com/en-us/windows/wsl/install-win10>`_.
-Then, you can install JAX in WSL just like the installation step in Linux/MacOs.
-
-
-Dependency 3: brainpylib
-------------------------
-
-Many customized operators in BrainPy are implemented in ``brainpylib``.
-``brainpylib`` can also be installed from pypi according to your devices.
-For windows, Linux and MacOS users, ``brainpylib`` supports CPU operators.
-You can install CPU-version `brainpylib` by:
-
-.. code-block:: bash
-
-    # CPU installation
-    pip install --upgrade brainpylib
-
-For Nvidia GPU users, ``brainpylib`` only support Linux system and WSL2 subsystem. You can install the CUDA-version by using:
-
-.. code-block:: bash
-
-    # CUDA 12 installation
-    pip install --upgrade brainpylib-cu12x
-
-.. code-block:: bash
-
-    # CUDA 11 installation
-    pip install --upgrade brainpylib-cu11x
-
-Dependency 4: taichi
-------------------------
-Now BrainPy supports customized operators implemented in `taichi`_. You can install the latest version of `taichi`_ by:
-
-.. code-block:: bash
-
-    pip install -i https://pypi.taichi.graphics/simple/ taichi-nightly
-
-.. _taichi: https://www.taichi-lang.org
-
-And you can try it in the `operator custom with taichi <../tutorial_advanced/operator_custom_with_taichi.html>`_ tutorial page
-Attention: customized operators is still in the experimental stage. If you meet any problems, please contact us through the issue page.
-
-Running BrainPy with docker
-------------------------
-
-If you want to use BrainPy in docker, you can use the following command to pull the docker image:
-
-.. code:: bash
-
-   docker pull brainpy/brainpy:latest
-
-You can then run the docker image by:
-
-.. code:: bash
-
-   docker run -it --platform linux/amd64 brainpy/brainpy:latest
-
-Please notice that BrainPy docker image is based on the `ubuntu22.04` image, so it only support CPU version of BrainPy.
-
-
-Running BrainPy online with binder
-----------------------------------
-
-Click on the following link to launch the Binder environment with the
-BrainPy repository:
-
-|image1|
-
-Wait for the Binder environment to build. This might take a few moments.
-
-Once the environment is ready, you'll be redirected to a Jupyter
-notebook interface within your web browser.
-
-.. |image1| image:: https://camo.githubusercontent.com/581c077bdbc6ca6899c86d0acc6145ae85e9d80e6f805a1071793dbe48917982/68747470733a2f2f6d7962696e6465722e6f72672f62616467655f6c6f676f2e737667
-   :target: https://mybinder.org/v2/gh/brainpy/BrainPy-binder/main
-
-
-.. _NumPy: https://numpy.org/
-.. _Matplotlib: https://matplotlib.org/
-.. _JAX: https://github.com/google/jax
-.. _Windows Subsystem for Linux (WSL): https://docs.microsoft.com/en-us/windows/wsl/about
-.. _build JAX from source: https://jax.readthedocs.io/en/latest/developer.html
-.. _SymPy: https://github.com/sympy/sympy
-.. _Numba: https://numba.pydata.org/
-.. _CUDA: https://developer.nvidia.com/cuda-downloads
-.. _CuDNN: https://developer.nvidia.com/CUDNN
+    pip install brainpylib-cu12x
 
diff --git a/examples/dynamics_training/integrator_rnn.py b/examples/dynamics_training/integrator_rnn.py
index fc36845e6..aeaf0c412 100644
--- a/examples/dynamics_training/integrator_rnn.py
+++ b/examples/dynamics_training/integrator_rnn.py
@@ -30,7 +30,7 @@ def train_data():
 class RNN(bp.DynamicalSystem):
   def __init__(self, num_in, num_hidden):
     super(RNN, self).__init__()
-    self.rnn = bp.layers.RNNCell(num_in, num_hidden, train_state=True)
+    self.rnn = bp.dyn.RNNCell(num_in, num_hidden, train_state=True)
     self.out = bp.layers.Dense(num_hidden, 1)
 
   def update(self, x):
@@ -49,7 +49,7 @@ def loss(predictions, targets, l2_reg=2e-4):
 
 
 # define optimizer
-lr = bp.optim.ExponentialDecay(lr=0.025, decay_steps=1, decay_rate=0.99975)
+lr = bp.optim.ExponentialDecayLR(lr=0.025, decay_steps=1, decay_rate=0.99975)
 opt = bp.optim.Adam(lr=lr, eps=1e-1)
 
 # create a trainer
diff --git a/requirements-dev-raw.txt b/requirements-dev-raw.txt
new file mode 100644
index 000000000..99361efa9
--- /dev/null
+++ b/requirements-dev-raw.txt
@@ -0,0 +1,12 @@
+numpy
+jax
+jaxlib
+matplotlib
+msgpack
+tqdm
+pathos
+
+
+# test requirements
+pytest
+absl-py
diff --git a/requirements-dev.txt b/requirements-dev.txt
index 0e475e83d..98398ae2d 100644
--- a/requirements-dev.txt
+++ b/requirements-dev.txt
@@ -1,5 +1,4 @@
 numpy
-numba
 brainpylib
 jax
 jaxlib
@@ -7,7 +6,9 @@ matplotlib
 msgpack
 tqdm
 pathos
-taichi==1.7.0
+taichi
+numba
+
 
 # test requirements
 pytest
diff --git a/requirements.txt b/requirements.txt
index 02fdebe83..ab5665e73 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,5 +1,3 @@
 numpy
 jax
 tqdm
-numba
-taichi==1.7.0
diff --git a/setup.py b/setup.py
index d7fd45e38..885bbf57b 100644
--- a/setup.py
+++ b/setup.py
@@ -56,8 +56,8 @@
   author='BrainPy Team',
   author_email='chao.brain@qq.com',
   packages=packages,
-  python_requires='>=3.8',
-  install_requires=['numpy>=1.15', 'jax>=0.4.13', 'tqdm', 'numba', 'taichi==1.7.0'],
+  python_requires='>=3.9',
+  install_requires=['numpy>=1.15', 'jax>=0.4.13', 'tqdm'],
   url='https://github.com/brainpy/BrainPy',
   project_urls={
     "Bug Tracker": "https://github.com/brainpy/BrainPy/issues",
@@ -68,11 +68,12 @@
     'https://storage.googleapis.com/jax-releases/jax_cuda_releases.html',
   ],
   extras_require={
-    'cpu': ['jaxlib>=0.4.13', 'brainpylib'],
-    'cuda': ['jax[cuda]', 'brainpylib-cu12x'],
-    'cuda11': ['jax[cuda11_local]', 'brainpylib-cu11x'],
-    'cuda12': ['jax[cuda12_local]', 'brainpylib-cu12x'],
-    'tpu': ['jax[tpu]'],
+    'cpu': ['jaxlib>=0.4.13', 'brainpylib', 'numba', 'taichi==1.7.0'],
+    'cuda11': ['jaxlib[cuda11_pip]', 'brainpylib-cu11x', 'numba', 'taichi==1.7.0'],
+    'cuda12': ['jaxlib[cuda12_pip]', 'brainpylib-cu12x', 'numba', 'taichi==1.7.0'],
+    'tpu': ['jaxlib[tpu]', 'numba',],
+    'cpu_mini': ['jaxlib>=0.4.13'],
+    'cuda_mini': ['jaxlib[cuda12_pip]'],
   },
   keywords=('computational neuroscience, '
             'brain-inspired computation, '
@@ -89,6 +90,7 @@
     'Programming Language :: Python :: 3.9',
     'Programming Language :: Python :: 3.10',
     'Programming Language :: Python :: 3.11',
+    'Programming Language :: Python :: 3.12',
     'Intended Audience :: Science/Research',
     'License :: OSI Approved :: Apache Software License',
     'Topic :: Scientific/Engineering :: Bio-Informatics',