diff --git a/ding/policy/base_policy.py b/ding/policy/base_policy.py
index 2b5f13a541..3ff99c7b43 100644
--- a/ding/policy/base_policy.py
+++ b/ding/policy/base_policy.py
@@ -25,7 +25,7 @@ def default_config(cls: type) -> EasyDict:
         Overview:
             Get the default config of policy. This method is used to create the default config of policy.
         Returns:
-            cfg (:obj:`EasyDict`): The default config of corresponding policy. For the derived policy class, \
+            - cfg (:obj:`EasyDict`): The default config of corresponding policy. For the derived policy class, \
                 it will recursively merge the default config of base class and its own default config.
 
         .. tip::
@@ -196,16 +196,17 @@ def hook(*ignore):
     def _create_model(self, cfg: EasyDict, model: Optional[torch.nn.Module] = None) -> torch.nn.Module:
         """
         Overview:
-            Create neural network model according to input configures and model. If the input model is None, then \
-            the model will be created according to ``default_model`` method and ``cfg.model`` field. Otherwise, the \
-            model will be set to the ``model`` instance created by outside caller.
+            Create or validate the neural network model according to input configures and model. If the input model is \
+            None, then the model will be created according to ``default_model`` method and ``cfg.model`` field. \
+            Otherwise, the model will be verified as an instance of ``torch.nn.Module`` and set to the ``model`` \
+            instance created by outside caller.
         Arguments:
             - cfg (:obj:`EasyDict`): The final merged config used to initialize policy.
             - model (:obj:`torch.nn.Module`): The neural network model used to initialize policy. User can refer to \
                 the default model defined in corresponding policy to customize its own model.
         Returns:
-            - model (:obj:`torch.nn.Module`): The created neural network model. Then different modes of policy will \
-                add wrappers and plugins to the model, which is used to train, collect and evaluate.
+            - model (:obj:`torch.nn.Module`): The created neural network model. The different modes of policy will \
+                add distinct wrappers and plugins to the model, which is used to train, collect and evaluate.
         Raises:
             - RuntimeError: If the input model is not None and is not an instance of ``torch.nn.Module``.
         """
diff --git a/ding/policy/ddpg.py b/ding/policy/ddpg.py
index a4dd5dc4bf..2e253370b8 100644
--- a/ding/policy/ddpg.py
+++ b/ding/policy/ddpg.py
@@ -438,7 +438,7 @@ def _process_transition(self, obs: torch.Tensor, policy_output: Dict[str, torch.
                             timestep: namedtuple) -> Dict[str, torch.Tensor]:
         """
         Overview:
-            Process and pack one timestep transition data info a dict, which can be directly used for training and \
+            Process and pack one timestep transition data into a dict, which can be directly used for training and \
             saved in replay buffer. For DDPG, it contains obs, next_obs, action, reward, done.
         Arguments:
             - obs (:obj:`torch.Tensor`): The env observation of current timestep, such as stacked 2D image in Atari.
diff --git a/ding/policy/dqn.py b/ding/policy/dqn.py
index 6f121e3018..d1f6fdbb49 100644
--- a/ding/policy/dqn.py
+++ b/ding/policy/dqn.py
@@ -403,10 +403,10 @@ def _get_train_sample(self, transitions: List[Dict[str, Any]]) -> List[Dict[str,
             in ``self._forward_learn`` method.
         Arguments:
             - transitions (:obj:`List[Dict[str, Any]`): The trajectory data (a list of transition), each element is \
-                the same format as the return value of ``self._process_transition`` method.
+                in the same format as the return value of ``self._process_transition`` method.
         Returns:
-            - samples (:obj:`List[Dict[str, Any]]`): The processed train samples, each element is the similar format \
-                as input transitions, but may contain more data for training, such as nstep reward and target obs.
+            - samples (:obj:`List[Dict[str, Any]]`): The processed train samples, each element is similar in format \
+                to input transitions, but may contain more data for training, such as nstep reward and target obs.
         """
         transitions = get_nstep_return_data(transitions, self._nstep, gamma=self._gamma)
         return get_train_sample(transitions, self._unroll_len)
@@ -415,7 +415,7 @@ def _process_transition(self, obs: torch.Tensor, policy_output: Dict[str, torch.
                             timestep: namedtuple) -> Dict[str, torch.Tensor]:
         """
         Overview:
-            Process and pack one timestep transition data info a dict, which can be directly used for training and \
+            Process and pack one timestep transition data into a dict, which can be directly used for training and \
             saved in replay buffer. For DQN, it contains obs, next_obs, action, reward, done.
         Arguments:
             - obs (:obj:`torch.Tensor`): The env observation of current timestep, such as stacked 2D image in Atari.
@@ -539,6 +539,7 @@ class DQNSTDIMPolicy(DQNPolicy):
     """
     Overview:
         Policy class of DQN algorithm, extended by ST-DIM auxiliary objectives.
+        ST-DIM paper link: https://arxiv.org/abs/1906.08226.
     Config:
         == ==================== ======== ============== ======================================== =======================
         ID Symbol               Type     Default Value  Description                              Other(Shape)
diff --git a/ding/policy/dt.py b/ding/policy/dt.py
index c630c8949d..145b11f97c 100644
--- a/ding/policy/dt.py
+++ b/ding/policy/dt.py
@@ -6,7 +6,6 @@
 from ding.torch_utils import to_device
 from ding.utils import POLICY_REGISTRY
 from ding.utils.data import default_decollate
-from ding.torch_utils import one_hot
 from .base_policy import Policy
 
 
@@ -56,8 +55,20 @@ def default_model(self) -> Tuple[str, List[str]]:
     def _init_learn(self) -> None:
         """
         Overview:
-            Learn mode init method. Called by ``self.__init__``.
-            Init the optimizer, algorithm config, main and target models.
+            Initialize the learn mode of policy, including related attributes and modules. For Decision Transformer, \
+            it mainly contains the optimizer, algorithm-specific arguments such as rtg_scale and lr scheduler.
+            This method will be called in ``__init__`` method if ``learn`` field is in ``enable_field``.
+
+        .. note::
+            For the member variables that need to be saved and loaded, please refer to the ``_state_dict_learn`` \
+            and ``_load_state_dict_learn`` methods.
+
+        .. note::
+            For the member variables that need to be monitored, please refer to the ``_monitor_vars_learn`` method.
+
+        .. note::
+            If you want to set some spacial member variables in ``_init_learn`` method, you'd better name them \
+            with prefix ``_learn_`` to avoid conflict with other modes, such as ``self._learn_attr1``.
         """
         # rtg_scale: scale of `return to go`
         # rtg_target: max target of `return to go`
@@ -92,14 +103,26 @@ def _init_learn(self) -> None:
 
         self.max_env_score = -1.0
 
-    def _forward_learn(self, data: list) -> Dict[str, Any]:
+    def _forward_learn(self, data: List[torch.Tensor]) -> Dict[str, Any]:
         """
         Overview:
-            Forward and backward function of learn mode.
+            Policy forward function of learn mode (training policy and updating parameters). Forward means \
+            that the policy inputs some training batch data from the offline dataset and then returns the output \
+            result, including various training information such as loss, current learning rate.
         Arguments:
-            - data (:obj:`dict`): Dict type data, including at least ['obs', 'action', 'reward', 'next_obs']
+            - data (:obj:`List[torch.Tensor]`): The input data used for policy forward, including a series of \
+                processed torch.Tensor data, i.e., timesteps, states, actions, returns_to_go, traj_mask.
         Returns:
-            - info_dict (:obj:`Dict[str, Any]`): Including current lr and loss.
+            - info_dict (:obj:`Dict[str, Any]`): The information dict that indicated training result, which will be \
+                recorded in text log and tensorboard, values must be python scalar or a list of scalars. For the \
+                detailed definition of the dict, refer to the code of ``_monitor_vars_learn`` method.
+
+        .. note::
+            The input value can be torch.Tensor or dict/list combinations and current policy supports all of them. \
+            For the data type that not supported, the main reason is that the corresponding model does not support it. \
+            You can implement you own model rather than use the default model. For more information, please raise an \
+            issue in GitHub repo and we will continue to follow up.
+
         """
         self._learn_model.train()
 
@@ -156,7 +179,18 @@ def _forward_learn(self, data: list) -> Dict[str, Any]:
     def _init_eval(self) -> None:
         """
         Overview:
-            Evaluate mode init method. Called by ``self.__init__``, initialize eval_model.
+            Initialize the eval mode of policy, including related attributes and modules. For DQN, it contains the \
+            eval model, some algorithm-specific parameters such as context_len, max_eval_ep_len, etc.
+            This method will be called in ``__init__`` method if ``eval`` field is in ``enable_field``.
+
+        .. tip::
+            For the evaluation of complete episodes, we need to maintain some historical information for transformer \
+            inference. These variables need to be initialized in ``_init_eval`` and reset in ``_reset_eval`` when \
+            necessary.
+
+        .. note::
+            If you want to set some spacial member variables in ``_init_eval`` method, you'd better name them \
+            with prefix ``_eval_`` to avoid conflict with other modes, such as ``self._eval_attr1``.
         """
         self._eval_model = self._model
         # init data
diff --git a/ding/policy/impala.py b/ding/policy/impala.py
index eb19ca9b87..46adeb1204 100644
--- a/ding/policy/impala.py
+++ b/ding/policy/impala.py
@@ -79,9 +79,9 @@ class IMPALAPolicy(Policy):
             c_clip_ratio=1.0,
             # (float) clip ratio of importance sampling.
             rho_pg_clip_ratio=1.0,
-            # (str) The gradient clip operation type used in PPO, ['clip_norm', clip_value', 'clip_momentum_norm'].
+            # (str) The gradient clip operation type used in IMPALA, ['clip_norm', clip_value', 'clip_momentum_norm'].
             grad_clip_type=None,
-            # (float) The gradient clip target value used in PPO.
+            # (float) The gradient clip target value used in IMPALA.
             # If ``grad_clip_type`` is 'clip_norm', then the maximum of gradient will be normalized to this value.
             clip_value=0.5,
             # (str) Optimizer used to train the network, ['adam', 'rmsprop'].
@@ -120,12 +120,24 @@ def default_model(self) -> Tuple[str, List[str]]:
         return 'vac', ['ding.model.template.vac']
 
     def _init_learn(self) -> None:
-        r"""
+        """
         Overview:
-            Learn mode init method. Called by ``self.__init__``.
-            Initialize the optimizer, algorithm config and main model.
+            Initialize the learn mode of policy, including related attributes and modules. For IMPALA, it mainly \
+            contains optimizer, algorithm-specific arguments such as loss weight and gamma, main (learn) model.
+            This method will be called in ``__init__`` method if ``learn`` field is in ``enable_field``.
+
+        .. note::
+            For the member variables that need to be saved and loaded, please refer to the ``_state_dict_learn`` \
+            and ``_load_state_dict_learn`` methods.
+
+        .. note::
+            For the member variables that need to be monitored, please refer to the ``_monitor_vars_learn`` method.
+
+        .. note::
+            If you want to set some spacial member variables in ``_init_learn`` method, you'd better name them \
+            with prefix ``_learn_`` to avoid conflict with other modes, such as ``self._learn_attr1``.
         """
-        assert self._cfg.action_space in ["continuous", "discrete"]
+        assert self._cfg.action_space in ["continuous", "discrete"], self._cfg.action_space
         self._action_space = self._cfg.action_space
         # Optimizer
         optim_type = self._cfg.learn.optim
@@ -166,8 +178,8 @@ def _data_preprocess_learn(self, data: List[Dict[str, Any]]):
             Convert list trajectory data to to trajectory data, which is a dict of tensors.
         Arguments:
             - data (:obj:`List[Dict[str, Any]]`): List type data, a list of data for training. Each list element is a \
-            dict, whose values are torch.Tensor or np.ndarray or dict/list combinations, keys include at least 'obs', \
-             'next_obs', 'logit', 'action', 'reward', 'done'
+                dict, whose values are torch.Tensor or np.ndarray or dict/list combinations, keys include at least \
+                'obs', 'next_obs', 'logit', 'action', 'reward', 'done'
         Returns:
             - data (:obj:`dict`): Dict type data. Values are torch.Tensor or np.ndarray or dict/list combinations. \
         ReturnsKeys:
@@ -208,20 +220,31 @@ def _data_preprocess_learn(self, data: List[Dict[str, Any]]):
     def _forward_learn(self, data: List[Dict[str, Any]]) -> Dict[str, Any]:
         """
         Overview:
-            Forward computation graph of learn mode(updating policy).
+            Policy forward function of learn mode (training policy and updating parameters). Forward means \
+            that the policy inputs some training batch data from the replay buffer and then returns the output \
+            result, including various training information such as loss and current learning rate.
         Arguments:
-            - data (:obj:`List[Dict[str, Any]]`): List type data, a list of data for training. Each list element is a \
-            dict, whose values are torch.Tensor or np.ndarray or dict/list combinations, keys include at least 'obs',\
-             'next_obs', 'logit', 'action', 'reward', 'done'
+            - data (:obj:`List[Dict[int, Any]]`): The input data used for policy forward, including a batch of \
+                training samples. For each element in list, the key of the dict is the name of data items and the \
+                value is the corresponding data. Usually, the value is torch.Tensor or np.ndarray or there dict/list \
+                combinations. In the ``_forward_learn`` method, data often need to first be stacked in the batch \
+                dimension by some utility functions such as ``default_preprocess_learn``. \
+                For IMPALA, each element in list is a dict containing at least the following keys: ``obs``, \
+                ``action``, ``logit``, ``reward``, ``next_obs``, ``done``. Sometimes, it also contains other keys such \
+                as ``weight``.
         Returns:
-            - info_dict (:obj:`Dict[str, Any]`): Dict type data, a info dict indicated training result, which will be \
-                recorded in text log and tensorboard, values are python scalar or a list of scalars.
-        ArgumentsKeys:
-            - necessary: ``obs``, ``action``, ``reward``, ``next_obs``, ``done``
-            - optional: 'collect_iter', 'replay_unique_id', 'replay_buffer_idx', 'priority', 'staleness', 'use', 'IS'
-        ReturnsKeys:
-            - necessary: ``cur_lr``, ``total_loss``, ``policy_loss`,``value_loss``,``entropy_loss``
-            - optional: ``priority``
+            - info_dict (:obj:`Dict[str, Any]`): The information dict that indicated training result, which will be \
+                recorded in text log and tensorboard, values must be python scalar or a list of scalars. For the \
+                detailed definition of the dict, refer to the code of ``_monitor_vars_learn`` method.
+
+        .. note::
+            The input value can be torch.Tensor or dict/list combinations and current policy supports all of them. \
+            For the data type that not supported, the main reason is that the corresponding model does not support it. \
+            You can implement you own model rather than use the default model. For more information, please raise an \
+            issue in GitHub repo and we will continue to follow up.
+
+        .. note::
+            For more detailed examples, please refer to unittest for IMPALAPolicy: ``ding.policy.tests.test_impala``.
         """
         data = self._data_preprocess_learn(data)
         # ====================
@@ -267,8 +290,7 @@ def _reshape_data(self, output: Dict[str, Any], data: Dict[str, Any]) -> Tuple:
             - data (:obj:`Dict[int, Any]`): Dict type data, input of policy._forward_learn Values are torch.Tensor or \
                 np.ndarray or dict/list combinations. Keys includes at least ['logit', 'action', 'reward', 'done'].
         Returns:
-            - data (:obj:`Tuple[Any]`): Tuple of target_logit, behaviour_logit, actions, \
-                values, rewards, weights
+            - data (:obj:`Tuple[Any]`): Tuple of target_logit, behaviour_logit, actions, values, rewards, weights.
         ReturnsShapes:
             - target_logit (:obj:`torch.FloatTensor`): :math:`((T+1), B, Obs_Shape)`, where T is timestep,\
                 B is batch size and Obs_Shape is the shape of single env observation.
@@ -300,10 +322,16 @@ def _reshape_data(self, output: Dict[str, Any], data: Dict[str, Any]) -> Tuple:
         return target_logit, behaviour_logit, actions, values, rewards, weights
 
     def _init_collect(self) -> None:
-        r"""
+        """
         Overview:
-            Collect mode init method. Called by ``self.__init__``, initialize algorithm arguments and collect_model.
-            Use multinomial_sample to choose action.
+            Initialize the collect mode of policy, including related attributes and modules. For IMPALA, it contains \
+            the collect_model to balance the exploration and exploitation (e.g. the multinomial sample mechanism in \
+            discrete action space), and other algorithm-specific arguments such as unroll_len.
+            This method will be called in ``__init__`` method if ``collect`` field is in ``enable_field``.
+
+        .. note::
+            If you want to set some spacial member variables in ``_init_collect`` method, you'd better name them \
+            with prefix ``_collect_`` to avoid conflict with other modes, such as ``self._collect_attr1``.
         """
         assert self._cfg.action_space in ["continuous", "discrete"]
         self._action_space = self._cfg.action_space
@@ -314,18 +342,32 @@ def _init_collect(self) -> None:
 
         self._collect_model.reset()
 
-    def _forward_collect(self, data: Dict[int, Any]) -> Dict[int, Dict[str, Any]]:
+    def _forward_collect(self, data: Dict[int, Any]) -> Dict[int, Any]:
         """
         Overview:
-            Forward computation graph of collect mode(collect training data).
+            Policy forward function of collect mode (collecting training data by interacting with envs). Forward means \
+            that the policy gets some necessary data (mainly observation) from the envs and then returns the output \
+            data, such as the action to interact with the envs.
         Arguments:
-            - data (:obj:`Dict[int, Any]`): Dict type data, stacked env data for predicting \
-            action, values are torch.Tensor or np.ndarray or dict/list combinations,keys \
-            are env_id indicated by integer.
+            - data (:obj:`Dict[int, Any]`): The input data used for policy forward, including at least the obs. The \
+                key of the dict is environment id and the value is the corresponding data of the env.
         Returns:
-            - output (:obj:`Dict[int, Dict[str,Any]]`): Dict of predicting policy_output(logit, action) for each env.
-        ReturnsKeys
-            - necessary: ``logit``, ``action``
+            - output (:obj:`Dict[int, Any]`): The output data of policy forward, including at least the action and \
+                other necessary data (action logit and value) for learn mode defined in ``self._process_transition`` \
+                method. The key of the dict is the same as the input data, i.e. environment id.
+
+        .. tip::
+            If you want to add more tricks on this policy, like temperature factor in multinomial sample, you can pass \
+            related data as extra keyword arguments of this method.
+
+        .. note::
+            The input value can be torch.Tensor or dict/list combinations and current policy supports all of them. \
+            For the data type that not supported, the main reason is that the corresponding model does not support it. \
+            You can implement you own model rather than use the default model. For more information, please raise an \
+            issue in GitHub repo and we will continue to follow up.
+
+        .. note::
+            For more detailed examples, please refer to unittest for IMPALAPolicy: ``ding.policy.tests.test_impala``.
         """
         data_id = list(data.keys())
         data = default_collate(list(data.values()))
@@ -358,7 +400,7 @@ def _process_transition(self, obs: torch.Tensor, policy_output: Dict[str, torch.
                             timestep: namedtuple) -> Dict[str, torch.Tensor]:
         """
         Overview:
-            Process and pack one timestep transition data info a dict, which can be directly used for training and \
+            Process and pack one timestep transition data into a dict, which can be directly used for training and \
             saved in replay buffer. For IMPALA, it contains obs, next_obs, action, reward, done, logit.
         Arguments:
             - obs (:obj:`torch.Tensor`): The env observation of current timestep, such as stacked 2D image in Atari.
@@ -381,12 +423,17 @@ def _process_transition(self, obs: torch.Tensor, policy_output: Dict[str, torch.
         return transition
 
     def _init_eval(self) -> None:
-        r"""
+        """
         Overview:
-            Evaluate mode init method. Called by ``self.__init__``, initialize eval_model,
-            and use argmax_sample to choose action.
+            Initialize the eval mode of policy, including related attributes and modules. For IMPALA, it contains the \
+            eval model to select optimial action (e.g. greedily select action with argmax mechanism in discrete action).
+            This method will be called in ``__init__`` method if ``eval`` field is in ``enable_field``.
+
+        .. note::
+            If you want to set some spacial member variables in ``_init_eval`` method, you'd better name them \
+            with prefix ``_eval_`` to avoid conflict with other modes, such as ``self._eval_attr1``.
         """
-        assert self._cfg.action_space in ["continuous", "discrete"]
+        assert self._cfg.action_space in ["continuous", "discrete"], self._cfg.action_space
         self._action_space = self._cfg.action_space
         if self._action_space == 'continuous':
             self._eval_model = model_wrap(self._model, wrapper_name='deterministic_sample')
@@ -398,17 +445,26 @@ def _init_eval(self) -> None:
     def _forward_eval(self, data: Dict[int, Any]) -> Dict[int, Any]:
         """
         Overview:
-            Forward computation graph of eval mode(evaluate policy performance), at most cases, it is similar to \
-            ``self._forward_collect``.
+            Policy forward function of eval mode (evaluation policy performance by interacting with envs). Forward \
+            means that the policy gets some necessary data (mainly observation) from the envs and then returns the \
+            action to interact with the envs. ``_forward_eval`` in IMPALA often uses deterministic sample to get \
+            actions while ``_forward_collect`` usually uses stochastic sample method for balance exploration and \
+            exploitation.
         Arguments:
-            - data (:obj:`Dict[str, Any]`): Dict type data, stacked env data for predicting policy_output(action), \
-                values are torch.Tensor or np.ndarray or dict/list combinations, keys are env_id indicated by integer.
+            - data (:obj:`Dict[int, Any]`): The input data used for policy forward, including at least the obs. The \
+                key of the dict is environment id and the value is the corresponding data of the env.
         Returns:
-            - output (:obj:`Dict[int, Any]`): The dict of predicting action for the interaction with env.
-        ReturnsKeys
-            - necessary: ``action``
-            - optional: ``logit``
+            - output (:obj:`Dict[int, Any]`): The output data of policy forward, including at least the action. The \
+                key of the dict is the same as the input data, i.e. environment id.
 
+        .. note::
+            The input value can be torch.Tensor or dict/list combinations and current policy supports all of them. \
+            For the data type that not supported, the main reason is that the corresponding model does not support it. \
+            You can implement you own model rather than use the default model. For more information, please raise an \
+            issue in GitHub repo and we will continue to follow up.
+
+        .. note::
+            For more detailed examples, please refer to unittest for IMPALAPolicy: ``ding.policy.tests.test_impala``.
         """
         data_id = list(data.keys())
         data = default_collate(list(data.values()))
diff --git a/ding/policy/pdqn.py b/ding/policy/pdqn.py
index 33887bff9e..6b66e263ab 100644
--- a/ding/policy/pdqn.py
+++ b/ding/policy/pdqn.py
@@ -444,7 +444,7 @@ def _process_transition(self, obs: torch.Tensor, policy_output: Dict[str, torch.
                             timestep: namedtuple) -> Dict[str, torch.Tensor]:
         """
         Overview:
-            Process and pack one timestep transition data info a dict, which can be directly used for training and \
+            Process and pack one timestep transition data into a dict, which can be directly used for training and \
             saved in replay buffer. For PDQN, it contains obs, next_obs, action, reward, done and logit.
         Arguments:
             - obs (:obj:`torch.Tensor`): The env observation of current timestep, such as stacked 2D image in Atari.
diff --git a/ding/policy/ppo.py b/ding/policy/ppo.py
index e717ea23e2..9ebd7b0f73 100644
--- a/ding/policy/ppo.py
+++ b/ding/policy/ppo.py
@@ -112,8 +112,21 @@ def default_model(self) -> Tuple[str, List[str]]:
     def _init_learn(self) -> None:
         """
         Overview:
-            Learn mode init method. Called by ``self.__init__``. Initialize the optimizer, algorithm config and \
-            the learn model, execute special network initialization and value running mean and std.
+            Initialize the learn mode of policy, including related attributes and modules. For PPO, it mainly contains \
+            optimizer, algorithm-specific arguments such as loss weight, clip_ratio and recompute_adv. This method \
+            also executes some special network initializations and prepares running mean/std monitor for value.
+            This method will be called in ``__init__`` method if ``learn`` field is in ``enable_field``.
+
+        .. note::
+            For the member variables that need to be saved and loaded, please refer to the ``_state_dict_learn`` \
+            and ``_load_state_dict_learn`` methods.
+
+        .. note::
+            For the member variables that need to be monitored, please refer to the ``_monitor_vars_learn`` method.
+
+        .. note::
+            If you want to set some spacial member variables in ``_init_learn`` method, you'd better name them \
+            with prefix ``_learn_`` to avoid conflict with other modes, such as ``self._learn_attr1``.
         """
         self._priority = self._cfg.priority
         self._priority_IS_weight = self._cfg.priority_IS_weight
@@ -172,16 +185,40 @@ def _init_learn(self) -> None:
         # Main model
         self._learn_model.reset()
 
-    def _forward_learn(self, data: Dict[str, Any]) -> List[Dict[str, Any]]:
+    def _forward_learn(self, data: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
         """
         Overview:
-            Forward and backward function of learn mode.
+            Policy forward function of learn mode (training policy and updating parameters). Forward means \
+            that the policy inputs some training batch data from the replay buffer and then returns the output \
+            result, including various training information such as loss, clipfrac, approx_kl.
         Arguments:
-            - data (:obj:`dict`): Dict type data
+            - data (:obj:`List[Dict[int, Any]]`): The input data used for policy forward, including the latest \
+                collected training samples for on-policy algorithms like PPO. For each element in list, the key of the \
+                dict is the name of data items and the value is the corresponding data. Usually, the value is \
+                torch.Tensor or np.ndarray or there dict/list combinations. In the ``_forward_learn`` method, data \
+                often need to first be stacked in the batch dimension by some utility functions such as \
+                ``default_preprocess_learn``. \
+                For PPO, each element in list is a dict containing at least the following keys: ``obs``, ``action``, \
+                ``reward``, ``logit``, ``value``, ``done``. Sometimes, it also contains other keys such as ``weight``.
         Returns:
-            - info_dict (:obj:`List[Dict[str, Any]]`): Return a list of information, each element is a Dict type data, \
-                a info dict indicated training result, which will be recorded in text log and tensorboard, values are \
-                python scalar or a list of scalars.
+            - return_infos (:obj:`List[Dict[str, Any]]`): The information list that indicated training result, each \
+                training iteration contains append a information dict into the final list. The list will be precessed \
+                and recorded in text log and tensorboard. The value of the dict must be python scalar or a list of \
+                scalars. For the detailed definition of the dict, refer to the code of ``_monitor_vars_learn`` method.
+
+        .. tip::
+            The training procedure of PPO is two for loops. The outer loop trains all the collected training samples \
+            with ``epoch_per_collect`` epochs. The inner loop splits all the data into different mini-batch with \
+            the length of ``batch_size``.
+
+        .. note::
+            The input value can be torch.Tensor or dict/list combinations and current policy supports all of them. \
+            For the data type that not supported, the main reason is that the corresponding model does not support it. \
+            You can implement you own model rather than use the default model. For more information, please raise an \
+            issue in GitHub repo and we will continue to follow up.
+
+        .. note::
+            For more detailed examples, please refer to our unittest for PPOPolicy: ``ding.policy.tests.test_ppo``.
         """
         data = default_preprocess_learn(data, ignore_done=self._cfg.learn.ignore_done, use_nstep=False)
         if self._cuda:
@@ -302,13 +339,23 @@ def _forward_learn(self, data: Dict[str, Any]) -> List[Dict[str, Any]]:
         return return_infos
 
     def _init_collect(self) -> None:
-        r"""
+        """
         Overview:
-            Collect mode init method. Called by ``self.__init__``.
-            Init traj and unroll length, collect model.
+            Initialize the collect mode of policy, including related attributes and modules. For PPO, it contains the \
+            collect_model to balance the exploration and exploitation (e.g. the multinomial sample mechanism in \
+            discrete action space), and other algorithm-specific arguments such as unroll_len and gae_lambda.
+            This method will be called in ``__init__`` method if ``collect`` field is in ``enable_field``.
+
+        .. note::
+            If you want to set some spacial member variables in ``_init_collect`` method, you'd better name them \
+            with prefix ``_collect_`` to avoid conflict with other modes, such as ``self._collect_attr1``.
+
+        .. tip::
+            Some variables need to initialize independently in different modes, such as gamma and gae_lambda in PPO. \
+            This design is for the convenience of parallel execution of different policy modes.
         """
         self._unroll_len = self._cfg.collect.unroll_len
-        assert self._cfg.action_space in ["continuous", "discrete", "hybrid"]
+        assert self._cfg.action_space in ["continuous", "discrete", "hybrid"], self._cfg.action_space
         self._action_space = self._cfg.action_space
         if self._action_space == 'continuous':
             self._collect_model = model_wrap(self._model, wrapper_name='reparam_sample')
@@ -364,7 +411,7 @@ def _process_transition(self, obs: torch.Tensor, policy_output: Dict[str, torch.
                             timestep: namedtuple) -> Dict[str, torch.Tensor]:
         """
         Overview:
-            Process and pack one timestep transition data info a dict, which can be directly used for training and \
+            Process and pack one timestep transition data into a dict, which can be directly used for training and \
             saved in replay buffer. For PPO, it contains obs, next_obs, action, reward, done, logit, value.
         Arguments:
             - obs (:obj:`torch.Tensor`): The env observation of current timestep, such as stacked 2D image in Atari.
@@ -447,10 +494,15 @@ def _get_train_sample(self, transitions: List[Dict[str, Any]]) -> List[Dict[str,
         return get_train_sample(data, self._unroll_len)
 
     def _init_eval(self) -> None:
-        r"""
+        """
         Overview:
-            Evaluate mode init method. Called by ``self.__init__``.
-            Init eval model with argmax strategy.
+            Initialize the eval mode of policy, including related attributes and modules. For PPO, it contains the \
+            eval model to select optimial action (e.g. greedily select action with argmax mechanism in discrete action).
+            This method will be called in ``__init__`` method if ``eval`` field is in ``enable_field``.
+
+        .. note::
+            If you want to set some spacial member variables in ``_init_eval`` method, you'd better name them \
+            with prefix ``_eval_`` to avoid conflict with other modes, such as ``self._eval_attr1``.
         """
         assert self._cfg.action_space in ["continuous", "discrete", "hybrid"]
         self._action_space = self._cfg.action_space
@@ -526,7 +578,7 @@ def _monitor_vars_learn(self) -> List[str]:
 class PPOPGPolicy(Policy):
     """
     Overview:
-        Policy class of on policy version PPO algorithm (pure policy gradient).
+        Policy class of on policy version PPO algorithm (pure policy gradient without value network).
         Paper link: https://arxiv.org/abs/1707.06347.
     """
     config = dict(
@@ -588,7 +640,25 @@ def default_model(self) -> Tuple[str, List[str]]:
         return 'pg', ['ding.model.template.pg']
 
     def _init_learn(self) -> None:
-        assert self._cfg.action_space in ["continuous", "discrete", "hybrid"]
+        """
+        Overview:
+            Initialize the learn mode of policy, including related attributes and modules. For PPOPG, it mainly \
+            contains optimizer, algorithm-specific arguments such as loss weight and clip_ratio. This method \
+            also executes some special network initializations.
+            This method will be called in ``__init__`` method if ``learn`` field is in ``enable_field``.
+
+        .. note::
+            For the member variables that need to be saved and loaded, please refer to the ``_state_dict_learn`` \
+            and ``_load_state_dict_learn`` methods.
+
+        .. note::
+            For the member variables that need to be monitored, please refer to the ``_monitor_vars_learn`` method.
+
+        .. note::
+            If you want to set some spacial member variables in ``_init_learn`` method, you'd better name them \
+            with prefix ``_learn_`` to avoid conflict with other modes, such as ``self._learn_attr1``.
+        """
+        assert self._cfg.action_space in ["continuous", "discrete"]
         self._action_space = self._cfg.action_space
         if self._cfg.learn.ppo_param_init:
             for n, m in self._model.named_modules():
@@ -620,7 +690,39 @@ def _init_learn(self) -> None:
         # Main model
         self._learn_model.reset()
 
-    def _forward_learn(self, data: Dict[str, Any]) -> Dict[str, Any]:
+    def _forward_learn(self, data: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
+        """
+        Overview:
+            Policy forward function of learn mode (training policy and updating parameters). Forward means \
+            that the policy inputs some training batch data from the replay buffer and then returns the output \
+            result, including various training information such as loss, clipfrac, approx_kl.
+        Arguments:
+            - data (:obj:`List[Dict[int, Any]]`): The input data used for policy forward, including the latest \
+                collected training samples for on-policy algorithms like PPO. For each element in list, the key of the \
+                dict is the name of data items and the value is the corresponding data. Usually, the value is \
+                torch.Tensor or np.ndarray or there dict/list combinations. In the ``_forward_learn`` method, data \
+                often need to first be stacked in the batch dimension by some utility functions such as \
+                ``default_preprocess_learn``. \
+                For PPOPG, each element in list is a dict containing at least the following keys: ``obs``, ``action``, \
+                ``return``, ``logit``, ``done``. Sometimes, it also contains other keys such as ``weight``.
+        Returns:
+            - return_infos (:obj:`List[Dict[str, Any]]`): The information list that indicated training result, each \
+                training iteration contains append a information dict into the final list. The list will be precessed \
+                and recorded in text log and tensorboard. The value of the dict must be python scalar or a list of \
+                scalars. For the detailed definition of the dict, refer to the code of ``_monitor_vars_learn`` method.
+
+        .. tip::
+            The training procedure of PPOPG is two for loops. The outer loop trains all the collected training samples \
+            with ``epoch_per_collect`` epochs. The inner loop splits all the data into different mini-batch with \
+            the length of ``batch_size``.
+
+        .. note::
+            The input value can be torch.Tensor or dict/list combinations and current policy supports all of them. \
+            For the data type that not supported, the main reason is that the corresponding model does not support it. \
+            You can implement you own model rather than use the default model. For more information, please raise an \
+            issue in GitHub repo and we will continue to follow up.
+        """
+
         data = default_preprocess_learn(data)
         if self._cuda:
             data = to_device(data, self._device)
@@ -664,7 +766,22 @@ def _forward_learn(self, data: Dict[str, Any]) -> Dict[str, Any]:
         return return_infos
 
     def _init_collect(self) -> None:
-        assert self._cfg.action_space in ["continuous", "discrete", "hybrid"]
+        """
+        Overview:
+            Initialize the collect mode of policy, including related attributes and modules. For PPOPG, it contains \
+            the collect_model to balance the exploration and exploitation (e.g. the multinomial sample mechanism in \
+            discrete action space), and other algorithm-specific arguments such as unroll_len and gae_lambda.
+            This method will be called in ``__init__`` method if ``collect`` field is in ``enable_field``.
+
+        .. note::
+            If you want to set some spacial member variables in ``_init_collect`` method, you'd better name them \
+            with prefix ``_collect_`` to avoid conflict with other modes, such as ``self._collect_attr1``.
+
+        .. tip::
+            Some variables need to initialize independently in different modes, such as gamma and gae_lambda in PPO. \
+            This design is for the convenience of parallel execution of different policy modes.
+        """
+        assert self._cfg.action_space in ["continuous", "discrete"], self._cfg.action_space
         self._action_space = self._cfg.action_space
         self._unroll_len = self._cfg.collect.unroll_len
         if self._action_space == 'continuous':
@@ -697,9 +814,6 @@ def _forward_collect(self, data: Dict[int, Any]) -> Dict[int, Any]:
             For the data type that not supported, the main reason is that the corresponding model does not support it. \
             You can implement you own model rather than use the default model. For more information, please raise an \
             issue in GitHub repo and we will continue to follow up.
-
-        .. note::
-            For more detailed examples, please refer to our unittest for PPOPGPolicy: ``ding.policy.tests.test_ppo``.
         """
         data_id = list(data.keys())
         data = default_collate(list(data.values()))
@@ -717,7 +831,7 @@ def _process_transition(self, obs: torch.Tensor, policy_output: Dict[str, torch.
                             timestep: namedtuple) -> Dict[str, torch.Tensor]:
         """
         Overview:
-            Process and pack one timestep transition data info a dict, which can be directly used for training and \
+            Process and pack one timestep transition data into a dict, which can be directly used for training and \
             saved in replay buffer. For PPOPG, it contains obs, action, reward, done, logit.
         Arguments:
             - obs (:obj:`torch.Tensor`): The env observation of current timestep, such as stacked 2D image in Atari.
@@ -767,7 +881,17 @@ def _get_train_sample(self, data: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
         return get_train_sample(data, self._unroll_len)
 
     def _init_eval(self) -> None:
-        assert self._cfg.action_space in ["continuous", "discrete", "hybrid"]
+        """
+        Overview:
+            Initialize the eval mode of policy, including related attributes and modules. For PPOPG, it contains the \
+            eval model to select optimial action (e.g. greedily select action with argmax mechanism in discrete action).
+            This method will be called in ``__init__`` method if ``eval`` field is in ``enable_field``.
+
+        .. note::
+            If you want to set some spacial member variables in ``_init_eval`` method, you'd better name them \
+            with prefix ``_eval_`` to avoid conflict with other modes, such as ``self._eval_attr1``.
+        """
+        assert self._cfg.action_space in ["continuous", "discrete"]
         self._action_space = self._cfg.action_space
         if self._action_space == 'continuous':
             self._eval_model = model_wrap(self._model, wrapper_name='deterministic_sample')
@@ -832,6 +956,7 @@ class PPOOffPolicy(Policy):
     """
     Overview:
         Policy class of off-policy version PPO algorithm. Paper link: https://arxiv.org/abs/1707.06347.
+        This version is more suitable for large-scale distributed training.
     """
     config = dict(
         # (str) RL policy register name (refer to function "POLICY_REGISTRY").
@@ -915,14 +1040,27 @@ def default_model(self) -> Tuple[str, List[str]]:
         return 'vac', ['ding.model.template.vac']
 
     def _init_learn(self) -> None:
-        r"""
+        """
         Overview:
-            Learn mode init method. Called by ``self.__init__``.
-            Init the optimizer, algorithm config and the main model.
+            Initialize the learn mode of policy, including related attributes and modules. For PPOOff, it mainly \
+            contains optimizer, algorithm-specific arguments such as loss weight and clip_ratio. This method \
+            also executes some special network initializations and prepares running mean/std monitor for value.
+            This method will be called in ``__init__`` method if ``learn`` field is in ``enable_field``.
+
+        .. note::
+            For the member variables that need to be saved and loaded, please refer to the ``_state_dict_learn`` \
+            and ``_load_state_dict_learn`` methods.
+
+        .. note::
+            For the member variables that need to be monitored, please refer to the ``_monitor_vars_learn`` method.
+
+        .. note::
+            If you want to set some spacial member variables in ``_init_learn`` method, you'd better name them \
+            with prefix ``_learn_`` to avoid conflict with other modes, such as ``self._learn_attr1``.
         """
         self._priority = self._cfg.priority
         self._priority_IS_weight = self._cfg.priority_IS_weight
-        assert not self._priority and not self._priority_IS_weight, "Priority is not implemented in PPO"
+        assert not self._priority and not self._priority_IS_weight, "Priority is not implemented in PPOOff"
 
         assert self._cfg.action_space in ["continuous", "discrete", "hybrid"]
         self._action_space = self._cfg.action_space
@@ -979,15 +1117,31 @@ def _init_learn(self) -> None:
         # Main model
         self._learn_model.reset()
 
-    def _forward_learn(self, data: dict) -> Dict[str, Any]:
+    def _forward_learn(self, data: List[Dict[str, Any]]) -> Dict[str, Any]:
         """
         Overview:
-            Forward and backward function of learn mode.
+            Policy forward function of learn mode (training policy and updating parameters). Forward means \
+            that the policy inputs some training batch data from the replay buffer and then returns the output \
+            result, including various training information such as loss, clipfrac and approx_kl.
         Arguments:
-            - data (:obj:`dict`): Dict type data
+            - data (:obj:`List[Dict[int, Any]]`): The input data used for policy forward, including a batch of \
+                training samples. For each element in list, the key of the dict is the name of data items and the \
+                value is the corresponding data. Usually, the value is torch.Tensor or np.ndarray or there dict/list \
+                combinations. In the ``_forward_learn`` method, data often need to first be stacked in the batch \
+                dimension by some utility functions such as ``default_preprocess_learn``. \
+                For PPOOff, each element in list is a dict containing at least the following keys: ``obs``, ``adv``, \
+                ``action``, ``logit``, ``value``, ``done``. Sometimes, it also contains other keys such as ``weight`` \
+                and ``value_gamma``.
         Returns:
-            - info_dict (:obj:`Dict[str, Any]`): Dict type data, a info dict indicated training result, which will be \
-                recorded in text log and tensorboard, values are python scalar or a list of scalars.
+            - info_dict (:obj:`Dict[str, Any]`): The information dict that indicated training result, which will be \
+                recorded in text log and tensorboard, values must be python scalar or a list of scalars. For the \
+                detailed definition of the dict, refer to the code of ``_monitor_vars_learn`` method.
+
+        .. note::
+            The input value can be torch.Tensor or dict/list combinations and current policy supports all of them. \
+            For the data type that not supported, the main reason is that the corresponding model does not support it. \
+            You can implement you own model rather than use the default model. For more information, please raise an \
+            issue in GitHub repo and we will continue to follow up.
         """
         data = default_preprocess_learn(data, ignore_done=self._cfg.learn.ignore_done, use_nstep=self._nstep_return)
         if self._cuda:
@@ -1002,7 +1156,7 @@ def _forward_learn(self, data: dict) -> Dict[str, Any]:
         self._learn_model.train()
 
         with torch.no_grad():
-            if hasattr(self, "_value_norm") and self._value_norm:
+            if self._value_norm:
                 unnormalized_return = data['adv'] + data['value'] * self._running_mean_std.std
                 data['return'] = unnormalized_return / self._running_mean_std.std
                 self._running_mean_std.update(unnormalized_return.cpu().numpy())
@@ -1145,8 +1299,18 @@ def _forward_learn(self, data: dict) -> Dict[str, Any]:
     def _init_collect(self) -> None:
         """
         Overview:
-            Collect mode init method. Called by ``self.__init__``. Initialize unroll length, gamma, gae lambda and \
-            collect model.
+            Initialize the collect mode of policy, including related attributes and modules. For PPOOff, it contains \
+            collect_model to balance the exploration and exploitation (e.g. the multinomial sample mechanism in \
+            discrete action space), and other algorithm-specific arguments such as unroll_len and gae_lambda.
+            This method will be called in ``__init__`` method if ``collect`` field is in ``enable_field``.
+
+        .. note::
+            If you want to set some spacial member variables in ``_init_collect`` method, you'd better name them \
+            with prefix ``_collect_`` to avoid conflict with other modes, such as ``self._collect_attr1``.
+
+        .. tip::
+            Some variables need to initialize independently in different modes, such as gamma and gae_lambda in PPOOff.
+            This design is for the convenience of parallel execution of different policy modes.
         """
         self._unroll_len = self._cfg.collect.unroll_len
         assert self._cfg.action_space in ["continuous", "discrete", "hybrid"]
@@ -1162,6 +1326,7 @@ def _init_collect(self) -> None:
         self._gae_lambda = self._cfg.collect.gae_lambda
         self._nstep = self._cfg.nstep
         self._nstep_return = self._cfg.nstep_return
+        self._value_norm = self._cfg.learn.value_norm
 
     def _forward_collect(self, data: Dict[int, Any]) -> Dict[int, Any]:
         """
@@ -1206,7 +1371,7 @@ def _process_transition(self, obs: torch.Tensor, policy_output: Dict[str, torch.
                             timestep: namedtuple) -> Dict[str, torch.Tensor]:
         """
         Overview:
-            Process and pack one timestep transition data info a dict, which can be directly used for training and \
+            Process and pack one timestep transition data into a dict, which can be directly used for training and \
             saved in replay buffer. For PPO, it contains obs, next_obs, action, reward, done, logit, value.
         Arguments:
             - obs (:obj:`torch.Tensor`): The env observation of current timestep, such as stacked 2D image in Atari.
@@ -1289,10 +1454,15 @@ def _get_train_sample(self, transitions: List[Dict[str, Any]]) -> List[Dict[str,
             return get_nstep_return_data(data, self._nstep)
 
     def _init_eval(self) -> None:
-        r"""
+        """
         Overview:
-            Evaluate mode init method. Called by ``self.__init__``.
-            Init eval model with argmax strategy.
+            Initialize the eval mode of policy, including related attributes and modules. For PPOOff, it contains the \
+            eval model to select optimial action (e.g. greedily select action with argmax mechanism in discrete action).
+            This method will be called in ``__init__`` method if ``eval`` field is in ``enable_field``.
+
+        .. note::
+            If you want to set some spacial member variables in ``_init_eval`` method, you'd better name them \
+            with prefix ``_eval_`` to avoid conflict with other modes, such as ``self._eval_attr1``.
         """
         assert self._cfg.action_space in ["continuous", "discrete", "hybrid"]
         self._action_space = self._cfg.action_space
diff --git a/ding/policy/qmix.py b/ding/policy/qmix.py
index e1f9dd3e2f..ff1d66f7c8 100644
--- a/ding/policy/qmix.py
+++ b/ding/policy/qmix.py
@@ -205,6 +205,7 @@ def _forward_learn(self, data: List[List[Dict[str, Any]]]) -> Dict[str, Any]:
             - info_dict (:obj:`Dict[str, Any]`): The information dict that indicated training result, which will be \
                 recorded in text log and tensorboard, values must be python scalar or a list of scalars. For the \
                 detailed definition of the dict, refer to the code of ``_monitor_vars_learn`` method.
+
         .. note::
             The input value can be torch.Tensor or dict/list combinations and current policy supports all of them. \
             For the data type that not supported, the main reason is that the corresponding model does not support it. \
@@ -389,7 +390,7 @@ def _process_transition(self, obs: torch.Tensor, policy_output: Dict[str, torch.
                             timestep: namedtuple) -> Dict[str, torch.Tensor]:
         """
         Overview:
-            Process and pack one timestep transition data info a dict, which can be directly used for training and \
+            Process and pack one timestep transition data into a dict, which can be directly used for training and \
             saved in replay buffer. For QMIX, it contains obs, next_obs, action, prev_state, reward, done.
         Arguments:
             - obs (:obj:`torch.Tensor`): The env observation of current timestep, usually including ``agent_obs`` \
diff --git a/ding/policy/r2d2.py b/ding/policy/r2d2.py
index 6898ccdbc3..0726c2c820 100644
--- a/ding/policy/r2d2.py
+++ b/ding/policy/r2d2.py
@@ -68,9 +68,9 @@ class R2D2Policy(Policy):
         cuda=False,
         # (bool) Whether the RL algorithm is on-policy or off-policy.
         on_policy=False,
-        # (bool) Whether use priority(priority sample, IS weight, update priority)
+        # (bool) Whether to use priority(priority sample, IS weight, update priority)
         priority=True,
-        # (bool) Whether use Importance Sampling Weight to correct biased update. If True, priority must be True.
+        # (bool) Whether to use Importance Sampling Weight to correct biased update. If True, priority must be True.
         priority_IS_weight=True,
         # (float) Reward's future discount factor, aka. gamma.
         discount_factor=0.997,
@@ -84,13 +84,13 @@ class R2D2Policy(Policy):
         learn_unroll_len=80,
         # learn_mode config
         learn=dict(
-            # (int) How many updates(iterations) to train after collector's one collection.
-            # Bigger "update_per_collect" means bigger off-policy.
-            # collect data -> update policy-> collect data -> ...
+            # (int) The number of training updates (iterations) to perform after each data collection by the collector.
+            # A larger "update_per_collect" value implies a more off-policy approach.
+            # The whole pipeline process follows this cycle: collect data -> update policy -> collect data -> ...
             update_per_collect=1,
-            # (int) How many samples in a training batch.
+            # (int) The number of samples in a training batch.
             batch_size=64,
-            # (float) The step size of gradient descent.
+            # (float) The step size of gradient descent, determining the rate of learning.
             learning_rate=0.0001,
             # (int) Frequence of target network update.
             # target_update_freq=100,
@@ -116,26 +116,26 @@ class R2D2Policy(Policy):
             # In R2D2 policy, for each collect_env, we want to collect data of length self._traj_len=INF
             # unless the episode enters the 'done' state.
             traj_len_inf=True,
-            # (int) `env_num` is used in hidden state, should equal to that one in env config.
-            # User should specify this value in user config.
+            # (int) `env_num` is used in hidden state, should equal to that one in env config (e.g. collector_env_num).
+            # User should specify this value in user config. `None` is a placeholder.
             env_num=None,
         ),
         # eval_mode config
         eval=dict(
-            # (int) `env_num` is used in hidden state, should equal to that one in env config.
+            # (int) `env_num` is used in hidden state, should equal to that one in env config (e.g. evaluator_env_num).
             # User should specify this value in user config.
             env_num=None,
         ),
         other=dict(
             # Epsilon greedy with decay.
             eps=dict(
-                # (str) Decay type. Support ['exp', 'linear'].
+                # (str) Type of decay. Supports either 'exp' (exponential) or 'linear'.
                 type='exp',
-                # (float) Epsilon start value.
+                # (float) Initial value of epsilon at the start.
                 start=0.95,
-                # (float) Epsilon end value.
+                # (float) Final value of epsilon after decay.
                 end=0.05,
-                # (int) Decay length(env step).
+                # (int) The number of environment steps over which epsilon should decay.
                 decay=10000,
             ),
             replay_buffer=dict(
@@ -529,11 +529,11 @@ def _process_transition(self, obs: torch.Tensor, policy_output: Dict[str, torch.
                             timestep: namedtuple) -> Dict[str, torch.Tensor]:
         """
         Overview:
-            Process and pack one timestep transition data info a dict, which can be directly used for training and \
-            saved in replay buffer. For R2D2, it contains obs, action, prev_state, reward, done.
+            Process and pack one timestep transition data into a dict, which can be directly used for training and \
+            saved in replay buffer. For R2D2, it contains obs, action, prev_state, reward, and done.
         Arguments:
             - obs (:obj:`torch.Tensor`): The env observation of current timestep, such as stacked 2D image in Atari.
-            - policy_output (:obj:`Dict[str, torch.Tensor]`): The output of the policy network with the observation \
+            - policy_output (:obj:`Dict[str, torch.Tensor]`): The output of the policy network given the observation \
                 as input. For R2D2, it contains the action and the prev_state of RNN.
             - timestep (:obj:`namedtuple`): The execution result namedtuple returned by the environment step method, \
                 except all the elements have been transformed into tensor data. Usually, it contains the next obs, \
diff --git a/ding/policy/sac.py b/ding/policy/sac.py
index 5ee3cc3eda..5b5dfe55c8 100644
--- a/ding/policy/sac.py
+++ b/ding/policy/sac.py
@@ -443,7 +443,7 @@ def _process_transition(self, obs: torch.Tensor, policy_output: Dict[str, torch.
                             timestep: namedtuple) -> Dict[str, torch.Tensor]:
         """
         Overview:
-            Process and pack one timestep transition data info a dict, which can be directly used for training and \
+            Process and pack one timestep transition data into a dict, which can be directly used for training and \
             saved in replay buffer. For discrete SAC, it contains obs, next_obs, logit, action, reward, done.
         Arguments:
             - obs (:obj:`torch.Tensor`): The env observation of current timestep, such as stacked 2D image in Atari.
@@ -1015,7 +1015,7 @@ def _process_transition(self, obs: torch.Tensor, policy_output: Dict[str, torch.
                             timestep: namedtuple) -> Dict[str, torch.Tensor]:
         """
         Overview:
-            Process and pack one timestep transition data info a dict, which can be directly used for training and \
+            Process and pack one timestep transition data into a dict, which can be directly used for training and \
             saved in replay buffer. For continuous SAC, it contains obs, next_obs, action, reward, done. The logit \
             will be also added when ``collector_logit`` is True.
         Arguments:
diff --git a/ding/rl_utils/td.py b/ding/rl_utils/td.py
index 9a42630702..9b741ec7ba 100644
--- a/ding/rl_utils/td.py
+++ b/ding/rl_utils/td.py
@@ -575,15 +575,12 @@ def bdq_nstep_td_error(
 ) -> torch.Tensor:
     """
     Overview:
-        Multistep (1 step or n step) td_error for BDQ algorithm, \
-            referenced paper Action Branching Architectures for Deep Reinforcement Learning \
-            <https://arxiv.org/pdf/1711.08946>
-        In fact, the original paper only provides the 1-step TD-error calculation method, \
-            and here we extend the calculation method of n-step.
-                TD-error:
-                    y_d = \sigma_{t=0}^{nstep} \gamma^t * r_t + \gamma^{nstep} * Q_d'(s', argmax Q_d(s', a_d))
-                    TD-error = \frac{1}{D} * (y_d - Q_d(s, a_d))^2
-                    Loss = mean(TD-error)
+        Multistep (1 step or n step) td_error for BDQ algorithm, referenced paper "Action Branching Architectures for \
+        Deep Reinforcement Learning", link: https://arxiv.org/pdf/1711.08946.
+        In fact, the original paper only provides the 1-step TD-error calculation method, and here we extend the \
+        calculation method of n-step, i.e., TD-error:
+        :math:`y_d = \sigma_{t=0}^{nstep} \gamma^t * r_t + \gamma^{nstep} * Q_d'(s', argmax Q_d(s', a_d))`
+        :math:`TD-error = \frac{1}{D} * (y_d - Q_d(s, a_d))^2`
     Arguments:
         - data (:obj:`q_nstep_td_data`): The input data, q_nstep_td_data to calculate loss
         - gamma (:obj:`float`): Discount factor
diff --git a/ding/torch_utils/network/activation.py b/ding/torch_utils/network/activation.py
index acdb3f2707..f507e4bcc0 100644
--- a/ding/torch_utils/network/activation.py
+++ b/ding/torch_utils/network/activation.py
@@ -101,12 +101,12 @@ def forward(self, x: torch.Tensor) -> torch.Tensor:
 
 
 def build_activation(activation: str, inplace: bool = None) -> nn.Module:
-    r"""
+    """
     Overview:
         Return the activation module according to the given type.
     Arguments:
         - activation (:obj:`str`): the type of activation module, now supports \
-        ['relu', 'glu', 'prelu', 'swish', 'gelu', 'tanh', 'sigmoid', 'softplus', 'elu', 'square', 'identity']
+            ['relu', 'glu', 'prelu', 'swish', 'gelu', 'tanh', 'sigmoid', 'softplus', 'elu', 'square', 'identity']
         - inplace (:obj:`bool`): can optionally do the operation in-place in relu. Default ``None``
     Returns:
         - act_func (:obj:`nn.module`): the corresponding activation module