diff --git a/ding/policy/base_policy.py b/ding/policy/base_policy.py index 2b5f13a541..3ff99c7b43 100644 --- a/ding/policy/base_policy.py +++ b/ding/policy/base_policy.py @@ -25,7 +25,7 @@ def default_config(cls: type) -> EasyDict: Overview: Get the default config of policy. This method is used to create the default config of policy. Returns: - cfg (:obj:`EasyDict`): The default config of corresponding policy. For the derived policy class, \ + - cfg (:obj:`EasyDict`): The default config of corresponding policy. For the derived policy class, \ it will recursively merge the default config of base class and its own default config. .. tip:: @@ -196,16 +196,17 @@ def hook(*ignore): def _create_model(self, cfg: EasyDict, model: Optional[torch.nn.Module] = None) -> torch.nn.Module: """ Overview: - Create neural network model according to input configures and model. If the input model is None, then \ - the model will be created according to ``default_model`` method and ``cfg.model`` field. Otherwise, the \ - model will be set to the ``model`` instance created by outside caller. + Create or validate the neural network model according to input configures and model. If the input model is \ + None, then the model will be created according to ``default_model`` method and ``cfg.model`` field. \ + Otherwise, the model will be verified as an instance of ``torch.nn.Module`` and set to the ``model`` \ + instance created by outside caller. Arguments: - cfg (:obj:`EasyDict`): The final merged config used to initialize policy. - model (:obj:`torch.nn.Module`): The neural network model used to initialize policy. User can refer to \ the default model defined in corresponding policy to customize its own model. Returns: - - model (:obj:`torch.nn.Module`): The created neural network model. Then different modes of policy will \ - add wrappers and plugins to the model, which is used to train, collect and evaluate. + - model (:obj:`torch.nn.Module`): The created neural network model. The different modes of policy will \ + add distinct wrappers and plugins to the model, which is used to train, collect and evaluate. Raises: - RuntimeError: If the input model is not None and is not an instance of ``torch.nn.Module``. """ diff --git a/ding/policy/ddpg.py b/ding/policy/ddpg.py index a4dd5dc4bf..2e253370b8 100644 --- a/ding/policy/ddpg.py +++ b/ding/policy/ddpg.py @@ -438,7 +438,7 @@ def _process_transition(self, obs: torch.Tensor, policy_output: Dict[str, torch. timestep: namedtuple) -> Dict[str, torch.Tensor]: """ Overview: - Process and pack one timestep transition data info a dict, which can be directly used for training and \ + Process and pack one timestep transition data into a dict, which can be directly used for training and \ saved in replay buffer. For DDPG, it contains obs, next_obs, action, reward, done. Arguments: - obs (:obj:`torch.Tensor`): The env observation of current timestep, such as stacked 2D image in Atari. diff --git a/ding/policy/dqn.py b/ding/policy/dqn.py index 6f121e3018..d1f6fdbb49 100644 --- a/ding/policy/dqn.py +++ b/ding/policy/dqn.py @@ -403,10 +403,10 @@ def _get_train_sample(self, transitions: List[Dict[str, Any]]) -> List[Dict[str, in ``self._forward_learn`` method. Arguments: - transitions (:obj:`List[Dict[str, Any]`): The trajectory data (a list of transition), each element is \ - the same format as the return value of ``self._process_transition`` method. + in the same format as the return value of ``self._process_transition`` method. Returns: - - samples (:obj:`List[Dict[str, Any]]`): The processed train samples, each element is the similar format \ - as input transitions, but may contain more data for training, such as nstep reward and target obs. + - samples (:obj:`List[Dict[str, Any]]`): The processed train samples, each element is similar in format \ + to input transitions, but may contain more data for training, such as nstep reward and target obs. """ transitions = get_nstep_return_data(transitions, self._nstep, gamma=self._gamma) return get_train_sample(transitions, self._unroll_len) @@ -415,7 +415,7 @@ def _process_transition(self, obs: torch.Tensor, policy_output: Dict[str, torch. timestep: namedtuple) -> Dict[str, torch.Tensor]: """ Overview: - Process and pack one timestep transition data info a dict, which can be directly used for training and \ + Process and pack one timestep transition data into a dict, which can be directly used for training and \ saved in replay buffer. For DQN, it contains obs, next_obs, action, reward, done. Arguments: - obs (:obj:`torch.Tensor`): The env observation of current timestep, such as stacked 2D image in Atari. @@ -539,6 +539,7 @@ class DQNSTDIMPolicy(DQNPolicy): """ Overview: Policy class of DQN algorithm, extended by ST-DIM auxiliary objectives. + ST-DIM paper link: https://arxiv.org/abs/1906.08226. Config: == ==================== ======== ============== ======================================== ======================= ID Symbol Type Default Value Description Other(Shape) diff --git a/ding/policy/dt.py b/ding/policy/dt.py index c630c8949d..145b11f97c 100644 --- a/ding/policy/dt.py +++ b/ding/policy/dt.py @@ -6,7 +6,6 @@ from ding.torch_utils import to_device from ding.utils import POLICY_REGISTRY from ding.utils.data import default_decollate -from ding.torch_utils import one_hot from .base_policy import Policy @@ -56,8 +55,20 @@ def default_model(self) -> Tuple[str, List[str]]: def _init_learn(self) -> None: """ Overview: - Learn mode init method. Called by ``self.__init__``. - Init the optimizer, algorithm config, main and target models. + Initialize the learn mode of policy, including related attributes and modules. For Decision Transformer, \ + it mainly contains the optimizer, algorithm-specific arguments such as rtg_scale and lr scheduler. + This method will be called in ``__init__`` method if ``learn`` field is in ``enable_field``. + + .. note:: + For the member variables that need to be saved and loaded, please refer to the ``_state_dict_learn`` \ + and ``_load_state_dict_learn`` methods. + + .. note:: + For the member variables that need to be monitored, please refer to the ``_monitor_vars_learn`` method. + + .. note:: + If you want to set some spacial member variables in ``_init_learn`` method, you'd better name them \ + with prefix ``_learn_`` to avoid conflict with other modes, such as ``self._learn_attr1``. """ # rtg_scale: scale of `return to go` # rtg_target: max target of `return to go` @@ -92,14 +103,26 @@ def _init_learn(self) -> None: self.max_env_score = -1.0 - def _forward_learn(self, data: list) -> Dict[str, Any]: + def _forward_learn(self, data: List[torch.Tensor]) -> Dict[str, Any]: """ Overview: - Forward and backward function of learn mode. + Policy forward function of learn mode (training policy and updating parameters). Forward means \ + that the policy inputs some training batch data from the offline dataset and then returns the output \ + result, including various training information such as loss, current learning rate. Arguments: - - data (:obj:`dict`): Dict type data, including at least ['obs', 'action', 'reward', 'next_obs'] + - data (:obj:`List[torch.Tensor]`): The input data used for policy forward, including a series of \ + processed torch.Tensor data, i.e., timesteps, states, actions, returns_to_go, traj_mask. Returns: - - info_dict (:obj:`Dict[str, Any]`): Including current lr and loss. + - info_dict (:obj:`Dict[str, Any]`): The information dict that indicated training result, which will be \ + recorded in text log and tensorboard, values must be python scalar or a list of scalars. For the \ + detailed definition of the dict, refer to the code of ``_monitor_vars_learn`` method. + + .. note:: + The input value can be torch.Tensor or dict/list combinations and current policy supports all of them. \ + For the data type that not supported, the main reason is that the corresponding model does not support it. \ + You can implement you own model rather than use the default model. For more information, please raise an \ + issue in GitHub repo and we will continue to follow up. + """ self._learn_model.train() @@ -156,7 +179,18 @@ def _forward_learn(self, data: list) -> Dict[str, Any]: def _init_eval(self) -> None: """ Overview: - Evaluate mode init method. Called by ``self.__init__``, initialize eval_model. + Initialize the eval mode of policy, including related attributes and modules. For DQN, it contains the \ + eval model, some algorithm-specific parameters such as context_len, max_eval_ep_len, etc. + This method will be called in ``__init__`` method if ``eval`` field is in ``enable_field``. + + .. tip:: + For the evaluation of complete episodes, we need to maintain some historical information for transformer \ + inference. These variables need to be initialized in ``_init_eval`` and reset in ``_reset_eval`` when \ + necessary. + + .. note:: + If you want to set some spacial member variables in ``_init_eval`` method, you'd better name them \ + with prefix ``_eval_`` to avoid conflict with other modes, such as ``self._eval_attr1``. """ self._eval_model = self._model # init data diff --git a/ding/policy/impala.py b/ding/policy/impala.py index eb19ca9b87..46adeb1204 100644 --- a/ding/policy/impala.py +++ b/ding/policy/impala.py @@ -79,9 +79,9 @@ class IMPALAPolicy(Policy): c_clip_ratio=1.0, # (float) clip ratio of importance sampling. rho_pg_clip_ratio=1.0, - # (str) The gradient clip operation type used in PPO, ['clip_norm', clip_value', 'clip_momentum_norm']. + # (str) The gradient clip operation type used in IMPALA, ['clip_norm', clip_value', 'clip_momentum_norm']. grad_clip_type=None, - # (float) The gradient clip target value used in PPO. + # (float) The gradient clip target value used in IMPALA. # If ``grad_clip_type`` is 'clip_norm', then the maximum of gradient will be normalized to this value. clip_value=0.5, # (str) Optimizer used to train the network, ['adam', 'rmsprop']. @@ -120,12 +120,24 @@ def default_model(self) -> Tuple[str, List[str]]: return 'vac', ['ding.model.template.vac'] def _init_learn(self) -> None: - r""" + """ Overview: - Learn mode init method. Called by ``self.__init__``. - Initialize the optimizer, algorithm config and main model. + Initialize the learn mode of policy, including related attributes and modules. For IMPALA, it mainly \ + contains optimizer, algorithm-specific arguments such as loss weight and gamma, main (learn) model. + This method will be called in ``__init__`` method if ``learn`` field is in ``enable_field``. + + .. note:: + For the member variables that need to be saved and loaded, please refer to the ``_state_dict_learn`` \ + and ``_load_state_dict_learn`` methods. + + .. note:: + For the member variables that need to be monitored, please refer to the ``_monitor_vars_learn`` method. + + .. note:: + If you want to set some spacial member variables in ``_init_learn`` method, you'd better name them \ + with prefix ``_learn_`` to avoid conflict with other modes, such as ``self._learn_attr1``. """ - assert self._cfg.action_space in ["continuous", "discrete"] + assert self._cfg.action_space in ["continuous", "discrete"], self._cfg.action_space self._action_space = self._cfg.action_space # Optimizer optim_type = self._cfg.learn.optim @@ -166,8 +178,8 @@ def _data_preprocess_learn(self, data: List[Dict[str, Any]]): Convert list trajectory data to to trajectory data, which is a dict of tensors. Arguments: - data (:obj:`List[Dict[str, Any]]`): List type data, a list of data for training. Each list element is a \ - dict, whose values are torch.Tensor or np.ndarray or dict/list combinations, keys include at least 'obs', \ - 'next_obs', 'logit', 'action', 'reward', 'done' + dict, whose values are torch.Tensor or np.ndarray or dict/list combinations, keys include at least \ + 'obs', 'next_obs', 'logit', 'action', 'reward', 'done' Returns: - data (:obj:`dict`): Dict type data. Values are torch.Tensor or np.ndarray or dict/list combinations. \ ReturnsKeys: @@ -208,20 +220,31 @@ def _data_preprocess_learn(self, data: List[Dict[str, Any]]): def _forward_learn(self, data: List[Dict[str, Any]]) -> Dict[str, Any]: """ Overview: - Forward computation graph of learn mode(updating policy). + Policy forward function of learn mode (training policy and updating parameters). Forward means \ + that the policy inputs some training batch data from the replay buffer and then returns the output \ + result, including various training information such as loss and current learning rate. Arguments: - - data (:obj:`List[Dict[str, Any]]`): List type data, a list of data for training. Each list element is a \ - dict, whose values are torch.Tensor or np.ndarray or dict/list combinations, keys include at least 'obs',\ - 'next_obs', 'logit', 'action', 'reward', 'done' + - data (:obj:`List[Dict[int, Any]]`): The input data used for policy forward, including a batch of \ + training samples. For each element in list, the key of the dict is the name of data items and the \ + value is the corresponding data. Usually, the value is torch.Tensor or np.ndarray or there dict/list \ + combinations. In the ``_forward_learn`` method, data often need to first be stacked in the batch \ + dimension by some utility functions such as ``default_preprocess_learn``. \ + For IMPALA, each element in list is a dict containing at least the following keys: ``obs``, \ + ``action``, ``logit``, ``reward``, ``next_obs``, ``done``. Sometimes, it also contains other keys such \ + as ``weight``. Returns: - - info_dict (:obj:`Dict[str, Any]`): Dict type data, a info dict indicated training result, which will be \ - recorded in text log and tensorboard, values are python scalar or a list of scalars. - ArgumentsKeys: - - necessary: ``obs``, ``action``, ``reward``, ``next_obs``, ``done`` - - optional: 'collect_iter', 'replay_unique_id', 'replay_buffer_idx', 'priority', 'staleness', 'use', 'IS' - ReturnsKeys: - - necessary: ``cur_lr``, ``total_loss``, ``policy_loss`,``value_loss``,``entropy_loss`` - - optional: ``priority`` + - info_dict (:obj:`Dict[str, Any]`): The information dict that indicated training result, which will be \ + recorded in text log and tensorboard, values must be python scalar or a list of scalars. For the \ + detailed definition of the dict, refer to the code of ``_monitor_vars_learn`` method. + + .. note:: + The input value can be torch.Tensor or dict/list combinations and current policy supports all of them. \ + For the data type that not supported, the main reason is that the corresponding model does not support it. \ + You can implement you own model rather than use the default model. For more information, please raise an \ + issue in GitHub repo and we will continue to follow up. + + .. note:: + For more detailed examples, please refer to unittest for IMPALAPolicy: ``ding.policy.tests.test_impala``. """ data = self._data_preprocess_learn(data) # ==================== @@ -267,8 +290,7 @@ def _reshape_data(self, output: Dict[str, Any], data: Dict[str, Any]) -> Tuple: - data (:obj:`Dict[int, Any]`): Dict type data, input of policy._forward_learn Values are torch.Tensor or \ np.ndarray or dict/list combinations. Keys includes at least ['logit', 'action', 'reward', 'done']. Returns: - - data (:obj:`Tuple[Any]`): Tuple of target_logit, behaviour_logit, actions, \ - values, rewards, weights + - data (:obj:`Tuple[Any]`): Tuple of target_logit, behaviour_logit, actions, values, rewards, weights. ReturnsShapes: - target_logit (:obj:`torch.FloatTensor`): :math:`((T+1), B, Obs_Shape)`, where T is timestep,\ B is batch size and Obs_Shape is the shape of single env observation. @@ -300,10 +322,16 @@ def _reshape_data(self, output: Dict[str, Any], data: Dict[str, Any]) -> Tuple: return target_logit, behaviour_logit, actions, values, rewards, weights def _init_collect(self) -> None: - r""" + """ Overview: - Collect mode init method. Called by ``self.__init__``, initialize algorithm arguments and collect_model. - Use multinomial_sample to choose action. + Initialize the collect mode of policy, including related attributes and modules. For IMPALA, it contains \ + the collect_model to balance the exploration and exploitation (e.g. the multinomial sample mechanism in \ + discrete action space), and other algorithm-specific arguments such as unroll_len. + This method will be called in ``__init__`` method if ``collect`` field is in ``enable_field``. + + .. note:: + If you want to set some spacial member variables in ``_init_collect`` method, you'd better name them \ + with prefix ``_collect_`` to avoid conflict with other modes, such as ``self._collect_attr1``. """ assert self._cfg.action_space in ["continuous", "discrete"] self._action_space = self._cfg.action_space @@ -314,18 +342,32 @@ def _init_collect(self) -> None: self._collect_model.reset() - def _forward_collect(self, data: Dict[int, Any]) -> Dict[int, Dict[str, Any]]: + def _forward_collect(self, data: Dict[int, Any]) -> Dict[int, Any]: """ Overview: - Forward computation graph of collect mode(collect training data). + Policy forward function of collect mode (collecting training data by interacting with envs). Forward means \ + that the policy gets some necessary data (mainly observation) from the envs and then returns the output \ + data, such as the action to interact with the envs. Arguments: - - data (:obj:`Dict[int, Any]`): Dict type data, stacked env data for predicting \ - action, values are torch.Tensor or np.ndarray or dict/list combinations,keys \ - are env_id indicated by integer. + - data (:obj:`Dict[int, Any]`): The input data used for policy forward, including at least the obs. The \ + key of the dict is environment id and the value is the corresponding data of the env. Returns: - - output (:obj:`Dict[int, Dict[str,Any]]`): Dict of predicting policy_output(logit, action) for each env. - ReturnsKeys - - necessary: ``logit``, ``action`` + - output (:obj:`Dict[int, Any]`): The output data of policy forward, including at least the action and \ + other necessary data (action logit and value) for learn mode defined in ``self._process_transition`` \ + method. The key of the dict is the same as the input data, i.e. environment id. + + .. tip:: + If you want to add more tricks on this policy, like temperature factor in multinomial sample, you can pass \ + related data as extra keyword arguments of this method. + + .. note:: + The input value can be torch.Tensor or dict/list combinations and current policy supports all of them. \ + For the data type that not supported, the main reason is that the corresponding model does not support it. \ + You can implement you own model rather than use the default model. For more information, please raise an \ + issue in GitHub repo and we will continue to follow up. + + .. note:: + For more detailed examples, please refer to unittest for IMPALAPolicy: ``ding.policy.tests.test_impala``. """ data_id = list(data.keys()) data = default_collate(list(data.values())) @@ -358,7 +400,7 @@ def _process_transition(self, obs: torch.Tensor, policy_output: Dict[str, torch. timestep: namedtuple) -> Dict[str, torch.Tensor]: """ Overview: - Process and pack one timestep transition data info a dict, which can be directly used for training and \ + Process and pack one timestep transition data into a dict, which can be directly used for training and \ saved in replay buffer. For IMPALA, it contains obs, next_obs, action, reward, done, logit. Arguments: - obs (:obj:`torch.Tensor`): The env observation of current timestep, such as stacked 2D image in Atari. @@ -381,12 +423,17 @@ def _process_transition(self, obs: torch.Tensor, policy_output: Dict[str, torch. return transition def _init_eval(self) -> None: - r""" + """ Overview: - Evaluate mode init method. Called by ``self.__init__``, initialize eval_model, - and use argmax_sample to choose action. + Initialize the eval mode of policy, including related attributes and modules. For IMPALA, it contains the \ + eval model to select optimial action (e.g. greedily select action with argmax mechanism in discrete action). + This method will be called in ``__init__`` method if ``eval`` field is in ``enable_field``. + + .. note:: + If you want to set some spacial member variables in ``_init_eval`` method, you'd better name them \ + with prefix ``_eval_`` to avoid conflict with other modes, such as ``self._eval_attr1``. """ - assert self._cfg.action_space in ["continuous", "discrete"] + assert self._cfg.action_space in ["continuous", "discrete"], self._cfg.action_space self._action_space = self._cfg.action_space if self._action_space == 'continuous': self._eval_model = model_wrap(self._model, wrapper_name='deterministic_sample') @@ -398,17 +445,26 @@ def _init_eval(self) -> None: def _forward_eval(self, data: Dict[int, Any]) -> Dict[int, Any]: """ Overview: - Forward computation graph of eval mode(evaluate policy performance), at most cases, it is similar to \ - ``self._forward_collect``. + Policy forward function of eval mode (evaluation policy performance by interacting with envs). Forward \ + means that the policy gets some necessary data (mainly observation) from the envs and then returns the \ + action to interact with the envs. ``_forward_eval`` in IMPALA often uses deterministic sample to get \ + actions while ``_forward_collect`` usually uses stochastic sample method for balance exploration and \ + exploitation. Arguments: - - data (:obj:`Dict[str, Any]`): Dict type data, stacked env data for predicting policy_output(action), \ - values are torch.Tensor or np.ndarray or dict/list combinations, keys are env_id indicated by integer. + - data (:obj:`Dict[int, Any]`): The input data used for policy forward, including at least the obs. The \ + key of the dict is environment id and the value is the corresponding data of the env. Returns: - - output (:obj:`Dict[int, Any]`): The dict of predicting action for the interaction with env. - ReturnsKeys - - necessary: ``action`` - - optional: ``logit`` + - output (:obj:`Dict[int, Any]`): The output data of policy forward, including at least the action. The \ + key of the dict is the same as the input data, i.e. environment id. + .. note:: + The input value can be torch.Tensor or dict/list combinations and current policy supports all of them. \ + For the data type that not supported, the main reason is that the corresponding model does not support it. \ + You can implement you own model rather than use the default model. For more information, please raise an \ + issue in GitHub repo and we will continue to follow up. + + .. note:: + For more detailed examples, please refer to unittest for IMPALAPolicy: ``ding.policy.tests.test_impala``. """ data_id = list(data.keys()) data = default_collate(list(data.values())) diff --git a/ding/policy/pdqn.py b/ding/policy/pdqn.py index 33887bff9e..6b66e263ab 100644 --- a/ding/policy/pdqn.py +++ b/ding/policy/pdqn.py @@ -444,7 +444,7 @@ def _process_transition(self, obs: torch.Tensor, policy_output: Dict[str, torch. timestep: namedtuple) -> Dict[str, torch.Tensor]: """ Overview: - Process and pack one timestep transition data info a dict, which can be directly used for training and \ + Process and pack one timestep transition data into a dict, which can be directly used for training and \ saved in replay buffer. For PDQN, it contains obs, next_obs, action, reward, done and logit. Arguments: - obs (:obj:`torch.Tensor`): The env observation of current timestep, such as stacked 2D image in Atari. diff --git a/ding/policy/ppo.py b/ding/policy/ppo.py index e717ea23e2..9ebd7b0f73 100644 --- a/ding/policy/ppo.py +++ b/ding/policy/ppo.py @@ -112,8 +112,21 @@ def default_model(self) -> Tuple[str, List[str]]: def _init_learn(self) -> None: """ Overview: - Learn mode init method. Called by ``self.__init__``. Initialize the optimizer, algorithm config and \ - the learn model, execute special network initialization and value running mean and std. + Initialize the learn mode of policy, including related attributes and modules. For PPO, it mainly contains \ + optimizer, algorithm-specific arguments such as loss weight, clip_ratio and recompute_adv. This method \ + also executes some special network initializations and prepares running mean/std monitor for value. + This method will be called in ``__init__`` method if ``learn`` field is in ``enable_field``. + + .. note:: + For the member variables that need to be saved and loaded, please refer to the ``_state_dict_learn`` \ + and ``_load_state_dict_learn`` methods. + + .. note:: + For the member variables that need to be monitored, please refer to the ``_monitor_vars_learn`` method. + + .. note:: + If you want to set some spacial member variables in ``_init_learn`` method, you'd better name them \ + with prefix ``_learn_`` to avoid conflict with other modes, such as ``self._learn_attr1``. """ self._priority = self._cfg.priority self._priority_IS_weight = self._cfg.priority_IS_weight @@ -172,16 +185,40 @@ def _init_learn(self) -> None: # Main model self._learn_model.reset() - def _forward_learn(self, data: Dict[str, Any]) -> List[Dict[str, Any]]: + def _forward_learn(self, data: List[Dict[str, Any]]) -> List[Dict[str, Any]]: """ Overview: - Forward and backward function of learn mode. + Policy forward function of learn mode (training policy and updating parameters). Forward means \ + that the policy inputs some training batch data from the replay buffer and then returns the output \ + result, including various training information such as loss, clipfrac, approx_kl. Arguments: - - data (:obj:`dict`): Dict type data + - data (:obj:`List[Dict[int, Any]]`): The input data used for policy forward, including the latest \ + collected training samples for on-policy algorithms like PPO. For each element in list, the key of the \ + dict is the name of data items and the value is the corresponding data. Usually, the value is \ + torch.Tensor or np.ndarray or there dict/list combinations. In the ``_forward_learn`` method, data \ + often need to first be stacked in the batch dimension by some utility functions such as \ + ``default_preprocess_learn``. \ + For PPO, each element in list is a dict containing at least the following keys: ``obs``, ``action``, \ + ``reward``, ``logit``, ``value``, ``done``. Sometimes, it also contains other keys such as ``weight``. Returns: - - info_dict (:obj:`List[Dict[str, Any]]`): Return a list of information, each element is a Dict type data, \ - a info dict indicated training result, which will be recorded in text log and tensorboard, values are \ - python scalar or a list of scalars. + - return_infos (:obj:`List[Dict[str, Any]]`): The information list that indicated training result, each \ + training iteration contains append a information dict into the final list. The list will be precessed \ + and recorded in text log and tensorboard. The value of the dict must be python scalar or a list of \ + scalars. For the detailed definition of the dict, refer to the code of ``_monitor_vars_learn`` method. + + .. tip:: + The training procedure of PPO is two for loops. The outer loop trains all the collected training samples \ + with ``epoch_per_collect`` epochs. The inner loop splits all the data into different mini-batch with \ + the length of ``batch_size``. + + .. note:: + The input value can be torch.Tensor or dict/list combinations and current policy supports all of them. \ + For the data type that not supported, the main reason is that the corresponding model does not support it. \ + You can implement you own model rather than use the default model. For more information, please raise an \ + issue in GitHub repo and we will continue to follow up. + + .. note:: + For more detailed examples, please refer to our unittest for PPOPolicy: ``ding.policy.tests.test_ppo``. """ data = default_preprocess_learn(data, ignore_done=self._cfg.learn.ignore_done, use_nstep=False) if self._cuda: @@ -302,13 +339,23 @@ def _forward_learn(self, data: Dict[str, Any]) -> List[Dict[str, Any]]: return return_infos def _init_collect(self) -> None: - r""" + """ Overview: - Collect mode init method. Called by ``self.__init__``. - Init traj and unroll length, collect model. + Initialize the collect mode of policy, including related attributes and modules. For PPO, it contains the \ + collect_model to balance the exploration and exploitation (e.g. the multinomial sample mechanism in \ + discrete action space), and other algorithm-specific arguments such as unroll_len and gae_lambda. + This method will be called in ``__init__`` method if ``collect`` field is in ``enable_field``. + + .. note:: + If you want to set some spacial member variables in ``_init_collect`` method, you'd better name them \ + with prefix ``_collect_`` to avoid conflict with other modes, such as ``self._collect_attr1``. + + .. tip:: + Some variables need to initialize independently in different modes, such as gamma and gae_lambda in PPO. \ + This design is for the convenience of parallel execution of different policy modes. """ self._unroll_len = self._cfg.collect.unroll_len - assert self._cfg.action_space in ["continuous", "discrete", "hybrid"] + assert self._cfg.action_space in ["continuous", "discrete", "hybrid"], self._cfg.action_space self._action_space = self._cfg.action_space if self._action_space == 'continuous': self._collect_model = model_wrap(self._model, wrapper_name='reparam_sample') @@ -364,7 +411,7 @@ def _process_transition(self, obs: torch.Tensor, policy_output: Dict[str, torch. timestep: namedtuple) -> Dict[str, torch.Tensor]: """ Overview: - Process and pack one timestep transition data info a dict, which can be directly used for training and \ + Process and pack one timestep transition data into a dict, which can be directly used for training and \ saved in replay buffer. For PPO, it contains obs, next_obs, action, reward, done, logit, value. Arguments: - obs (:obj:`torch.Tensor`): The env observation of current timestep, such as stacked 2D image in Atari. @@ -447,10 +494,15 @@ def _get_train_sample(self, transitions: List[Dict[str, Any]]) -> List[Dict[str, return get_train_sample(data, self._unroll_len) def _init_eval(self) -> None: - r""" + """ Overview: - Evaluate mode init method. Called by ``self.__init__``. - Init eval model with argmax strategy. + Initialize the eval mode of policy, including related attributes and modules. For PPO, it contains the \ + eval model to select optimial action (e.g. greedily select action with argmax mechanism in discrete action). + This method will be called in ``__init__`` method if ``eval`` field is in ``enable_field``. + + .. note:: + If you want to set some spacial member variables in ``_init_eval`` method, you'd better name them \ + with prefix ``_eval_`` to avoid conflict with other modes, such as ``self._eval_attr1``. """ assert self._cfg.action_space in ["continuous", "discrete", "hybrid"] self._action_space = self._cfg.action_space @@ -526,7 +578,7 @@ def _monitor_vars_learn(self) -> List[str]: class PPOPGPolicy(Policy): """ Overview: - Policy class of on policy version PPO algorithm (pure policy gradient). + Policy class of on policy version PPO algorithm (pure policy gradient without value network). Paper link: https://arxiv.org/abs/1707.06347. """ config = dict( @@ -588,7 +640,25 @@ def default_model(self) -> Tuple[str, List[str]]: return 'pg', ['ding.model.template.pg'] def _init_learn(self) -> None: - assert self._cfg.action_space in ["continuous", "discrete", "hybrid"] + """ + Overview: + Initialize the learn mode of policy, including related attributes and modules. For PPOPG, it mainly \ + contains optimizer, algorithm-specific arguments such as loss weight and clip_ratio. This method \ + also executes some special network initializations. + This method will be called in ``__init__`` method if ``learn`` field is in ``enable_field``. + + .. note:: + For the member variables that need to be saved and loaded, please refer to the ``_state_dict_learn`` \ + and ``_load_state_dict_learn`` methods. + + .. note:: + For the member variables that need to be monitored, please refer to the ``_monitor_vars_learn`` method. + + .. note:: + If you want to set some spacial member variables in ``_init_learn`` method, you'd better name them \ + with prefix ``_learn_`` to avoid conflict with other modes, such as ``self._learn_attr1``. + """ + assert self._cfg.action_space in ["continuous", "discrete"] self._action_space = self._cfg.action_space if self._cfg.learn.ppo_param_init: for n, m in self._model.named_modules(): @@ -620,7 +690,39 @@ def _init_learn(self) -> None: # Main model self._learn_model.reset() - def _forward_learn(self, data: Dict[str, Any]) -> Dict[str, Any]: + def _forward_learn(self, data: List[Dict[str, Any]]) -> List[Dict[str, Any]]: + """ + Overview: + Policy forward function of learn mode (training policy and updating parameters). Forward means \ + that the policy inputs some training batch data from the replay buffer and then returns the output \ + result, including various training information such as loss, clipfrac, approx_kl. + Arguments: + - data (:obj:`List[Dict[int, Any]]`): The input data used for policy forward, including the latest \ + collected training samples for on-policy algorithms like PPO. For each element in list, the key of the \ + dict is the name of data items and the value is the corresponding data. Usually, the value is \ + torch.Tensor or np.ndarray or there dict/list combinations. In the ``_forward_learn`` method, data \ + often need to first be stacked in the batch dimension by some utility functions such as \ + ``default_preprocess_learn``. \ + For PPOPG, each element in list is a dict containing at least the following keys: ``obs``, ``action``, \ + ``return``, ``logit``, ``done``. Sometimes, it also contains other keys such as ``weight``. + Returns: + - return_infos (:obj:`List[Dict[str, Any]]`): The information list that indicated training result, each \ + training iteration contains append a information dict into the final list. The list will be precessed \ + and recorded in text log and tensorboard. The value of the dict must be python scalar or a list of \ + scalars. For the detailed definition of the dict, refer to the code of ``_monitor_vars_learn`` method. + + .. tip:: + The training procedure of PPOPG is two for loops. The outer loop trains all the collected training samples \ + with ``epoch_per_collect`` epochs. The inner loop splits all the data into different mini-batch with \ + the length of ``batch_size``. + + .. note:: + The input value can be torch.Tensor or dict/list combinations and current policy supports all of them. \ + For the data type that not supported, the main reason is that the corresponding model does not support it. \ + You can implement you own model rather than use the default model. For more information, please raise an \ + issue in GitHub repo and we will continue to follow up. + """ + data = default_preprocess_learn(data) if self._cuda: data = to_device(data, self._device) @@ -664,7 +766,22 @@ def _forward_learn(self, data: Dict[str, Any]) -> Dict[str, Any]: return return_infos def _init_collect(self) -> None: - assert self._cfg.action_space in ["continuous", "discrete", "hybrid"] + """ + Overview: + Initialize the collect mode of policy, including related attributes and modules. For PPOPG, it contains \ + the collect_model to balance the exploration and exploitation (e.g. the multinomial sample mechanism in \ + discrete action space), and other algorithm-specific arguments such as unroll_len and gae_lambda. + This method will be called in ``__init__`` method if ``collect`` field is in ``enable_field``. + + .. note:: + If you want to set some spacial member variables in ``_init_collect`` method, you'd better name them \ + with prefix ``_collect_`` to avoid conflict with other modes, such as ``self._collect_attr1``. + + .. tip:: + Some variables need to initialize independently in different modes, such as gamma and gae_lambda in PPO. \ + This design is for the convenience of parallel execution of different policy modes. + """ + assert self._cfg.action_space in ["continuous", "discrete"], self._cfg.action_space self._action_space = self._cfg.action_space self._unroll_len = self._cfg.collect.unroll_len if self._action_space == 'continuous': @@ -697,9 +814,6 @@ def _forward_collect(self, data: Dict[int, Any]) -> Dict[int, Any]: For the data type that not supported, the main reason is that the corresponding model does not support it. \ You can implement you own model rather than use the default model. For more information, please raise an \ issue in GitHub repo and we will continue to follow up. - - .. note:: - For more detailed examples, please refer to our unittest for PPOPGPolicy: ``ding.policy.tests.test_ppo``. """ data_id = list(data.keys()) data = default_collate(list(data.values())) @@ -717,7 +831,7 @@ def _process_transition(self, obs: torch.Tensor, policy_output: Dict[str, torch. timestep: namedtuple) -> Dict[str, torch.Tensor]: """ Overview: - Process and pack one timestep transition data info a dict, which can be directly used for training and \ + Process and pack one timestep transition data into a dict, which can be directly used for training and \ saved in replay buffer. For PPOPG, it contains obs, action, reward, done, logit. Arguments: - obs (:obj:`torch.Tensor`): The env observation of current timestep, such as stacked 2D image in Atari. @@ -767,7 +881,17 @@ def _get_train_sample(self, data: List[Dict[str, Any]]) -> List[Dict[str, Any]]: return get_train_sample(data, self._unroll_len) def _init_eval(self) -> None: - assert self._cfg.action_space in ["continuous", "discrete", "hybrid"] + """ + Overview: + Initialize the eval mode of policy, including related attributes and modules. For PPOPG, it contains the \ + eval model to select optimial action (e.g. greedily select action with argmax mechanism in discrete action). + This method will be called in ``__init__`` method if ``eval`` field is in ``enable_field``. + + .. note:: + If you want to set some spacial member variables in ``_init_eval`` method, you'd better name them \ + with prefix ``_eval_`` to avoid conflict with other modes, such as ``self._eval_attr1``. + """ + assert self._cfg.action_space in ["continuous", "discrete"] self._action_space = self._cfg.action_space if self._action_space == 'continuous': self._eval_model = model_wrap(self._model, wrapper_name='deterministic_sample') @@ -832,6 +956,7 @@ class PPOOffPolicy(Policy): """ Overview: Policy class of off-policy version PPO algorithm. Paper link: https://arxiv.org/abs/1707.06347. + This version is more suitable for large-scale distributed training. """ config = dict( # (str) RL policy register name (refer to function "POLICY_REGISTRY"). @@ -915,14 +1040,27 @@ def default_model(self) -> Tuple[str, List[str]]: return 'vac', ['ding.model.template.vac'] def _init_learn(self) -> None: - r""" + """ Overview: - Learn mode init method. Called by ``self.__init__``. - Init the optimizer, algorithm config and the main model. + Initialize the learn mode of policy, including related attributes and modules. For PPOOff, it mainly \ + contains optimizer, algorithm-specific arguments such as loss weight and clip_ratio. This method \ + also executes some special network initializations and prepares running mean/std monitor for value. + This method will be called in ``__init__`` method if ``learn`` field is in ``enable_field``. + + .. note:: + For the member variables that need to be saved and loaded, please refer to the ``_state_dict_learn`` \ + and ``_load_state_dict_learn`` methods. + + .. note:: + For the member variables that need to be monitored, please refer to the ``_monitor_vars_learn`` method. + + .. note:: + If you want to set some spacial member variables in ``_init_learn`` method, you'd better name them \ + with prefix ``_learn_`` to avoid conflict with other modes, such as ``self._learn_attr1``. """ self._priority = self._cfg.priority self._priority_IS_weight = self._cfg.priority_IS_weight - assert not self._priority and not self._priority_IS_weight, "Priority is not implemented in PPO" + assert not self._priority and not self._priority_IS_weight, "Priority is not implemented in PPOOff" assert self._cfg.action_space in ["continuous", "discrete", "hybrid"] self._action_space = self._cfg.action_space @@ -979,15 +1117,31 @@ def _init_learn(self) -> None: # Main model self._learn_model.reset() - def _forward_learn(self, data: dict) -> Dict[str, Any]: + def _forward_learn(self, data: List[Dict[str, Any]]) -> Dict[str, Any]: """ Overview: - Forward and backward function of learn mode. + Policy forward function of learn mode (training policy and updating parameters). Forward means \ + that the policy inputs some training batch data from the replay buffer and then returns the output \ + result, including various training information such as loss, clipfrac and approx_kl. Arguments: - - data (:obj:`dict`): Dict type data + - data (:obj:`List[Dict[int, Any]]`): The input data used for policy forward, including a batch of \ + training samples. For each element in list, the key of the dict is the name of data items and the \ + value is the corresponding data. Usually, the value is torch.Tensor or np.ndarray or there dict/list \ + combinations. In the ``_forward_learn`` method, data often need to first be stacked in the batch \ + dimension by some utility functions such as ``default_preprocess_learn``. \ + For PPOOff, each element in list is a dict containing at least the following keys: ``obs``, ``adv``, \ + ``action``, ``logit``, ``value``, ``done``. Sometimes, it also contains other keys such as ``weight`` \ + and ``value_gamma``. Returns: - - info_dict (:obj:`Dict[str, Any]`): Dict type data, a info dict indicated training result, which will be \ - recorded in text log and tensorboard, values are python scalar or a list of scalars. + - info_dict (:obj:`Dict[str, Any]`): The information dict that indicated training result, which will be \ + recorded in text log and tensorboard, values must be python scalar or a list of scalars. For the \ + detailed definition of the dict, refer to the code of ``_monitor_vars_learn`` method. + + .. note:: + The input value can be torch.Tensor or dict/list combinations and current policy supports all of them. \ + For the data type that not supported, the main reason is that the corresponding model does not support it. \ + You can implement you own model rather than use the default model. For more information, please raise an \ + issue in GitHub repo and we will continue to follow up. """ data = default_preprocess_learn(data, ignore_done=self._cfg.learn.ignore_done, use_nstep=self._nstep_return) if self._cuda: @@ -1002,7 +1156,7 @@ def _forward_learn(self, data: dict) -> Dict[str, Any]: self._learn_model.train() with torch.no_grad(): - if hasattr(self, "_value_norm") and self._value_norm: + if self._value_norm: unnormalized_return = data['adv'] + data['value'] * self._running_mean_std.std data['return'] = unnormalized_return / self._running_mean_std.std self._running_mean_std.update(unnormalized_return.cpu().numpy()) @@ -1145,8 +1299,18 @@ def _forward_learn(self, data: dict) -> Dict[str, Any]: def _init_collect(self) -> None: """ Overview: - Collect mode init method. Called by ``self.__init__``. Initialize unroll length, gamma, gae lambda and \ - collect model. + Initialize the collect mode of policy, including related attributes and modules. For PPOOff, it contains \ + collect_model to balance the exploration and exploitation (e.g. the multinomial sample mechanism in \ + discrete action space), and other algorithm-specific arguments such as unroll_len and gae_lambda. + This method will be called in ``__init__`` method if ``collect`` field is in ``enable_field``. + + .. note:: + If you want to set some spacial member variables in ``_init_collect`` method, you'd better name them \ + with prefix ``_collect_`` to avoid conflict with other modes, such as ``self._collect_attr1``. + + .. tip:: + Some variables need to initialize independently in different modes, such as gamma and gae_lambda in PPOOff. + This design is for the convenience of parallel execution of different policy modes. """ self._unroll_len = self._cfg.collect.unroll_len assert self._cfg.action_space in ["continuous", "discrete", "hybrid"] @@ -1162,6 +1326,7 @@ def _init_collect(self) -> None: self._gae_lambda = self._cfg.collect.gae_lambda self._nstep = self._cfg.nstep self._nstep_return = self._cfg.nstep_return + self._value_norm = self._cfg.learn.value_norm def _forward_collect(self, data: Dict[int, Any]) -> Dict[int, Any]: """ @@ -1206,7 +1371,7 @@ def _process_transition(self, obs: torch.Tensor, policy_output: Dict[str, torch. timestep: namedtuple) -> Dict[str, torch.Tensor]: """ Overview: - Process and pack one timestep transition data info a dict, which can be directly used for training and \ + Process and pack one timestep transition data into a dict, which can be directly used for training and \ saved in replay buffer. For PPO, it contains obs, next_obs, action, reward, done, logit, value. Arguments: - obs (:obj:`torch.Tensor`): The env observation of current timestep, such as stacked 2D image in Atari. @@ -1289,10 +1454,15 @@ def _get_train_sample(self, transitions: List[Dict[str, Any]]) -> List[Dict[str, return get_nstep_return_data(data, self._nstep) def _init_eval(self) -> None: - r""" + """ Overview: - Evaluate mode init method. Called by ``self.__init__``. - Init eval model with argmax strategy. + Initialize the eval mode of policy, including related attributes and modules. For PPOOff, it contains the \ + eval model to select optimial action (e.g. greedily select action with argmax mechanism in discrete action). + This method will be called in ``__init__`` method if ``eval`` field is in ``enable_field``. + + .. note:: + If you want to set some spacial member variables in ``_init_eval`` method, you'd better name them \ + with prefix ``_eval_`` to avoid conflict with other modes, such as ``self._eval_attr1``. """ assert self._cfg.action_space in ["continuous", "discrete", "hybrid"] self._action_space = self._cfg.action_space diff --git a/ding/policy/qmix.py b/ding/policy/qmix.py index e1f9dd3e2f..ff1d66f7c8 100644 --- a/ding/policy/qmix.py +++ b/ding/policy/qmix.py @@ -205,6 +205,7 @@ def _forward_learn(self, data: List[List[Dict[str, Any]]]) -> Dict[str, Any]: - info_dict (:obj:`Dict[str, Any]`): The information dict that indicated training result, which will be \ recorded in text log and tensorboard, values must be python scalar or a list of scalars. For the \ detailed definition of the dict, refer to the code of ``_monitor_vars_learn`` method. + .. note:: The input value can be torch.Tensor or dict/list combinations and current policy supports all of them. \ For the data type that not supported, the main reason is that the corresponding model does not support it. \ @@ -389,7 +390,7 @@ def _process_transition(self, obs: torch.Tensor, policy_output: Dict[str, torch. timestep: namedtuple) -> Dict[str, torch.Tensor]: """ Overview: - Process and pack one timestep transition data info a dict, which can be directly used for training and \ + Process and pack one timestep transition data into a dict, which can be directly used for training and \ saved in replay buffer. For QMIX, it contains obs, next_obs, action, prev_state, reward, done. Arguments: - obs (:obj:`torch.Tensor`): The env observation of current timestep, usually including ``agent_obs`` \ diff --git a/ding/policy/r2d2.py b/ding/policy/r2d2.py index 6898ccdbc3..0726c2c820 100644 --- a/ding/policy/r2d2.py +++ b/ding/policy/r2d2.py @@ -68,9 +68,9 @@ class R2D2Policy(Policy): cuda=False, # (bool) Whether the RL algorithm is on-policy or off-policy. on_policy=False, - # (bool) Whether use priority(priority sample, IS weight, update priority) + # (bool) Whether to use priority(priority sample, IS weight, update priority) priority=True, - # (bool) Whether use Importance Sampling Weight to correct biased update. If True, priority must be True. + # (bool) Whether to use Importance Sampling Weight to correct biased update. If True, priority must be True. priority_IS_weight=True, # (float) Reward's future discount factor, aka. gamma. discount_factor=0.997, @@ -84,13 +84,13 @@ class R2D2Policy(Policy): learn_unroll_len=80, # learn_mode config learn=dict( - # (int) How many updates(iterations) to train after collector's one collection. - # Bigger "update_per_collect" means bigger off-policy. - # collect data -> update policy-> collect data -> ... + # (int) The number of training updates (iterations) to perform after each data collection by the collector. + # A larger "update_per_collect" value implies a more off-policy approach. + # The whole pipeline process follows this cycle: collect data -> update policy -> collect data -> ... update_per_collect=1, - # (int) How many samples in a training batch. + # (int) The number of samples in a training batch. batch_size=64, - # (float) The step size of gradient descent. + # (float) The step size of gradient descent, determining the rate of learning. learning_rate=0.0001, # (int) Frequence of target network update. # target_update_freq=100, @@ -116,26 +116,26 @@ class R2D2Policy(Policy): # In R2D2 policy, for each collect_env, we want to collect data of length self._traj_len=INF # unless the episode enters the 'done' state. traj_len_inf=True, - # (int) `env_num` is used in hidden state, should equal to that one in env config. - # User should specify this value in user config. + # (int) `env_num` is used in hidden state, should equal to that one in env config (e.g. collector_env_num). + # User should specify this value in user config. `None` is a placeholder. env_num=None, ), # eval_mode config eval=dict( - # (int) `env_num` is used in hidden state, should equal to that one in env config. + # (int) `env_num` is used in hidden state, should equal to that one in env config (e.g. evaluator_env_num). # User should specify this value in user config. env_num=None, ), other=dict( # Epsilon greedy with decay. eps=dict( - # (str) Decay type. Support ['exp', 'linear']. + # (str) Type of decay. Supports either 'exp' (exponential) or 'linear'. type='exp', - # (float) Epsilon start value. + # (float) Initial value of epsilon at the start. start=0.95, - # (float) Epsilon end value. + # (float) Final value of epsilon after decay. end=0.05, - # (int) Decay length(env step). + # (int) The number of environment steps over which epsilon should decay. decay=10000, ), replay_buffer=dict( @@ -529,11 +529,11 @@ def _process_transition(self, obs: torch.Tensor, policy_output: Dict[str, torch. timestep: namedtuple) -> Dict[str, torch.Tensor]: """ Overview: - Process and pack one timestep transition data info a dict, which can be directly used for training and \ - saved in replay buffer. For R2D2, it contains obs, action, prev_state, reward, done. + Process and pack one timestep transition data into a dict, which can be directly used for training and \ + saved in replay buffer. For R2D2, it contains obs, action, prev_state, reward, and done. Arguments: - obs (:obj:`torch.Tensor`): The env observation of current timestep, such as stacked 2D image in Atari. - - policy_output (:obj:`Dict[str, torch.Tensor]`): The output of the policy network with the observation \ + - policy_output (:obj:`Dict[str, torch.Tensor]`): The output of the policy network given the observation \ as input. For R2D2, it contains the action and the prev_state of RNN. - timestep (:obj:`namedtuple`): The execution result namedtuple returned by the environment step method, \ except all the elements have been transformed into tensor data. Usually, it contains the next obs, \ diff --git a/ding/policy/sac.py b/ding/policy/sac.py index 5ee3cc3eda..5b5dfe55c8 100644 --- a/ding/policy/sac.py +++ b/ding/policy/sac.py @@ -443,7 +443,7 @@ def _process_transition(self, obs: torch.Tensor, policy_output: Dict[str, torch. timestep: namedtuple) -> Dict[str, torch.Tensor]: """ Overview: - Process and pack one timestep transition data info a dict, which can be directly used for training and \ + Process and pack one timestep transition data into a dict, which can be directly used for training and \ saved in replay buffer. For discrete SAC, it contains obs, next_obs, logit, action, reward, done. Arguments: - obs (:obj:`torch.Tensor`): The env observation of current timestep, such as stacked 2D image in Atari. @@ -1015,7 +1015,7 @@ def _process_transition(self, obs: torch.Tensor, policy_output: Dict[str, torch. timestep: namedtuple) -> Dict[str, torch.Tensor]: """ Overview: - Process and pack one timestep transition data info a dict, which can be directly used for training and \ + Process and pack one timestep transition data into a dict, which can be directly used for training and \ saved in replay buffer. For continuous SAC, it contains obs, next_obs, action, reward, done. The logit \ will be also added when ``collector_logit`` is True. Arguments: diff --git a/ding/rl_utils/td.py b/ding/rl_utils/td.py index 9a42630702..9b741ec7ba 100644 --- a/ding/rl_utils/td.py +++ b/ding/rl_utils/td.py @@ -575,15 +575,12 @@ def bdq_nstep_td_error( ) -> torch.Tensor: """ Overview: - Multistep (1 step or n step) td_error for BDQ algorithm, \ - referenced paper Action Branching Architectures for Deep Reinforcement Learning \ - - In fact, the original paper only provides the 1-step TD-error calculation method, \ - and here we extend the calculation method of n-step. - TD-error: - y_d = \sigma_{t=0}^{nstep} \gamma^t * r_t + \gamma^{nstep} * Q_d'(s', argmax Q_d(s', a_d)) - TD-error = \frac{1}{D} * (y_d - Q_d(s, a_d))^2 - Loss = mean(TD-error) + Multistep (1 step or n step) td_error for BDQ algorithm, referenced paper "Action Branching Architectures for \ + Deep Reinforcement Learning", link: https://arxiv.org/pdf/1711.08946. + In fact, the original paper only provides the 1-step TD-error calculation method, and here we extend the \ + calculation method of n-step, i.e., TD-error: + :math:`y_d = \sigma_{t=0}^{nstep} \gamma^t * r_t + \gamma^{nstep} * Q_d'(s', argmax Q_d(s', a_d))` + :math:`TD-error = \frac{1}{D} * (y_d - Q_d(s, a_d))^2` Arguments: - data (:obj:`q_nstep_td_data`): The input data, q_nstep_td_data to calculate loss - gamma (:obj:`float`): Discount factor diff --git a/ding/torch_utils/network/activation.py b/ding/torch_utils/network/activation.py index acdb3f2707..f507e4bcc0 100644 --- a/ding/torch_utils/network/activation.py +++ b/ding/torch_utils/network/activation.py @@ -101,12 +101,12 @@ def forward(self, x: torch.Tensor) -> torch.Tensor: def build_activation(activation: str, inplace: bool = None) -> nn.Module: - r""" + """ Overview: Return the activation module according to the given type. Arguments: - activation (:obj:`str`): the type of activation module, now supports \ - ['relu', 'glu', 'prelu', 'swish', 'gelu', 'tanh', 'sigmoid', 'softplus', 'elu', 'square', 'identity'] + ['relu', 'glu', 'prelu', 'swish', 'gelu', 'tanh', 'sigmoid', 'softplus', 'elu', 'square', 'identity'] - inplace (:obj:`bool`): can optionally do the operation in-place in relu. Default ``None`` Returns: - act_func (:obj:`nn.module`): the corresponding activation module