diff --git a/.idea/Reinforcement-Learning.iml b/.idea/Reinforcement-Learning.iml index 91edcfb..76dd67a 100644 --- a/.idea/Reinforcement-Learning.iml +++ b/.idea/Reinforcement-Learning.iml @@ -2,7 +2,7 @@ - + diff --git a/README.md b/README.md index bae23da..01d57fa 100644 --- a/README.md +++ b/README.md @@ -60,21 +60,21 @@ Now you have successfully installed the project and its dependencies. You can pr ## Papers to Code -| No | Year | Status | Name | Citations | -|:--:|:----:|:----------------:|:------------------------------------------------------------------------------------------------------|:--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------:| -| 1 | 1951 | 🚧 Developing | [A Stochastic Approximation Method]() | ![](https://img.shields.io/badge/dynamic/json?label=Citation&query=citationCount&url=https%3A%2F%2Fapi.semanticscholar.org%2Fgraph%2Fv1%2Fpaper%2F34ddd8865569c2c32dec9bf7ffc817ff42faaa01%3Ffields%3DcitationCount) | -| 2 | 1986 | 🚧 Developing | [Stochastic approximation for Monte Carlo optimization]() | ![](https://img.shields.io/badge/dynamic/json?label=Citation&query=citationCount&url=https%3A%2F%2Fapi.semanticscholar.org%2Fgraph%2Fv1%2Fpaper%2F08bcd967e6ca896eb85d6e03561aabf138df65d1%3Ffields%3DcitationCount) | -| 3 | 2001 | 🚧 Developing | [A natural policy gradient]() | ![](https://img.shields.io/badge/dynamic/json?label=Citation&query=citationCount&url=https%3A%2F%2Fapi.semanticscholar.org%2Fgraph%2Fv1%2Fpaper%2Fb18833db0de9393d614d511e60821a1504fc6cd1%3Ffields%3DcitationCount) | -| 4 | 2013 | πŸ§ͺ Experimenting | [Playing Atari with Deep Reinforcement Learning](./algorithms/dqn.py) | ![](https://img.shields.io/badge/dynamic/json?label=Citation&query=citationCount&url=https%3A%2F%2Fapi.semanticscholar.org%2Fgraph%2Fv1%2Fpaper%2F2319a491378867c7049b3da055c5df60e1671158%3Ffields%3DcitationCount) | -| 5 | 2015 | πŸ§ͺ Experimenting | [Human-level control through deep reinforcement learning](./algorithms/dqn.py) | ![](https://img.shields.io/badge/dynamic/json?label=Citation&query=citationCount&url=https%3A%2F%2Fapi.semanticscholar.org%2Fgraph%2Fv1%2Fpaper%2Fe0e9a94c4a6ba219e768b4e59f72c18f0a22e23d%3Ffields%3DcitationCount) | -| 6 | 2015 | 🚧 Developing | [Trust Region Policy Optimization]() | ![](https://img.shields.io/badge/dynamic/json?label=Citation&query=citationCount&url=https%3A%2F%2Fapi.semanticscholar.org%2Fgraph%2Fv1%2Fpaper%2F66cdc28dc084af6507e979767755e99fe0b46b39%3Ffields%3DcitationCount) | -| 7 | 2015 | 🚧 Developing | [Continuous control with deep reinforcement learning]() | ![](https://img.shields.io/badge/dynamic/json?label=Citation&query=citationCount&url=https%3A%2F%2Fapi.semanticscholar.org%2Fgraph%2Fv1%2Fpaper%2F024006d4c2a89f7acacc6e4438d156525b60a98f%3Ffields%3DcitationCount) | +| No | Year | Status | Name | +|:--:|:----:|:----------------:|:------------------------------------------------------------------------------------------------------| +| 1 | 1951 | 🚧 Developing | [A Stochastic Approximation Method]() | +| 2 | 1986 | 🚧 Developing | [Stochastic approximation for Monte Carlo optimization]() | +| 3 | 2001 | 🚧 Developing | [A natural policy gradient]() | +| 4 | 2013 | πŸ§ͺ Experimenting | [Playing Atari with Deep Reinforcement Learning](./algorithms/dqn.py) | +| 5 | 2015 | πŸ§ͺ Experimenting | [Human-level control through deep reinforcement learning](./algorithms/dqn.py) | +| 6 | 2015 | 🚧 Developing | [Trust Region Policy Optimization]() | +| 7 | 2015 | 🚧 Developing | [Continuous control with deep reinforcement learning]() | | 8 | 2015 | πŸ§ͺ Experimenting | [Deep Reinforcement Learning with Double Q-Learning](./algorithms/double_dqn.py) | ![](https://img.shields.io/badge/dynamic/json?label=Citation&query=citationCount&url=https%3A%2F%2Fapi.semanticscholar.org%2Fgraph%2Fv1%2Fpaper%2F3b9732bb07dc99bde5e1f9f75251c6ea5039373e%3Ffields%3DcitationCount) | -| 8 | 2016 | πŸ§ͺ Experimenting | [Dueling Network Architectures for Deep Reinforcement Learning](./algorithm/dueling_dqn.py_) | ![](https://img.shields.io/badge/dynamic/json?label=Citation&query=citationCount&url=https%3A%2F%2Fapi.semanticscholar.org%2Fgraph%2Fv1%2Fpaper%2F4c05d7caa357148f0bbd61720bdd35f0bc05eb81%3Ffields%3DcitationCount) | -| 9 | 2016 | πŸ§ͺ Experimenting | [Prioritized Experience Replay](./algorithms/dqn_pp.py) | ![](https://img.shields.io/badge/dynamic/json?label=Citation&query=citationCount&url=https%3A%2F%2Fapi.semanticscholar.org%2Fgraph%2Fv1%2Fpaper%2Fc6170fa90d3b2efede5a2e1660cb23e1c824f2ca%3Ffields%3DcitationCount) | -| 10 | 2017 | 🚧 Developing | [Proximal Policy Optimization Algorithms]() | ![](https://img.shields.io/badge/dynamic/json?label=Citation&query=citationCount&url=https%3A%2F%2Fapi.semanticscholar.org%2Fgraph%2Fv1%2Fpaper%2Fdce6f9d4017b1785979e7520fd0834ef8cf02f4b%3Ffields%3DcitationCount) | -| 11 | 2018 | 🚧 Developing | [Addressing Function Approximation Error in Actor-Critic Methods]() | ![](https://img.shields.io/badge/dynamic/json?label=Citation&query=citationCount&url=https%3A%2F%2Fapi.semanticscholar.org%2Fgraph%2Fv1%2Fpaper%2F4debb99c0c63bfaa97dd433bc2828e4dac81c48b%3Ffields%3DcitationCount) | -| 12 | 2018 | 🚧 Developing | [Soft Actor-Critic: Off-Policy Maximum Entropy Deep Reinforcement Learning with a Stochastic Actor]() | ![](https://img.shields.io/badge/dynamic/json?label=Citation&query=citationCount&url=https%3A%2F%2Fapi.semanticscholar.org%2Fgraph%2Fv1%2Fpaper%2F811df72e210e20de99719539505da54762a11c6d%3Ffields%3DcitationCount) | +| 8 | 2016 | πŸ§ͺ Experimenting | [Dueling Network Architectures for Deep Reinforcement Learning](./algorithm/dueling_dqn.py_) | +| 9 | 2016 | πŸ§ͺ Experimenting | [Prioritized Experience Replay](./algorithms/dqn_pp.py) | +| 10 | 2017 | 🚧 Developing | [Proximal Policy Optimization Algorithms]() | +| 11 | 2018 | 🚧 Developing | [Addressing Function Approximation Error in Actor-Critic Methods]() | +| 12 | 2018 | 🚧 Developing | [Soft Actor-Critic: Off-Policy Maximum Entropy Deep Reinforcement Learning with a Stochastic Actor]() | ## Code Structure and Explanation diff --git a/abc_rl/experience_replay.py b/abc_rl/experience_replay.py index 2d73652..54dce5c 100644 --- a/abc_rl/experience_replay.py +++ b/abc_rl/experience_replay.py @@ -22,6 +22,10 @@ def store(self, observation: np.ndarray, action: np.ndarray, reward: np.ndarray, # Update the position in a circular manner self.position = (self.position + 1) % self.capacity + def clear(self): + self.buffer = [] + self.position = 0 + def __len__(self): return len(self.buffer) diff --git a/agents/async_dqn_agent.py b/agents/async_dqn_agent.py new file mode 100644 index 0000000..fda73ca --- /dev/null +++ b/agents/async_dqn_agent.py @@ -0,0 +1,24 @@ +import torch.multiprocessing as mp +from agents.dqn_agent import * + + +class AsynDQNValueFunction(DQNValueFunction): + def __init__(self, input_channel: int, action_dim: int, learning_rate: float, + gamma: float, step_c: int, model_saving_period: int, device: torch.device, logger: Logger): + super(AsynDQNValueFunction, self).__init__(input_channel, action_dim, learning_rate, + gamma, step_c, model_saving_period, device, logger) + self.value_nn.share_memory() + self.target_value_nn.share_memory() + + +class AsynDQNAgent(DQNAgent): + def __init__(self, input_frame_width: int, input_frame_height: int, action_space, + mini_batch_size: int, replay_buffer_size: int, replay_start_size: int, + learning_rate: float, step_c: int, model_saving_period: int, + gamma: float, training_episodes: int, phi_channel: int, epsilon_max: float, epsilon_min: float, + exploration_steps: int, device: torch.device, logger: Logger): + super(AsynDQNAgent, self).__init__(input_frame_width, input_frame_height, action_space, + mini_batch_size, replay_buffer_size, replay_start_size, + learning_rate, step_c, model_saving_period, + gamma, training_episodes, phi_channel, epsilon_max, epsilon_min, + exploration_steps, device, logger) diff --git a/agents/dqn_agent.py b/agents/dqn_agent.py index 99867e9..6689e9e 100644 --- a/agents/dqn_agent.py +++ b/agents/dqn_agent.py @@ -159,6 +159,7 @@ def max_state_value(self, obs_tensor): # Update the value function with the given samples def update(self, samples: list, weight=None): + """ :param samples: Input samples :param weight: Importance weight for prioritized experience replay @@ -206,8 +207,9 @@ def update(self, samples: list, weight=None): # synchronize the target value neural network with the value neural network every step_c steps if self.update_step % self.step_c == 0: self.synchronize_value_nn() - self.logger.tb_scalar('loss', loss.item(), self.update_step) - self.logger.tb_scalar('q', torch.mean(q_value), self.update_step) + if self.logger: + self.logger.tb_scalar('loss', loss.item(), self.update_step) + self.logger.tb_scalar('q', torch.mean(q_value), self.update_step) return np.abs(diff_clipped.detach().cpu().numpy().astype(np.float32)) # Calculate the value of the given phi tensor. @@ -292,3 +294,4 @@ def train_step(self): if len(self.memory) > self.replay_start_size: samples = self.memory.sample(self.mini_batch_size) self.value_function.update(samples) + diff --git a/algorithms/async_dqn.py b/algorithms/async_dqn.py new file mode 100644 index 0000000..803ee12 --- /dev/null +++ b/algorithms/async_dqn.py @@ -0,0 +1,147 @@ +from agents.async_dqn_agent import * +from abc_rl.experience_replay import * +from abc_rl.exploration import * +from utils.hyperparameters import * + +import argparse +from agents.dqn_agent import * +from environments.env_wrapper import EnvWrapper +from exploration.epsilon_greedy import * +from utils.hyperparameters import Hyperparameters +from tools.dqn_play_ground import DQNPlayGround +import torch.multiprocessing as mp + + +# Argument parser for command line arguments +parser = argparse.ArgumentParser(description='PyTorch dqn training arguments') +parser.add_argument('--env_name', default='ALE/Pong-v5', type=str, + help='openai gym environment (default: ALE/Atlantis-v5)') +parser.add_argument('--worker_num', default=4, type=int, + help='parallel worker number (default: 4)') +parser.add_argument('--device', default='cuda:0', type=str, + help='calculation device default: cuda') +parser.add_argument('--log_path', default='../exps/async_dqn/', type=str, + help='log save path,default: ../exps/async_dqn/') + +# Load hyperparameters from yaml file +cfg = Hyperparameters(parser, '../configs/async_dqn.yaml') + + +def test(agent, test_episode_num: int): + """ + Test the DQN agent for a given number of episodes. + :param test_episode_num: The number of episodes for testing + :return: The average reward and average steps per episode + """ + env = EnvWrapper(cfg['env_name'], repeat_action_probability=0, frameskip=cfg['skip_k_frame']) + exploration_method = EpsilonGreedy(cfg['epsilon_for_test']) + reward_cum = 0 + step_cum = 0 + for i in range(test_episode_num): + state, _ = env.reset() + done = truncated = False + step_i = 0 + while (not done) and (not truncated): + obs = agent.perception_mapping(state, step_i) + action = agent.select_action(obs, exploration_method) + next_state, reward, done, truncated, inf = env.step(action) + reward_cum += reward + state = next_state + step_i += 1 + step_cum += step_i + return reward_cum / cfg['agent_test_episodes'], step_cum / cfg['agent_test_episodes'] + + +def train(rank:int, agent: DQNAgent, env: EnvWrapper, + training_steps_each_worker: int, + no_op: int, batch_per_epoch:int): + # training + training_steps = 0 + episode = 0 + epoch_i = 0 + run_test = False + while training_steps < training_steps_each_worker: + state, _ = env.reset() + done = False + truncated = False + step_i = 0 + reward_cumulated = 0 + obs = agent.perception_mapping(state, step_i) + while (not done) and (not truncated): + # + if step_i >= no_op: + action = agent.select_action(obs) + else: + action = agent.select_action(obs, RandomAction()) + next_state, reward_raw, done, truncated, inf = env.step(action) + reward = agent.reward_shaping(reward_raw) + next_obs = agent.perception_mapping(next_state, step_i) + agent.store(obs, action, reward, next_obs, done, truncated) + agent.train_step() + if len(agent.memory) > 1000: + agent.memory.clear() + obs = next_obs + reward_cumulated += reward + training_steps += 1 + step_i += 1 + if rank==0 and training_steps % batch_per_epoch == 0: + run_test = True + epoch_i += 1 + if rank == 0: + agent.logger.msg(f'{training_steps} training reward: ' + str(reward_cumulated)) + agent.logger.tb_scalar('training reward', reward_cumulated, training_steps) + if run_test: + agent.logger.msg(f'{epoch_i} test start:') + avg_reward, avg_steps = test(agent, cfg['agent_test_episodes']) + agent.logger.tb_scalar('avg_reward', avg_reward, epoch_i) + agent.logger.tb_scalar('avg_steps', avg_steps, epoch_i) + agent.logger.tb_scalar('epsilon', agent.exploration_method.epsilon, epoch_i) + agent.logger.msg(f'{epoch_i} avg_reward: ' + str(avg_reward)) + agent.logger.msg(f'{epoch_i} avg_steps: ' + str(avg_steps)) + agent.logger.msg(f'{epoch_i} epsilon: ' + str(agent.exploration_method.epsilon)) + + episode += 1 + + +class AsyncDQNPlayGround: + def __init__(self, agent: AsynDQNAgent, env: list, cfg: Hyperparameters): + self.agent = agent + self.env_list = env + self.cfg = cfg + self.worker_num = cfg['worker_num'] + self.training_steps_each_worker = int(self.cfg['training_steps'] / self.worker_num) + + def train(self): + mp.set_start_method('spawn', force=True) + processes = [] + for rank in range(self.worker_num): + p = mp.Process(target=train, args=(rank, self.agent, self.env_list[rank], + self.training_steps_each_worker, + self.cfg['no_op'], + cfg['batch_num_per_epoch']/self.worker_num)) + p.start() + processes.append(p) + for p in processes: + p.join() + + + + + +def main(): + logger = Logger(cfg['env_name'], cfg['log_path']) + logger.msg('\nparameters:' + str(cfg)) + envs = [EnvWrapper(cfg['env_name'], repeat_action_probability=0, + frameskip=cfg['skip_k_frame']) + for _ in range(cfg['worker_num'])] + async_dqn_agent = AsynDQNAgent(cfg['input_frame_width'], cfg['input_frame_height'], envs[0].action_space, + cfg['mini_batch_size'], cfg['replay_buffer_size'], cfg['replay_start_size'], + cfg['learning_rate'], cfg['step_c'], cfg['agent_saving_period'], cfg['gamma'], + cfg['training_steps'], cfg['phi_channel'], cfg['epsilon_max'], cfg['epsilon_min'], + cfg['exploration_steps'], cfg['device'], logger) + dqn_pg = AsyncDQNPlayGround(async_dqn_agent, envs, cfg) + dqn_pg.train() + + +if __name__ == '__main__': + main() diff --git a/configs/async_dqn.yaml b/configs/async_dqn.yaml new file mode 100644 index 0000000..e95fb65 --- /dev/null +++ b/configs/async_dqn.yaml @@ -0,0 +1,22 @@ +mini_batch_size: 32 +batch_num_per_epoch: 1_000 +replay_buffer_size: 1000 +training_steps: 50_000_000 +skip_k_frame: 4 +phi_channel: 4 +device: 'cuda: 0' +input_frame_width: 84 +input_frame_height: 84 +replay_start_size: 100 +gamma: 0.99 +no_op: 30 +save_path: './exps/' +log_path: '../exps/dqn/' +learning_rate: 0.00001 +step_c: 10_000 +epsilon_max: 1. +epsilon_min: 0.1 +exploration_steps: 1_000_000 +epsilon_for_test: 0.05 +agent_test_episodes: 20 +agent_saving_period: 80000