linux_ppo_vec_envs_image copy.py

#Modified this code - https://github.com/DeepReinforcementLearning/DeepReinforcementLearningInAction/blob/master/Chapter%204/Ch4_book.ipynb
#Also, modified this code - https://github.com/higgsfield/RL-Adventure-2/blob/master/1.actor-critic.ipynb
# Also, modified this code - https://github.com/ericyangyu/PPO-for-Beginners/blob/9abd435771aa84764d8d0d1f737fa39118b74019/ppo.py#L151
# Got a lot of help from the subreddit - reinforcement_learning
#Incorporated ideas from here - https://iclr-blog-track.github.io/2022/03/25/ppo-implementation-details/
if __name__ == '__main__':

    import numpy as np
    import gymnasium as gym
    from gymnasium.wrappers import AtariPreprocessing
    import torch
    import random
    import matplotlib.pyplot as plt
    from torch import nn
    import torchvision as tv
    import torch.nn.functional as F
    torch.manual_seed(798)
    import matplotlib.pyplot as plt
    torch.manual_seed(0)
    random.seed(0)
    np.random.seed(0)
    from collections import deque
    from torch.optim.lr_scheduler import MultiStepLR

    num_envs = 12
    ent_coeff = 0.1
    batches = 4
    channels = 3
    learning_rate = 0.0003
    episodes = 1500
    gae_lambda = 0.95
    stack_num = 8
    num_channels = stack_num

    gamma = 0.99
    clip = 0.2
    rollout_steps = 400
    training_iters = 4
    grad_clip = 0.5


    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    import envpool

    env = envpool.make("Breakout-v5", env_type="gymnasium", num_envs=num_envs, stack_num = stack_num, episodic_life = True)
    print("envpool observation_space.shape = ", env.observation_space.shape)
    print("envpool action_space.n= ", env.action_space.n)
    # env = AtariPreprocessing(env)
    # env = gym.vector.make("BreakoutNoFrameskip-v4", num_envs=num_envs,wrappers=AtariPreprocessing)
    actor_PATH = './actor_model' + 'breakout' + '.pt'
    critic_PATH = './critic_model ' + 'pong'+ '.pt'
    square_size = env.observation_space.shape[-1]
    print("square_size = ", square_size)

    class Actor(nn.Module):
        def __init__(self, state_size, action_size):
            super(Actor, self).__init__()
            self.state_size = state_size
            self.action_size = action_size
            self.conv1 = nn.Conv2d(num_channels, 6, 5)
            self.pool = nn.MaxPool2d(2, 2)
            self.conv2 = nn.Conv2d(6, 16, 3)
            self.conv3 = nn.Conv2d(16, 32, 3)
            self.fc1 = nn.Linear(2048, 120)
            self.fc2 = nn.Linear(120, 84)
            self.fc3 = nn.Linear(84, action_size)
            self.last = nn.Softmax(dim=-1)

        def forward(self,x):
            x = self.pool(F.relu(self.conv1(x)))
            x = self.pool(F.relu(self.conv2(x)))
            x = self.pool(F.relu(self.conv3(x)))
            x = torch.flatten(x, 1)
            x = F.relu(self.fc1(x))
            x = F.relu(self.fc2(x))
            x = self.fc3(x)
            x = self.last(x)
            return x

    class Critic(nn.Module):
        def __init__(self, state_size, action_size):
            super(Critic, self).__init__()
            self.state_size = state_size
            self.action_size = action_size
            self.conv1 = nn.Conv2d(num_channels, 6, 5)
            self.pool = nn.MaxPool2d(2, 2)
            self.conv2 = nn.Conv2d(6, 16, 3)
            self.conv3 = nn.Conv2d(16, 32, 3)
            self.fc1 = nn.Linear(2048, 120)
            self.fc2 = nn.Linear(120, 84)
            self.fc3 = nn.Linear(84, 1)

        def forward(self, x):
            x = x.reshape(-1, stack_num, square_size, square_size)
            x = self.pool(F.relu(self.conv1(x)))
            x = self.pool(F.relu(self.conv2(x)))
            x = self.pool(F.relu(self.conv3(x)))
            x = torch.flatten(x, 1)
            x = F.relu(self.fc1(x))
            x = F.relu(self.fc2(x))
            x = self.fc3(x)
            return x

    actor = Actor(env.observation_space.shape[-1], env.action_space.n).to(device)
    critic = Critic(env.observation_space.shape[-1], 1).to(device)
    policy_opt = torch.optim.Adam(params = actor.parameters(), lr = learning_rate)
    value_opt = torch.optim.Adam(params = critic.parameters(), lr = learning_rate)
    policy_scheduler = MultiStepLR(policy_opt, milestones = [700], gamma=0.1,verbose=True)
    value_scheduler = MultiStepLR(policy_opt, milestones = [700], gamma=0.1,verbose=True)

    obs = torch.tensor(env.reset()[0], dtype=torch.float32).to(device)

    tot_rewards = np.array([0 for i in range(num_envs)], dtype=float)
    final_scores = []
    last_n_rewards = deque(maxlen=10)
    def rollout(obs): #todo Why can't the rollout function access it from outside?

        all_rewards = []
        all_actions = []
        all_actions_probs = []
        all_observations = []
        all_dones = []
        global tot_rewards #todo Why did I have to declare tot_rewards as global?

        for i in range(rollout_steps):
            obs = obs.reshape(num_envs, stack_num, square_size, square_size)
            act_probs = torch.distributions.Categorical(actor(obs.to(device)).squeeze())
            action = act_probs.sample().squeeze()
            action = action.cpu().detach().numpy()
            next_state, reward, done, truncated, info = env.step(action)
            action = torch.tensor(action, dtype=torch.float32).to(device)

            # These statistics help determine how well the agent is performing.
            tot_rewards += reward
            for reward_val, done_val in zip(tot_rewards, done):
                if done_val:
                    last_n_rewards.append(reward_val)
                    final_scores.append(reward_val)
            tot_rewards[done] = 0

            all_rewards.append(reward)
            all_dones.append(done)
            all_observations.append(obs.cpu().detach().numpy())
            all_actions.append(action.cpu().detach().numpy())
            all_actions_probs.append(act_probs.log_prob(action).cpu().detach().numpy())
            obs = torch.tensor(next_state, dtype=torch.float32)


        # Computing advantages over here, A = Q - V and returns, Q = V + A 
        eps_rew = critic(obs.to(device)).cpu().detach().numpy().reshape(-1)
        next_adv = np.array([0 for i in range(num_envs)], dtype=float)
        batch_obs = torch.Tensor(all_observations).reshape(-1, stack_num*num_envs, square_size, square_size)
        val_next_state = eps_rew.copy()
        state_value_list = []
        inv_eps_adv_list = []
        inv_eps_ret_list = []
        for reward,done,obs in zip(reversed(all_rewards), reversed(all_dones), reversed(batch_obs)):
            next_adv[done] = 0
            val_next_state[done] = 0
            val_current_state = critic(obs.to(device)).cpu().detach().numpy().reshape(-1)

            delta = reward + gamma*val_next_state-val_current_state
            adv = delta + gae_lambda * gamma * next_adv
            returns = val_current_state + adv

            inv_eps_adv_list.append(adv)
            inv_eps_ret_list.append(returns)
            next_adv = adv.copy()
            val_next_state = val_current_state.copy()
        final_adv_list = []

        for a in reversed(inv_eps_adv_list):
            final_adv_list.append(a)
        for a in reversed(inv_eps_ret_list):
            state_value_list.append(a)

        # Returning all the data from the rollout. `obs` needs to be returned because the episode might not be over
        # for some environment
        batch_obs = torch.Tensor(all_observations).reshape(-1,env.observation_space.shape[1]).to(device)
        batch_act = torch.Tensor(np.array(all_actions).reshape(-1)).to(device)
        batch_log_probs = torch.Tensor(np.array(all_actions_probs).reshape(-1)).to(device)
        batch_rtgs = torch.Tensor(state_value_list).reshape(-1).to(device)
        batch_advantages = torch.Tensor(final_adv_list).reshape(-1).to(device)
        return batch_obs, batch_act, batch_log_probs, batch_rtgs, batch_advantages, obs

    #Learning Phase
    for episode in range(episodes):
        print("episodes = ", episode)
        all_obs, all_act, all_log_probs, all_rtgs, all_advantages, obs = rollout(obs)
        all_obs = all_obs.reshape(-1, stack_num, square_size, square_size)

        assert (all_obs.shape == (rollout_steps*num_envs, num_channels, square_size, square_size))
        assert (all_act.shape == (rollout_steps*num_envs,))
        assert (all_log_probs.shape == (rollout_steps*num_envs,))
        assert (all_rtgs.shape == (rollout_steps*num_envs,))
        assert (all_advantages.shape == (rollout_steps*num_envs,))

        # Standardize all advantages
        all_advantages = (all_advantages - all_advantages.mean()) / (all_advantages.std() + 1e-8)

        for i in range(training_iters):
            print("Training Iteration = ", i)
            total_examples = num_envs * rollout_steps
            batch_size = total_examples // batches
            batch_starts = np.arange(0, total_examples, batch_size)
            indices = np.arange(total_examples, dtype=np.int32)
            np.random.shuffle(indices)

            for batch_start in batch_starts:
                batch_end = batch_start + batch_size
                batch_index = indices[batch_start:batch_end]
                batch_obs = all_obs[batch_index]
                batch_act = all_act[batch_index]
                batch_log_probs = all_log_probs[batch_index]
                batch_rtgs = all_rtgs[batch_index]
                batch_advantages = all_advantages[batch_index]

                value = critic(batch_obs).squeeze()
                assert(value.ndim==1)
                policy = actor(batch_obs)
                act_probs = torch.distributions.Categorical(policy)
                batch_entropy = act_probs.entropy().mean()
                log_probs = act_probs.log_prob(batch_act).squeeze()
                ratios = torch.exp(log_probs - batch_log_probs)
                assert(ratios.ndim==1)
                surr1 = ratios*batch_advantages
                assert (surr1.ndim == 1)
                surr2 = torch.clamp(ratios, 1 - clip, 1 + clip)*batch_advantages
                assert (surr2.ndim == 1)
                actor_loss = -torch.min(surr1, surr2).mean()  - ent_coeff*batch_entropy
                critic_loss = (value - batch_rtgs).pow(2).mean()

                policy_opt.zero_grad()
                actor_loss.backward()

                # Gradient Clipping
                torch.nn.utils.clip_grad_norm_(actor.parameters(), grad_clip)
                policy_opt.step()

                value_opt.zero_grad()
                critic_loss.backward()
                
                # Gradient Clipping

                torch.nn.utils.clip_grad_norm_(critic.parameters(), grad_clip) 

                value_opt.step()

        value_scheduler.step()
        policy_scheduler.step()

        if episode % 100 == 0:
            print("Saved")
            torch.save(actor.state_dict(), actor_PATH)
            torch.save(critic.state_dict(), critic_PATH)

    plt.plot(final_scores)
    plt.show()