From da66630c3ae92e509aa88c013470e6d4a1bd4605 Mon Sep 17 00:00:00 2001 From: Naveen Prashanth <78990165+gnpaone@users.noreply.github.com> Date: Sun, 27 Mar 2022 19:44:33 +0530 Subject: [PATCH] A3C files --- Lunar-Lander-Continuos-v2.ipynb | 531 ++++++++++++++++++++++++++++++++ 1 file changed, 531 insertions(+) create mode 100644 Lunar-Lander-Continuos-v2.ipynb diff --git a/Lunar-Lander-Continuos-v2.ipynb b/Lunar-Lander-Continuos-v2.ipynb new file mode 100644 index 0000000..3a53d24 --- /dev/null +++ b/Lunar-Lander-Continuos-v2.ipynb @@ -0,0 +1,531 @@ +{ + "nbformat": 4, + "nbformat_minor": 0, + "metadata": { + "colab": { + "name": "Lunar_lander (2).ipynb", + "provenance": [] + }, + "kernelspec": { + "name": "python3", + "display_name": "Python 3" + }, + "language_info": { + "name": "python" + } + }, + "cells": [ + { + "cell_type": "markdown", + "source": [ + "##LunarLanderContinuous-v2 by OpenAI Gym\n", + "**Importing Modules**" + ], + "metadata": { + "id": "YQGhuaNsRLO1" + } + }, + { + "cell_type": "code", + "source": [ + "!pip3 install box2d-py\n", + "import gym\n", + "import pickle\n", + "import tensorflow as tf\n", + "import tensorflow.compat.v1 as tf1\n", + "tf1.disable_v2_behavior()\n", + "from tensorflow import keras\n", + "from keras import layers, initializers, regularizers\n", + "import numpy as np\n", + "import threading\n", + "from functools import reduce\n", + "import time\n", + "import os\n", + "from collections import deque\n", + "import matplotlib.pyplot as plt" + ], + "metadata": { + "id": "Q0OLWrL_RQ6y" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "source": [ + "**Initial prep work**" + ], + "metadata": { + "id": "e1LXGgThRWRy" + } + }, + { + "cell_type": "code", + "source": [ + "os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'\n", + "\n", + "checkpoint_path = \"./models_checkpoints\"\n", + "try:\n", + "\n", + " os.mkdir(checkpoint_path)\n", + "except FileExistsError:\n", + " pass\n", + "\n", + "\n", + "class StateTrasitionRecorder:\n", + "\n", + " def __init__(self):\n", + " self.recorder_memory = deque()\n", + "\n", + " def save_state_transition(self, transition):\n", + " self.recorder_memory.append(transition)\n", + "\n", + " def flush_recorder_memory(self):\n", + " self.recorder_memory = deque()\n", + "\n", + "\n", + "class RolloutBuffer(StateTrasitionRecorder):\n", + "\n", + " def __init__(self, policy_net_args):\n", + " super().__init__()\n", + " self.rollout_memory = deque()\n", + " self.gamma = policy_net_args[\"gamma\"]\n", + "\n", + " def save_rollout(self, episode):\n", + " complete_episode = self.compute_total_rewards(episode, self.gamma)\n", + " self.rollout_memory.append(complete_episode)\n", + " self.flush_recorder_memory()\n", + "\n", + " def compute_total_rewards(self, episode_transitions, gamma):\n", + " states, actions, rewards, nex_states, dones = zip(*episode_transitions)\n", + " Q_s_a = []\n", + "\n", + " for i in range(len(rewards)):\n", + " Q_i = 0\n", + " for j in range(i, len(rewards)):\n", + " Q_i += rewards[j] * self.gamma ** (j - i)\n", + "\n", + " Q_s_a.append(Q_i)\n", + "\n", + " episode = deque(zip(states, actions, rewards,\n", + " nex_states, dones, Q_s_a))\n", + "\n", + " return(episode)\n", + "\n", + " def unroll_state_transitions(self):\n", + " states = ()\n", + " actions = ()\n", + " next_states = ()\n", + " rewards = ()\n", + " dones = ()\n", + " Q_sa = ()\n", + "\n", + " for episode in self.rollout_memory:\n", + " ep_states, ep_actions, ep_next_states, ep_rewards, ep_dones, ep_Q_s_a = zip(\n", + " *episode)\n", + "\n", + " states += ep_states\n", + " actions += ep_actions\n", + " next_states += ep_next_states\n", + " rewards += ep_rewards\n", + " dones += ep_dones\n", + " Q_sa += ep_Q_s_a\n", + "\n", + " states = np.asarray(states)\n", + " actions = np.asarray(actions)\n", + " next_states = np.asarray(next_states)\n", + " rewards = np.asarray(rewards)\n", + " dones = np.asarray(dones, dtype=int)\n", + " Q_sa = np.asarray(Q_sa).reshape(-1, 1)\n", + "\n", + " return states, actions, next_states, rewards, dones, Q_sa\n", + "\n", + " def flush_rollout_memory(self):\n", + " self.rollout_memory = deque()\n", + "\n", + "\n", + "def build_networks(network_name, num_Hlayers, activations_Hlayers, Hlayer_sizes, n_output_units, output_layer_activation, regularization_constant, network_type, input_features,):\n", + " assert(num_Hlayers == (len(activations_Hlayers)) and num_Hlayers ==\n", + " len(Hlayer_sizes))\n", + "\n", + " with tf1.variable_scope(network_type):\n", + "\n", + " network = tf1.layers.Dense(Hlayer_sizes[0], activation=activations_Hlayers[0], kernel_initializer=tf.initializers.glorot_normal(),\n", + " kernel_regularizer=tf.keras.regularizers.L2(l2=regularization_constant), name=\"Layer_1\")(input_features)\n", + "\n", + " for layer in range(1, num_Hlayers):\n", + "\n", + " network = tf1.layers.Dense(units=Hlayer_sizes[layer], kernel_initializer=tf.initializers.glorot_normal(), kernel_regularizer=tf.keras.regularizers.L2(l2=regularization_constant), activation=activations_Hlayers[layer], name=(\n", + " \"Layer_\" + str(layer + 1)))(network)\n", + "\n", + " if network_type == \"Actor\":\n", + " mu = tf1.layers.Dense(units=n_output_units, kernel_initializer=tf.initializers.glorot_normal(),\n", + " kernel_regularizer=tf.keras.regularizers.L2(l2=regularization_constant), activation=output_layer_activation, name=\"mu\")(network)\n", + "\n", + " covariance = tf1.layers.Dense(\n", + " units=n_output_units, kernel_initializer=tf.initializers.glorot_normal(), kernel_regularizer=tf.keras.regularizers.L2(l2=regularization_constant), activation=tf.nn.softplus, name=\"covariance\")(network)\n", + " \n", + " params = tf1.get_collection(tf1.GraphKeys.TRAINABLE_VARIABLES, network_name + \"/\" + network_type)\n", + "\n", + " return mu, covariance, params\n", + "\n", + " else:\n", + "\n", + " critic = tf1.layers.Dense(units=n_output_units, kernel_initializer=tf.initializers.glorot_normal(\n", + " ), activation=output_layer_activation, kernel_regularizer=tf.keras.regularizers.L2(l2=regularization_constant), name=\"V\")(network)\n", + " params = tf1.get_collection(tf1.GraphKeys.TRAINABLE_VARIABLES, network_name + \"/\" + network_type)\n", + "\n", + " return critic, params\n", + "\n", + "\n", + "class ComputationGraph:\n", + " def __init__(self, name, policy_network_args, value_function_network_args):\n", + " super().__init__(policy_network_args)\n", + "\n", + " with tf1.variable_scope(name):\n", + " self.actor_optimizer = policy_network_args[\"optimizer\"]\n", + " self.critic_optimizer = value_function_network_args[\"optimizer\"]\n", + "\n", + " self.st_placeholder = tf1.placeholder(dtype=tf.float32, shape=[\n", + " None, policy_network_args[\"state_space_size\"]], name=\"State\")\n", + "\n", + " self.rewards_placeholder = tf1.placeholder(\n", + " tf.float32, shape=[None, 1], name=\"rewards\")\n", + " self.actions_placeholder = tf1.placeholder(\n", + " tf.float32, shape=[None, policy_network_args[\"action_space_size\"]], name=\"actions\")\n", + " self.dones_placeholder = tf1.placeholder(\n", + " tf.float32, shape=[None, 1], name=\"dones\")\n", + "\n", + " self.Qsa_placeholder = tf1.placeholder(\n", + " dtype=tf.float32, shape=[None, 1], name=\"Q_sa\")\n", + "\n", + " self.mu, self.covariance, self.actor_params = build_networks(name, policy_network_args[\"num_Hlayers\"], policy_network_args[\"activations_Hlayers\"], policy_network_args[\n", + " \"Hlayer_sizes\"], policy_network_args[\"n_output_units\"], policy_network_args[\"output_layer_activation\"], policy_network_args[\"regularization_constant\"], \"Actor\", self.st_placeholder)\n", + "\n", + " self.critic, self.critic_params = build_networks(name, value_function_network_args[\"num_Hlayers\"], value_function_network_args[\"activations_Hlayers\"], value_function_network_args[\n", + " \"Hlayer_sizes\"], value_function_network_args[\"n_output_units\"], value_function_network_args[\"output_layer_activation\"], policy_network_args[\"regularization_constant\"], \"Critic\", self.st_placeholder)\n", + "\n", + " with tf1.variable_scope(\"Train_value_function_estimator\"):\n", + "\n", + " self.value_function_net_cost = tf.losses.mean_squared_error(\n", + " self.Qsa_placeholder, self.critic) + tf1.losses.get_regularization_loss(scope=name + \"/\" + \"Critic\")\n", + "\n", + " tf1.summary.scalar(\"Critic_Cost\", self.value_function_net_cost)\n", + "\n", + " if name == \"Global_Agent\":\n", + " for variable in self.actor_params:\n", + " var_name = \"Actor_\" + variable.name.replace(\"kernel:0\", \"w\").replace(\"bias:0\", \"b\")\n", + " tf.summary.histogram(var_name, variable)\n", + "\n", + " for variable in self.critic_params:\n", + " var_name = \"Critic_\" + variable.name.replace(\"kernel:0\", \"w\").replace(\"bias:0\", \"b\")\n", + " tf.summary.histogram(var_name, variable)\n", + "\n", + " with tf1.variable_scope(\"Train_policy_network\"):\n", + "\n", + " self.advantage_funtion = tf.math.subtract(\n", + " self.Qsa_placeholder, self.critic)\n", + "\n", + " self.probability_density_func = tf1.distributions.Normal(\n", + " self.mu, self.covariance)\n", + "\n", + " self.log_prob_a = self.probability_density_func.log_prob(\n", + " self.actions_placeholder)\n", + "\n", + " auxiliary = tf.multiply(\n", + " self.log_prob_a, self.advantage_funtion)\n", + "\n", + " entropy = self.probability_density_func.entropy()\n", + "\n", + " self.auxiliary = policy_network_args[\"Entropy\"] * \\\n", + " entropy + auxiliary\n", + "\n", + " self.policy_net_cost = tf.reduce_sum(-self.auxiliary) + tf1.losses.get_regularization_loss(scope=name + \"/\" + \"Actor\")\n", + "\n", + " self.summary_policy_cost = tf.summary.scalar(\"Policy_Cost\", self.policy_net_cost)\n", + "\n", + " with tf.name_scope(\"choose_a\"):\n", + "\n", + " self.action = tf1.clip_by_value(self.probability_density_func.sample(\n", + " 1), policy_network_args[\"action_space_lower_bound\"], policy_network_args[\"action_space_upper_bound\"])\n", + "\n", + " with tf.name_scope(\"get_grad\"):\n", + " self.actor_grads = tf1.gradients(self.policy_net_cost, self.actor_params)\n", + " self.critic_grads = tf1.gradients(self.value_function_net_cost, self.critic_params)\n", + "\n", + " for act_grad, critic_grad in zip(self.actor_grads, self.critic_grads):\n", + " var_name_actor = \"Actor_\" + act_grad.name.replace(\"Addn\", \"w\")\n", + " var_name_critic = \"Critic_\" + critic_grad.name.replace(\"Addn\", \"w\")\n", + " tf.summary.histogram(var_name_actor, act_grad)\n", + " tf.summary.histogram(var_name_critic, critic_grad)\n", + "\n", + " self.summaries = tf1.summary.merge_all()\n" + ], + "metadata": { + "id": "mXiDeGCiRf-J" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "source": [ + "**Training the Model**" + ], + "metadata": { + "id": "oLy8Xy-bRxvb" + } + }, + { + "cell_type": "code", + "source": [ + "class RLAgent(ComputationGraph, RolloutBuffer):\n", + " def __init__(self, name, policy_network_args, value_function_network_args, session, summary_writer, Global_Agent=None):\n", + " super().__init__(name, policy_network_args, value_function_network_args)\n", + "\n", + " self.current_num_epi = 0\n", + " self.env = gym.make('LunarLanderContinuous-v2')\n", + " self.total_number_episodes = policy_network_args[\"total_number_episodes\"]\n", + " self.num_episodes_before_update = policy_network_args[\"number_of_episodes_before_update\"]\n", + " self.Global_Agent = Global_Agent\n", + " self.ep_rewards = []\n", + " self.frequency_printing_statistics = policy_network_args[\"frequency_of_printing_statistics\"]\n", + " self.episodes_back = policy_network_args[\"episodes_back\"]\n", + " self.rendering_frequency = policy_network_args[\"frequency_of_rendering_episode\"]\n", + " self.max_steps = policy_network_args[\"max_steps_per_episode\"]\n", + " self.summary_writer = summary_writer\n", + " self.name = name\n", + "\n", + " self.session = session\n", + " if Global_Agent is not None:\n", + " with tf.name_scope(name):\n", + "\n", + " with tf.name_scope('sync'):\n", + " with tf.name_scope('pull_from_global'):\n", + " self.pull_actor_params_op = [local_params.assign(\n", + " global_params) for local_params, global_params in zip(self.actor_params, Global_Agent.actor_params)]\n", + " self.pull_critic_params_op = [local_params.assign(\n", + " global_params) for local_params, global_params in zip(self.critic_params, Global_Agent.critic_params)]\n", + " with tf.name_scope(\"push_to_global\"):\n", + " self.push_actor_params_op = self.actor_optimizer.apply_gradients(zip(self.actor_grads, self.Global_Agent.actor_params))\n", + " self.push_critic_params_op = self.critic_optimizer.apply_gradients(zip(self.critic_grads, Global_Agent.critic_params))\n", + "\n", + " def update_Global_Agent(self, feed_dict):\n", + " _, _, = self.session.run([self.push_actor_params_op,\n", + " self.push_critic_params_op], feed_dict)\n", + "\n", + " def save_summary(self, feed_dict):\n", + " summary = self.session.run(self.Global_Agent.summaries, feed_dict)\n", + " self.summary_writer.add_summary(summary, self.Global_Agent.current_num_epi)\n", + "\n", + " def pull_from_global(self):\n", + " self.session.run([self.pull_actor_params_op,\n", + " self.pull_critic_params_op])\n", + "\n", + " def take_action(self, state):\n", + " state = state.reshape(-1, 8)\n", + " action = self.session.run([self.action], feed_dict={\n", + " self.st_placeholder: state})\n", + " return action[0].reshape(2,)\n", + "\n", + " def collect_rollouts(self, n_rolls, max_steps, render=False):\n", + " for i in range(n_rolls):\n", + " n_steps = 0\n", + " state = self.env.reset()\n", + " done = False\n", + " sum_rewards = 0\n", + " while not done and n_steps <= max_steps:\n", + " if render:\n", + " self.env.render()\n", + "\n", + " action = self.take_action(state)\n", + " next_state, reward, done, info = self.env.step(action)\n", + "\n", + " if not done and n_steps == max_steps:\n", + " state_feed = next_state.reshape(-1, 8)\n", + " reward = reward + float(self.session.run([self.critic], feed_dict={self.st_placeholder: state_feed})[0])\n", + "\n", + " self.save_state_transition(\n", + " [state, action, reward, next_state, done])\n", + "\n", + " sum_rewards += reward\n", + " state = next_state\n", + " n_steps += 1\n", + "\n", + " if self.name == \"Global_Agent\":\n", + " print(f\"Episode Reward: {sum_rewards}\")\n", + "\n", + " self.ep_rewards.append(sum_rewards)\n", + " self.save_rollout(self.recorder_memory)\n", + "\n", + " def training_loop(self):\n", + " \"\"\"Runs episodes in a loop and performs steps of gradient descent after every episode\"\"\"\n", + "\n", + " while not coord.should_stop() and self.Global_Agent.current_num_epi <= self.total_number_episodes:\n", + " self.collect_rollouts(\n", + " self.num_episodes_before_update, self.max_steps, render=False)\n", + "\n", + " states, actions, next_states, rewards, dones, Q_sa = self.unroll_state_transitions()\n", + "\n", + " feed_dict = {self.st_placeholder: states,\n", + " self.actions_placeholder: actions,\n", + " self.Qsa_placeholder: Q_sa}\n", + "\n", + " self.update_Global_Agent(feed_dict)\n", + " self.Global_Agent.current_num_epi += self.num_episodes_before_update\n", + "\n", + " feed_dict_global_summary = {self.Global_Agent.st_placeholder: states,\n", + " self.Global_Agent.actions_placeholder: actions,\n", + " self.Global_Agent.Qsa_placeholder: Q_sa}\n", + "\n", + " self.save_summary(feed_dict_global_summary)\n", + "\n", + " self.flush_rollout_memory()\n", + " self.pull_from_global()\n", + "\n", + " if self.Global_Agent.current_num_epi % self.frequency_printing_statistics == 0:\n", + "\n", + " average_reward = self.Global_Agent.compute_average_rewards(self.episodes_back)\n", + " print(\n", + " f\"Global ep number {self.Global_Agent.current_num_epi}: Reward = {average_reward}\")\n", + "\n", + "class Global_Agent(RLAgent):\n", + " def __init__(self, name, policy_network_args, value_function_network_args, session, summary_writer, child_agents=[]):\n", + " super().__init__(name, policy_network_args, value_function_network_args, session, summary_writer)\n", + " self.child_agents = child_agents\n", + " self.num_childs = len(child_agents)\n", + "\n", + " def compute_average_rewards(self, episodes_back):\n", + " \"\"\"Computes the average reward of each child agent going n episodes back, and returnes the average of those average rewards\"\"\"\n", + " reward = 0\n", + " for agent in self.child_agents:\n", + " agent_average_reward = reduce(\n", + " lambda x, y: x + y, agent.ep_rewards[-episodes_back:]) / episodes_back\n", + " reward += agent_average_reward\n", + "\n", + " reward /= self.num_childs\n", + "\n", + " return reward" + ], + "metadata": { + "id": "fKTk5Zt7R63o" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "source": [ + "**Testing the model**" + ], + "metadata": { + "id": "5j_ph4KMS6xc" + } + }, + { + "cell_type": "code", + "source": [ + "if __name__ == \"__main__\":\n", + "\n", + " env = gym.make('LunarLanderContinuous-v2')\n", + " action_space_upper_bound = env.action_space.high\n", + " action_space_lower_bound = env.action_space.low\n", + " subdir = time.strftime(\"%Y%m%d-%H%M%S\", time.localtime())\n", + " logdir = \"./summary/\" + subdir\n", + " writer = tf.summary.create_file_writer(logdir)\n", + " sess = tf1.Session()\n", + "\n", + " policy_net_args = {\"num_Hlayers\": 2,\n", + " \"activations_Hlayers\": [\"relu\", \"relu\"],\n", + " \"Hlayer_sizes\": [100, 100],\n", + " \"n_output_units\": 2,\n", + " \"output_layer_activation\": tf.nn.tanh,\n", + " \"state_space_size\": 8,\n", + " \"action_space_size\": 2,\n", + " \"Entropy\": 0.01,\n", + " \"action_space_upper_bound\": action_space_upper_bound,\n", + " \"action_space_lower_bound\": action_space_lower_bound,\n", + " \"optimizer\": tf1.train.RMSPropOptimizer(0.0001),\n", + " \"total_number_episodes\": 5000,\n", + " \"number_of_episodes_before_update\": 1,\n", + " \"frequency_of_printing_statistics\": 100,\n", + " \"frequency_of_rendering_episode\": 1000,\n", + " \"number_child_agents\": 8,\n", + " \"episodes_back\": 20,\n", + " \"gamma\": 0.99,\n", + " \"regularization_constant\": 0.01,\n", + " \"max_steps_per_episode\": 2000\n", + "\n", + " }\n", + "\n", + " valuefunction_net_args = {\"num_Hlayers\": 2,\n", + " \"activations_Hlayers\": [\"relu\", \"relu\"],\n", + " \"Hlayer_sizes\": [100, 64],\n", + " \"n_output_units\": 1,\n", + " \"output_layer_activation\": \"linear\",\n", + " \"state_space_size\": 8,\n", + " \"action_space_size\": 2,\n", + " \"optimizer\": tf1.train.RMSPropOptimizer(0.01),\n", + " \"regularization_constant\": 0.01}\n", + "\n", + " global_agent = Global_Agent(\"Global_Agent\", policy_net_args, valuefunction_net_args, sess, writer)\n", + "\n", + " child_agents = []\n", + "\n", + " for i in range(policy_net_args[\"number_child_agents\"]):\n", + " i_name = f\"ChildAgent_{i}\"\n", + " child_agents.append(RLAgent(i_name, policy_net_args, valuefunction_net_args, sess, writer, global_agent))\n", + "\n", + " global_agent.child_agents = child_agents\n", + " global_agent.num_childs = len(child_agents)\n", + "\n", + " saver = tf1.train.Saver()\n", + "\n", + " coord = tf.train.Coordinator()\n", + "\n", + " if len(os.listdir(checkpoint_path)) == 0:\n", + "\n", + " sess.run(tf1.global_variables_initializer())\n", + " else:\n", + " saver.restore(sess, checkpoint_path + \"/variables.ckpt\")\n", + "\n", + " child_agents_threads = []\n", + "\n", + " subdir = time.strftime(\"%Y%m%d-%H%M%S\", time.localtime())\n", + " logdir = \"./summary/\" + subdir\n", + " writer = tf1.summary.FileWriter(logdir)\n", + " writer.add_graph(sess.graph)\n", + "\n", + " for child_agent in child_agents:\n", + " def job(): return child_agent.training_loop()\n", + " t = threading.Thread(target=job)\n", + " t.start()\n", + " child_agents_threads.append(t)\n", + "\n", + " coord.join(child_agents_threads)\n", + " saver.save(sess, checkpoint_path + \"/variables.ckpt\")\n", + "\n", + " for i in range(1):\n", + "\n", + " global_agent.collect_rollouts(10, 2000, render=True)\n", + " global_agent.collect_rollouts(90, 2000)\n", + "\n", + " rewards = global_agent.ep_rewards\n", + " average = sum(rewards)/len(rewards)\n", + " average = [average] * 100\n", + "\n", + " fig, ax = plt.subplots()\n", + "\n", + " ax.plot(rewards, label=\"Episode Reward\")\n", + " ax.plot(average, label=\"Average\")\n", + " ax.legend(loc=\"best\")\n", + " plt.show()" + ], + "metadata": { + "id": "AWN9xhEVS9Q6" + }, + "execution_count": null, + "outputs": [] + } + ] +} \ No newline at end of file