diff --git a/Lunar-Lander-Continuos=v2.ipynb b/Lunar-Lander-Continuos=v2.ipynb deleted file mode 100644 index 2d3ae7a..0000000 --- a/Lunar-Lander-Continuos=v2.ipynb +++ /dev/null @@ -1,542 +0,0 @@ -{ - "nbformat": 4, - "nbformat_minor": 0, - "metadata": { - "colab": { - "name": "Lunar_lander.ipynb", - "provenance": [] - }, - "kernelspec": { - "name": "python3", - "display_name": "Python 3" - }, - "language_info": { - "name": "python" - } - }, - "cells": [ - { - "cell_type": "markdown", - "source": [ - "##LunarLanderContinuous-v2 by OpenAI Gym\n", - "**Importing Modules**" - ], - "metadata": { - "id": "YQGhuaNsRLO1" - } - }, - { - "cell_type": "code", - "source": [ - "!pip3 install box2d-py\n", - "import gym\n", - "import pickle\n", - "import tensorflow as tf\n", - "import tensorflow.compat.v1 as tf1\n", - "tf1.disable_v2_behavior()\n", - "from tensorflow import keras\n", - "from keras import layers, initializers, regularizers\n", - "import numpy as np\n", - "import threading\n", - "from functools import reduce\n", - "import time\n", - "import os\n", - "from collections import deque\n", - "import matplotlib.pyplot as plt" - ], - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "Q0OLWrL_RQ6y", - "outputId": "d8ba50be-7060-47e5-adf7-ad140dfa3fdb" - }, - "execution_count": null, - "outputs": [] - } - ] - }, - { - "cell_type": "markdown", - "source": [ - "**Initial prep work**" - ], - "metadata": { - "id": "e1LXGgThRWRy" - } - }, - { - "cell_type": "code", - "source": [ - "os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'\n", - "\n", - "checkpoint_path = \"./models_checkpoints\"\n", - "try:\n", - "\n", - " os.mkdir(checkpoint_path)\n", - "except FileExistsError:\n", - " pass\n", - "\n", - "\n", - "class StateTrasitionRecorder:\n", - "\n", - " def __init__(self):\n", - " self.recorder_memory = deque()\n", - "\n", - " def save_state_transition(self, transition):\n", - " self.recorder_memory.append(transition)\n", - "\n", - " def flush_recorder_memory(self):\n", - " self.recorder_memory = deque()\n", - "\n", - "\n", - "class RolloutBuffer(StateTrasitionRecorder):\n", - "\n", - " def __init__(self, policy_net_args):\n", - " super().__init__()\n", - " self.rollout_memory = deque()\n", - " self.gamma = policy_net_args[\"gamma\"]\n", - "\n", - " def save_rollout(self, episode):\n", - " complete_episode = self.compute_total_rewards(episode, self.gamma)\n", - " self.rollout_memory.append(complete_episode)\n", - " self.flush_recorder_memory()\n", - "\n", - " def compute_total_rewards(self, episode_transitions, gamma):\n", - " states, actions, rewards, nex_states, dones = zip(*episode_transitions)\n", - " Q_s_a = []\n", - "\n", - " for i in range(len(rewards)):\n", - " Q_i = 0\n", - " for j in range(i, len(rewards)):\n", - " Q_i += rewards[j] * self.gamma ** (j - i)\n", - "\n", - " Q_s_a.append(Q_i)\n", - "\n", - " episode = deque(zip(states, actions, rewards,\n", - " nex_states, dones, Q_s_a))\n", - "\n", - " return(episode)\n", - "\n", - " def unroll_state_transitions(self):\n", - " states = ()\n", - " actions = ()\n", - " next_states = ()\n", - " rewards = ()\n", - " dones = ()\n", - " Q_sa = ()\n", - "\n", - " for episode in self.rollout_memory:\n", - " ep_states, ep_actions, ep_next_states, ep_rewards, ep_dones, ep_Q_s_a = zip(\n", - " *episode)\n", - "\n", - " states += ep_states\n", - " actions += ep_actions\n", - " next_states += ep_next_states\n", - " rewards += ep_rewards\n", - " dones += ep_dones\n", - " Q_sa += ep_Q_s_a\n", - "\n", - " states = np.asarray(states)\n", - " actions = np.asarray(actions)\n", - " next_states = np.asarray(next_states)\n", - " rewards = np.asarray(rewards)\n", - " dones = np.asarray(dones, dtype=int)\n", - " Q_sa = np.asarray(Q_sa).reshape(-1, 1)\n", - "\n", - " return states, actions, next_states, rewards, dones, Q_sa\n", - "\n", - " def flush_rollout_memory(self):\n", - " self.rollout_memory = deque()\n", - "\n", - "\n", - "def build_networks(network_name, num_Hlayers, activations_Hlayers, Hlayer_sizes, n_output_units, output_layer_activation, regularization_constant, network_type, input_features,):\n", - " assert(num_Hlayers == (len(activations_Hlayers)) and num_Hlayers ==\n", - " len(Hlayer_sizes))\n", - "\n", - " with tf1.variable_scope(network_type):\n", - "\n", - " network = tf1.layers.Dense(Hlayer_sizes[0], activation=activations_Hlayers[0], kernel_initializer=tf.initializers.glorot_normal(),\n", - " kernel_regularizer=tf.keras.regularizers.L2(l2=regularization_constant), name=\"Layer_1\")(input_features)\n", - "\n", - " for layer in range(1, num_Hlayers):\n", - "\n", - " network = tf1.layers.Dense(units=Hlayer_sizes[layer], kernel_initializer=tf.initializers.glorot_normal(), kernel_regularizer=tf.keras.regularizers.L2(l2=regularization_constant), activation=activations_Hlayers[layer], name=(\n", - " \"Layer_\" + str(layer + 1)))(network)\n", - "\n", - " if network_type == \"Actor\":\n", - " mu = tf1.layers.Dense(units=n_output_units, kernel_initializer=tf.initializers.glorot_normal(),\n", - " kernel_regularizer=tf.keras.regularizers.L2(l2=regularization_constant), activation=output_layer_activation, name=\"mu\")(network)\n", - "\n", - " covariance = tf1.layers.Dense(\n", - " units=n_output_units, kernel_initializer=tf.initializers.glorot_normal(), kernel_regularizer=tf.keras.regularizers.L2(l2=regularization_constant), activation=tf.nn.softplus, name=\"covariance\")(network)\n", - " \n", - " params = tf1.get_collection(tf1.GraphKeys.TRAINABLE_VARIABLES, network_name + \"/\" + network_type)\n", - "\n", - " return mu, covariance, params\n", - "\n", - " else:\n", - "\n", - " critic = tf1.layers.Dense(units=n_output_units, kernel_initializer=tf.initializers.glorot_normal(\n", - " ), activation=output_layer_activation, kernel_regularizer=tf.keras.regularizers.L2(l2=regularization_constant), name=\"V\")(network)\n", - " params = tf1.get_collection(tf1.GraphKeys.TRAINABLE_VARIABLES, network_name + \"/\" + network_type)\n", - "\n", - " return critic, params\n", - "\n", - "\n", - "class ComputationGraph:\n", - " def __init__(self, name, policy_network_args, value_function_network_args):\n", - " super().__init__(policy_network_args)\n", - "\n", - " with tf1.variable_scope(name):\n", - " self.actor_optimizer = policy_network_args[\"optimizer\"]\n", - " self.critic_optimizer = value_function_network_args[\"optimizer\"]\n", - "\n", - " self.st_placeholder = tf1.placeholder(dtype=tf.float32, shape=[\n", - " None, policy_network_args[\"state_space_size\"]], name=\"State\")\n", - "\n", - " self.rewards_placeholder = tf1.placeholder(\n", - " tf.float32, shape=[None, 1], name=\"rewards\")\n", - " self.actions_placeholder = tf1.placeholder(\n", - " tf.float32, shape=[None, policy_network_args[\"action_space_size\"]], name=\"actions\")\n", - " self.dones_placeholder = tf1.placeholder(\n", - " tf.float32, shape=[None, 1], name=\"dones\")\n", - "\n", - " self.Qsa_placeholder = tf1.placeholder(\n", - " dtype=tf.float32, shape=[None, 1], name=\"Q_sa\")\n", - "\n", - " self.mu, self.covariance, self.actor_params = build_networks(name, policy_network_args[\"num_Hlayers\"], policy_network_args[\"activations_Hlayers\"], policy_network_args[\n", - " \"Hlayer_sizes\"], policy_network_args[\"n_output_units\"], policy_network_args[\"output_layer_activation\"], policy_network_args[\"regularization_constant\"], \"Actor\", self.st_placeholder)\n", - "\n", - " self.critic, self.critic_params = build_networks(name, value_function_network_args[\"num_Hlayers\"], value_function_network_args[\"activations_Hlayers\"], value_function_network_args[\n", - " \"Hlayer_sizes\"], value_function_network_args[\"n_output_units\"], value_function_network_args[\"output_layer_activation\"], policy_network_args[\"regularization_constant\"], \"Critic\", self.st_placeholder)\n", - "\n", - " with tf1.variable_scope(\"Train_value_function_estimator\"):\n", - "\n", - " self.value_function_net_cost = tf.losses.mean_squared_error(\n", - " self.Qsa_placeholder, self.critic) + tf1.losses.get_regularization_loss(scope=name + \"/\" + \"Critic\")\n", - "\n", - " tf1.summary.scalar(\"Critic_Cost\", self.value_function_net_cost)\n", - "\n", - " if name == \"Global_Agent\":\n", - " for variable in self.actor_params:\n", - " var_name = \"Actor_\" + variable.name.replace(\"kernel:0\", \"w\").replace(\"bias:0\", \"b\")\n", - " tf.summary.histogram(var_name, variable)\n", - "\n", - " for variable in self.critic_params:\n", - " var_name = \"Critic_\" + variable.name.replace(\"kernel:0\", \"w\").replace(\"bias:0\", \"b\")\n", - " tf.summary.histogram(var_name, variable)\n", - "\n", - " with tf1.variable_scope(\"Train_policy_network\"):\n", - "\n", - " self.advantage_funtion = tf.math.subtract(\n", - " self.Qsa_placeholder, self.critic)\n", - "\n", - " self.probability_density_func = tf1.distributions.Normal(\n", - " self.mu, self.covariance)\n", - "\n", - " self.log_prob_a = self.probability_density_func.log_prob(\n", - " self.actions_placeholder)\n", - "\n", - " auxiliary = tf.multiply(\n", - " self.log_prob_a, self.advantage_funtion)\n", - "\n", - " entropy = self.probability_density_func.entropy()\n", - "\n", - " self.auxiliary = policy_network_args[\"Entropy\"] * \\\n", - " entropy + auxiliary\n", - "\n", - " self.policy_net_cost = tf.reduce_sum(-self.auxiliary) + tf1.losses.get_regularization_loss(scope=name + \"/\" + \"Actor\")\n", - "\n", - " self.summary_policy_cost = tf.summary.scalar(\"Policy_Cost\", self.policy_net_cost)\n", - "\n", - " with tf.name_scope(\"choose_a\"):\n", - "\n", - " self.action = tf1.clip_by_value(self.probability_density_func.sample(\n", - " 1), policy_network_args[\"action_space_lower_bound\"], policy_network_args[\"action_space_upper_bound\"])\n", - "\n", - " with tf.name_scope(\"get_grad\"):\n", - " self.actor_grads = tf1.gradients(self.policy_net_cost, self.actor_params)\n", - " self.critic_grads = tf1.gradients(self.value_function_net_cost, self.critic_params)\n", - "\n", - " for act_grad, critic_grad in zip(self.actor_grads, self.critic_grads):\n", - " var_name_actor = \"Actor_\" + act_grad.name.replace(\"Addn\", \"w\")\n", - " var_name_critic = \"Critic_\" + critic_grad.name.replace(\"Addn\", \"w\")\n", - " tf.summary.histogram(var_name_actor, act_grad)\n", - " tf.summary.histogram(var_name_critic, critic_grad)\n", - "\n", - " self.summaries = tf1.summary.merge_all()\n" - ], - "metadata": { - "id": "mXiDeGCiRf-J" - }, - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "source": [ - "**Training the Model**" - ], - "metadata": { - "id": "oLy8Xy-bRxvb" - } - }, - { - "cell_type": "code", - "source": [ - "class RLAgent(ComputationGraph, RolloutBuffer):\n", - " def __init__(self, name, policy_network_args, value_function_network_args, session, summary_writer, Global_Agent=None):\n", - " super().__init__(name, policy_network_args, value_function_network_args)\n", - "\n", - " self.current_num_epi = 0\n", - " self.env = gym.make('LunarLanderContinuous-v2')\n", - " self.total_number_episodes = policy_network_args[\"total_number_episodes\"]\n", - " self.num_episodes_before_update = policy_network_args[\"number_of_episodes_before_update\"]\n", - " self.Global_Agent = Global_Agent\n", - " self.ep_rewards = []\n", - " self.frequency_printing_statistics = policy_network_args[\"frequency_of_printing_statistics\"]\n", - " self.episodes_back = policy_network_args[\"episodes_back\"]\n", - " self.rendering_frequency = policy_network_args[\"frequency_of_rendering_episode\"]\n", - " self.max_steps = policy_network_args[\"max_steps_per_episode\"]\n", - " self.summary_writer = summary_writer\n", - " self.name = name\n", - "\n", - " self.session = session\n", - " if Global_Agent is not None:\n", - " with tf.name_scope(name):\n", - "\n", - " with tf.name_scope('sync'):\n", - " with tf.name_scope('pull_from_global'):\n", - " self.pull_actor_params_op = [local_params.assign(\n", - " global_params) for local_params, global_params in zip(self.actor_params, Global_Agent.actor_params)]\n", - " self.pull_critic_params_op = [local_params.assign(\n", - " global_params) for local_params, global_params in zip(self.critic_params, Global_Agent.critic_params)]\n", - " with tf.name_scope(\"push_to_global\"):\n", - " self.push_actor_params_op = self.actor_optimizer.apply_gradients(zip(self.actor_grads, self.Global_Agent.actor_params))\n", - " self.push_critic_params_op = self.critic_optimizer.apply_gradients(zip(self.critic_grads, Global_Agent.critic_params))\n", - "\n", - " def update_Global_Agent(self, feed_dict):\n", - " _, _, = self.session.run([self.push_actor_params_op,\n", - " self.push_critic_params_op], feed_dict)\n", - "\n", - " def save_summary(self, feed_dict):\n", - " summary = self.session.run(self.Global_Agent.summaries, feed_dict)\n", - " self.summary_writer.add_summary(summary, self.Global_Agent.current_num_epi)\n", - "\n", - " def pull_from_global(self):\n", - " self.session.run([self.pull_actor_params_op,\n", - " self.pull_critic_params_op])\n", - "\n", - " def take_action(self, state):\n", - " state = state.reshape(-1, 8)\n", - " action = self.session.run([self.action], feed_dict={\n", - " self.st_placeholder: state})\n", - " return action[0].reshape(2,)\n", - "\n", - " def collect_rollouts(self, n_rolls, max_steps, render=False):\n", - " for i in range(n_rolls):\n", - " n_steps = 0\n", - " state = self.env.reset()\n", - " done = False\n", - " sum_rewards = 0\n", - " while not done and n_steps <= max_steps:\n", - " if render:\n", - " self.env.render()\n", - "\n", - " action = self.take_action(state)\n", - " next_state, reward, done, info = self.env.step(action)\n", - "\n", - " if not done and n_steps == max_steps:\n", - " state_feed = next_state.reshape(-1, 8)\n", - " reward = reward + float(self.session.run([self.critic], feed_dict={self.st_placeholder: state_feed})[0])\n", - "\n", - " self.save_state_transition(\n", - " [state, action, reward, next_state, done])\n", - "\n", - " sum_rewards += reward\n", - " state = next_state\n", - " n_steps += 1\n", - "\n", - " if self.name == \"Global_Agent\":\n", - " print(f\"Episode Reward: {sum_rewards}\")\n", - "\n", - " self.ep_rewards.append(sum_rewards)\n", - " self.save_rollout(self.recorder_memory)\n", - "\n", - " def training_loop(self):\n", - " \"\"\"Runs episodes in a loop and performs steps of gradient descent after every episode\"\"\"\n", - "\n", - " while not coord.should_stop() and self.Global_Agent.current_num_epi <= self.total_number_episodes:\n", - " self.collect_rollouts(\n", - " self.num_episodes_before_update, self.max_steps, render=False)\n", - "\n", - " states, actions, next_states, rewards, dones, Q_sa = self.unroll_state_transitions()\n", - "\n", - " feed_dict = {self.st_placeholder: states,\n", - " self.actions_placeholder: actions,\n", - " self.Qsa_placeholder: Q_sa}\n", - "\n", - " self.update_Global_Agent(feed_dict)\n", - " self.Global_Agent.current_num_epi += self.num_episodes_before_update\n", - "\n", - " feed_dict_global_summary = {self.Global_Agent.st_placeholder: states,\n", - " self.Global_Agent.actions_placeholder: actions,\n", - " self.Global_Agent.Qsa_placeholder: Q_sa}\n", - "\n", - " self.save_summary(feed_dict_global_summary)\n", - "\n", - " self.flush_rollout_memory()\n", - " self.pull_from_global()\n", - "\n", - " if self.Global_Agent.current_num_epi % self.frequency_printing_statistics == 0:\n", - "\n", - " average_reward = self.Global_Agent.compute_average_rewards(self.episodes_back)\n", - " print(\n", - " f\"Global ep number {self.Global_Agent.current_num_epi}: Reward = {average_reward}\")\n", - "\n", - "class Global_Agent(RLAgent):\n", - " def __init__(self, name, policy_network_args, value_function_network_args, session, summary_writer, child_agents=[]):\n", - " super().__init__(name, policy_network_args, value_function_network_args, session, summary_writer)\n", - " self.child_agents = child_agents\n", - " self.num_childs = len(child_agents)\n", - "\n", - " def compute_average_rewards(self, episodes_back):\n", - " \"\"\"Computes the average reward of each child agent going n episodes back, and returnes the average of those average rewards\"\"\"\n", - " reward = 0\n", - " for agent in self.child_agents:\n", - " agent_average_reward = reduce(\n", - " lambda x, y: x + y, agent.ep_rewards[-episodes_back:]) / episodes_back\n", - " reward += agent_average_reward\n", - "\n", - " reward /= self.num_childs\n", - "\n", - " return reward" - ], - "metadata": { - "id": "fKTk5Zt7R63o" - }, - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "source": [ - "**Testing the model**" - ], - "metadata": { - "id": "5j_ph4KMS6xc" - } - }, - { - "cell_type": "code", - "source": [ - "if __name__ == \"__main__\":\n", - "\n", - " env = gym.make('LunarLanderContinuous-v2')\n", - " action_space_upper_bound = env.action_space.high\n", - " action_space_lower_bound = env.action_space.low\n", - " subdir = time.strftime(\"%Y%m%d-%H%M%S\", time.localtime())\n", - " logdir = \"./summary/\" + subdir\n", - " writer = tf.summary.create_file_writer(logdir)\n", - " sess = tf1.Session()\n", - "\n", - " policy_net_args = {\"num_Hlayers\": 2,\n", - " \"activations_Hlayers\": [\"relu\", \"relu\"],\n", - " \"Hlayer_sizes\": [100, 100],\n", - " \"n_output_units\": 2,\n", - " \"output_layer_activation\": tf.nn.tanh,\n", - " \"state_space_size\": 8,\n", - " \"action_space_size\": 2,\n", - " \"Entropy\": 0.01,\n", - " \"action_space_upper_bound\": action_space_upper_bound,\n", - " \"action_space_lower_bound\": action_space_lower_bound,\n", - " \"optimizer\": tf1.train.RMSPropOptimizer(0.0001),\n", - " \"total_number_episodes\": 5000,\n", - " \"number_of_episodes_before_update\": 1,\n", - " \"frequency_of_printing_statistics\": 100,\n", - " \"frequency_of_rendering_episode\": 1000,\n", - " \"number_child_agents\": 8,\n", - " \"episodes_back\": 20,\n", - " \"gamma\": 0.99,\n", - " \"regularization_constant\": 0.01,\n", - " \"max_steps_per_episode\": 2000\n", - "\n", - " }\n", - "\n", - " valuefunction_net_args = {\"num_Hlayers\": 2,\n", - " \"activations_Hlayers\": [\"relu\", \"relu\"],\n", - " \"Hlayer_sizes\": [100, 64],\n", - " \"n_output_units\": 1,\n", - " \"output_layer_activation\": \"linear\",\n", - " \"state_space_size\": 8,\n", - " \"action_space_size\": 2,\n", - " \"optimizer\": tf1.train.RMSPropOptimizer(0.01),\n", - " \"regularization_constant\": 0.01}\n", - "\n", - " global_agent = Global_Agent(\"Global_Agent\", policy_net_args, valuefunction_net_args, sess, writer)\n", - "\n", - " child_agents = []\n", - "\n", - " for i in range(policy_net_args[\"number_child_agents\"]):\n", - " i_name = f\"ChildAgent_{i}\"\n", - " child_agents.append(RLAgent(i_name, policy_net_args, valuefunction_net_args, sess, writer, global_agent))\n", - "\n", - " global_agent.child_agents = child_agents\n", - " global_agent.num_childs = len(child_agents)\n", - "\n", - " saver = tf1.train.Saver()\n", - "\n", - " coord = tf.train.Coordinator()\n", - "\n", - " if len(os.listdir(checkpoint_path)) == 0:\n", - "\n", - " sess.run(tf1.global_variables_initializer())\n", - " else:\n", - " saver.restore(sess, checkpoint_path + \"/variables.ckpt\")\n", - "\n", - " child_agents_threads = []\n", - "\n", - " subdir = time.strftime(\"%Y%m%d-%H%M%S\", time.localtime())\n", - " logdir = \"./summary/\" + subdir\n", - " writer = tf1.summary.FileWriter(logdir)\n", - " writer.add_graph(sess.graph)\n", - "\n", - " for child_agent in child_agents:\n", - " def job(): return child_agent.training_loop()\n", - " t = threading.Thread(target=job)\n", - " t.start()\n", - " child_agents_threads.append(t)\n", - "\n", - " coord.join(child_agents_threads)\n", - " saver.save(sess, checkpoint_path + \"/variables.ckpt\")\n", - "\n", - " for i in range(1):\n", - "\n", - " global_agent.collect_rollouts(10, 2000, render=True)\n", - " global_agent.collect_rollouts(90, 2000)\n", - "\n", - " rewards = global_agent.ep_rewards\n", - " average = sum(rewards)/len(rewards)\n", - " average = [average] * 100\n", - "\n", - " fig, ax = plt.subplots()\n", - "\n", - " ax.plot(rewards, label=\"Episode Reward\")\n", - " ax.plot(average, label=\"Average\")\n", - " ax.legend(loc=\"best\")\n", - " plt.show()" - ], - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 1000 - }, - "id": "AWN9xhEVS9Q6", - "outputId": "209bb891-0498-47c4-a835-dd8d7415f02f" - }, - "execution_count": null, - "outputs": [] - } - ] -} \ No newline at end of file