From 8b3ba5d0791bfe32d252ac26d8f645dc0735d0f2 Mon Sep 17 00:00:00 2001 From: ruben Date: Tue, 14 Mar 2023 19:57:02 +0100 Subject: [PATCH] fixed f1 follow line ddpg agent for simplified perception --- .gitignore | 4 +- rl_studio/agents/f1/__init__.py | 0 rl_studio/agents/f1/loaders.py | 25 +- rl_studio/agents/f1/train_ddpg.py | 0 .../f1/train_followline_ddpg_f1_gazebo_tf.py | 279 ++++++++---------- .../agents/utilities/plot_npy_dataset.py | 32 ++ .../agents/utilities/plot_reward_function.py | 66 +++++ rl_studio/agents/utilities/push_git_repo.py | 18 ++ rl_studio/agents/utils.py | 21 +- rl_studio/algorithms/ddpg.py | 90 ++++-- rl_studio/algorithms/utils.py | 46 +-- ...ig_training_followline_ddpg_f1_gazebo.yaml | 53 ++-- rl_studio/envs/gazebo/f1/models/f1_env.py | 2 + .../envs/gazebo/f1/models/f1_env_camera.py | 1 + .../envs/gazebo/f1/models/f1_env_ddpg.py | 5 +- rl_studio/envs/gazebo/f1/models/reset.py | 52 +++- rl_studio/envs/gazebo/f1/models/rewards.py | 58 ++++ rl_studio/envs/gazebo/f1/models/settings.py | 2 +- .../gazebo/f1/models/simplified_perception.py | 2 +- rl_studio/envs/gazebo/f1/models/step.py | 71 ++--- rl_studio/requirements.txt | 60 ++++ 21 files changed, 575 insertions(+), 312 deletions(-) mode change 100644 => 100755 .gitignore mode change 100644 => 100755 rl_studio/agents/f1/__init__.py mode change 100644 => 100755 rl_studio/agents/f1/loaders.py mode change 100644 => 100755 rl_studio/agents/f1/train_ddpg.py mode change 100644 => 100755 rl_studio/agents/f1/train_followline_ddpg_f1_gazebo_tf.py create mode 100755 rl_studio/agents/utilities/plot_npy_dataset.py create mode 100644 rl_studio/agents/utilities/plot_reward_function.py create mode 100644 rl_studio/agents/utilities/push_git_repo.py mode change 100644 => 100755 rl_studio/algorithms/ddpg.py mode change 100644 => 100755 rl_studio/config/config_training_followline_ddpg_f1_gazebo.yaml mode change 100644 => 100755 rl_studio/envs/gazebo/f1/models/f1_env.py mode change 100644 => 100755 rl_studio/envs/gazebo/f1/models/f1_env_camera.py mode change 100644 => 100755 rl_studio/envs/gazebo/f1/models/f1_env_ddpg.py mode change 100644 => 100755 rl_studio/envs/gazebo/f1/models/reset.py mode change 100644 => 100755 rl_studio/envs/gazebo/f1/models/rewards.py mode change 100644 => 100755 rl_studio/envs/gazebo/f1/models/settings.py mode change 100644 => 100755 rl_studio/envs/gazebo/f1/models/step.py create mode 100644 rl_studio/requirements.txt diff --git a/.gitignore b/.gitignore old mode 100644 new mode 100755 index 682202d66..167854026 --- a/.gitignore +++ b/.gitignore @@ -15,8 +15,8 @@ catkin_ws output recorders logs - - +checkpoints +metrics ### Images ### JPEG diff --git a/rl_studio/agents/f1/__init__.py b/rl_studio/agents/f1/__init__.py old mode 100644 new mode 100755 diff --git a/rl_studio/agents/f1/loaders.py b/rl_studio/agents/f1/loaders.py old mode 100644 new mode 100755 index fbecbc348..087b2af88 --- a/rl_studio/agents/f1/loaders.py +++ b/rl_studio/agents/f1/loaders.py @@ -100,9 +100,7 @@ def __init__(self, config): "avg": [], "max": [], "min": [], - "step": [], "epoch_training_time": [], - "total_training_time": [], } self.best_current_epoch = { "best_epoch": [], @@ -124,6 +122,8 @@ def __init__(self, config): self.metrics_graphics_dir = f"{config['settings']['metrics_dir']}/{config['settings']['mode']}/{config['settings']['task']}_{config['settings']['algorithm']}_{config['settings']['agent']}_{config['settings']['framework']}/graphics" self.recorders_carla_dir = f"{config['settings']['recorder_carla_dir']}/{config['settings']['mode']}/{config['settings']['task']}_{config['settings']['algorithm']}_{config['settings']['agent']}_{config['settings']['framework']}" self.training_time = config["settings"]["training_time"] + self.debug_stats = config["settings"]["debug_stats"] + self.show_monitoring = config["settings"]["show_monitoring"] ####### States self.states = config["settings"]["states"] self.states_set = config["states"][self.states] @@ -132,6 +132,10 @@ def __init__(self, config): self.actions_set = config["actions"][self.actions] ####### Rewards self.rewards = config["settings"]["rewards"] + ###### Exploration + self.steps_to_decrease = config["settings"]["steps_to_decrease"] + self.decrease_substraction = config["settings"]["decrease_substraction"] + self.decrease_min = config["settings"]["decrease_min"] class LoadEnvVariablesDQNGazebo: @@ -270,14 +274,19 @@ def __init__(self, config) -> None: self.environment["model_state_name"] = config[self.environment_set][self.env][ "model_state_name" ] + self.environment["sleep"] = config[self.environment_set][self.env][ + "sleep" + ] + self.environment["punish_ineffective_vel"] = config["settings"]["reward_params"]["punish_ineffective_vel"] + self.environment["punish_zig_zag_value"] = config["settings"]["reward_params"]["punish_zig_zag_value"] + self.environment["reward_function_tuning"] = config["settings"]["reward_params"]["function"] + self.environment["beta_1"] = config["settings"]["reward_params"]["beta_1"] + + # Training/inference self.environment["mode"] = config["settings"]["mode"] - self.environment["retrain_ddpg_tf_actor_model_name"] = config["retraining"][ - "ddpg" - ]["retrain_ddpg_tf_actor_model_name"] - self.environment["retrain_ddpg_tf_critic_model_name"] = config["retraining"][ - "ddpg" - ]["retrain_ddpg_tf_critic_model_name"] + self.environment["retrain_ddpg_tf_actor_model_name"] = f"{config['retraining']['ddpg']['retrain_ddpg_tf_model_name']}/ACTOR" + self.environment["retrain_ddpg_tf_critic_model_name"] = f"{config['retraining']['ddpg']['retrain_ddpg_tf_model_name']}/CRITIC" self.environment["inference_ddpg_tf_actor_model_name"] = config["inference"][ "ddpg" ]["inference_ddpg_tf_actor_model_name"] diff --git a/rl_studio/agents/f1/train_ddpg.py b/rl_studio/agents/f1/train_ddpg.py old mode 100644 new mode 100755 diff --git a/rl_studio/agents/f1/train_followline_ddpg_f1_gazebo_tf.py b/rl_studio/agents/f1/train_followline_ddpg_f1_gazebo_tf.py old mode 100644 new mode 100755 index af6839454..fdf39a5be --- a/rl_studio/agents/f1/train_followline_ddpg_f1_gazebo_tf.py +++ b/rl_studio/agents/f1/train_followline_ddpg_f1_gazebo_tf.py @@ -8,7 +8,8 @@ import numpy as np import tensorflow as tf from tqdm import tqdm - +from rl_studio.agents.utilities.plot_npy_dataset import plot_rewards +from rl_studio.agents.utilities.push_git_repo import git_add_commit_push from rl_studio.agents.f1.loaders import ( LoadAlgorithmParams, @@ -34,6 +35,27 @@ from rl_studio.envs.gazebo.gazebo_envs import * +def combine_attributes(obj1, obj2, obj3): + combined_dict = {} + + # Extract attributes from obj1 + obj1_dict = obj1.__dict__ + for key, value in obj1_dict.items(): + combined_dict[key] = value + + # Extract attributes from obj2 + obj2_dict = obj2.__dict__ + for key, value in obj2_dict.items(): + combined_dict[key] = value + + # Extract attributes from obj3 + obj3_dict = obj3.__dict__ + for key, value in obj3_dict.items(): + combined_dict[key] = value + + return combined_dict + + class TrainerFollowLineDDPGF1GazeboTF: """ Mode: training @@ -55,10 +77,21 @@ def __init__(self, config): os.makedirs(f"{self.global_params.metrics_data_dir}", exist_ok=True) os.makedirs(f"{self.global_params.metrics_graphics_dir}", exist_ok=True) self.log_file = f"{self.global_params.logs_dir}/{time.strftime('%Y%m%d-%H%M%S')}_{self.global_params.mode}_{self.global_params.task}_{self.global_params.algorithm}_{self.global_params.agent}_{self.global_params.framework}.log" + # self.outdir = f"{self.global_params.models_dir}/ddpg/{self.global_params.states}" def main(self): log = LoggingHandler(self.log_file) + # Init TensorBoard + tensorboard = ModifiedTensorBoard( + log_dir=f"{self.global_params.logs_tensorboard_dir}/{self.algoritmhs_params.model_name}-{time.strftime('%Y%m%d-%H%M%S')}" + ) + + hyperparams = combine_attributes(self.algoritmhs_params, + self.environment, + self.global_params) + + tensorboard.update_hyperparams(hyperparams) ## Load Environment env = gym.make(self.env_params.env_name, **self.environment.environment) @@ -67,11 +100,13 @@ def main(self): np.random.seed(1) tf.compat.v1.random.set_random_seed(1) + actor_loss = 0 + critic_loss = 0 start_time = datetime.now() best_epoch = 1 current_max_reward = 0 - best_step = 0 best_epoch_training_time = 0 + all_steps = 0 ## Reset env state, state_size = env.reset() @@ -88,9 +123,10 @@ def main(self): ) ## --------------------- Deep Nets ------------------ + exploration = self.algoritmhs_params.std_dev ou_noise = OUActionNoise( mean=np.zeros(1), - std_deviation=float(self.algoritmhs_params.std_dev) * np.ones(1), + std_deviation=float(exploration) * np.ones(1), ) # Init Agents ac_agent = DDPGAgent( @@ -108,10 +144,7 @@ def main(self): self.algoritmhs_params.buffer_capacity, self.algoritmhs_params.batch_size, ) - # Init TensorBoard - tensorboard = ModifiedTensorBoard( - log_dir=f"{self.global_params.logs_tensorboard_dir}/{self.algoritmhs_params.model_name}-{time.strftime('%Y%m%d-%H%M%S')}" - ) + ## ------------- START TRAINING -------------------- for episode in tqdm( @@ -126,16 +159,30 @@ def main(self): prev_state, prev_state_size = env.reset() while not done: + all_steps += 1 + if not all_steps % self.global_params.steps_to_decrease: + exploration = max(self.global_params.decrease_min, exploration - self.global_params.decrease_substraction) + log.logger.debug("decreasing exploration to exploration") + ou_noise = OUActionNoise( + mean=np.zeros(1), + std_deviation=float(exploration) * np.ones(1), + ) + tf_prev_state = tf.expand_dims(tf.convert_to_tensor(prev_state), 0) action = ac_agent.policy( tf_prev_state, ou_noise, self.global_params.actions ) - state, reward, done, _ = env.step(action, step) + tensorboard.update_actions(action[0], all_steps) + state, reward, done, info = env.step(action, step) + fps = info["fps"] + # print(fps) cumulated_reward += reward # learn and update + # env._gazebo_pause() + buffer.record((prev_state, action, reward, state)) - buffer.learn(ac_agent, self.algoritmhs_params.gamma) + actor_loss, critic_loss = buffer.learn(ac_agent, self.algoritmhs_params.gamma) ac_agent.update_target( ac_agent.target_actor.variables, ac_agent.actor_model.variables, @@ -151,131 +198,100 @@ def main(self): prev_state = state step += 1 - log.logger.debug( - f"\nstate = {state}\n" - # f"observation[0]= {observation[0]}\n" - f"state type = {type(state)}\n" - # f"observation[0] type = {type(observation[0])}\n" - f"prev_state = {prev_state}\n" - f"prev_state = {type(prev_state)}\n" - f"action = {action}\n" - f"actions type = {type(action)}\n" - ) - render_params( - task=self.global_params.task, - v=action[0][0], # for continuous actions - w=action[0][1], # for continuous actions - episode=episode, - step=step, - state=state, - # v=self.global_params.actions_set[action][ - # 0 - # ], # this case for discrete - # w=self.global_params.actions_set[action][ - # 1 - # ], # this case for discrete - reward_in_step=reward, - cumulated_reward_in_this_episode=cumulated_reward, - _="--------------------------", - best_episode_until_now=best_epoch, - in_best_step=best_step, - with_highest_reward=int(current_max_reward), - in_best_epoch_training_time=best_epoch_training_time, - ) - log.logger.debug( - f"\nepisode = {episode}\n" - f"step = {step}\n" - f"actions_len = {len(self.global_params.actions_set)}\n" - f"actions_range = {range(len(self.global_params.actions_set))}\n" - f"actions = {self.global_params.actions_set}\n" - f"reward_in_step = {reward}\n" - f"cumulated_reward = {cumulated_reward}\n" - f"done = {done}\n" - ) - - # best episode - if current_max_reward <= cumulated_reward: - current_max_reward = cumulated_reward - best_epoch = episode - best_step = step - best_epoch_training_time = datetime.now() - start_time_epoch - # saving params to show - self.global_params.actions_rewards["episode"].append(episode) - self.global_params.actions_rewards["step"].append(step) - # For continuous actios - # self.actions_rewards["v"].append(action[0][0]) - # self.actions_rewards["w"].append(action[0][1]) - self.global_params.actions_rewards["reward"].append(reward) - self.global_params.actions_rewards["center"].append( - env.image_center + if self.global_params.show_monitoring: + log.logger.debug( + f"\nstate = {state}\n" + # f"observation[0]= {observation[0]}\n" + f"state type = {type(state)}\n" + # f"observation[0] type = {type(observation[0])}\n" + f"prev_state = {prev_state}\n" + f"prev_state = {type(prev_state)}\n" + f"action = {action}\n" + f"actions type = {type(action)}\n" + ) + render_params( + task=self.global_params.task, + v=action[0][0], # for continuous actions + w=action[0][1], # for continuous actions + episode=episode, + step=step, + state=state, + reward_in_step=reward, + cumulated_reward_in_this_episode=cumulated_reward, + _="--------------------------", + # fps=fps, + exploration=exploration, + best_episode_until_now=best_epoch, + with_highest_reward=int(current_max_reward), + in_best_epoch_training_time=best_epoch_training_time, + ) + log.logger.debug( + f"\nepisode = {episode}\n" + f"step = {step}\n" + f"actions_len = {len(self.global_params.actions_set)}\n" + f"actions_range = {range(len(self.global_params.actions_set))}\n" + f"actions = {self.global_params.actions_set}\n" + f"reward_in_step = {reward}\n" + f"cumulated_reward = {cumulated_reward}\n" + f"done = {done}\n" ) # Showing stats in screen for monitoring. Showing every 'save_every_step' value - if not step % self.env_params.save_every_step: - save_dataframe_episodes( + if not all_steps % self.env_params.save_every_step: + file_name = save_dataframe_episodes( self.environment.environment, self.global_params.metrics_data_dir, - self.global_params.aggr_ep_rewards, - self.global_params.actions_rewards, + self.global_params.aggr_ep_rewards) + plot_rewards( + self.global_params.metrics_data_dir, + file_name ) + git_add_commit_push("automatic_rewards_update") log.logger.debug( f"SHOWING BATCH OF STEPS\n" f"current_max_reward = {cumulated_reward}\n" f"current epoch = {episode}\n" f"current step = {step}\n" f"best epoch so far = {best_epoch}\n" - f"best step so far = {best_step}\n" f"best_epoch_training_time = {best_epoch_training_time}\n" ) - ##################################################### ### save in case of completed steps in one episode if step >= self.env_params.estimated_steps: done = True - log.logger.info( - f"\nEPISODE COMPLETED\n" - f"in episode = {episode}\n" - f"steps = {step}\n" - f"cumulated_reward = {cumulated_reward}\n" + print_messages( + "Lap completed in:", + time=datetime.now() - start_time_epoch, + in_episode=episode, + episode_reward=int(cumulated_reward), + with_steps=step, ) save_actorcritic_model( ac_agent, self.global_params, self.algoritmhs_params, - cumulated_reward, + self.environment.environment, + current_max_reward, episode, "LAPCOMPLETED", ) + + + ##################################################### #### save best lap in episode - if ( - cumulated_reward - self.environment.environment["rewards"]["penal"] - ) >= current_max_reward and episode > 1: + if current_max_reward <= cumulated_reward: + current_max_reward = cumulated_reward + best_epoch = episode - self.global_params.best_current_epoch["best_epoch"].append(best_epoch) - self.global_params.best_current_epoch["highest_reward"].append( - current_max_reward - ) - self.global_params.best_current_epoch["best_step"].append(best_step) - self.global_params.best_current_epoch[ - "best_epoch_training_time" - ].append(best_epoch_training_time) - self.global_params.best_current_epoch[ - "current_total_training_time" - ].append(datetime.now() - start_time) - - save_dataframe_episodes( - self.environment.environment, - self.global_params.metrics_data_dir, - self.global_params.best_current_epoch, - ) save_actorcritic_model( ac_agent, self.global_params, self.algoritmhs_params, - cumulated_reward, + self.environment.environment, + current_max_reward, episode, - "BESTLAP", + "IMPROVED", ) log.logger.info( f"\nsaving best lap\n" @@ -296,73 +312,30 @@ def main(self): f"epoch = {episode}\n" f"step = {step}\n" ) - if cumulated_reward > current_max_reward: - save_actorcritic_model( - ac_agent, - self.global_params, - self.algoritmhs_params, - cumulated_reward, - episode, - "FINISHTIME", - ) - break ##################################################### ### save every save_episode times self.global_params.ep_rewards.append(cumulated_reward) if not episode % self.env_params.save_episodes: - average_reward = sum( - self.global_params.ep_rewards[-self.env_params.save_episodes :] - ) / len(self.global_params.ep_rewards[-self.env_params.save_episodes :]) - min_reward = min( - self.global_params.ep_rewards[-self.env_params.save_episodes :] - ) - max_reward = max( - self.global_params.ep_rewards[-self.env_params.save_episodes :] + average_reward = sum(self.global_params.ep_rewards[-self.env_params.save_episodes:]) / len( + self.global_params.ep_rewards[-self.env_params.save_episodes:] ) + min_reward = min(self.global_params.ep_rewards[-self.env_params.save_episodes:]) + max_reward = max(self.global_params.ep_rewards[-self.env_params.save_episodes:]) tensorboard.update_stats( - reward_avg=int(average_reward), - reward_max=int(max_reward), - steps=step, + cum_rewards=average_reward, + reward_min=min_reward, + reward_max=max_reward, + actor_loss=actor_loss, + critic_loss=critic_loss ) + self.global_params.aggr_ep_rewards["episode"].append(episode) - self.global_params.aggr_ep_rewards["step"].append(step) self.global_params.aggr_ep_rewards["avg"].append(average_reward) self.global_params.aggr_ep_rewards["max"].append(max_reward) self.global_params.aggr_ep_rewards["min"].append(min_reward) self.global_params.aggr_ep_rewards["epoch_training_time"].append( (datetime.now() - start_time_epoch).total_seconds() ) - self.global_params.aggr_ep_rewards["total_training_time"].append( - (datetime.now() - start_time).total_seconds() - ) - save_actorcritic_model( - ac_agent, - self.global_params, - self.algoritmhs_params, - cumulated_reward, - episode, - "BATCH", - ) - - save_dataframe_episodes( - self.environment.environment, - self.global_params.metrics_data_dir, - self.global_params.aggr_ep_rewards, - ) - log.logger.info( - f"\nsaving BATCH\n" - f"current_max_reward = {cumulated_reward}\n" - f"best_epoch = {best_epoch}\n" - f"best_step = {best_step}\n" - f"best_epoch_training_time = {best_epoch_training_time}\n" - ) - ##################################################### - ### save last episode, not neccesarily the best one - save_dataframe_episodes( - self.environment.environment, - self.global_params.metrics_data_dir, - self.global_params.aggr_ep_rewards, - ) env.close() diff --git a/rl_studio/agents/utilities/plot_npy_dataset.py b/rl_studio/agents/utilities/plot_npy_dataset.py new file mode 100755 index 000000000..80e80a9c1 --- /dev/null +++ b/rl_studio/agents/utilities/plot_npy_dataset.py @@ -0,0 +1,32 @@ +import matplotlib as pltlib +import matplotlib.pyplot as plt +import numpy as np + +def load_rewards(file_path): + rewards = np.load(file_path, allow_pickle=True).item() + return rewards + + +def plot_rewards(base_directory_name, file_name): + pltlib.rcParams.update({'font.size': 15}) + fig, ax1 = plt.subplots() + + file_path = base_directory_name + "/" + file_name + rewards = load_rewards(file_path) + ax1.set_xscale('log') + ax1.plot(range(len(rewards["avg"])), rewards["avg"], color='purple', label='ddpg') + + # fig.canvas.manager.full_screen_toggle() + + plt.legend() + plt.ylabel("cun_reward") + plt.xlabel("episode") + # plt.show() + + base_path = '/home/ruben/Desktop/2020-phd-ruben-lucas/docs/assets/images/results_images/f1-follow-line/gazebo/ddpg/sp10/' + ax1.figure.savefig(base_path + 'trainings.png') + +if __name__ == "__main__": + # Pass the file_name as a parameter when calling the script + file_name = "20230722-135205_Circuit-simple_States-sp10_Actions-continuous_Rewards-followline_center.npy" + plot_rewards(file_name) diff --git a/rl_studio/agents/utilities/plot_reward_function.py b/rl_studio/agents/utilities/plot_reward_function.py new file mode 100644 index 000000000..096a5792c --- /dev/null +++ b/rl_studio/agents/utilities/plot_reward_function.py @@ -0,0 +1,66 @@ +import numpy as np +import matplotlib.pyplot as plt + +def normalize_range(num, a, b): + return (num - a) / (b - a) + +def linear_function(cross_x, slope, x): + return cross_x + (slope * x) + +def sigmoid_function(start, end, x, slope=10): + slope = slope / (end - start) + sigmoid = 1 / (1 + np.exp(-slope * (x - ((start + end) / 2)))) + return sigmoid + +def reward_proximity(state): + if abs(state) > 0.7: + return 0 + else: + # return 1 - abs(state) + # return linear_function(1, -1.4, abs(state)) + return pow(1 - abs(state), 4) + # return 1 - sigmoid_function(0, 1, abs(state), 5) + + +def rewards_followline_velocity_center(v, pos, range_v): + p_reward = reward_proximity(pos) + v_norm = normalize_range(v, range_v[0], range_v[1]) + v_r = v_norm * pow(p_reward, 2) + beta_pos = 0.7 + reward = (beta_pos * p_reward) + ((1 - beta_pos) * v_r) + v_punish = reward * (1 - p_reward) * v_norm + reward = reward - v_punish + + return reward + +range_v = [0, 10] + +# Define the ranges for v, w, and pos +v_range = np.linspace(range_v[0], range_v[1], 50) # Adjust the range as needed +pos_range = np.linspace(-1, 1, 50) # Adjust the range as needed + +# Create a grid of values for v, w, and pos +V, POS = np.meshgrid(v_range, pos_range) + +# Calculate the rewards for each combination of v, w, and pos +rewards = np.empty_like(V) +for i in range(V.shape[0]): + for j in range(V.shape[1]): + rewards[i, j] = rewards_followline_velocity_center(V[i, j], POS[i, j], range_v) + +# Create a 3D plot +fig = plt.figure(figsize=(10, 8)) +ax = fig.add_subplot(111, projection='3d') + +# Plot the 3D surface +surf = ax.plot_surface(POS, V, rewards, cmap='viridis') + +# Set labels for the axes +ax.set_xlabel('Center Distance (pos)') +ax.set_ylabel('Linear Velocity (v)') +ax.set_zlabel('Reward') + +# Show the color bar +fig.colorbar(surf, shrink=0.5, aspect=5) + +plt.show() diff --git a/rl_studio/agents/utilities/push_git_repo.py b/rl_studio/agents/utilities/push_git_repo.py new file mode 100644 index 000000000..08ecdb624 --- /dev/null +++ b/rl_studio/agents/utilities/push_git_repo.py @@ -0,0 +1,18 @@ +import subprocess + +def git_add_commit_push(commit_message): + # Change this to your repository directory + repo_directory = "/home/ruben/Desktop/2020-phd-ruben-lucas" + + # Change to the repository directory + subprocess.run(["git", "add", "."], cwd=repo_directory) + + # Commit changes with the given commit message + subprocess.run(["git", "commit", "-m", commit_message], cwd=repo_directory) + + # Push changes to the remote repository + subprocess.run(["git", "push"], cwd=repo_directory) + +if __name__ == "__main__": + commit_message = "Your commit message here." + git_add_commit_push(commit_message) \ No newline at end of file diff --git a/rl_studio/agents/utils.py b/rl_studio/agents/utils.py index 7010ba446..941d05e04 100644 --- a/rl_studio/agents/utils.py +++ b/rl_studio/agents/utils.py @@ -92,12 +92,8 @@ def print_dictionary(dic): def render_params(**kwargs): font = cv2.FONT_HERSHEY_SIMPLEX - canvas = np.zeros((400, 400, 3), dtype="uint8") - # blue = (255, 0, 0) - # green = (0, 255, 0) - # red = (0, 0, 255) + canvas = np.zeros((400, 900, 3), dtype="uint8") white = (255, 255, 255) - # white_darkness = (200, 200, 200) i = 10 for key, value in kwargs.items(): cv2.putText( @@ -116,22 +112,21 @@ def render_params(**kwargs): cv2.waitKey(100) -def save_dataframe_episodes(environment, outdir, aggr_ep_rewards, actions_rewards=None): +def save_dataframe_episodes(environment, outdir, aggr_ep_rewards): """ We save info every certains epochs in a dataframe and .npy format to export or manage """ os.makedirs(f"{outdir}", exist_ok=True) - - file_csv = f"{outdir}/{time.strftime('%Y%m%d-%H%M%S')}_Circuit-{environment['circuit_name']}_States-{environment['states']}_Actions-{environment['action_space']}_Rewards-{environment['reward_function']}.csv" - file_excel = f"{outdir}/{time.strftime('%Y%m%d-%H%M%S')}_Circuit-{environment['circuit_name']}_States-{environment['states']}_Actions-{environment['action_space']}_Rewards-{environment['reward_function']}.xlsx" + file_name = f"{time.strftime('%Y%m%d-%H%M%S')}_Circuit-{environment['circuit_name']}_States-{environment['states']}_Actions-{environment['action_space']}_Rewards-{environment['reward_function']}" + file_csv = f"{outdir}/{file_name}.csv" + file_excel = f"{outdir}/{file_name}.xlsx" df = pd.DataFrame(aggr_ep_rewards) df.to_csv(file_csv, mode="a", index=False, header=None) df.to_excel(file_excel) - - if actions_rewards is not None: - file_npy = f"{outdir}/{time.strftime('%Y%m%d-%H%M%S')}_Circuit-{environment['circuit_name']}_States-{environment['states']}_Actions-{environment['action_space']}_Rewards-{environment['reward_function']}.npy" - np.save(file_npy, actions_rewards) + file_npy = f"{outdir}/{file_name}.npy" + np.save(file_npy, aggr_ep_rewards) + return f"{file_name}.npy" def save_best_episode( diff --git a/rl_studio/algorithms/ddpg.py b/rl_studio/algorithms/ddpg.py old mode 100644 new mode 100755 index d289145bb..1ed26ee7f --- a/rl_studio/algorithms/ddpg.py +++ b/rl_studio/algorithms/ddpg.py @@ -15,7 +15,8 @@ ) from tensorflow.keras.models import Sequential, load_model from tensorflow.keras.optimizers import Adam - +from tensorboardX import SummaryWriter +from tabulate import tabulate # Sharing GPU gpus = tf.config.experimental.list_physical_devices("GPU") @@ -23,6 +24,22 @@ tf.config.experimental.set_memory_growth(gpu, True) +def build_markdown_table(data_dict): + markdown_table = "```\n" + + # Create the table header + markdown_table += "| Key | Value |\n" + markdown_table += "| --- | --- |\n" + + # Iterate over the dictionary and add rows to the table + for key, value in data_dict.items(): + markdown_table += f"| {key} | {value} |\n" + + markdown_table += "```\n" + + return markdown_table + + # Own Tensorboard class class ModifiedTensorBoard(TensorBoard): @@ -31,6 +48,7 @@ def __init__(self, **kwargs): super().__init__(**kwargs) self.step = 1 self.writer = tf.summary.create_file_writer(self.log_dir) + self.txWriter = SummaryWriter(self.log_dir) # Overriding this method to stop creating default log writer def set_model(self, model): @@ -60,6 +78,22 @@ def _write_logs(self, logs, index): def update_stats(self, **stats): self._write_logs(stats, self.step) + def update_actions(self, actions, index): + with self.writer.as_default(): + tf.summary.histogram("actions_v", actions[0], step=index) + tf.summary.histogram("actions_w", actions[1], step=index) + self.writer.flush() + + def update_hyperparams(self, params): + # Convert the dictionary to a list of (key, value) pairs + table_data = [[key, value] for key, value in params.items()] + # Create a nicely formatted table + table = tabulate(table_data, headers=['Key', 'Value'], tablefmt='pipe') + + with self.writer.as_default(): + tf.summary.experimental.set_step(0) + tf.summary.text("HyperParams", table) + self.writer.flush() class OUActionNoise: def __init__(self, mean, std_deviation, theta=0.15, dt=1e-2, x_initial=None): @@ -165,7 +199,7 @@ def update( with tf.GradientTape() as tape: target_actions = actor_critic.target_actor(next_state_batch, training=True) y = reward_batch + gamma * actor_critic.target_critic( - [next_state_batch, target_actions], training=True + [next_state_batch, target_actions[0], target_actions[1]], training=True ) if self.action_space == "continuous": @@ -192,7 +226,7 @@ def update( with tf.GradientTape() as tape: actions = actor_critic.actor_model(state_batch, training=True) critic_value = actor_critic.critic_model( - [state_batch, actions], training=True + [state_batch, actions[0], actions[1]], training=True ) # Used `-value` as we want to maximize the value given # by the critic for our actions @@ -204,6 +238,7 @@ def update( actor_critic.actor_optimizer.apply_gradients( zip(actor_grad, actor_critic.actor_model.trainable_variables) ) + return actor_loss, critic_loss # We compute the loss and update parameters def learn(self, actor_critic, gamma): @@ -251,7 +286,7 @@ def learn(self, actor_critic, gamma): ) else: - self.update( + return self.update( actor_critic, gamma, state_batch, @@ -288,9 +323,9 @@ def __init__(self, config, action_space_size, observation_space_values, outdir): self.actor_optimizer = tf.keras.optimizers.Adam(config["actor_lr"]) # Custom tensorboard object - self.tensorboard = ModifiedTensorBoard( - log_dir=f"{outdir}/{self.MODEL_NAME}-{time.strftime('%Y%m%d-%H%M%S')}" - ) + # self.tensorboard = ModifiedTensorBoard( + # log_dir=f"{outdir}/{self.MODEL_NAME}-{time.strftime('%Y%m%d-%H%M%S')}" + # ) # Used to count when to update target network with main network's weights self.target_update_counter = 0 @@ -406,13 +441,8 @@ def policy_discrete_actions(self, state): sampled_actions = tf.squeeze(self.actor_model(state)) # Adding noise to action sampled_actions = sampled_actions.numpy() - # print(f"sampled_actions:{sampled_actions}") sampled_actions = np.argmax(sampled_actions) - # print(f"sampled_actions:{sampled_actions}") - # We make sure action is within bounds - # legal_action = np.clip(sampled_actions, self.LOWER_BOUND, self.UPPER_BOUND) legal_action = sampled_actions - # print(f"np.squeeze(legal_action):{np.squeeze(legal_action)}") return np.squeeze(legal_action) def get_actor_model_simplified_perception_discrete_actions_nan(self): @@ -478,9 +508,13 @@ def get_critic_model_simplified_perception_discrete_actions(self): def policy_continuous_actions(self, state, noise_object): sampled_actions = tf.squeeze(self.actor_model(state)) noise = noise_object() + # Adding noise to action - sampled_actions = sampled_actions.numpy() + noise - # we can discretized the actions values with round(,0) + #print(f"debug: actions {sampled_actions[0]} {sampled_actions[1]}") + sampled_actions = sampled_actions.numpy() + sampled_actions[0] = sampled_actions[0] + noise + sampled_actions[1] = sampled_actions[1] + noise + legal_action_v = round( np.clip(sampled_actions[0], self.V_LOWER_BOUND, self.V_UPPER_BOUND), 1 ) @@ -613,13 +647,15 @@ def get_critic_model_image_continuous_actions(self): return model def get_actor_model_sp_continuous_actions(self): - # inputShape = (96, 128, 3) - inputs = Input(shape=self.OBSERVATION_SPACE_VALUES) - v_branch = self.build_branch(inputs, "v_output") - w_branch = self.build_branch(inputs, "w_output") + # last_init = tf.random_uniform_initializer(minval=-1, maxval=0.01) + hidden_init = tf.keras.initializers.GlorotUniform() + shared_layer = Dense(16, activation="relu", kernel_initializer=hidden_init) - v_branch = abs(v_branch) * self.V_UPPER_BOUND + v_branch = self.build_branch(inputs, "v_output", shared_layer) + w_branch = self.build_branch(inputs, "w_output", shared_layer) + + v_branch = ((v_branch + 1) / 2) * (self.V_UPPER_BOUND - self.V_LOWER_BOUND) + self.V_LOWER_BOUND w_branch = w_branch * self.W_LEFT_BOUND # create the model using our input (the batch of images) and @@ -632,11 +668,14 @@ def get_actor_model_sp_continuous_actions(self): # return the constructed network architecture return model - def build_branch(self, inputs, action_name): - last_init = tf.random_uniform_initializer(minval=-0.01, maxval=0.01) - # inputs = layers.Input(shape=(self.OBSERVATION_SPACE_VALUES)) - x = Dense(128, activation="relu")(inputs) # 8, 16, 32 neurons - x = Dense(128, activation="relu")(x) # 8, 16, 32 neurons + def build_branch(self, inputs, action_name, shared_layer): + # last_init = tf.random_uniform_initializer(minval=-1, maxval=1) + hidden_init = tf.keras.initializers.GlorotUniform() + last_init = tf.keras.initializers.HeUniform() + + x = shared_layer(inputs) + x = Dense(16, activation="relu", kernel_initializer=hidden_init)(x) # 8, 16, 32 neurons + x = Dense(16, activation="relu", kernel_initializer=hidden_init)(x) # 8, 16, 32 neurons x = Dense(1, activation="tanh", kernel_initializer=last_init)(x) x = Activation("tanh", name=action_name)(x) @@ -647,8 +686,7 @@ def build_branch(self, inputs, action_name): def get_critic_model_sp_continuous_actions(self): # State as input state_input = layers.Input(shape=(self.OBSERVATION_SPACE_VALUES)) - state_out = layers.Dense(16, activation="relu")(state_input) - state_out = layers.Dense(32, activation="relu")(state_out) + state_out = layers.Dense(64, activation="relu")(state_input) # state_out = layers.Flatten()(state_out) # Actions V and W. For more actions, we should add more layers diff --git a/rl_studio/algorithms/utils.py b/rl_studio/algorithms/utils.py index e2236e0f0..6b84279dc 100644 --- a/rl_studio/algorithms/utils.py +++ b/rl_studio/algorithms/utils.py @@ -10,42 +10,22 @@ def save_actorcritic_model( agent, global_params, algoritmhs_params, environment, cumulated_reward, episode, text ): + timestamp = time.strftime('%Y%m%d-%H%M%S') agent.actor_model.save( - f"{global_params.models_dir}/{time.strftime('%Y%m%d-%H%M%S')}_{algoritmhs_params.model_name}_{text}_ACTOR" - f"Circuit-{environment['circuit_name']}_" - f"States-{environment['states']}_" - f"Actions-{environment['action_space']}_" - f"BATCH_Rewards-{environment['reward_function']}_" - f"MaxReward-{int(cumulated_reward)}_" - f"Epoch-{episode}" + f"{global_params.models_dir}/{timestamp}_{text}_" + f"C-{environment['circuit_name']}_" + f"S-{environment['states']}_" + f"A-{environment['action_space']}_" + f"MR-{int(cumulated_reward)}_" + f"E-{episode}/ACTOR" ) agent.critic_model.save( - f"{global_params.models_dir}/{time.strftime('%Y%m%d-%H%M%S')}_{algoritmhs_params.model_name}_{text}_CRITIC" - f"Circuit-{environment['circuit_name']}_" - f"States-{environment['states']}_" - f"Actions-{environment['action_space']}_" - f"BATCH_Rewards-{environment['reward_function']}_" - f"MaxReward-{int(cumulated_reward)}_" - f"Epoch-{episode}" + f"{global_params.models_dir}/{timestamp}_{text}_" + f"C-{environment['circuit_name']}_" + f"S-{environment['states']}_" + f"A-{environment['action_space']}_" + f"MR-{int(cumulated_reward)}_" + f"E-{episode}/CRITIC" ) - # save model in h5 format - agent.actor_model.save( - f"{global_params.models_dir}/{time.strftime('%Y%m%d-%H%M%S')}_{algoritmhs_params.model_name}_{text}_ACTOR" - f"Circuit-{environment['circuit_name']}_" - f"States-{environment['states']}_" - f"Actions-{environment['action_space']}_" - f"BATCH_Rewards-{environment['reward_function']}_" - f"MaxReward-{int(cumulated_reward)}_" - f"Epoch-{episode}.h5" - ) - agent.critic_model.save( - f"{global_params.models_dir}/{time.strftime('%Y%m%d-%H%M%S')}_{algoritmhs_params.model_name}_{text}_CRITIC" - f"Circuit-{environment['circuit_name']}_" - f"States-{environment['states']}_" - f"Actions-{environment['action_space']}_" - f"BATCH_Rewards-{environment['reward_function']}_" - f"MaxReward-{int(cumulated_reward)}_" - f"Epoch-{episode}.h5" - ) diff --git a/rl_studio/config/config_training_followline_ddpg_f1_gazebo.yaml b/rl_studio/config/config_training_followline_ddpg_f1_gazebo.yaml old mode 100644 new mode 100755 index 7f4a60460..547102f79 --- a/rl_studio/config/config_training_followline_ddpg_f1_gazebo.yaml +++ b/rl_studio/config/config_training_followline_ddpg_f1_gazebo.yaml @@ -7,14 +7,24 @@ settings: env: simple # simple, nurburgring, montreal, curves, simple_laser, manual, autoparking agent: f1 # f1, autoparkingRL, auto_carla, mountain_car, robot_mesh, cartpole, turtlebot actions: continuous # continuous, simple, medium, hard, test, autoparking_simple - states: image #image, sp1 (simplified perception with 1 point), sp3 (simplified perception with 3 points), spn (simplified perception with n points) + states: sp5 #image, sp1 (simplified perception with 1 point), sp3 (simplified perception with 3 points), spn (simplified perception with n points) rewards: followline_center # followline_center, followline_center_v_w_linear framework: TensorFlow # TensorFlow, Pytorch - total_episodes: 5 - training_time: 6 + total_episodes: 500000 + training_time: 72 models_dir: "./checkpoints" logs_dir: "./logs" metrics_dir: "./metrics" + debug_stats: false + show_monitoring: true + steps_to_decrease: 50000 + decrease_substraction: 0.01 + decrease_min: 0.03 + reward_params: + function: pow + punish_zig_zag_value: 1 + punish_ineffective_vel: 0.5 + beta_1: 0.8 ros: ros_master_uri: "11311" @@ -22,9 +32,7 @@ ros: retraining: ddpg: - retrain_ddpg_tf_actor_model_name: "20230111_DDPG_Actor_conv2d32x64_Critic_conv2d32x64_BATCH_ACTOR_Max--100_Epoch-5_State-image_Actions-continuous_Rewards-followline_center_inTime-20230111-183048.h5" - retrain_ddpg_tf_critic_model_name: "20230111_DDPG_Actor_conv2d32x64_Critic_conv2d32x64_BATCH_CRITIC_Max--100_Epoch-5_State-image_Actions-continuous_Rewards-followline_center_inTime-20230111-183048.h5" - + retrain_ddpg_tf_model_name: "20231029-023138_IMPROVEDCircuit-simple_States-sp5_Actions-continuous_BATCH_Rewards-followline_center_MaxReward-2801_Epoch-118" inference: ddpg: inference_ddpg_tf_actor_model_name: @@ -33,15 +41,15 @@ inference: algorithm: ddpg: gamma: 0.9 - tau: 0.005 - std_dev: 0.2 + tau: 0.01 + std_dev: 0.1 model_name: DDPG_Actor_conv2d32x64_Critic_conv2d32x64 replay_memory_size: 50_000 memory_fraction: 0.20 - critic_lr: 0.002 - actor_lr: 0.001 + critic_lr: 0.003 + actor_lr: 0.002 buffer_capacity: 100_000 - batch_size: 64 + batch_size: 256 agents: f1: @@ -59,11 +67,13 @@ states: image: 0: [3] sp1: - 0: [10] + 0: [40] #TODO sp3: - 0: [5, 15, 22] + 0: [10, 50, 150] sp5: - 0: [3, 5, 10, 15, 20] + 0: [10, 40, 70, 130, 190] + sp10: + 0: [3, 15, 30, 45, 50, 70, 100, 120, 150, 190] spn: 0: [10] @@ -89,8 +99,8 @@ actions: test: 0: [0, 0] continuous: - v: [2, 30] - w: [-3, 3] + v: [1, 7] + w: [-1.5, 1.5] rewards: followline_center: @@ -101,8 +111,6 @@ rewards: min_reward: 5_000 highest_reward: 100 followline_center_v_w_linear: # only for continuous actions - beta_0: 3 - beta_1: -0.1 penal: 0 min_reward: 1_000 highest_reward: 100 @@ -116,11 +124,11 @@ gazebo_environments: robot_name: f1_renault model_state_name: f1_renault # f1_renault_multicamera_multilaser start_pose: 0 # 0, 1, 2, 3, 4 - alternate_pose: False - estimated_steps: 100 + alternate_pose: True + estimated_steps: 5000 sensor: camera - save_episodes: 5 - save_every_step: 10 + save_episodes: 1 + save_every_step: 20000 lap_completed: False save_model: True save_positions: True @@ -134,3 +142,4 @@ gazebo_environments: 2: [39.712, -30.741, 0.004, 0, 0, 1.56, 1.56] 3: [-6.861, -36.481, 0.004, 0, 0.01, -0.858, 0.613] 4: [20.043, 37.130, 0.003, 0, 0.103, -1.4383, -1.4383] + sleep: 0.05 diff --git a/rl_studio/envs/gazebo/f1/models/f1_env.py b/rl_studio/envs/gazebo/f1/models/f1_env.py old mode 100644 new mode 100755 index dc196efe4..23392b60a --- a/rl_studio/envs/gazebo/f1/models/f1_env.py +++ b/rl_studio/envs/gazebo/f1/models/f1_env.py @@ -15,6 +15,7 @@ def __init__(self, **config): self.alternate_pose = config.get("alternate_pose") self.model_state_name = config.get("model_state_name") + self.sleep = config.get("sleep") self.vel_pub = rospy.Publisher("/F1ROS/cmd_vel", Twist, queue_size=5) self.unpause = rospy.ServiceProxy("/gazebo/unpause_physics", Empty) self.pause = rospy.ServiceProxy("/gazebo/pause_physics", Empty) @@ -27,6 +28,7 @@ def __init__(self, **config): # self.start_pose = np.array(config.get("start_pose")) self.start_pose = np.array(config.get("gazebo_start_pose")) self.start_random_pose = config.get("gazebo_random_start_pose") + self.end = 0 self._seed() diff --git a/rl_studio/envs/gazebo/f1/models/f1_env_camera.py b/rl_studio/envs/gazebo/f1/models/f1_env_camera.py old mode 100644 new mode 100755 index bd9bf364a..bb28d5d1e --- a/rl_studio/envs/gazebo/f1/models/f1_env_camera.py +++ b/rl_studio/envs/gazebo/f1/models/f1_env_camera.py @@ -127,6 +127,7 @@ def step(self, action) -> Tuple: f1_image_camera = self.image.getImage() self.previous_image = f1_image_camera.data + while np.array_equal(self.previous_image, f1_image_camera.data): if (time.time() - start) > 0.1: diff --git a/rl_studio/envs/gazebo/f1/models/f1_env_ddpg.py b/rl_studio/envs/gazebo/f1/models/f1_env_ddpg.py old mode 100644 new mode 100755 index c963ce957..5fc183f5d --- a/rl_studio/envs/gazebo/f1/models/f1_env_ddpg.py +++ b/rl_studio/envs/gazebo/f1/models/f1_env_ddpg.py @@ -51,7 +51,6 @@ def __init__(self, **config): self.actions = config["actions"] # Rewards - self.reward_function = config["reward_function"] self.rewards = config["rewards"] self.min_reward = config["min_reward"] self.beta_1 = config["beta_1"] @@ -454,7 +453,7 @@ def reset_camera(self): # step ################################################################################# - def step(self, action): + def step(self, action, show_monitoring=True): self._gazebo_unpause() vel_cmd = Twist() @@ -471,7 +470,7 @@ def step(self, action): self._gazebo_pause() ######### center - points = self.processed_image(f1_image_camera.data) + points = self.processed_image(f1_image_camera.data, show_monitoring) if self.state_space == "spn": self.point = points[self.poi] else: diff --git a/rl_studio/envs/gazebo/f1/models/reset.py b/rl_studio/envs/gazebo/f1/models/reset.py old mode 100644 new mode 100755 index 3ef041549..8a60dec54 --- a/rl_studio/envs/gazebo/f1/models/reset.py +++ b/rl_studio/envs/gazebo/f1/models/reset.py @@ -62,18 +62,48 @@ def reset_f1_state_sp(self): ##==== calculating State # simplified perception as observation - centrals_in_pixels, _ = self.simplifiedperception.calculate_centrals_lane( - f1_image_camera.data, - self.height, - self.width, - self.x_row, - self.lower_limit, - self.center_image, + points_in_red_line, centrals_normalized = self.simplifiedperception.processed_image( + f1_image_camera.data, self.height, self.width, self.x_row, self.center_image ) + self._gazebo_pause() + states = centrals_normalized + state_size = len(states) + + return states, state_size + + def reset_f1_state_sp_line(self): + """ + reset for + - State: Simplified perception + - tasks: FollowLane and FollowLine + """ + self._gazebo_reset() + # === POSE === + if self.alternate_pose: + self._gazebo_set_random_pose_f1_follow_rigth_lane() + else: + self._gazebo_set_fix_pose_f1_follow_right_lane() + + ##==== get image from sensor camera + f1_image_camera, _ = self.f1gazeboimages.get_camera_info() + f1_image_camera, _ = self.f1gazeboimages.get_camera_info() + + ##==== get center + points_in_red_line, _ = self.simplifiedperception.processed_image( + f1_image_camera.data, self.height, self.width, self.x_row, self.center_image + ) + + if self.state_space == "spn": + self.point = points_in_red_line[self.poi] + else: + self.point = points_in_red_line[0] + + ##==== get State + ##==== simplified perception as observation states = self.simplifiedperception.calculate_observation( - centrals_in_pixels, self.center_image, self.pixel_region + points_in_red_line, self.center_image, self.pixel_region ) - state = [states[0]] - state_size = len(state) - return state, state_size + state_size = len(states) + + return states, state_size diff --git a/rl_studio/envs/gazebo/f1/models/rewards.py b/rl_studio/envs/gazebo/f1/models/rewards.py old mode 100644 new mode 100755 index 80da09950..b30fe823f --- a/rl_studio/envs/gazebo/f1/models/rewards.py +++ b/rl_studio/envs/gazebo/f1/models/rewards.py @@ -4,6 +4,12 @@ class F1GazeboRewards: + def __init__(self, **config): + self.reward_function_tuning = config.get("reward_function_tuning") + self.punish_zig_zag_value = config.get("punish_zig_zag_value") + self.punish_ineffective_vel = config.get("punish_ineffective_vel") + self.beta_1 = config.get("beta_1") + @staticmethod def rewards_followlane_centerline(center, rewards): """ @@ -87,6 +93,58 @@ def rewards_followline_center(self, center, rewards): return reward, done + def rewards_followline_velocity_center(self, v, w, state, range_v, range_w): + """ + original for Following Line + """ + # we reward proximity to the line + p_reward1, done1 = self.reward_proximity(state[-2]) + p_reward2, done2 = self.reward_proximity(state[-4]) + p_reward3, done3 = self.reward_proximity(state[-5]) + p_reward = (p_reward1 + p_reward2 + p_reward3)/3 + done = done1 and done2 and done3 + + v_norm = self.normalize_range(v, range_v[0], range_v[1]) + v_reward = v_norm * pow(p_reward, 2) + beta = self.beta_1 + reward = (beta * p_reward) + ((1 - beta) * v_reward) + + # penalizing steering to avoid zig-zag + w_punish = self.normalize_range(abs(w), 0, abs(range_w[1])) * self.punish_zig_zag_value + reward = reward - (p_reward * w_punish) + + # penalizing accelerating on bad positions + v_punish = reward * (1 - p_reward) * v_norm * self.punish_ineffective_vel + reward = reward - v_punish + + return reward, done + + def normalize_range(self, num, a, b): + return (num - a) / (b - a) + + def reward_proximity(self, state): + # sigmoid_pos = self.sigmoid_function(0, 1, state) + if abs(state) > 0.7: + return 0, True + else: + if self.reward_function_tuning == "sigmoid": + return 1-self.sigmoid_function(0, 1, abs(state), 5), False + elif self.reward_function_tuning == "linear": + return self.linear_function(1, -1.4, abs(state)), False + elif self.reward_function_tuning == "pow": + return pow(1 - abs(state), 5), False + else: + return 1 - abs(state), False + + + def sigmoid_function(self, start, end, x, slope=10): + slope = slope / (end - start) + sigmoid = 1 / (1 + np.exp(-slope * (x - ((start + end) / 2)))) + return sigmoid + + def linear_function(self, cross_x, slope, x): + return cross_x + (slope * x) + def rewards_followline_v_w_centerline( self, vel_cmd, center, rewards, beta_1, beta_0 ): diff --git a/rl_studio/envs/gazebo/f1/models/settings.py b/rl_studio/envs/gazebo/f1/models/settings.py old mode 100644 new mode 100755 index f5dbe2d30..2adcaf309 --- a/rl_studio/envs/gazebo/f1/models/settings.py +++ b/rl_studio/envs/gazebo/f1/models/settings.py @@ -12,7 +12,7 @@ class F1GazeboTFConfig(BaseModel): def __init__(self, **config): self.simplifiedperception = F1GazeboSimplifiedPerception() - self.f1gazeborewards = F1GazeboRewards() + self.f1gazeborewards = F1GazeboRewards(**config) self.f1gazeboutils = F1GazeboUtils() self.f1gazeboimages = F1GazeboImages() diff --git a/rl_studio/envs/gazebo/f1/models/simplified_perception.py b/rl_studio/envs/gazebo/f1/models/simplified_perception.py index 7bc281b9d..975a0f466 100644 --- a/rl_studio/envs/gazebo/f1/models/simplified_perception.py +++ b/rl_studio/envs/gazebo/f1/models/simplified_perception.py @@ -13,7 +13,7 @@ def processed_image(self, img, height, width, x_row, center_image): :parameters: input image 640x480 :return: centrals: lists with distance to center in pixels - cntrals_normalized: lists with distance in range [0,1] for calculating rewards + centrals_normalized: lists with distance in range [0,1] for calculating rewards """ image_middle_line = height // 2 img_sliced = img[image_middle_line:] diff --git a/rl_studio/envs/gazebo/f1/models/step.py b/rl_studio/envs/gazebo/f1/models/step.py old mode 100644 new mode 100755 index f45b20162..3a69fdc77 --- a/rl_studio/envs/gazebo/f1/models/step.py +++ b/rl_studio/envs/gazebo/f1/models/step.py @@ -1,5 +1,6 @@ from geometry_msgs.msg import Twist import numpy as np +import time from rl_studio.agents.utils import ( print_messages, @@ -50,42 +51,40 @@ def step_followline_state_image_actions_discretes(self, action, step): return state, reward, done, {} - def step_followline_state_sp_actions_discretes(self, action, step): + def step_followline_state_sp_actions_discretes(self, action, step, show_monitoring=True): self._gazebo_unpause() vel_cmd = Twist() vel_cmd.linear.x = self.actions[action][0] vel_cmd.angular.z = self.actions[action][1] self.vel_pub.publish(vel_cmd) + self._gazebo_pause() + + start = time.time() + fps = (1 / (start - self.end))*2 + self.end = start ##==== get image from sensor camera f1_image_camera, _ = self.f1gazeboimages.get_camera_info() - self._gazebo_pause() + self.previous_image = f1_image_camera.data + + while np.array_equal(self.previous_image, f1_image_camera.data): + f1_image_camera, _ = self.f1gazeboimages.get_camera_info() ##==== get center - points_in_red_line, _ = self.simplifiedperception.processed_image( - f1_image_camera.data, self.height, self.width, self.x_row, self.center_image + points_in_red_line, centrals_normalized = self.simplifiedperception.processed_image( + f1_image_camera.data, self.height, self.width, self.x_row, self.center_image, show_monitoring ) - if self.state_space == "spn": - self.point = points_in_red_line[self.poi] - else: - self.point = points_in_red_line[0] - - center = abs(float(self.center_image - self.point) / (float(self.width) // 2)) - # center = float(self.center_image - self.point) / (float(self.width) // 2) ##==== get State - ##==== simplified perception as observation - state = self.simplifiedperception.calculate_observation( - points_in_red_line, self.center_image, self.pixel_region - ) + state = centrals_normalized ##==== get Rewards if self.reward_function == "followline_center": - reward, done = self.f1gazeborewards.rewards_followline_center( - center, self.rewards + reward, done = self.f1gazeborewards.rewards_followline_velocity_center( + vel_cmd.linear.x, vel_cmd.angular.z, state, self.rewards ) - return state, reward, done, {} + return state, reward, done, {"fps": fps} def step_followline_state_image_actions_continuous(self, action, step): self._gazebo_unpause() @@ -135,40 +134,34 @@ def step_followline_state_sp_actions_continuous(self, action, step): vel_cmd.linear.x = action[0][0] vel_cmd.angular.z = action[0][1] self.vel_pub.publish(vel_cmd) + time.sleep(self.sleep) + self._gazebo_pause() + start = time.time() + fps = (1 / (start - self.end)) + self.end = start ##==== get image from sensor camera f1_image_camera, _ = self.f1gazeboimages.get_camera_info() - self._gazebo_pause() + + self.previous_image = f1_image_camera + while np.array_equal(self.previous_image, f1_image_camera.data): + f1_image_camera, _ = self.f1gazeboimages.get_camera_info() ##==== get center - points_in_red_line, _ = self.simplifiedperception.processed_image( + points_in_red_line, centrals_normalized = self.simplifiedperception.processed_image( f1_image_camera.data, self.height, self.width, self.x_row, self.center_image ) - if self.state_space == "spn": - self.point = points_in_red_line[self.poi] - else: - self.point = points_in_red_line[0] - - center = abs(float(self.center_image - self.point) / (float(self.width) // 2)) - # center = float(self.center_image - self.point) / (float(self.width) // 2) - - ##==== get State - ##==== simplified perception as observation - state = self.simplifiedperception.calculate_observation( - points_in_red_line, self.center_image, self.pixel_region - ) ##==== get Rewards if self.reward_function == "followline_center": - reward, done = self.f1gazeborewards.rewards_followline_center( - center, self.rewards + reward, done = self.f1gazeborewards.rewards_followline_velocity_center( + vel_cmd.linear.x, vel_cmd.angular.z, centrals_normalized, self.actions["v"] + , self.actions["w"] ) else: - reward, done = self.f1gazeborewards.rewards_followline_v_w_centerline( - vel_cmd, center, self.rewards, self.beta_1, self.beta_0 - ) + exit(1) - return state, reward, done, {} + return centrals_normalized, reward, done, {"fps": fps} class StepFollowLane(F1Env): diff --git a/rl_studio/requirements.txt b/rl_studio/requirements.txt new file mode 100644 index 000000000..6fbb21bf3 --- /dev/null +++ b/rl_studio/requirements.txt @@ -0,0 +1,60 @@ +ale_py==0.7.4 +carla_birdeye_view==1.1.1 +cv_bridge==1.16.2 +cv_bridge==1.16.0 +gym==0.17.0 +gymnasium==0.28.1 +gymnasium==0.27.0 +keras==2.11.0 +markdownTable==6.0.0 +matplotlib==3.3.2 +matplotlib==3.1.2 +networkx==2.5.1 +numpy==1.22.0 +numpy==1.17.4 +opencv_python==4.2.0.32 +opencv_python==4.2.0.34 +opencv_python_headless==4.7.0.68 +optuna==3.1.0 +pandas==1.5.3 +Pillow==8.3.1 +Pillow==9.3.0 +Pillow==7.0.0 +Pillow==10.1.0 +py3rosmsgs==1.18.2 +py_markdown_table==0.3.3 +pydantic==1.9.1 +pydantic==1.10.2 +pygame==2.4.0 +pygame==2.1.2 +Pygments==2.14.0 +Pygments==2.2.0 +Pygments==2.12.0 +Pygments==2.3.1 +PyYAML==6.0 +PyYAML==5.4 +PyYAML==6.0.1 +reloading==1.1.2 +rosbag==1.16.0 +rosbag==1.15.14 +rosgraph==1.16.0 +rosgraph==1.15.14 +rosnode==1.16.0 +rosnode==1.15.14 +rospy==1.16.0 +scikit_learn==1.1.2 +scikit_learn==1.1.1 +scipy==1.6.1 +scipy==1.4.1 +sensor_msgs==1.13.1 +six==1.14.0 +statsmodels==0.13.2 +tabulate==0.9.0 +tensorboardX==2.6 +tensorboardX==2.6.2.2 +tensorflow==2.9.1 +tensorflow_gpu==2.11.0 +tf==1.13.2 +torch==1.11.0+cu113 +torch==1.11.0 +tqdm==4.64.0