From 8b3ba5d0791bfe32d252ac26d8f645dc0735d0f2 Mon Sep 17 00:00:00 2001
From: ruben <ruben.lucas.zaragoza@hotmail.com>
Date: Tue, 14 Mar 2023 19:57:02 +0100
Subject: [PATCH] fixed f1 follow line  ddpg agent for simplified perception

---
 .gitignore                                    |   4 +-
 rl_studio/agents/f1/__init__.py               |   0
 rl_studio/agents/f1/loaders.py                |  25 +-
 rl_studio/agents/f1/train_ddpg.py             |   0
 .../f1/train_followline_ddpg_f1_gazebo_tf.py  | 279 ++++++++----------
 .../agents/utilities/plot_npy_dataset.py      |  32 ++
 .../agents/utilities/plot_reward_function.py  |  66 +++++
 rl_studio/agents/utilities/push_git_repo.py   |  18 ++
 rl_studio/agents/utils.py                     |  21 +-
 rl_studio/algorithms/ddpg.py                  |  90 ++++--
 rl_studio/algorithms/utils.py                 |  46 +--
 ...ig_training_followline_ddpg_f1_gazebo.yaml |  53 ++--
 rl_studio/envs/gazebo/f1/models/f1_env.py     |   2 +
 .../envs/gazebo/f1/models/f1_env_camera.py    |   1 +
 .../envs/gazebo/f1/models/f1_env_ddpg.py      |   5 +-
 rl_studio/envs/gazebo/f1/models/reset.py      |  52 +++-
 rl_studio/envs/gazebo/f1/models/rewards.py    |  58 ++++
 rl_studio/envs/gazebo/f1/models/settings.py   |   2 +-
 .../gazebo/f1/models/simplified_perception.py |   2 +-
 rl_studio/envs/gazebo/f1/models/step.py       |  71 ++---
 rl_studio/requirements.txt                    |  60 ++++
 21 files changed, 575 insertions(+), 312 deletions(-)
 mode change 100644 => 100755 .gitignore
 mode change 100644 => 100755 rl_studio/agents/f1/__init__.py
 mode change 100644 => 100755 rl_studio/agents/f1/loaders.py
 mode change 100644 => 100755 rl_studio/agents/f1/train_ddpg.py
 mode change 100644 => 100755 rl_studio/agents/f1/train_followline_ddpg_f1_gazebo_tf.py
 create mode 100755 rl_studio/agents/utilities/plot_npy_dataset.py
 create mode 100644 rl_studio/agents/utilities/plot_reward_function.py
 create mode 100644 rl_studio/agents/utilities/push_git_repo.py
 mode change 100644 => 100755 rl_studio/algorithms/ddpg.py
 mode change 100644 => 100755 rl_studio/config/config_training_followline_ddpg_f1_gazebo.yaml
 mode change 100644 => 100755 rl_studio/envs/gazebo/f1/models/f1_env.py
 mode change 100644 => 100755 rl_studio/envs/gazebo/f1/models/f1_env_camera.py
 mode change 100644 => 100755 rl_studio/envs/gazebo/f1/models/f1_env_ddpg.py
 mode change 100644 => 100755 rl_studio/envs/gazebo/f1/models/reset.py
 mode change 100644 => 100755 rl_studio/envs/gazebo/f1/models/rewards.py
 mode change 100644 => 100755 rl_studio/envs/gazebo/f1/models/settings.py
 mode change 100644 => 100755 rl_studio/envs/gazebo/f1/models/step.py
 create mode 100644 rl_studio/requirements.txt

diff --git a/.gitignore b/.gitignore
old mode 100644
new mode 100755
index 682202d66..167854026
--- a/.gitignore
+++ b/.gitignore
@@ -15,8 +15,8 @@ catkin_ws
 output
 recorders
 logs
-
-
+checkpoints
+metrics
 
 ### Images ###
  JPEG
diff --git a/rl_studio/agents/f1/__init__.py b/rl_studio/agents/f1/__init__.py
old mode 100644
new mode 100755
diff --git a/rl_studio/agents/f1/loaders.py b/rl_studio/agents/f1/loaders.py
old mode 100644
new mode 100755
index fbecbc348..087b2af88
--- a/rl_studio/agents/f1/loaders.py
+++ b/rl_studio/agents/f1/loaders.py
@@ -100,9 +100,7 @@ def __init__(self, config):
             "avg": [],
             "max": [],
             "min": [],
-            "step": [],
             "epoch_training_time": [],
-            "total_training_time": [],
         }
         self.best_current_epoch = {
             "best_epoch": [],
@@ -124,6 +122,8 @@ def __init__(self, config):
         self.metrics_graphics_dir = f"{config['settings']['metrics_dir']}/{config['settings']['mode']}/{config['settings']['task']}_{config['settings']['algorithm']}_{config['settings']['agent']}_{config['settings']['framework']}/graphics"
         self.recorders_carla_dir = f"{config['settings']['recorder_carla_dir']}/{config['settings']['mode']}/{config['settings']['task']}_{config['settings']['algorithm']}_{config['settings']['agent']}_{config['settings']['framework']}"
         self.training_time = config["settings"]["training_time"]
+        self.debug_stats = config["settings"]["debug_stats"]
+        self.show_monitoring = config["settings"]["show_monitoring"]
         ####### States
         self.states = config["settings"]["states"]
         self.states_set = config["states"][self.states]
@@ -132,6 +132,10 @@ def __init__(self, config):
         self.actions_set = config["actions"][self.actions]
         ####### Rewards
         self.rewards = config["settings"]["rewards"]
+        ###### Exploration
+        self.steps_to_decrease = config["settings"]["steps_to_decrease"]
+        self.decrease_substraction = config["settings"]["decrease_substraction"]
+        self.decrease_min = config["settings"]["decrease_min"]
 
 
 class LoadEnvVariablesDQNGazebo:
@@ -270,14 +274,19 @@ def __init__(self, config) -> None:
         self.environment["model_state_name"] = config[self.environment_set][self.env][
             "model_state_name"
         ]
+        self.environment["sleep"] = config[self.environment_set][self.env][
+            "sleep"
+        ]
+        self.environment["punish_ineffective_vel"] = config["settings"]["reward_params"]["punish_ineffective_vel"]
+        self.environment["punish_zig_zag_value"] = config["settings"]["reward_params"]["punish_zig_zag_value"]
+        self.environment["reward_function_tuning"] = config["settings"]["reward_params"]["function"]
+        self.environment["beta_1"] = config["settings"]["reward_params"]["beta_1"]
+
+
         # Training/inference
         self.environment["mode"] = config["settings"]["mode"]
-        self.environment["retrain_ddpg_tf_actor_model_name"] = config["retraining"][
-            "ddpg"
-        ]["retrain_ddpg_tf_actor_model_name"]
-        self.environment["retrain_ddpg_tf_critic_model_name"] = config["retraining"][
-            "ddpg"
-        ]["retrain_ddpg_tf_critic_model_name"]
+        self.environment["retrain_ddpg_tf_actor_model_name"] = f"{config['retraining']['ddpg']['retrain_ddpg_tf_model_name']}/ACTOR"
+        self.environment["retrain_ddpg_tf_critic_model_name"] = f"{config['retraining']['ddpg']['retrain_ddpg_tf_model_name']}/CRITIC"
         self.environment["inference_ddpg_tf_actor_model_name"] = config["inference"][
             "ddpg"
         ]["inference_ddpg_tf_actor_model_name"]
diff --git a/rl_studio/agents/f1/train_ddpg.py b/rl_studio/agents/f1/train_ddpg.py
old mode 100644
new mode 100755
diff --git a/rl_studio/agents/f1/train_followline_ddpg_f1_gazebo_tf.py b/rl_studio/agents/f1/train_followline_ddpg_f1_gazebo_tf.py
old mode 100644
new mode 100755
index af6839454..fdf39a5be
--- a/rl_studio/agents/f1/train_followline_ddpg_f1_gazebo_tf.py
+++ b/rl_studio/agents/f1/train_followline_ddpg_f1_gazebo_tf.py
@@ -8,7 +8,8 @@
 import numpy as np
 import tensorflow as tf
 from tqdm import tqdm
-
+from rl_studio.agents.utilities.plot_npy_dataset import plot_rewards
+from  rl_studio.agents.utilities.push_git_repo import git_add_commit_push
 
 from rl_studio.agents.f1.loaders import (
     LoadAlgorithmParams,
@@ -34,6 +35,27 @@
 from rl_studio.envs.gazebo.gazebo_envs import *
 
 
+def combine_attributes(obj1, obj2, obj3):
+    combined_dict = {}
+
+    # Extract attributes from obj1
+    obj1_dict = obj1.__dict__
+    for key, value in obj1_dict.items():
+        combined_dict[key] = value
+
+    # Extract attributes from obj2
+    obj2_dict = obj2.__dict__
+    for key, value in obj2_dict.items():
+        combined_dict[key] = value
+
+    # Extract attributes from obj3
+    obj3_dict = obj3.__dict__
+    for key, value in obj3_dict.items():
+        combined_dict[key] = value
+
+    return combined_dict
+
+
 class TrainerFollowLineDDPGF1GazeboTF:
     """
     Mode: training
@@ -55,10 +77,21 @@ def __init__(self, config):
         os.makedirs(f"{self.global_params.metrics_data_dir}", exist_ok=True)
         os.makedirs(f"{self.global_params.metrics_graphics_dir}", exist_ok=True)
         self.log_file = f"{self.global_params.logs_dir}/{time.strftime('%Y%m%d-%H%M%S')}_{self.global_params.mode}_{self.global_params.task}_{self.global_params.algorithm}_{self.global_params.agent}_{self.global_params.framework}.log"
+        # self.outdir = f"{self.global_params.models_dir}/ddpg/{self.global_params.states}"
 
     def main(self):
 
         log = LoggingHandler(self.log_file)
+        # Init TensorBoard
+        tensorboard = ModifiedTensorBoard(
+            log_dir=f"{self.global_params.logs_tensorboard_dir}/{self.algoritmhs_params.model_name}-{time.strftime('%Y%m%d-%H%M%S')}"
+        )
+
+        hyperparams = combine_attributes(self.algoritmhs_params,
+                                       self.environment,
+                                       self.global_params)
+
+        tensorboard.update_hyperparams(hyperparams)
 
         ## Load Environment
         env = gym.make(self.env_params.env_name, **self.environment.environment)
@@ -67,11 +100,13 @@ def main(self):
         np.random.seed(1)
         tf.compat.v1.random.set_random_seed(1)
 
+        actor_loss = 0
+        critic_loss = 0
         start_time = datetime.now()
         best_epoch = 1
         current_max_reward = 0
-        best_step = 0
         best_epoch_training_time = 0
+        all_steps = 0
         ## Reset env
         state, state_size = env.reset()
 
@@ -88,9 +123,10 @@ def main(self):
         )
 
         ## --------------------- Deep Nets ------------------
+        exploration = self.algoritmhs_params.std_dev
         ou_noise = OUActionNoise(
             mean=np.zeros(1),
-            std_deviation=float(self.algoritmhs_params.std_dev) * np.ones(1),
+            std_deviation=float(exploration) * np.ones(1),
         )
         # Init Agents
         ac_agent = DDPGAgent(
@@ -108,10 +144,7 @@ def main(self):
             self.algoritmhs_params.buffer_capacity,
             self.algoritmhs_params.batch_size,
         )
-        # Init TensorBoard
-        tensorboard = ModifiedTensorBoard(
-            log_dir=f"{self.global_params.logs_tensorboard_dir}/{self.algoritmhs_params.model_name}-{time.strftime('%Y%m%d-%H%M%S')}"
-        )
+
 
         ## -------------    START TRAINING --------------------
         for episode in tqdm(
@@ -126,16 +159,30 @@ def main(self):
             prev_state, prev_state_size = env.reset()
 
             while not done:
+                all_steps += 1
+                if not all_steps % self.global_params.steps_to_decrease:
+                    exploration = max(self.global_params.decrease_min, exploration - self.global_params.decrease_substraction)
+                    log.logger.debug("decreasing exploration to exploration")
+                    ou_noise = OUActionNoise(
+                        mean=np.zeros(1),
+                        std_deviation=float(exploration) * np.ones(1),
+                    )
+
                 tf_prev_state = tf.expand_dims(tf.convert_to_tensor(prev_state), 0)
                 action = ac_agent.policy(
                     tf_prev_state, ou_noise, self.global_params.actions
                 )
-                state, reward, done, _ = env.step(action, step)
+                tensorboard.update_actions(action[0], all_steps)
+                state, reward, done, info = env.step(action, step)
+                fps = info["fps"]
+                # print(fps)
                 cumulated_reward += reward
 
                 # learn and update
+                # env._gazebo_pause()
+
                 buffer.record((prev_state, action, reward, state))
-                buffer.learn(ac_agent, self.algoritmhs_params.gamma)
+                actor_loss, critic_loss = buffer.learn(ac_agent, self.algoritmhs_params.gamma)
                 ac_agent.update_target(
                     ac_agent.target_actor.variables,
                     ac_agent.actor_model.variables,
@@ -151,131 +198,100 @@ def main(self):
                 prev_state = state
                 step += 1
 
-                log.logger.debug(
-                    f"\nstate = {state}\n"
-                    # f"observation[0]= {observation[0]}\n"
-                    f"state type = {type(state)}\n"
-                    # f"observation[0] type = {type(observation[0])}\n"
-                    f"prev_state = {prev_state}\n"
-                    f"prev_state = {type(prev_state)}\n"
-                    f"action = {action}\n"
-                    f"actions type = {type(action)}\n"
-                )
-                render_params(
-                    task=self.global_params.task,
-                    v=action[0][0],  # for continuous actions
-                    w=action[0][1],  # for continuous actions
-                    episode=episode,
-                    step=step,
-                    state=state,
-                    # v=self.global_params.actions_set[action][
-                    #    0
-                    # ],  # this case for discrete
-                    # w=self.global_params.actions_set[action][
-                    #    1
-                    # ],  # this case for discrete
-                    reward_in_step=reward,
-                    cumulated_reward_in_this_episode=cumulated_reward,
-                    _="--------------------------",
-                    best_episode_until_now=best_epoch,
-                    in_best_step=best_step,
-                    with_highest_reward=int(current_max_reward),
-                    in_best_epoch_training_time=best_epoch_training_time,
-                )
-                log.logger.debug(
-                    f"\nepisode = {episode}\n"
-                    f"step = {step}\n"
-                    f"actions_len = {len(self.global_params.actions_set)}\n"
-                    f"actions_range = {range(len(self.global_params.actions_set))}\n"
-                    f"actions = {self.global_params.actions_set}\n"
-                    f"reward_in_step = {reward}\n"
-                    f"cumulated_reward = {cumulated_reward}\n"
-                    f"done = {done}\n"
-                )
-
-                # best episode
-                if current_max_reward <= cumulated_reward:
-                    current_max_reward = cumulated_reward
-                    best_epoch = episode
-                    best_step = step
-                    best_epoch_training_time = datetime.now() - start_time_epoch
-                    # saving params to show
-                    self.global_params.actions_rewards["episode"].append(episode)
-                    self.global_params.actions_rewards["step"].append(step)
-                    # For continuous actios
-                    # self.actions_rewards["v"].append(action[0][0])
-                    # self.actions_rewards["w"].append(action[0][1])
-                    self.global_params.actions_rewards["reward"].append(reward)
-                    self.global_params.actions_rewards["center"].append(
-                        env.image_center
+                if self.global_params.show_monitoring:
+                    log.logger.debug(
+                        f"\nstate = {state}\n"
+                        # f"observation[0]= {observation[0]}\n"
+                        f"state type = {type(state)}\n"
+                        # f"observation[0] type = {type(observation[0])}\n"
+                        f"prev_state = {prev_state}\n"
+                        f"prev_state = {type(prev_state)}\n"
+                        f"action = {action}\n"
+                        f"actions type = {type(action)}\n"
+                    )
+                    render_params(
+                        task=self.global_params.task,
+                        v=action[0][0],  # for continuous actions
+                        w=action[0][1],  # for continuous actions
+                        episode=episode,
+                        step=step,
+                        state=state,
+                        reward_in_step=reward,
+                        cumulated_reward_in_this_episode=cumulated_reward,
+                        _="--------------------------",
+                        # fps=fps,
+                        exploration=exploration,
+                        best_episode_until_now=best_epoch,
+                        with_highest_reward=int(current_max_reward),
+                        in_best_epoch_training_time=best_epoch_training_time,
+                    )
+                    log.logger.debug(
+                        f"\nepisode = {episode}\n"
+                        f"step = {step}\n"
+                        f"actions_len = {len(self.global_params.actions_set)}\n"
+                        f"actions_range = {range(len(self.global_params.actions_set))}\n"
+                        f"actions = {self.global_params.actions_set}\n"
+                        f"reward_in_step = {reward}\n"
+                        f"cumulated_reward = {cumulated_reward}\n"
+                        f"done = {done}\n"
                     )
 
                 # Showing stats in screen for monitoring. Showing every 'save_every_step' value
-                if not step % self.env_params.save_every_step:
-                    save_dataframe_episodes(
+                if not all_steps % self.env_params.save_every_step:
+                    file_name = save_dataframe_episodes(
                         self.environment.environment,
                         self.global_params.metrics_data_dir,
-                        self.global_params.aggr_ep_rewards,
-                        self.global_params.actions_rewards,
+                        self.global_params.aggr_ep_rewards)
+                    plot_rewards(
+                        self.global_params.metrics_data_dir,
+                        file_name
                     )
+                    git_add_commit_push("automatic_rewards_update")
                     log.logger.debug(
                         f"SHOWING BATCH OF STEPS\n"
                         f"current_max_reward = {cumulated_reward}\n"
                         f"current epoch = {episode}\n"
                         f"current step = {step}\n"
                         f"best epoch so far = {best_epoch}\n"
-                        f"best step so far = {best_step}\n"
                         f"best_epoch_training_time = {best_epoch_training_time}\n"
                     )
-
                 #####################################################
                 ### save in case of completed steps in one episode
                 if step >= self.env_params.estimated_steps:
                     done = True
-                    log.logger.info(
-                        f"\nEPISODE COMPLETED\n"
-                        f"in episode = {episode}\n"
-                        f"steps = {step}\n"
-                        f"cumulated_reward = {cumulated_reward}\n"
+                    print_messages(
+                        "Lap completed in:",
+                        time=datetime.now() - start_time_epoch,
+                        in_episode=episode,
+                        episode_reward=int(cumulated_reward),
+                        with_steps=step,
                     )
                     save_actorcritic_model(
                         ac_agent,
                         self.global_params,
                         self.algoritmhs_params,
-                        cumulated_reward,
+                        self.environment.environment,
+                        current_max_reward,
                         episode,
                         "LAPCOMPLETED",
                     )
+
+
+
             #####################################################
             #### save best lap in episode
-            if (
-                cumulated_reward - self.environment.environment["rewards"]["penal"]
-            ) >= current_max_reward and episode > 1:
+            if current_max_reward <= cumulated_reward:
+                current_max_reward = cumulated_reward
+                best_epoch = episode
 
-                self.global_params.best_current_epoch["best_epoch"].append(best_epoch)
-                self.global_params.best_current_epoch["highest_reward"].append(
-                    current_max_reward
-                )
-                self.global_params.best_current_epoch["best_step"].append(best_step)
-                self.global_params.best_current_epoch[
-                    "best_epoch_training_time"
-                ].append(best_epoch_training_time)
-                self.global_params.best_current_epoch[
-                    "current_total_training_time"
-                ].append(datetime.now() - start_time)
-
-                save_dataframe_episodes(
-                    self.environment.environment,
-                    self.global_params.metrics_data_dir,
-                    self.global_params.best_current_epoch,
-                )
                 save_actorcritic_model(
                     ac_agent,
                     self.global_params,
                     self.algoritmhs_params,
-                    cumulated_reward,
+                    self.environment.environment,
+                    current_max_reward,
                     episode,
-                    "BESTLAP",
+                    "IMPROVED",
                 )
                 log.logger.info(
                     f"\nsaving best lap\n"
@@ -296,73 +312,30 @@ def main(self):
                     f"epoch = {episode}\n"
                     f"step = {step}\n"
                 )
-                if cumulated_reward > current_max_reward:
-                    save_actorcritic_model(
-                        ac_agent,
-                        self.global_params,
-                        self.algoritmhs_params,
-                        cumulated_reward,
-                        episode,
-                        "FINISHTIME",
-                    )
-
                 break
 
             #####################################################
             ### save every save_episode times
             self.global_params.ep_rewards.append(cumulated_reward)
             if not episode % self.env_params.save_episodes:
-                average_reward = sum(
-                    self.global_params.ep_rewards[-self.env_params.save_episodes :]
-                ) / len(self.global_params.ep_rewards[-self.env_params.save_episodes :])
-                min_reward = min(
-                    self.global_params.ep_rewards[-self.env_params.save_episodes :]
-                )
-                max_reward = max(
-                    self.global_params.ep_rewards[-self.env_params.save_episodes :]
+                average_reward = sum(self.global_params.ep_rewards[-self.env_params.save_episodes:]) / len(
+                    self.global_params.ep_rewards[-self.env_params.save_episodes:]
                 )
+                min_reward = min(self.global_params.ep_rewards[-self.env_params.save_episodes:])
+                max_reward = max(self.global_params.ep_rewards[-self.env_params.save_episodes:])
                 tensorboard.update_stats(
-                    reward_avg=int(average_reward),
-                    reward_max=int(max_reward),
-                    steps=step,
+                    cum_rewards=average_reward,
+                    reward_min=min_reward,
+                    reward_max=max_reward,
+                    actor_loss=actor_loss,
+                    critic_loss=critic_loss
                 )
+
                 self.global_params.aggr_ep_rewards["episode"].append(episode)
-                self.global_params.aggr_ep_rewards["step"].append(step)
                 self.global_params.aggr_ep_rewards["avg"].append(average_reward)
                 self.global_params.aggr_ep_rewards["max"].append(max_reward)
                 self.global_params.aggr_ep_rewards["min"].append(min_reward)
                 self.global_params.aggr_ep_rewards["epoch_training_time"].append(
                     (datetime.now() - start_time_epoch).total_seconds()
                 )
-                self.global_params.aggr_ep_rewards["total_training_time"].append(
-                    (datetime.now() - start_time).total_seconds()
-                )
-                save_actorcritic_model(
-                    ac_agent,
-                    self.global_params,
-                    self.algoritmhs_params,
-                    cumulated_reward,
-                    episode,
-                    "BATCH",
-                )
-
-                save_dataframe_episodes(
-                    self.environment.environment,
-                    self.global_params.metrics_data_dir,
-                    self.global_params.aggr_ep_rewards,
-                )
-                log.logger.info(
-                    f"\nsaving BATCH\n"
-                    f"current_max_reward = {cumulated_reward}\n"
-                    f"best_epoch = {best_epoch}\n"
-                    f"best_step = {best_step}\n"
-                    f"best_epoch_training_time = {best_epoch_training_time}\n"
-                )
-        #####################################################
-        ### save last episode, not neccesarily the best one
-        save_dataframe_episodes(
-            self.environment.environment,
-            self.global_params.metrics_data_dir,
-            self.global_params.aggr_ep_rewards,
-        )
         env.close()
diff --git a/rl_studio/agents/utilities/plot_npy_dataset.py b/rl_studio/agents/utilities/plot_npy_dataset.py
new file mode 100755
index 000000000..80e80a9c1
--- /dev/null
+++ b/rl_studio/agents/utilities/plot_npy_dataset.py
@@ -0,0 +1,32 @@
+import matplotlib as pltlib
+import matplotlib.pyplot as plt
+import numpy as np
+
+def load_rewards(file_path):
+    rewards = np.load(file_path, allow_pickle=True).item()
+    return rewards
+
+
+def plot_rewards(base_directory_name, file_name):
+    pltlib.rcParams.update({'font.size': 15})
+    fig, ax1 = plt.subplots()
+
+    file_path = base_directory_name + "/" + file_name
+    rewards = load_rewards(file_path)
+    ax1.set_xscale('log')
+    ax1.plot(range(len(rewards["avg"])), rewards["avg"], color='purple', label='ddpg')
+
+    # fig.canvas.manager.full_screen_toggle()
+
+    plt.legend()
+    plt.ylabel("cun_reward")
+    plt.xlabel("episode")
+    # plt.show()
+
+    base_path = '/home/ruben/Desktop/2020-phd-ruben-lucas/docs/assets/images/results_images/f1-follow-line/gazebo/ddpg/sp10/'
+    ax1.figure.savefig(base_path + 'trainings.png')
+
+if __name__ == "__main__":
+    # Pass the file_name as a parameter when calling the script
+    file_name = "20230722-135205_Circuit-simple_States-sp10_Actions-continuous_Rewards-followline_center.npy"
+    plot_rewards(file_name)
diff --git a/rl_studio/agents/utilities/plot_reward_function.py b/rl_studio/agents/utilities/plot_reward_function.py
new file mode 100644
index 000000000..096a5792c
--- /dev/null
+++ b/rl_studio/agents/utilities/plot_reward_function.py
@@ -0,0 +1,66 @@
+import numpy as np
+import matplotlib.pyplot as plt
+
+def normalize_range(num, a, b):
+    return (num - a) / (b - a)
+
+def linear_function(cross_x, slope, x):
+    return cross_x + (slope * x)
+
+def sigmoid_function(start, end, x, slope=10):
+    slope = slope / (end - start)
+    sigmoid = 1 / (1 + np.exp(-slope * (x - ((start + end) / 2))))
+    return sigmoid
+
+def reward_proximity(state):
+    if abs(state) > 0.7:
+        return 0
+    else:
+        # return 1 - abs(state)
+        # return linear_function(1, -1.4, abs(state))
+        return pow(1 - abs(state), 4)
+        # return 1 - sigmoid_function(0, 1, abs(state), 5)
+
+
+def rewards_followline_velocity_center(v, pos, range_v):
+    p_reward = reward_proximity(pos)
+    v_norm = normalize_range(v, range_v[0], range_v[1])
+    v_r = v_norm * pow(p_reward, 2)
+    beta_pos = 0.7
+    reward = (beta_pos * p_reward) + ((1 - beta_pos) * v_r)
+    v_punish = reward * (1 - p_reward) * v_norm
+    reward = reward - v_punish
+
+    return reward
+
+range_v = [0, 10]
+
+# Define the ranges for v, w, and pos
+v_range = np.linspace(range_v[0], range_v[1], 50)  # Adjust the range as needed
+pos_range = np.linspace(-1, 1, 50)  # Adjust the range as needed
+
+# Create a grid of values for v, w, and pos
+V, POS = np.meshgrid(v_range, pos_range)
+
+# Calculate the rewards for each combination of v, w, and pos
+rewards = np.empty_like(V)
+for i in range(V.shape[0]):
+    for j in range(V.shape[1]):
+            rewards[i, j] = rewards_followline_velocity_center(V[i, j], POS[i, j], range_v)
+
+# Create a 3D plot
+fig = plt.figure(figsize=(10, 8))
+ax = fig.add_subplot(111, projection='3d')
+
+# Plot the 3D surface
+surf = ax.plot_surface(POS, V, rewards, cmap='viridis')
+
+# Set labels for the axes
+ax.set_xlabel('Center Distance (pos)')
+ax.set_ylabel('Linear Velocity (v)')
+ax.set_zlabel('Reward')
+
+# Show the color bar
+fig.colorbar(surf, shrink=0.5, aspect=5)
+
+plt.show()
diff --git a/rl_studio/agents/utilities/push_git_repo.py b/rl_studio/agents/utilities/push_git_repo.py
new file mode 100644
index 000000000..08ecdb624
--- /dev/null
+++ b/rl_studio/agents/utilities/push_git_repo.py
@@ -0,0 +1,18 @@
+import subprocess
+
+def git_add_commit_push(commit_message):
+    # Change this to your repository directory
+    repo_directory = "/home/ruben/Desktop/2020-phd-ruben-lucas"
+
+    # Change to the repository directory
+    subprocess.run(["git", "add", "."], cwd=repo_directory)
+
+    # Commit changes with the given commit message
+    subprocess.run(["git", "commit", "-m", commit_message], cwd=repo_directory)
+
+    # Push changes to the remote repository
+    subprocess.run(["git", "push"], cwd=repo_directory)
+
+if __name__ == "__main__":
+    commit_message = "Your commit message here."
+    git_add_commit_push(commit_message)
\ No newline at end of file
diff --git a/rl_studio/agents/utils.py b/rl_studio/agents/utils.py
index 7010ba446..941d05e04 100644
--- a/rl_studio/agents/utils.py
+++ b/rl_studio/agents/utils.py
@@ -92,12 +92,8 @@ def print_dictionary(dic):
 
 def render_params(**kwargs):
     font = cv2.FONT_HERSHEY_SIMPLEX
-    canvas = np.zeros((400, 400, 3), dtype="uint8")
-    # blue = (255, 0, 0)
-    # green = (0, 255, 0)
-    # red = (0, 0, 255)
+    canvas = np.zeros((400, 900, 3), dtype="uint8")
     white = (255, 255, 255)
-    # white_darkness = (200, 200, 200)
     i = 10
     for key, value in kwargs.items():
         cv2.putText(
@@ -116,22 +112,21 @@ def render_params(**kwargs):
     cv2.waitKey(100)
 
 
-def save_dataframe_episodes(environment, outdir, aggr_ep_rewards, actions_rewards=None):
+def save_dataframe_episodes(environment, outdir, aggr_ep_rewards):
     """
     We save info every certains epochs in a dataframe and .npy format to export or manage
     """
     os.makedirs(f"{outdir}", exist_ok=True)
-
-    file_csv = f"{outdir}/{time.strftime('%Y%m%d-%H%M%S')}_Circuit-{environment['circuit_name']}_States-{environment['states']}_Actions-{environment['action_space']}_Rewards-{environment['reward_function']}.csv"
-    file_excel = f"{outdir}/{time.strftime('%Y%m%d-%H%M%S')}_Circuit-{environment['circuit_name']}_States-{environment['states']}_Actions-{environment['action_space']}_Rewards-{environment['reward_function']}.xlsx"
+    file_name = f"{time.strftime('%Y%m%d-%H%M%S')}_Circuit-{environment['circuit_name']}_States-{environment['states']}_Actions-{environment['action_space']}_Rewards-{environment['reward_function']}"
+    file_csv = f"{outdir}/{file_name}.csv"
+    file_excel = f"{outdir}/{file_name}.xlsx"
 
     df = pd.DataFrame(aggr_ep_rewards)
     df.to_csv(file_csv, mode="a", index=False, header=None)
     df.to_excel(file_excel)
-
-    if actions_rewards is not None:
-        file_npy = f"{outdir}/{time.strftime('%Y%m%d-%H%M%S')}_Circuit-{environment['circuit_name']}_States-{environment['states']}_Actions-{environment['action_space']}_Rewards-{environment['reward_function']}.npy"
-        np.save(file_npy, actions_rewards)
+    file_npy = f"{outdir}/{file_name}.npy"
+    np.save(file_npy, aggr_ep_rewards)
+    return f"{file_name}.npy"
 
 
 def save_best_episode(
diff --git a/rl_studio/algorithms/ddpg.py b/rl_studio/algorithms/ddpg.py
old mode 100644
new mode 100755
index d289145bb..1ed26ee7f
--- a/rl_studio/algorithms/ddpg.py
+++ b/rl_studio/algorithms/ddpg.py
@@ -15,7 +15,8 @@
 )
 from tensorflow.keras.models import Sequential, load_model
 from tensorflow.keras.optimizers import Adam
-
+from tensorboardX import SummaryWriter
+from tabulate import tabulate
 
 # Sharing GPU
 gpus = tf.config.experimental.list_physical_devices("GPU")
@@ -23,6 +24,22 @@
     tf.config.experimental.set_memory_growth(gpu, True)
 
 
+def build_markdown_table(data_dict):
+    markdown_table = "```\n"
+
+    # Create the table header
+    markdown_table += "| Key | Value |\n"
+    markdown_table += "| --- | --- |\n"
+
+    # Iterate over the dictionary and add rows to the table
+    for key, value in data_dict.items():
+        markdown_table += f"| {key} | {value} |\n"
+
+    markdown_table += "```\n"
+
+    return markdown_table
+
+
 # Own Tensorboard class
 class ModifiedTensorBoard(TensorBoard):
 
@@ -31,6 +48,7 @@ def __init__(self, **kwargs):
         super().__init__(**kwargs)
         self.step = 1
         self.writer = tf.summary.create_file_writer(self.log_dir)
+        self.txWriter = SummaryWriter(self.log_dir)
 
     # Overriding this method to stop creating default log writer
     def set_model(self, model):
@@ -60,6 +78,22 @@ def _write_logs(self, logs, index):
     def update_stats(self, **stats):
         self._write_logs(stats, self.step)
 
+    def update_actions(self, actions, index):
+        with self.writer.as_default():
+            tf.summary.histogram("actions_v", actions[0], step=index)
+            tf.summary.histogram("actions_w", actions[1], step=index)
+            self.writer.flush()
+
+    def update_hyperparams(self, params):
+        # Convert the dictionary to a list of (key, value) pairs
+        table_data = [[key, value] for key, value in params.items()]
+        # Create a nicely formatted table
+        table = tabulate(table_data, headers=['Key', 'Value'], tablefmt='pipe')
+
+        with self.writer.as_default():
+            tf.summary.experimental.set_step(0)
+            tf.summary.text("HyperParams", table)
+            self.writer.flush()
 
 class OUActionNoise:
     def __init__(self, mean, std_deviation, theta=0.15, dt=1e-2, x_initial=None):
@@ -165,7 +199,7 @@ def update(
         with tf.GradientTape() as tape:
             target_actions = actor_critic.target_actor(next_state_batch, training=True)
             y = reward_batch + gamma * actor_critic.target_critic(
-                [next_state_batch, target_actions], training=True
+                [next_state_batch, target_actions[0], target_actions[1]], training=True
             )
 
             if self.action_space == "continuous":
@@ -192,7 +226,7 @@ def update(
         with tf.GradientTape() as tape:
             actions = actor_critic.actor_model(state_batch, training=True)
             critic_value = actor_critic.critic_model(
-                [state_batch, actions], training=True
+                [state_batch, actions[0], actions[1]], training=True
             )
             # Used `-value` as we want to maximize the value given
             # by the critic for our actions
@@ -204,6 +238,7 @@ def update(
         actor_critic.actor_optimizer.apply_gradients(
             zip(actor_grad, actor_critic.actor_model.trainable_variables)
         )
+        return actor_loss, critic_loss
 
     # We compute the loss and update parameters
     def learn(self, actor_critic, gamma):
@@ -251,7 +286,7 @@ def learn(self, actor_critic, gamma):
             )
 
         else:
-            self.update(
+            return self.update(
                 actor_critic,
                 gamma,
                 state_batch,
@@ -288,9 +323,9 @@ def __init__(self, config, action_space_size, observation_space_values, outdir):
         self.actor_optimizer = tf.keras.optimizers.Adam(config["actor_lr"])
 
         # Custom tensorboard object
-        self.tensorboard = ModifiedTensorBoard(
-            log_dir=f"{outdir}/{self.MODEL_NAME}-{time.strftime('%Y%m%d-%H%M%S')}"
-        )
+        # self.tensorboard = ModifiedTensorBoard(
+        #     log_dir=f"{outdir}/{self.MODEL_NAME}-{time.strftime('%Y%m%d-%H%M%S')}"
+        # )
 
         # Used to count when to update target network with main network's weights
         self.target_update_counter = 0
@@ -406,13 +441,8 @@ def policy_discrete_actions(self, state):
         sampled_actions = tf.squeeze(self.actor_model(state))
         # Adding noise to action
         sampled_actions = sampled_actions.numpy()
-        # print(f"sampled_actions:{sampled_actions}")
         sampled_actions = np.argmax(sampled_actions)
-        # print(f"sampled_actions:{sampled_actions}")
-        # We make sure action is within bounds
-        # legal_action = np.clip(sampled_actions, self.LOWER_BOUND, self.UPPER_BOUND)
         legal_action = sampled_actions
-        # print(f"np.squeeze(legal_action):{np.squeeze(legal_action)}")
         return np.squeeze(legal_action)
 
     def get_actor_model_simplified_perception_discrete_actions_nan(self):
@@ -478,9 +508,13 @@ def get_critic_model_simplified_perception_discrete_actions(self):
     def policy_continuous_actions(self, state, noise_object):
         sampled_actions = tf.squeeze(self.actor_model(state))
         noise = noise_object()
+
         # Adding noise to action
-        sampled_actions = sampled_actions.numpy() + noise
-        # we can discretized the actions values with round(,0)
+        #print(f"debug: actions {sampled_actions[0]} {sampled_actions[1]}")
+        sampled_actions = sampled_actions.numpy()
+        sampled_actions[0] = sampled_actions[0] + noise
+        sampled_actions[1] = sampled_actions[1] + noise
+        
         legal_action_v = round(
             np.clip(sampled_actions[0], self.V_LOWER_BOUND, self.V_UPPER_BOUND), 1
         )
@@ -613,13 +647,15 @@ def get_critic_model_image_continuous_actions(self):
         return model
 
     def get_actor_model_sp_continuous_actions(self):
-        # inputShape = (96, 128, 3)
-
         inputs = Input(shape=self.OBSERVATION_SPACE_VALUES)
-        v_branch = self.build_branch(inputs, "v_output")
-        w_branch = self.build_branch(inputs, "w_output")
+        # last_init = tf.random_uniform_initializer(minval=-1, maxval=0.01)
+        hidden_init = tf.keras.initializers.GlorotUniform()
+        shared_layer = Dense(16, activation="relu", kernel_initializer=hidden_init)
 
-        v_branch = abs(v_branch) * self.V_UPPER_BOUND
+        v_branch = self.build_branch(inputs, "v_output", shared_layer)
+        w_branch = self.build_branch(inputs, "w_output", shared_layer)
+
+        v_branch = ((v_branch + 1) / 2) * (self.V_UPPER_BOUND - self.V_LOWER_BOUND) + self.V_LOWER_BOUND
         w_branch = w_branch * self.W_LEFT_BOUND
 
         # create the model using our input (the batch of images) and
@@ -632,11 +668,14 @@ def get_actor_model_sp_continuous_actions(self):
         # return the constructed network architecture
         return model
 
-    def build_branch(self, inputs, action_name):
-        last_init = tf.random_uniform_initializer(minval=-0.01, maxval=0.01)
-        # inputs = layers.Input(shape=(self.OBSERVATION_SPACE_VALUES))
-        x = Dense(128, activation="relu")(inputs)  # 8, 16, 32 neurons
-        x = Dense(128, activation="relu")(x)  # 8, 16, 32 neurons
+    def build_branch(self, inputs, action_name, shared_layer):
+        # last_init = tf.random_uniform_initializer(minval=-1, maxval=1)
+        hidden_init = tf.keras.initializers.GlorotUniform()
+        last_init = tf.keras.initializers.HeUniform()
+
+        x = shared_layer(inputs)
+        x = Dense(16, activation="relu", kernel_initializer=hidden_init)(x)  # 8, 16, 32 neurons
+        x = Dense(16, activation="relu", kernel_initializer=hidden_init)(x)  # 8, 16, 32 neurons
 
         x = Dense(1, activation="tanh", kernel_initializer=last_init)(x)
         x = Activation("tanh", name=action_name)(x)
@@ -647,8 +686,7 @@ def build_branch(self, inputs, action_name):
     def get_critic_model_sp_continuous_actions(self):
         # State as input
         state_input = layers.Input(shape=(self.OBSERVATION_SPACE_VALUES))
-        state_out = layers.Dense(16, activation="relu")(state_input)
-        state_out = layers.Dense(32, activation="relu")(state_out)
+        state_out = layers.Dense(64, activation="relu")(state_input)
         # state_out = layers.Flatten()(state_out)
 
         # Actions V and W. For more actions, we should add more layers
diff --git a/rl_studio/algorithms/utils.py b/rl_studio/algorithms/utils.py
index e2236e0f0..6b84279dc 100644
--- a/rl_studio/algorithms/utils.py
+++ b/rl_studio/algorithms/utils.py
@@ -10,42 +10,22 @@ def save_actorcritic_model(
     agent, global_params, algoritmhs_params, environment, cumulated_reward, episode, text
 ):
 
+    timestamp = time.strftime('%Y%m%d-%H%M%S')
     agent.actor_model.save(
-        f"{global_params.models_dir}/{time.strftime('%Y%m%d-%H%M%S')}_{algoritmhs_params.model_name}_{text}_ACTOR"
-        f"Circuit-{environment['circuit_name']}_"
-        f"States-{environment['states']}_"
-        f"Actions-{environment['action_space']}_"
-        f"BATCH_Rewards-{environment['reward_function']}_"
-        f"MaxReward-{int(cumulated_reward)}_"
-        f"Epoch-{episode}"
+        f"{global_params.models_dir}/{timestamp}_{text}_"
+        f"C-{environment['circuit_name']}_"
+        f"S-{environment['states']}_"
+        f"A-{environment['action_space']}_"
+        f"MR-{int(cumulated_reward)}_"
+        f"E-{episode}/ACTOR"
     )    
     agent.critic_model.save(
-        f"{global_params.models_dir}/{time.strftime('%Y%m%d-%H%M%S')}_{algoritmhs_params.model_name}_{text}_CRITIC"
-        f"Circuit-{environment['circuit_name']}_"
-        f"States-{environment['states']}_"
-        f"Actions-{environment['action_space']}_"
-        f"BATCH_Rewards-{environment['reward_function']}_"
-        f"MaxReward-{int(cumulated_reward)}_"
-        f"Epoch-{episode}"
+        f"{global_params.models_dir}/{timestamp}_{text}_"
+        f"C-{environment['circuit_name']}_"
+        f"S-{environment['states']}_"
+        f"A-{environment['action_space']}_"
+        f"MR-{int(cumulated_reward)}_"
+        f"E-{episode}/CRITIC"
     )
 
-    # save model in h5 format
-    agent.actor_model.save(
-        f"{global_params.models_dir}/{time.strftime('%Y%m%d-%H%M%S')}_{algoritmhs_params.model_name}_{text}_ACTOR"
-        f"Circuit-{environment['circuit_name']}_"
-        f"States-{environment['states']}_"
-        f"Actions-{environment['action_space']}_"
-        f"BATCH_Rewards-{environment['reward_function']}_"
-        f"MaxReward-{int(cumulated_reward)}_"
-        f"Epoch-{episode}.h5"
-    )    
-    agent.critic_model.save(
-        f"{global_params.models_dir}/{time.strftime('%Y%m%d-%H%M%S')}_{algoritmhs_params.model_name}_{text}_CRITIC"
-        f"Circuit-{environment['circuit_name']}_"
-        f"States-{environment['states']}_"
-        f"Actions-{environment['action_space']}_"
-        f"BATCH_Rewards-{environment['reward_function']}_"
-        f"MaxReward-{int(cumulated_reward)}_"
-        f"Epoch-{episode}.h5"
-    )
 
diff --git a/rl_studio/config/config_training_followline_ddpg_f1_gazebo.yaml b/rl_studio/config/config_training_followline_ddpg_f1_gazebo.yaml
old mode 100644
new mode 100755
index 7f4a60460..547102f79
--- a/rl_studio/config/config_training_followline_ddpg_f1_gazebo.yaml
+++ b/rl_studio/config/config_training_followline_ddpg_f1_gazebo.yaml
@@ -7,14 +7,24 @@ settings:
   env: simple # simple, nurburgring, montreal, curves, simple_laser, manual, autoparking
   agent: f1 # f1, autoparkingRL, auto_carla, mountain_car, robot_mesh, cartpole, turtlebot
   actions: continuous # continuous, simple, medium, hard, test, autoparking_simple
-  states: image #image, sp1 (simplified perception with 1 point), sp3 (simplified perception with 3 points), spn (simplified perception with n points)
+  states: sp5 #image, sp1 (simplified perception with 1 point), sp3 (simplified perception with 3 points), spn (simplified perception with n points)
   rewards: followline_center # followline_center, followline_center_v_w_linear
   framework: TensorFlow # TensorFlow, Pytorch
-  total_episodes: 5
-  training_time: 6
+  total_episodes: 500000
+  training_time: 72
   models_dir: "./checkpoints"
   logs_dir: "./logs"
   metrics_dir: "./metrics"
+  debug_stats: false
+  show_monitoring: true
+  steps_to_decrease: 50000
+  decrease_substraction: 0.01
+  decrease_min: 0.03
+  reward_params:
+    function: pow
+    punish_zig_zag_value: 1
+    punish_ineffective_vel: 0.5
+    beta_1: 0.8
 
 ros:
   ros_master_uri: "11311"
@@ -22,9 +32,7 @@ ros:
 
 retraining:
   ddpg:
-    retrain_ddpg_tf_actor_model_name: "20230111_DDPG_Actor_conv2d32x64_Critic_conv2d32x64_BATCH_ACTOR_Max--100_Epoch-5_State-image_Actions-continuous_Rewards-followline_center_inTime-20230111-183048.h5"
-    retrain_ddpg_tf_critic_model_name: "20230111_DDPG_Actor_conv2d32x64_Critic_conv2d32x64_BATCH_CRITIC_Max--100_Epoch-5_State-image_Actions-continuous_Rewards-followline_center_inTime-20230111-183048.h5"
-
+    retrain_ddpg_tf_model_name: "20231029-023138_IMPROVEDCircuit-simple_States-sp5_Actions-continuous_BATCH_Rewards-followline_center_MaxReward-2801_Epoch-118"
 inference:
   ddpg:
     inference_ddpg_tf_actor_model_name:
@@ -33,15 +41,15 @@ inference:
 algorithm:
   ddpg:
     gamma: 0.9
-    tau: 0.005
-    std_dev: 0.2
+    tau: 0.01
+    std_dev: 0.1
     model_name: DDPG_Actor_conv2d32x64_Critic_conv2d32x64
     replay_memory_size: 50_000
     memory_fraction: 0.20
-    critic_lr: 0.002
-    actor_lr: 0.001
+    critic_lr: 0.003
+    actor_lr: 0.002
     buffer_capacity: 100_000
-    batch_size: 64
+    batch_size: 256
 
 agents:
   f1:
@@ -59,11 +67,13 @@ states:
   image:
     0: [3]
   sp1:
-    0: [10]
+    0: [40] #TODO
   sp3:
-    0: [5, 15, 22]
+    0: [10, 50, 150]
   sp5:
-    0: [3, 5, 10, 15, 20]
+    0: [10, 40, 70, 130, 190]
+  sp10:
+    0: [3, 15, 30, 45, 50, 70, 100, 120, 150, 190]
   spn:
     0: [10]
 
@@ -89,8 +99,8 @@ actions:
   test:
     0: [0, 0]
   continuous:
-    v: [2, 30]
-    w: [-3, 3]
+    v: [1, 7]
+    w: [-1.5, 1.5]
 
 rewards:
   followline_center:
@@ -101,8 +111,6 @@ rewards:
     min_reward: 5_000
     highest_reward: 100
   followline_center_v_w_linear: # only for continuous actions
-    beta_0: 3
-    beta_1: -0.1
     penal: 0
     min_reward: 1_000
     highest_reward: 100
@@ -116,11 +124,11 @@ gazebo_environments:
     robot_name: f1_renault
     model_state_name: f1_renault # f1_renault_multicamera_multilaser
     start_pose: 0 # 0, 1, 2, 3, 4
-    alternate_pose: False
-    estimated_steps: 100
+    alternate_pose: True
+    estimated_steps: 5000
     sensor: camera
-    save_episodes: 5
-    save_every_step: 10
+    save_episodes: 1
+    save_every_step: 20000
     lap_completed: False
     save_model: True
     save_positions: True
@@ -134,3 +142,4 @@ gazebo_environments:
       2: [39.712, -30.741, 0.004, 0, 0, 1.56, 1.56]
       3: [-6.861, -36.481, 0.004, 0, 0.01, -0.858, 0.613]
       4: [20.043, 37.130, 0.003, 0, 0.103, -1.4383, -1.4383]
+    sleep: 0.05
diff --git a/rl_studio/envs/gazebo/f1/models/f1_env.py b/rl_studio/envs/gazebo/f1/models/f1_env.py
old mode 100644
new mode 100755
index dc196efe4..23392b60a
--- a/rl_studio/envs/gazebo/f1/models/f1_env.py
+++ b/rl_studio/envs/gazebo/f1/models/f1_env.py
@@ -15,6 +15,7 @@ def __init__(self, **config):
         self.alternate_pose = config.get("alternate_pose")
         self.model_state_name = config.get("model_state_name")
 
+        self.sleep = config.get("sleep")
         self.vel_pub = rospy.Publisher("/F1ROS/cmd_vel", Twist, queue_size=5)
         self.unpause = rospy.ServiceProxy("/gazebo/unpause_physics", Empty)
         self.pause = rospy.ServiceProxy("/gazebo/pause_physics", Empty)
@@ -27,6 +28,7 @@ def __init__(self, **config):
         # self.start_pose = np.array(config.get("start_pose"))
         self.start_pose = np.array(config.get("gazebo_start_pose"))
         self.start_random_pose = config.get("gazebo_random_start_pose")
+        self.end = 0
 
         self._seed()
 
diff --git a/rl_studio/envs/gazebo/f1/models/f1_env_camera.py b/rl_studio/envs/gazebo/f1/models/f1_env_camera.py
old mode 100644
new mode 100755
index bd9bf364a..bb28d5d1e
--- a/rl_studio/envs/gazebo/f1/models/f1_env_camera.py
+++ b/rl_studio/envs/gazebo/f1/models/f1_env_camera.py
@@ -127,6 +127,7 @@ def step(self, action) -> Tuple:
 
         f1_image_camera = self.image.getImage()
         self.previous_image = f1_image_camera.data
+
         
         while np.array_equal(self.previous_image, f1_image_camera.data):
             if (time.time() - start) > 0.1:
diff --git a/rl_studio/envs/gazebo/f1/models/f1_env_ddpg.py b/rl_studio/envs/gazebo/f1/models/f1_env_ddpg.py
old mode 100644
new mode 100755
index c963ce957..5fc183f5d
--- a/rl_studio/envs/gazebo/f1/models/f1_env_ddpg.py
+++ b/rl_studio/envs/gazebo/f1/models/f1_env_ddpg.py
@@ -51,7 +51,6 @@ def __init__(self, **config):
         self.actions = config["actions"]
 
         # Rewards
-        self.reward_function = config["reward_function"]
         self.rewards = config["rewards"]
         self.min_reward = config["min_reward"]
         self.beta_1 = config["beta_1"]
@@ -454,7 +453,7 @@ def reset_camera(self):
     # step
     #################################################################################
 
-    def step(self, action):
+    def step(self, action, show_monitoring=True):
         self._gazebo_unpause()
         vel_cmd = Twist()
 
@@ -471,7 +470,7 @@ def step(self, action):
         self._gazebo_pause()
 
         ######### center
-        points = self.processed_image(f1_image_camera.data)
+        points = self.processed_image(f1_image_camera.data, show_monitoring)
         if self.state_space == "spn":
             self.point = points[self.poi]
         else:
diff --git a/rl_studio/envs/gazebo/f1/models/reset.py b/rl_studio/envs/gazebo/f1/models/reset.py
old mode 100644
new mode 100755
index 3ef041549..8a60dec54
--- a/rl_studio/envs/gazebo/f1/models/reset.py
+++ b/rl_studio/envs/gazebo/f1/models/reset.py
@@ -62,18 +62,48 @@ def reset_f1_state_sp(self):
 
         ##==== calculating State
         # simplified perception as observation
-        centrals_in_pixels, _ = self.simplifiedperception.calculate_centrals_lane(
-            f1_image_camera.data,
-            self.height,
-            self.width,
-            self.x_row,
-            self.lower_limit,
-            self.center_image,
+        points_in_red_line, centrals_normalized = self.simplifiedperception.processed_image(
+            f1_image_camera.data, self.height, self.width, self.x_row, self.center_image
         )
+        self._gazebo_pause()
+        states = centrals_normalized
+        state_size = len(states)
+
+        return states, state_size
+
+    def reset_f1_state_sp_line(self):
+        """
+               reset for
+               - State: Simplified perception
+               - tasks: FollowLane and FollowLine
+               """
+        self._gazebo_reset()
+        # === POSE ===
+        if self.alternate_pose:
+            self._gazebo_set_random_pose_f1_follow_rigth_lane()
+        else:
+            self._gazebo_set_fix_pose_f1_follow_right_lane()
+
+        ##==== get image from sensor camera
+        f1_image_camera, _ = self.f1gazeboimages.get_camera_info()
+        f1_image_camera, _ = self.f1gazeboimages.get_camera_info()
+
+        ##==== get center
+        points_in_red_line, _ = self.simplifiedperception.processed_image(
+            f1_image_camera.data, self.height, self.width, self.x_row, self.center_image
+        )
+
+        if self.state_space == "spn":
+            self.point = points_in_red_line[self.poi]
+        else:
+            self.point = points_in_red_line[0]
+
+        ##==== get State
+        ##==== simplified perception as observation
         states = self.simplifiedperception.calculate_observation(
-            centrals_in_pixels, self.center_image, self.pixel_region
+            points_in_red_line, self.center_image, self.pixel_region
         )
-        state = [states[0]]
-        state_size = len(state)
 
-        return state, state_size
+        state_size = len(states)
+        
+        return states, state_size
diff --git a/rl_studio/envs/gazebo/f1/models/rewards.py b/rl_studio/envs/gazebo/f1/models/rewards.py
old mode 100644
new mode 100755
index 80da09950..b30fe823f
--- a/rl_studio/envs/gazebo/f1/models/rewards.py
+++ b/rl_studio/envs/gazebo/f1/models/rewards.py
@@ -4,6 +4,12 @@
 
 
 class F1GazeboRewards:
+    def __init__(self, **config):
+        self.reward_function_tuning = config.get("reward_function_tuning")
+        self.punish_zig_zag_value = config.get("punish_zig_zag_value")
+        self.punish_ineffective_vel = config.get("punish_ineffective_vel")
+        self.beta_1 = config.get("beta_1")
+
     @staticmethod
     def rewards_followlane_centerline(center, rewards):
         """
@@ -87,6 +93,58 @@ def rewards_followline_center(self, center, rewards):
 
         return reward, done
 
+    def rewards_followline_velocity_center(self, v, w, state, range_v, range_w):
+        """
+        original for Following Line
+        """
+        # we reward proximity to the line
+        p_reward1, done1 = self.reward_proximity(state[-2])
+        p_reward2, done2 = self.reward_proximity(state[-4])
+        p_reward3, done3 = self.reward_proximity(state[-5])
+        p_reward = (p_reward1 + p_reward2 + p_reward3)/3
+        done = done1 and done2 and done3
+
+        v_norm = self.normalize_range(v, range_v[0], range_v[1])
+        v_reward = v_norm * pow(p_reward, 2)
+        beta = self.beta_1
+        reward = (beta * p_reward) + ((1 - beta) * v_reward)
+
+        # penalizing steering to avoid zig-zag
+        w_punish = self.normalize_range(abs(w), 0, abs(range_w[1])) * self.punish_zig_zag_value
+        reward = reward - (p_reward * w_punish)
+
+        # penalizing accelerating on bad positions
+        v_punish = reward * (1 - p_reward) * v_norm * self.punish_ineffective_vel
+        reward = reward - v_punish
+
+        return reward, done
+
+    def normalize_range(self, num, a, b):
+        return (num - a) / (b - a)
+
+    def reward_proximity(self, state):
+        # sigmoid_pos = self.sigmoid_function(0, 1, state)
+        if abs(state) > 0.7:
+            return 0, True
+        else:
+            if self.reward_function_tuning == "sigmoid":
+                return 1-self.sigmoid_function(0, 1, abs(state), 5), False
+            elif self.reward_function_tuning == "linear":
+                return self.linear_function(1, -1.4, abs(state)), False
+            elif self.reward_function_tuning == "pow":
+                return pow(1 - abs(state), 5), False
+            else:
+                return 1 - abs(state), False
+
+
+    def sigmoid_function(self, start, end, x, slope=10):
+        slope = slope / (end - start)
+        sigmoid = 1 / (1 + np.exp(-slope * (x - ((start + end) / 2))))
+        return sigmoid
+
+    def linear_function(self, cross_x, slope, x):
+        return cross_x + (slope * x)
+
     def rewards_followline_v_w_centerline(
         self, vel_cmd, center, rewards, beta_1, beta_0
     ):
diff --git a/rl_studio/envs/gazebo/f1/models/settings.py b/rl_studio/envs/gazebo/f1/models/settings.py
old mode 100644
new mode 100755
index f5dbe2d30..2adcaf309
--- a/rl_studio/envs/gazebo/f1/models/settings.py
+++ b/rl_studio/envs/gazebo/f1/models/settings.py
@@ -12,7 +12,7 @@
 class F1GazeboTFConfig(BaseModel):
     def __init__(self, **config):
         self.simplifiedperception = F1GazeboSimplifiedPerception()
-        self.f1gazeborewards = F1GazeboRewards()
+        self.f1gazeborewards = F1GazeboRewards(**config)
         self.f1gazeboutils = F1GazeboUtils()
         self.f1gazeboimages = F1GazeboImages()
 
diff --git a/rl_studio/envs/gazebo/f1/models/simplified_perception.py b/rl_studio/envs/gazebo/f1/models/simplified_perception.py
index 7bc281b9d..975a0f466 100644
--- a/rl_studio/envs/gazebo/f1/models/simplified_perception.py
+++ b/rl_studio/envs/gazebo/f1/models/simplified_perception.py
@@ -13,7 +13,7 @@ def processed_image(self, img, height, width, x_row, center_image):
         :parameters: input image 640x480
         :return:
             centrals: lists with distance to center in pixels
-            cntrals_normalized: lists with distance in range [0,1] for calculating rewards
+            centrals_normalized: lists with distance in range [0,1] for calculating rewards
         """
         image_middle_line = height // 2
         img_sliced = img[image_middle_line:]
diff --git a/rl_studio/envs/gazebo/f1/models/step.py b/rl_studio/envs/gazebo/f1/models/step.py
old mode 100644
new mode 100755
index f45b20162..3a69fdc77
--- a/rl_studio/envs/gazebo/f1/models/step.py
+++ b/rl_studio/envs/gazebo/f1/models/step.py
@@ -1,5 +1,6 @@
 from geometry_msgs.msg import Twist
 import numpy as np
+import time
 
 from rl_studio.agents.utils import (
     print_messages,
@@ -50,42 +51,40 @@ def step_followline_state_image_actions_discretes(self, action, step):
 
         return state, reward, done, {}
 
-    def step_followline_state_sp_actions_discretes(self, action, step):
+    def step_followline_state_sp_actions_discretes(self, action, step, show_monitoring=True):
         self._gazebo_unpause()
         vel_cmd = Twist()
         vel_cmd.linear.x = self.actions[action][0]
         vel_cmd.angular.z = self.actions[action][1]
         self.vel_pub.publish(vel_cmd)
+        self._gazebo_pause()
+
+        start = time.time()
+        fps = (1 / (start - self.end))*2
+        self.end = start
 
         ##==== get image from sensor camera
         f1_image_camera, _ = self.f1gazeboimages.get_camera_info()
-        self._gazebo_pause()
+        self.previous_image = f1_image_camera.data
+
+        while np.array_equal(self.previous_image, f1_image_camera.data):
+            f1_image_camera, _ = self.f1gazeboimages.get_camera_info()
 
         ##==== get center
-        points_in_red_line, _ = self.simplifiedperception.processed_image(
-            f1_image_camera.data, self.height, self.width, self.x_row, self.center_image
+        points_in_red_line, centrals_normalized = self.simplifiedperception.processed_image(
+            f1_image_camera.data, self.height, self.width, self.x_row, self.center_image, show_monitoring
         )
-        if self.state_space == "spn":
-            self.point = points_in_red_line[self.poi]
-        else:
-            self.point = points_in_red_line[0]
-
-        center = abs(float(self.center_image - self.point) / (float(self.width) // 2))
-        # center = float(self.center_image - self.point) / (float(self.width) // 2)
 
         ##==== get State
-        ##==== simplified perception as observation
-        state = self.simplifiedperception.calculate_observation(
-            points_in_red_line, self.center_image, self.pixel_region
-        )
+        state = centrals_normalized
 
         ##==== get Rewards
         if self.reward_function == "followline_center":
-            reward, done = self.f1gazeborewards.rewards_followline_center(
-                center, self.rewards
+            reward, done = self.f1gazeborewards.rewards_followline_velocity_center(
+                vel_cmd.linear.x, vel_cmd.angular.z, state, self.rewards
             )
 
-        return state, reward, done, {}
+        return state, reward, done, {"fps": fps}
 
     def step_followline_state_image_actions_continuous(self, action, step):
         self._gazebo_unpause()
@@ -135,40 +134,34 @@ def step_followline_state_sp_actions_continuous(self, action, step):
         vel_cmd.linear.x = action[0][0]
         vel_cmd.angular.z = action[0][1]
         self.vel_pub.publish(vel_cmd)
+        time.sleep(self.sleep)
+        self._gazebo_pause()
 
+        start = time.time()
+        fps = (1 / (start - self.end))
+        self.end = start
         ##==== get image from sensor camera
         f1_image_camera, _ = self.f1gazeboimages.get_camera_info()
-        self._gazebo_pause()
+
+        self.previous_image = f1_image_camera
+        while np.array_equal(self.previous_image, f1_image_camera.data):
+            f1_image_camera, _ = self.f1gazeboimages.get_camera_info()
 
         ##==== get center
-        points_in_red_line, _ = self.simplifiedperception.processed_image(
+        points_in_red_line, centrals_normalized = self.simplifiedperception.processed_image(
             f1_image_camera.data, self.height, self.width, self.x_row, self.center_image
         )
-        if self.state_space == "spn":
-            self.point = points_in_red_line[self.poi]
-        else:
-            self.point = points_in_red_line[0]
-
-        center = abs(float(self.center_image - self.point) / (float(self.width) // 2))
-        # center = float(self.center_image - self.point) / (float(self.width) // 2)
-
-        ##==== get State
-        ##==== simplified perception as observation
-        state = self.simplifiedperception.calculate_observation(
-            points_in_red_line, self.center_image, self.pixel_region
-        )
 
         ##==== get Rewards
         if self.reward_function == "followline_center":
-            reward, done = self.f1gazeborewards.rewards_followline_center(
-                center, self.rewards
+            reward, done = self.f1gazeborewards.rewards_followline_velocity_center(
+                vel_cmd.linear.x, vel_cmd.angular.z, centrals_normalized, self.actions["v"]
+                , self.actions["w"]
             )
         else:
-            reward, done = self.f1gazeborewards.rewards_followline_v_w_centerline(
-                vel_cmd, center, self.rewards, self.beta_1, self.beta_0
-            )
+            exit(1)
 
-        return state, reward, done, {}
+        return centrals_normalized, reward, done,  {"fps": fps}
 
 
 class StepFollowLane(F1Env):
diff --git a/rl_studio/requirements.txt b/rl_studio/requirements.txt
new file mode 100644
index 000000000..6fbb21bf3
--- /dev/null
+++ b/rl_studio/requirements.txt
@@ -0,0 +1,60 @@
+ale_py==0.7.4
+carla_birdeye_view==1.1.1
+cv_bridge==1.16.2
+cv_bridge==1.16.0
+gym==0.17.0
+gymnasium==0.28.1
+gymnasium==0.27.0
+keras==2.11.0
+markdownTable==6.0.0
+matplotlib==3.3.2
+matplotlib==3.1.2
+networkx==2.5.1
+numpy==1.22.0
+numpy==1.17.4
+opencv_python==4.2.0.32
+opencv_python==4.2.0.34
+opencv_python_headless==4.7.0.68
+optuna==3.1.0
+pandas==1.5.3
+Pillow==8.3.1
+Pillow==9.3.0
+Pillow==7.0.0
+Pillow==10.1.0
+py3rosmsgs==1.18.2
+py_markdown_table==0.3.3
+pydantic==1.9.1
+pydantic==1.10.2
+pygame==2.4.0
+pygame==2.1.2
+Pygments==2.14.0
+Pygments==2.2.0
+Pygments==2.12.0
+Pygments==2.3.1
+PyYAML==6.0
+PyYAML==5.4
+PyYAML==6.0.1
+reloading==1.1.2
+rosbag==1.16.0
+rosbag==1.15.14
+rosgraph==1.16.0
+rosgraph==1.15.14
+rosnode==1.16.0
+rosnode==1.15.14
+rospy==1.16.0
+scikit_learn==1.1.2
+scikit_learn==1.1.1
+scipy==1.6.1
+scipy==1.4.1
+sensor_msgs==1.13.1
+six==1.14.0
+statsmodels==0.13.2
+tabulate==0.9.0
+tensorboardX==2.6
+tensorboardX==2.6.2.2
+tensorflow==2.9.1
+tensorflow_gpu==2.11.0
+tf==1.13.2
+torch==1.11.0+cu113
+torch==1.11.0
+tqdm==4.64.0