Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

DDPG update with continuous action spaces #157

Open
wants to merge 12 commits into
base: noetic-devel
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
182 changes: 182 additions & 0 deletions behavior_metrics/brains/f1rl/trainContinuous.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,182 @@
import time
from datetime import datetime
import pickle
import torch
from torch.utils.tensorboard import SummaryWriter
import gym
from brains.f1rl.utils import liveplot
import gym_gazebo
import numpy as np
from gym import logger, wrappers
from brains.f1rl.utils.ddpg import DDPG
import brains.f1rl.utils.ddpg_utils.settingsDDPG as settings
from PIL import Image

def render():
render_skip = 0 # Skip first X episodes.
render_interval = 50 # Show render Every Y episodes.
render_episodes = 10 # Show Z episodes every rendering.

if (episode % render_interval == 0) and (episode != 0) and (episode > render_skip):
env.render()
elif ((episode - render_episodes) % render_interval == 0) and (episode != 0) and (episode > render_skip) and \
(render_episodes < episode):
env.render(close=True)

# if __name__ == '__main__':
print(settings.title)
print(settings.description)

current_env = settings.current_env
if current_env == "laser":
env_id = "GazeboF1LaserEnvDDPG-v0"
env = gym.make('GazeboF1LaserEnvDDPG-v0')
elif current_env == "camera":
env_id = "GazeboF1CameraEnvDDPG-v0"
env = gym.make('GazeboF1CameraEnvDDPG-v0')
else:
print("NO correct env selected")

outdir = './logs/f1_ddpg_gym_experiments/'

if not os.path.exists(outdir):
os.makedirs(outdir+'images/')

env = gym.wrappers.Monitor(env, outdir, force=True)
plotter = liveplot.LivePlot(outdir)
last_time_steps = np.ndarray(0)
stimate_step_per_lap = 4000
lap_completed = False

if settings.load_model:
save_path = outdir+'model/'
model_path = save_path
else:
save_path = outdir+'model/'
model_path = None

highest_reward = 0

total_episodes = settings.total_episodes

start_time = time.time()

seed = 123
save_iter = int(total_episodes/20)
render = False
writer = SummaryWriter(outdir)
env.seed(seed)

ddpg = DDPG(env_id,
32,
2,
render=False,
num_process=1,
memory_size=1000000,
lr_p=1e-3,
lr_v=1e-3,
gamma=0.99,
polyak=0.995,
explore_size=2000,
step_per_iter=1000,
batch_size=256,
min_update_step=1000,
update_step=50,
action_noise=0.1,
seed=seed)

print(settings.lets_go)

max_action = [12., 2]
min_action = [2., -2]

for episode in range(total_episodes):

global_steps = (episode - 1) * ddpg.step_per_iter
log = dict()
num_steps = 0
num_episodes = 0
total_reward = 0
min_episode_reward = float('inf')
max_episode_reward = float('-inf')
lap_completed = False
cumulated_reward = 0 # Should going forward give more reward then L/R z?

while num_steps < ddpg.step_per_iter:
state = env.reset()
# state = self.running_state(state)
episode_reward = 0

for t in range(1000):

if global_steps < ddpg.explore_size: # explore
action = env.action_space.sample()
else: # action with noise
action = ddpg.choose_action(state, ddpg.action_noise)

mod_action = action

for itr in range(len(action)):
mod_action[itr] = min_action[itr] + 0.5*(max_action[itr] - min_action[itr])*(action[itr]+1)

next_state, reward, done, info = env.step(action)
# next_state = self.running_state(next_state)
mask = 0 if done else 1
# ('state', 'action', 'reward', 'next_state', 'mask', 'log_prob')
ddpg.memory.push(state, action, reward, next_state, mask, None)

# print("Points:", info['points'])
# print("Errors:", info['errors'])
# observation_image = Image.fromarray(info['image'].reshape(32,32))
# observation_image.save(outdir+'/images/obs'+str(episode)+str(t)+'.jpg')

episode_reward += reward
cumulated_reward += reward
global_steps += 1
num_steps += 1

if global_steps >= ddpg.min_update_step and global_steps % ddpg.update_step == 0:
for _ in range(ddpg.update_step):
batch = ddpg.memory.sample(
ddpg.batch_size) # random sample batch
ddpg.update(batch)

if done or num_steps >= ddpg.step_per_iter:
if highest_reward < cumulated_reward:
highest_reward = cumulated_reward
break

state = next_state

if num_steps > 4000 and not lap_completed:
print("LAP COMPLETED!!")
lap_completed = True

num_episodes += 1
total_reward += episode_reward
min_episode_reward = min(episode_reward, min_episode_reward)
max_episode_reward = max(episode_reward, max_episode_reward)

log['num_steps'] = num_steps
log['num_episodes'] = num_episodes
log['total_reward'] = total_reward
log['avg_reward'] = total_reward / num_episodes
log['max_episode_reward'] = max_episode_reward
log['min_episode_reward'] = min_episode_reward

print(f"Iter: {episode}, num steps: {log['num_steps']}, total reward: {log['total_reward']: .4f}, "
f"min reward: {log['min_episode_reward']: .4f}, max reward: {log['max_episode_reward']: .4f}, "
f"average reward: {log['avg_reward']: .4f}")

# record reward information
writer.add_scalar("total reward", log['total_reward'], episode)
writer.add_scalar("average reward", log['avg_reward'], episode)
writer.add_scalar("min reward", log['min_episode_reward'], episode)
writer.add_scalar("max reward", log['max_episode_reward'], episode)
writer.add_scalar("num steps", log['num_steps'], episode)

if episode % save_iter == 0:
ddpg.save(save_path)

torch.cuda.empty_cache()
env.close()
137 changes: 137 additions & 0 deletions behavior_metrics/brains/f1rl/utils/ddpg.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,137 @@
#!/usr/bin/env python
import pickle

import numpy as np
import torch
import torch.optim as optim

from brains.f1rl.utils.ddpg_utils.ddpg_step import ddpg_step
from brains.f1rl.utils.ddpg_utils.Policy_ddpg import Policy
from brains.f1rl.utils.ddpg_utils.Value_ddpg import Value
from brains.f1rl.utils.ddpg_utils.ddpg_utils import Memory, get_env_info, check_path, device, FLOAT, ZFilter


class DDPG:
def __init__(self,
env_id,
num_states,
num_actions,
render=False,
num_process=1,
memory_size=1000000,
lr_p=1e-3,
lr_v=1e-3,
gamma=0.99,
polyak=0.995,
explore_size=10000,
step_per_iter=3000,
batch_size=100,
min_update_step=1000,
update_step=50,
action_noise=0.1,
seed=1,
model_path=None
):
self.env_id = env_id
self.num_states = num_states
self.num_actions = num_actions
self.gamma = gamma
self.polyak = polyak
self.memory = Memory(memory_size)
self.explore_size = explore_size
self.step_per_iter = step_per_iter
self.render = render
self.num_process = num_process
self.lr_p = lr_p
self.lr_v = lr_v
self.batch_size = batch_size
self.min_update_step = min_update_step
self.update_step = update_step
self.action_noise = action_noise
self.model_path = model_path
self.seed = seed

self._init_model()

def _init_model(self):
"""init model from parameters"""

self.action_low = -1
self.action_high = 1

# seeding
np.random.seed(self.seed)
torch.manual_seed(self.seed)

self.policy_net = Policy(
self.num_actions, self.action_high).to(device)
self.policy_net_target = Policy(
self.num_actions, self.action_high).to(device)

self.value_net = Value(64, self.num_actions).to(device)
self.value_net_target = Value(64, self.num_actions).to(device)

self.running_state = ZFilter((self.num_states,), clip=5)

if self.model_path:
print("Loading Saved Model {}_ddpg.p".format(self.env_id))
self.policy_net, self.value_net, self.running_state = pickle.load(
open('{}/{}_ddpg.p'.format(self.model_path, self.env_id), "rb"))

self.policy_net_target.load_state_dict(self.policy_net.state_dict())
self.value_net_target.load_state_dict(self.value_net.state_dict())

self.optimizer_p = optim.Adam(
self.policy_net.parameters(), lr=self.lr_p)
self.optimizer_v = optim.Adam(
self.value_net.parameters(), lr=self.lr_v)

def choose_action(self, state, noise_scale):
"""select action"""
state = np.random.rand(1,32,32)
state = FLOAT(state).unsqueeze(0).to(device)
with torch.no_grad():
action, log_prob = self.policy_net.get_action_log_prob(state)
action = action.cpu().numpy()[0]
# add noise
noise = noise_scale * np.random.randn(self.num_actions)
action += noise
action = np.clip(action, -self.action_high, self.action_high)
return action

def eval(self, i_iter, render=False):
"""evaluate model"""
state = self.env.reset()
state = np.random.rand(1,32,32)
test_reward = 0
while True:
if render:
self.env.render()
# state = self.running_state(state)
action = self.choose_action(state, 0)
state, reward, done, _ = self.env.step(action)
state = np.random.rand(1,32,32)
test_reward += reward
if done:
break
print(f"Iter: {i_iter}, test Reward: {test_reward}")
self.env.close()

def update(self, batch):
"""learn model"""
batch_state = FLOAT(batch.state).to(device)
batch_action = FLOAT(batch.action).to(device)
batch_reward = FLOAT(batch.reward).to(device)
batch_next_state = FLOAT(batch.next_state).to(device)
batch_mask = FLOAT(batch.mask).to(device)

# update by DDPG
alg_step_stats = ddpg_step(self.policy_net, self.policy_net_target, self.value_net, self.value_net_target, self.optimizer_p,
self.optimizer_v, batch_state, batch_action, batch_reward, batch_next_state, batch_mask,
self.gamma, self.polyak)

def save(self, save_path):
"""save model"""
check_path(save_path)
pickle.dump((self.policy_net, self.value_net, self.running_state),
open('{}/{}_ddpg.p'.format(save_path, self.env_id), 'wb'))
58 changes: 58 additions & 0 deletions behavior_metrics/brains/f1rl/utils/ddpg_utils/Policy_ddpg.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
#!/usr/bin/env python
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

# if gpu is to be used
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


def init_weight(m):
if isinstance(m, nn.Linear):
nn.init.xavier_normal_(m.weight)
nn.init.constant_(m.bias, 0.0)


class Policy(nn.Module):
def __init__(
self,
dim_action,
max_action=None,
activation=nn.LeakyReLU
):
super(Policy, self).__init__()
self.dim_action = dim_action
self.max_action = max_action

self.conv1 = nn.Conv2d(1, 16, kernel_size=5, stride=2)
self.bn1 = nn.BatchNorm2d(16)
self.conv2 = nn.Conv2d(16, 32, kernel_size=5, stride=2)
self.bn2 = nn.BatchNorm2d(32)
self.conv3 = nn.Conv2d(32, 32, kernel_size=5, stride=2)
self.bn3 = nn.BatchNorm2d(32)

# Number of Linear input connections depends on output of conv2d layers
# and therefore the input image size, so compute it.
def conv2d_size_out(size, kernel_size = 5, stride = 2):
return (size - (kernel_size - 1) - 1) // stride + 1

convw = conv2d_size_out(conv2d_size_out(conv2d_size_out(32)))
convh = conv2d_size_out(conv2d_size_out(conv2d_size_out(32)))
linear_input_size = convw * convh * 32
self.head = nn.Linear(linear_input_size, self.dim_action)

self.apply(init_weight)

def forward(self, x):
x = x.to(device)
x = F.relu(self.bn1(self.conv1(x)))
x = F.relu(self.bn2(self.conv2(x)))
x = F.relu(self.bn3(self.conv3(x)))
action = self.head(x.view(x.size(0), -1))
return action * self.max_action

def get_action_log_prob(self, states):
action = self.forward(states)
return action, None

Loading