-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtraining.py
133 lines (114 loc) · 5.69 KB
/
training.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
import math
import time
import itertools
import argparse
import datetime
import torch
import sys
from agent import ContinuousDubinGym, DiscreteDubinGym
sys.path.append('./algorithm/SAC/')
from sac import SAC
from replay_memory import ReplayMemory
from torch.utils.tensorboard import SummaryWriter
parser = argparse.ArgumentParser(description='Dubins Env & Soft Actor-Critic Args')
parser.add_argument('--env-name', default="ContinuousDubinGym",
help='Dubin Gym environment (default: ContinuousDubinGym)')
parser.add_argument('--policy', default="Gaussian",
help='Policy Type: Gaussian | Deterministic (default: Gaussian)')
parser.add_argument('--eval', type=bool, default=False,
help='Evaluates a policy a policy every 10 episode (default: True)')
parser.add_argument('--gamma', type=float, default=0.99, metavar='G',
help='discount factor for reward (default: 0.99)')
parser.add_argument('--tau', type=float, default=0.005, metavar='G',
help='target smoothing coefficient(τ) (default: 0.005)')
parser.add_argument('--lr', type=float, default=0.0003, metavar='G',
help='learning rate (default: 0.0003)')
parser.add_argument('--alpha', type=float, default=0.2, metavar='G',
help='Temperature parameter α determines the relative importance of the entropy\
term against the reward (default: 0.2)')
parser.add_argument('--automatic_entropy_tuning', type=bool, default=False, metavar='G',
help='Automaically adjust α (default: False)')
parser.add_argument('--seed', type=int, default=123456, metavar='N',
help='random seed (default: 123456)')
parser.add_argument('--batch_size', type=int, default=256, metavar='N',
help='batch size (default: 256)')
parser.add_argument('--num_steps', type=int, default=200000, metavar='N',
help='maximum number of steps (default: 1000000)')
parser.add_argument('--hidden_size', type=int, default=256, metavar='N',
help='hidden size (default: 256)')
parser.add_argument('--updates_per_step', type=int, default=1, metavar='N',
help='model updates per simulator step (default: 1)')
parser.add_argument('--start_steps', type=int, default=1000, metavar='N',
help='Steps sampling random actions (default: 10000)')
parser.add_argument('--target_update_interval', type=int, default=1, metavar='N',
help='Value target update per no. of updates per step (default: 1)')
parser.add_argument('--replay_size', type=int, default=1000000, metavar='N',
help='size of replay buffer (default: 10000000)')
parser.add_argument('--cuda', type = int, default = 0, metavar = 'N',
help='run on CUDA (default: False)')
parser.add_argument('--max_episode_length', type=int, default=400, metavar='N',
help='max episode length (default: 3000)')
args = parser.parse_args()
def main():
if args.env_name == "ContinuousDubinGym":
env = ContinuousDubinGym()
else:
env = DiscreteDubinGym()
agent = SAC(env.observation_space.shape[0], env.action_space, args)
memory = ReplayMemory(args.replay_size, args.seed)
writer = SummaryWriter('runs/{}_SAC_{}_{}_{}'.format(datetime.datetime.now().strftime("%Y-%m-%d_%H-%M-%S"), 'DeepracerGym',
args.policy, "autotune" if args.automatic_entropy_tuning else ""))
total_numsteps = 0
updates = 0
num_goal_reached = 0
for i_episode in itertools.count(1):
episode_reward = 0
episode_steps = 0
done = False
state = env.reset()
while not done:
env.render()
start_time = time.time()
if args.start_steps > total_numsteps:
action = env.action_space.sample()
else:
action = agent.select_action(state)
next_state, reward, done, _ = env.step(action)
if (reward > 9) and (episode_steps > 1):
num_goal_reached += 1
episode_steps += 1
total_numsteps += 1
episode_reward += reward
if episode_steps > args.max_episode_length:
done = True
# Ignore the "done" signal if it comes from hitting the time horizon.
# (https://github.com/openai/spinningup/blob/master/spinup/algos/sac/sac.py)
mask = 1 if episode_steps == args.max_episode_length else float(not done)
# mask = float(not done)
memory.push(state, action, reward, next_state, mask) # Append transition to memory
state = next_state
# if i_episode % UPDATE_EVERY == 0:
if len(memory) > args.batch_size:
# Number of updates per step in environment
for i in range(args.updates_per_step*args.max_episode_length):
# Update parameters of all the networks
critic_1_loss, critic_2_loss, policy_loss, ent_loss, alpha = agent.update_parameters(memory, args.batch_size, updates)
writer.add_scalar('loss/critic_1', critic_1_loss, updates)
writer.add_scalar('loss/critic_2', critic_2_loss, updates)
writer.add_scalar('loss/policy', policy_loss, updates)
writer.add_scalar('loss/entropy_loss', ent_loss, updates)
writer.add_scalar('entropy_temprature/alpha', alpha, updates)
updates += 1
if total_numsteps > args.num_steps:
break
if (episode_steps > 1):
writer.add_scalar('reward/train', episode_reward, i_episode)
writer.add_scalar('reward/episode_length',episode_steps, i_episode)
writer.add_scalar('reward/num_goal_reached',num_goal_reached, i_episode)
print("Episode: {}, total numsteps: {}, episode steps: {}, reward: {}".format(i_episode, total_numsteps, episode_steps, round(episode_reward, 2)))
print("Number of Goals Reached: ",num_goal_reached)
print('----------------------Training Ending----------------------')
agent.save_model("burger", suffix = "1")
return True
if __name__ == '__main__':
main()