-
Notifications
You must be signed in to change notification settings - Fork 0
/
sarsa.py
104 lines (78 loc) · 2.99 KB
/
sarsa.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
import gym
import numpy as np
import matplotlib.pyplot as plot
from collections import defaultdict
import gym_pacman.envs.util as util
###################################################
# Environment Setup #
###################################################
env = gym.make('BerkeleyPacmanPO-v0')
env.seed(1)
done = False
# episodes, steps, and rewards
n_episodes = 200000
n_steps = 100
graph_steps = 150
rewards = []
# dictionary that will allow us to use the action name as the key
q_table = defaultdict(lambda: [0.0, 0.0, 0.0, 0.0, 0.0])
# Default values for learning algorithms
alpha = 0.05 # smaller learning rates are better, more accurate over time
gamma = 0.9 # high values give bigger weight to rewards
epsilon = 0.3 # high epsilon values = more randomness
###################################################
# Save Weighted Graph #
###################################################
def moving_avg_graph(title, file_name):
# calculate weights
weights = np.repeat(1.0, graph_steps)/graph_steps
moving_avg = np.convolve(rewards, weights, 'valid')
equalized_len = n_episodes - len(moving_avg)
# get the x and y points
x = np.arange(equalized_len, n_episodes)
y = moving_avg
# scatter plot
plot.scatter(x, y, marker='.')
# axis labels and title
plot.xlabel('Episodes')
plot.ylabel('Reward')
plot.title(title)
# save figure as png
plot.savefig("plots_and_data/"+file_name)
###################################################
# E Greedy #
###################################################
def policy(s):
if np.random.uniform(0, 1) < epsilon:
new_action = env.action_space.sample()
else:
new_action = np.argmax(q_table[s])
return new_action
###################################################
# SARSA #
###################################################
def learn(s, s_prime, r, a, a_prime):
current = q_table[s][a]
estimate = r + gamma*q_table[s_prime][a_prime]
q_table[s][a] = current + alpha * (estimate - current)
###################################################
# Main #
###################################################
if __name__ == '__main__':
for episode in range(n_episodes):
state = env.reset("trappedClassic.lay")
action = policy(state)
for i in range(n_steps):
# env.render()
state_prime, reward, done, info = env.step(action)
action_prime = policy(state_prime)
learn(state, state_prime, reward, action, action_prime)
state = state_prime
action = action_prime
if done:
break
rewards.append(info['episode']['r'])
print([str(episode), str(info['episode']['r'])])
moving_avg_graph(str(n_episodes)+' SARSA',
str(n_episodes)+'_sarsa.svg')
env.close()