-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathCartPole_SimpleQ.py
206 lines (166 loc) · 6.32 KB
/
CartPole_SimpleQ.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
'''
This code implements basic Q-learning for the CartPole_V0 environment
format for discretization and implementation from:
https://github.com/philtabor/OpenAI-Cartpole
'''
import math
import gym
import numpy as np
import matplotlib.pyplot as plt
import random
# returns maximum value in a dictionary with associated key
def maxDict(d):
max_v = float('-inf')
for key, val in d.items():
if val > max_v:
max_v = val
max_key = key
return max_key, max_v
# turns state into a string. ex) 1234 -> '1234'
# an allStates list is easier to generate with string states
def get_state_as_string(state):
string_state = ''.join(str(int(e)) for e in state)
return string_state
# Puts continuous observations in discrete bins
def assign_bins(observation, bins):
state = [0]*4
for i in range(4):
state[i] = np.digitize(observation[i], bins[i])
#print(state)
return state
# Goes through Q-learning algorithm for one episode
def oneEpisode(bins, Q, eta, gamma, epsilon, animate):
observation = env.reset() #initialize to semi-random start state
state = get_state_as_string(assign_bins(observation, bins))
#state = observation # FOR TRYING CONTINUOUS
done = False
totalReward = 0
t = 0 # number of time-steps
while not done:
if animate:
env.render()
t += 1
if np.random.uniform() < epsilon:
action = env.action_space.sample() # epsilon greedy
else:
action = maxDict(Q[state])[0] # returns action associated with highest Q value for that state
observation, reward, done, info = env.step(action) #results of action
newState = get_state_as_string(assign_bins(observation, bins))
#newState = observation # FOR TRYING CONTINUOUS SPACE
totalReward += reward
#weightVec = weightVec - eta*(Q_opt - (reward + discount*V_opt))*observation
if done and t < 500:
reward = -10000 #Harsh penalty for ending before timeout state
newAction, maxQ = maxDict(Q[newState])
# Q learning formula
Q[state][action] -= eta * (Q[state][action] - (reward + gamma * maxQ))
state = newState
action = newAction
#Q["0000"][0] = 5000
env.close()
return totalReward, t
####################################################################################
##################################BEGIN PROGRAM#####################################
####################################################################################
env = gym.make('CartPole-v0')
env._max_episode_steps = 500
###HYPERPARAMETERS###
discrete = 10
maxStates = discrete**4 # 10**4 # function of bin discretization
gamma = 1 # discount factor
eta = 0.003 # step size
numEpisodes = 5000 # episodes to train over
animate = False # boolean for showing animation
exportFilename = "Q output tau change.txt"
###INITIALIZE LIST OF ALL POSSIBLE STATES###
all_states = []
for i in range(1, discrete + 1):
for j in range(1, discrete + 1):
for k in range(1, discrete + 1):
for l in range(1, discrete + 1):
all_states.append(str(i)+str(j)+str(k)+str(l))
###INITIALIZE Q Dictionary###
Q = {}
for state in all_states:
Q[state] = {}
for action in range(env.action_space.n):
Q[state][action] = 0
###CREATE DISCRETE BINS###
bins = np.zeros((4, discrete)) # 10 discretizations for each variable
bins[0] = np.linspace(-4.8, 4.8, discrete) # observation[0]: cart position: -4.8 - 4.8
bins[1] = np.linspace(-5, 5, discrete) # observation[1]: cart velocity: -inf - inf
bins[2] = np.linspace(-.4, .4, discrete) # observation[2]: pole angle: -41.8 - 41.8
bins[3] = np.linspace(-5, 5, discrete) # observation[3]: pole velocity: -inf - inf
print(bins)
###TRAIN & EVALUATE###
episodeLengths = []
episodeRewards = []
iterSolved = 0
for n in range(numEpisodes):
epsilon = 1.0 / np.sqrt(n + 1) # epsilon greedy policy
episode_reward, episode_length = oneEpisode(bins, Q, eta, gamma, epsilon, animate)
episodeLengths.append(episode_length)
episodeRewards.append(episode_reward)
# print("\nTime: " + str(episode_length))
# print("N = " + str(n))
total = 0
if len(episodeRewards) > 19 and iterSolved == 0:
for i in range(len(episodeRewards)-20, len(episodeRewards)):
total += episodeRewards[i]
if total/20. >= 195.:
iterSolved = n
print("# iterations to solve: " + str(iterSolved))
###PLOTTING###
windowSize = 20
N = len(episodeRewards)
running_avg = [None]*N
for t in range(N):
if t > 18:
running_avg[t] = np.mean(episodeRewards[(t + 1 - windowSize):(t + 1)])
else:
running_avg[t] = np.mean(episodeRewards[0:(t + 1)])
plt.plot(running_avg)
plt.title("20-Episode Average Score")
plt.xlabel("Episode")
plt.ylabel("Average Score")
plt.show()
###OUTPUT LEARNED POLICY TO FILE###
file = open(exportFilename, "w")
file.write("s,a") #CSV file: state,action
for state in all_states:
highest = -float("Inf")
for action in Q[state]:
if Q[state][action] > highest:
highest = Q[state][action]
act = action
elif Q[state][action] == highest: #If state not visited, tie break with random
act = random.randint(0, env.action_space.n - 1)
file.write("\n" + str(state) + "," + str(act)) #CSV file: state,action
file.close()
'''
###OUTPUT HEURISTIC POLICY TO FILE###
file = open("Heuristic Policy.txt", "w")
file.write("s,a") #CSV file: state,action
for state in all_states:
if int(state[3]) > 4 and int(state[2]) > 4:
act = 1
elif int(state[3]) < 5 and int(state[2]) < 5:
act = 0
else:
q = random.randint(0,1)
if q > 0.2:
act = 1
else:
act = 0
file.write("\n" + str(state) + "," + str(act)) #CSV file: state,action
file.close()
'''
'''
###OUTPUT RANDOM POLICY TO FILE###
file = open("Random Policy.txt", "w")
file.write("s,a") #CSV file: state,action
for state in all_states:
act = random.randint(0, env.action_space.n - 1)
file.write("\n" + str(state) + "," + str(act)) #CSV file: state,action
file.close()
'''