-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathq_mtn_car.py
141 lines (115 loc) · 3.75 KB
/
q_mtn_car.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
import gym
import numpy as np
import random
import matplotlib.pyplot as plt
env = gym.make('MountainCar-v0')
minimum_epsilon = 0.5
minimum_eta = 0.5
np.random.seed(1)
#Helper Function
def getAction(eta, obs, partition, QTable):
if(np.random.uniform() <= eta):
action = env.action_space.sample()
else:
action = getBestAction(obs, partition, QTable)
return action
def getReward(prev_obs,curr_obs,action):
if(curr_obs[0] - prev_obs[0]) >= 0:
return (curr_obs[1] - prev_obs[1])
else:
return -1 * (curr_obs[1]-prev_obs[1])
def getState(obs, partition):
obs_dict = env.observation_space.__dict__ #what does this do
#print obs_dict
max_obs = obs_dict["high"]
min_obs = obs_dict["low"]
obs_range = max_obs - min_obs
state=[]
for index, value in enumerate(obs): #for each observation
step = obs_range[index]/partition[index]
threshold = min_obs[index] + step
state_code = 0
for _ in range(1,partition[index]):
if(value <= threshold):
break
else:
state_code += 1
threshold += step
state.append(state_code)
return state
def generateQTable(shape):
return np.zeros(np.prod(shape)).reshape(*shape)
def getQ(state, action, QTable):
return QTable[state[0]][state[1]][action[0]]
def setQ(new_value,state, action, QTable):
QTable[state[0]][state[1]][action[0]] = new_value
def getMaxQ(state, QTable):
result = QTable
for element in state:
result = result[element]
return np.max(result)
def getBestAction(obs, partition, QTable): #[100,2]
state = getState(obs, partition)
result = QTable
for element in state:
result = result[element]
return list(result).index(np.max(result))
def updateQTable(obs, next_obs, action, partition, epsilon , discount_rate, reward, QTable):
state = getState(obs, partition)
next_state = getState(next_obs, partition)
current_q = getQ(state, action, QTable)
max_future_q = getMaxQ(next_state, QTable)
setQ(epsilon *((-1*(current_q))+ (discount_rate * max_future_q) + getReward(obs,next_obs,action)), state, action, QTable)
return epsilon *((-1*(current_q))+ (discount_rate * max_future_q) + getReward(obs,next_obs,action))
QTable = generateQTable([100,2,3])
successful = []
unsuccessful = []
means = []
for episode in range(1,500*50):
eta = 0.05
obs = env.reset()
for step in range(200):
action_= getAction(eta, obs, [100,2], QTable)
next_obs, reward, done, info = env.step(action_)
#env.render()
if(done):
break
else:
reward2 = updateQTable(obs, next_obs, [action_], [100,2], 0.5, 0.5, reward, QTable)
obs = next_obs
if(step<199):
result = "Successful"
successful.append(step)
print("Episode {} : {}".format(episode,result))
else:
result = "Unsuccessful"
unsuccessful.append(step)
print("Episode {} : {}".format(episode,result))
if (episode % 500) == 0:
sum = 0
for x in range(len(successful)):
sum += successful[x]
for x in range(len(unsuccessful)):
sum += unsuccessful[x]
mean = ((sum)/(len(successful) + len(unsuccessful)))
means.append(mean)
print "means"
print means
print "mean"
sum = 0
for x in range(len(means)):
sum += means[x]
print sum/len(means)
print "std_dev"
print np.std(np.asarray(means))
print "original stuff"
sum = 0
for x in range(len(successful)):
sum += successful[x]
sum2 = 0
for x in range(len(unsuccessful)):
sum2 += unsuccessful[x]
#print successful + unsuccessful
print "mean total"
print_this = ((sum + sum2)/(len(successful) + len(unsuccessful)))
print print_this