-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmain.py
113 lines (89 loc) · 2.92 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
import gym
import matplotlib.pyplot as plt
import numpy as np
env_name = "MountainCar-v0"
env = gym.make(env_name)
#env.reset()
#obs_, reward,done, _ = env.step(1)
#print(obs_)
print(env.observation_space.high)
print(env.observation_space.low)
Alpha = 0.15
Gamma = 0.999
Eps = 1.0
pos_chunk = np.linspace(env.observation_space.low[0], env.observation_space.high[0],20)
vel_chunk = np.linspace(env.observation_space.low[1], env.observation_space.high[1], 20)
def get_discrete_state(state):
pos_dis = np.digitize(state[0], pos_chunk)
vel_dis = np.digitize(state[1], vel_chunk)
return (pos_dis, vel_dis)
def create_Q_table():
Q = {}
states = []
for i in range(len(pos_chunk)):
for j in range(len(vel_chunk)):
states.append((i,j))
for state in states:
for action in range(3):
Q[state,action] = 0
return Q
def get_best_action(state,Q):
actions = np.array([Q[state,action] for action in range(3)])
best_action = np.argmax(actions)
return best_action
def main(env,Alpha,Gamma,Eps,ep=75000, test_ep=100):
epsilon_decay = 1/ep
stock_rewards = np.zeros(ep)
Q = create_Q_table()
env._max_episode_steps = 1000
score = 0
for i in range(ep):
done = False
if i % 100 == 0:
print("episode : ", i, "score : ", score)
state = env.reset()
#print(state.dtype)
state_dis = get_discrete_state(state)
score = 0
while not done:
if np.random.random() > Eps:
action = get_best_action(state_dis,Q)
else:
action = np.random.choice([0,1,2])
new_state, reward, done, info = env.step(action)
new_state_dis = get_discrete_state(new_state)
score += reward
new_action = get_best_action(new_state_dis, Q)
#print(Q[state_dis,action])
Q[state_dis, action] = Q[state_dis,action] + Alpha*(reward + Gamma*Q[new_state_dis, new_action] - Q[state_dis,action])
state_dis = new_state_dis
stock_rewards[i] = score
if Eps > 0.001:
Eps -= epsilon_decay
else:
Eps = 0.001
for i in range(test_ep):
done = False
state = env.reset()
env.render()
state_dis = get_discrete_state(state)
if i % 10 == 0:
print("score", score)
score = 0
while not done:
action = get_best_action(state_dis,Q)
new_state, reward, done, info = env.step(action)
new_state_dis = get_discrete_state(new_state)
env.render()
score += reward
state_dis = new_state_dis
Visualize(ep,stock_rewards)
env.close()
def Visualize(ep,stock_rewards):
mean_r = np.zeros(ep-100)
for i in range(ep-100):
#print(i)
mean_r[i] = np.mean(stock_rewards[i:i+101])
plt.plot(mean_r)
plt.show()
main(env, Alpha,Gamma,Eps)