-
Notifications
You must be signed in to change notification settings - Fork 0
/
multi_agent_two_policy.py
170 lines (158 loc) · 6.52 KB
/
multi_agent_two_policy.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
import gym
import numpy as np
import ray
from ray import tune
from ray.tune import register_env
from gym_airport_tower.airport_tower_multi_env import AirportTowerMultiEnv
def env_creator(env_config):
return AirportTowerMultiEnv(**env_config)
register_env("AirportTowerMultiEnv", env_creator)
ray.init(include_dashboard=False)
config = {
# === Settings for Rollout Worker processes ===
"num_workers": 7,
# "num_gpus":1,
# "num_envs_per_worker": 1,
"seed": tune.grid_search([24088626, 30953886, 20735918]),
# === Settings for the Trainer process ===
# Discount factor of the MDP.
"gamma": 0.99,
# The default learning rate.
"lr": 0.0001,
# === Model ===
# Number of atoms for representing the distribution of return. When
# this is greater than 1, distributional Q-learning is used.
# the discrete supports are bounded by v_min and v_max
"num_atoms": 1,
"v_min": -10.0,
"v_max": 10.0,
# Whether to use noisy network
"noisy": False,
# control the initial value of noisy nets
"sigma0": 0.5,
# Whether to use dueling dqn
"dueling": True,
# Dense-layer setup for each the advantage branch and the value branch
# in a dueling architecture.
"hiddens": [256, 256],
# Whether to use double dqn
"double_q": True,
# N-step Q learning
"n_step": 1,
# === Prioritized replay buffer ===
# If True prioritized replay buffer will be used.
"prioritized_replay": True,
# Alpha parameter for prioritized replay buffer.
"prioritized_replay_alpha": 0.6,
# Beta parameter for sampling from prioritized replay buffer.
"prioritized_replay_beta": 0.4,
# Final value of beta (by default, we use constant beta=0.4).
"final_prioritized_replay_beta": 0.4,
# Time steps over which the beta parameter is annealed.
"prioritized_replay_beta_annealing_timesteps": 20000,
# Epsilon to add to the TD errors when updating priorities.
"prioritized_replay_eps": 1e-6,
# Callback to run before learning on a multi-agent batch of
# experiences.
"before_learn_on_batch": None,
# The intensity with which to update the model (vs collecting samples
# from the env). If None, uses the "natural" value of:
# `train_batch_size` / (`rollout_fragment_length` x `num_workers` x
# `num_envs_per_worker`).
# If provided, will make sure that the ratio between ts inserted into
# and sampled from the buffer matches the given value.
# Example:
# training_intensity=1000.0
# train_batch_size=250 rollout_fragment_length=1
# num_workers=1 (or 0) num_envs_per_worker=1
# -> natural value = 250 / 1 = 250.0
# -> will make sure that replay+train op will be executed 4x as
# often as rollout+insert op (4 * 250 = 1000).
# See: rllib/agents/dqn/dqn.py::calculate_rr_weights for further
# details.
"training_intensity": None,
# === Parallelism ===
# Whether to compute priorities on workers.
"worker_side_prioritization": False,
# === Exploration
"explore": True,
"exploration_config": {
# Exploration sub-class by name or full path to module+class
# (e.g. “ray.rllib.utils.exploration.epsilon_greedy.EpsilonGreedy”)
"type": "EpsilonGreedy",
# Parameters for the Exploration class' constructor:
"initial_epsilon": 1.0,
"final_epsilon": 0.02,
"epsilon_timesteps": 950000, # Timesteps over which to anneal epsilon.
},
# === Deep Learning Framework Settings ===
"framework": "tf2",
"eager_tracing": True,
# === Environment Settings ===
"env": 'AirportTowerMultiEnv',
"horizon": 200,
"env_config": {
"seed": 42, # seed gets seed by ray
"max_planes": tune.grid_search([2]),
"num_runways": tune.grid_search([1, 2]),
"runway_length": tune.grid_search([3]),
"airspace_size": tune.grid_search([(5, 5)]),
"plane_spawn_probability_per_step": 0.3,
"num_start_planes": 1,
"landing_reward": 100,
"plane_in_air_penalty": -1.0,
"plane_on_runway_reward": 5,
"render_env": False
},
# === Settings for Multi-Agent Environments ===
"multiagent": {
# Map of type MultiAgentPolicyConfigDict from policy ids to tuples
# of (policy_cls, obs_space, act_space, config). This defines the
# observation and action spaces of the policies and any extra config.
"policies": {"plane0": (None, gym.spaces.Box(low=-1, high=4,
shape=(5, 5), dtype=np.int32), gym.spaces.Discrete(4), {}
),
"plane1": (None, gym.spaces.Box(low=-1, high=4,
shape=(5, 5), dtype=np.int32), gym.spaces.Discrete(4), {}
)
},
# Function mapping agent ids to policy ids.
"policy_mapping_fn": lambda agent_id: agent_id,
},
# === Evaluation Settings ===
# Evaluate with every `evaluation_interval` training iterations.
# The evaluation stats will be reported under the "evaluation" metric key.
# Note that for Ape-X metrics are already only reported for the lowest
# epsilon workers (least random workers).
# Set to None (or 0) for no evaluation.
"evaluation_interval": 2,
# Typical usage is to pass extra args to evaluation env creator
# and to disable exploration by computing deterministic actions.
# IMPORTANT NOTE: Policy gradient algorithms are able to find the optimal
# policy, even if this is a stochastic one. Setting "explore=False" here
# will result in the evaluation workers not using this optimal policy!
"evaluation_config": {
# Example: overriding env_config, exploration, etc:
# "env_config": {...},
"explore": False,
"env_config": {
"seed": 666,
},
},
# Number of parallel workers to use for evaluation. Note that this is set
# to zero by default, which means evaluation will be run in the trainer
# process (only if evaluation_interval is not None). If you increase this,
# it will increase the Ray resource usage of the trainer since evaluation
# workers are created separately from rollout workers (used to sample data
# for training).
"evaluation_num_workers": 1,
}
tune.run(
"DQN",
stop={"episode_len_mean": 200, 'timesteps_total': 1000000},
checkpoint_at_end=True,
num_samples=1,
config=config,
resume=False,
local_dir="~/ray_results/multi_agent_two_policy"
)