Skip to content

Commit

Permalink
Restyled by yapf
Browse files Browse the repository at this point in the history
  • Loading branch information
restyled-commits committed Feb 19, 2020
1 parent 3726881 commit 15bfe97
Show file tree
Hide file tree
Showing 3 changed files with 87 additions and 86 deletions.
5 changes: 3 additions & 2 deletions consts.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,8 @@
TILED_COLUMNS = np.arange(TOTAL_DISKS) % WIDTH

ROW_EDGE_DISTANCE = np.min([TILED_ROWS, np.flip(TILED_ROWS, axis=0)], axis=0)
COLUMN_EDGE_DISTANCE = np.min([TILED_COLUMNS, np.flip(TILED_COLUMNS, axis=0)], axis=0)
COLUMN_EDGE_DISTANCE = np.min(
[TILED_COLUMNS, np.flip(TILED_COLUMNS, axis=0)], axis=0)
ODDS = TILED_ROWS % 2

FOURS = []
Expand Down Expand Up @@ -69,7 +70,7 @@
for colour in range(COLOURS):
for row in range(HEIGHT):
disks_in_column = row ^ (row + 1)
yellow_disks = 2 ** (row + 3) if colour == YELLOW else 0
yellow_disks = 2**(row + 3) if colour == YELLOW else 0
row_hash = disks_in_column | yellow_disks
for column in range(WIDTH):
row_column_hash = row_hash << (9 * column)
Expand Down
72 changes: 38 additions & 34 deletions network.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,30 +16,31 @@ def __init__(self, scope, use_symmetry):
[1, 2, HEIGHT, WIDTH],
)

self.disks = tf.placeholder(
tf.float32, shape=[None, 2, HEIGHT, WIDTH], name="disks"
)
self.disks = tf.placeholder(tf.float32,
shape=[None, 2, HEIGHT, WIDTH],
name="disks")

self.empty = tf.placeholder(
tf.float32, shape=[None, HEIGHT, WIDTH], name="empty"
)
self.empty = tf.placeholder(tf.float32,
shape=[None, HEIGHT, WIDTH],
name="empty")
empty = tf.expand_dims(self.empty, axis=1)

self.legal_moves = tf.placeholder(
tf.float32, shape=[None, HEIGHT, WIDTH], name="legal_moves"
)
self.legal_moves = tf.placeholder(tf.float32,
shape=[None, HEIGHT, WIDTH],
name="legal_moves")
legal_moves = tf.expand_dims(self.legal_moves, axis=1)

self.threats = tf.placeholder(
tf.float32, shape=[None, 2, HEIGHT, WIDTH], name="threats"
)
self.threats = tf.placeholder(tf.float32,
shape=[None, 2, HEIGHT, WIDTH],
name="threats")

constant_features = np.array(
[TILED_ROWS, ODDS, ROW_EDGE_DISTANCE, COLUMN_EDGE_DISTANCE],
dtype=np.float32,
).reshape([1, 4, HEIGHT, WIDTH])
batch_size = tf.shape(self.turn)[0]
tiled_constant_features = tf.tile(constant_features, [batch_size, 1, 1, 1])
tiled_constant_features = tf.tile(constant_features,
[batch_size, 1, 1, 1])

feature_planes = tf.concat(
[
Expand All @@ -55,11 +56,12 @@ def __init__(self, scope, use_symmetry):

if use_symmetry:
# Interleave horizontally flipped position
feature_planes_shape = [-1] + feature_planes.shape.as_list()[1:]
feature_planes_shape = [-1
] + feature_planes.shape.as_list()[1:]
flipped = tf.reverse(feature_planes, axis=[3])
feature_planes = tf.reshape(
tf.stack([feature_planes, flipped], axis=1), feature_planes_shape
)
tf.stack([feature_planes, flipped], axis=1),
feature_planes_shape)

with tf.name_scope("conv_layers"):
if self.gpu_available():
Expand Down Expand Up @@ -106,9 +108,9 @@ def __init__(self, scope, use_symmetry):
name="final_conv",
)
disk_bias = tf.get_variable("disk_bias", shape=[TOTAL_DISKS])
self.conv_output = tf.add(
tf.contrib.layers.flatten(final_conv), disk_bias, name="conv_output"
)
self.conv_output = tf.add(tf.contrib.layers.flatten(final_conv),
disk_bias,
name="conv_output")

self.conv_layers = [conv1, conv2, conv3, self.conv_output]

Expand All @@ -119,7 +121,8 @@ def gpu_available(self):
@property
def variables(self):
# Add '/' to stop network-1 containing network-10 variables
return tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, self.scope + "/")
return tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES,
self.scope + "/")

def assign(self, other):
return [
Expand All @@ -135,12 +138,11 @@ def __init__(self, scope, temperature=1.0, reuse=None, use_symmetry=False):

with tf.name_scope("policy"):
self.temperature = tf.placeholder_with_default(
temperature, (), name="temperature"
)
temperature, (), name="temperature")

disk_logits = tf.divide(
self.conv_output, self.temperature, name="disk_logits"
)
disk_logits = tf.divide(self.conv_output,
self.temperature,
name="disk_logits")

if use_symmetry:
# Calculate average of actual and horizontally flipped position
Expand All @@ -151,7 +153,9 @@ def __init__(self, scope, temperature=1.0, reuse=None, use_symmetry=False):
)
disk_logits = tf.reshape(
tf.reduce_mean(
tf.concat([normal, tf.reverse(flipped, axis=[3])], axis=1),
tf.concat(
[normal, tf.reverse(flipped, axis=[3])],
axis=1),
axis=1,
),
[-1, TOTAL_DISKS],
Expand All @@ -161,10 +165,8 @@ def __init__(self, scope, temperature=1.0, reuse=None, use_symmetry=False):
# - Legal moves have positive logits
# - Illegal moves have -ILLEGAL_PENALTY logits
legal_moves = tf.contrib.layers.flatten(self.legal_moves)
legal_disk_logits = (
tf.nn.relu(disk_logits) * legal_moves
+ (legal_moves - 1) * ILLEGAL_PENALTY
)
legal_disk_logits = (tf.nn.relu(disk_logits) * legal_moves +
(legal_moves - 1) * ILLEGAL_PENALTY)

self.policy = tf.nn.softmax(legal_disk_logits, name="policy")
self.sample_move = tf.squeeze(
Expand Down Expand Up @@ -203,10 +205,12 @@ def __init__(self, scope, use_symmetry=False):

if use_symmetry:
# Calculate average of actual and horizontally flipped position
self.value = tf.reduce_mean(
tf.reshape(value, [-1, 2]), axis=1, name="value"
)
self.value = tf.reduce_mean(tf.reshape(value, [-1, 2]),
axis=1,
name="value")
else:
self.value = tf.squeeze(value, axis=1, name="value")

self.value_layers = self.conv_layers + [fully_connected, self.value]
self.value_layers = self.conv_layers + [
fully_connected, self.value
]
96 changes: 46 additions & 50 deletions policy_training.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,42 +23,42 @@ def __init__(self, config):
self.config = config
self.run_dir = util.run_directory(config)

self.session = tf.Session(
config=tf.ConfigProto(gpu_options=tf.GPUOptions(allow_growth=True))
)
self.session = tf.Session(config=tf.ConfigProto(
gpu_options=tf.GPUOptions(allow_growth=True)))

self.policy_network = PolicyNetwork("policy")
self.policy_player = PolicyPlayer(self.policy_network, self.session)
util.restore_or_initialize_network(
self.session, self.run_dir, self.policy_network
)
util.restore_or_initialize_network(self.session, self.run_dir,
self.policy_network)

# Train ops
self.create_train_op(self.policy_network)
self.writer = tf.summary.FileWriter(self.run_dir)
util.restore_or_initialize_scope(
self.session, self.run_dir, self.training_scope.name
)
util.restore_or_initialize_scope(self.session, self.run_dir,
self.training_scope.name)

self.opponents = Opponents(
[RandomPlayer(), RandomThreatPlayer(), MaxThreatPlayer()]
)
[RandomPlayer(),
RandomThreatPlayer(),
MaxThreatPlayer()])
self.opponents.restore_networks(self.session, self.run_dir)

def create_train_op(self, policy_network):
with tf.variable_scope("policy_training") as self.training_scope:
self.move = tf.placeholder(tf.int32, shape=[None], name="move")
self.result = tf.placeholder(tf.float32, shape=[None], name="result")
self.result = tf.placeholder(tf.float32,
shape=[None],
name="result")

policy = tf.reshape(policy_network.policy, [-1, HEIGHT, WIDTH])
move = tf.expand_dims(tf.one_hot(self.move, WIDTH), axis=1)
turn = util.turn_win(policy_network.turn)
move_probability = tf.reduce_sum(policy * move, axis=[1, 2])

result_loss = -tf.reduce_mean(tf.log(move_probability) * turn * self.result)
result_loss = -tf.reduce_mean(
tf.log(move_probability) * turn * self.result)
entropy_regularisation = -config.entropy * tf.reduce_mean(
policy_network.entropy
)
policy_network.entropy)
loss = result_loss + entropy_regularisation

optimizer = tf.train.AdamOptimizer(self.config.learning_rate)
Expand Down Expand Up @@ -95,17 +95,20 @@ def save(self):

def play_games(self, opponent):
# Create games
games = incomplete_games = [Game() for _ in range(self.config.batch_size)]
games = incomplete_games = [
Game() for _ in range(self.config.batch_size)
]

# Let opponent play first in half of the games
self.play_move(games[0 : len(games) // 2], opponent)
self.play_move(games[0:len(games) // 2], opponent)
player = self.policy_player

while incomplete_games:
self.play_move(incomplete_games, player)
player = self.policy_player if player != self.policy_player else opponent
incomplete_games = [
game for game in incomplete_games if not game.position.gameover()
game for game in incomplete_games
if not game.position.gameover()
]

return games
Expand Down Expand Up @@ -154,7 +157,8 @@ def train_games(self, opponent, games):

def process_results(self, opponent, games, step, summary):
win_rate = np.mean([game.policy_player_score for game in games])
average_moves = sum(len(game.moves) for game in games) / self.config.batch_size
average_moves = sum(len(game.moves)
for game in games) / self.config.batch_size
opponent_summary = tf.Summary()
opponent_summary.value.add(
tag=self.training_scope.name + "/" + opponent.name + "/win_rate",
Expand All @@ -170,16 +174,13 @@ def process_results(self, opponent, games, step, summary):

self.opponents.update_win_rate(opponent, win_rate)

print(
"Step %d. Opponent %s, win rate %.2f <%.2f>, %.2f moves"
% (
step,
opponent.name,
win_rate,
self.opponents.win_rates[opponent],
average_moves,
)
)
print("Step %d. Opponent %s, win rate %.2f <%.2f>, %.2f moves" % (
step,
opponent.name,
win_rate,
self.opponents.win_rates[opponent],
average_moves,
))

def create_new_opponent(self, name):
# Create clone of policy_player
Expand Down Expand Up @@ -210,7 +211,8 @@ def decrease_win_rates(self):

def update_win_rate(self, opponent, win_rate):
# Win rate is a moving average
self.win_rates[opponent] = self.win_rates[opponent] * 0.9 + win_rate * 0.1
self.win_rates[
opponent] = self.win_rates[opponent] * 0.9 + win_rate * 0.1

def all_beaten(self):
result = True
Expand All @@ -221,32 +223,25 @@ def all_beaten(self):
def choose_opponent(self):
# More difficult opponents are chosen more often
win_rates = np.maximum(list(self.win_rates.values()), 0.1)
probs = (1 / win_rates ** 2) - 1
probs = (1 / win_rates**2) - 1
normalised_probs = probs / probs.sum()
return np.random.choice(list(self.win_rates.keys()), p=normalised_probs)
return np.random.choice(list(self.win_rates.keys()),
p=normalised_probs)

def next_network_name(self):
network_opponents = len(
[
opponent
for opponent in self.win_rates.keys()
if type(opponent) == PolicyPlayer
]
)
network_opponents = len([
opponent for opponent in self.win_rates.keys()
if type(opponent) == PolicyPlayer
])
return "network-%d" % (network_opponents + 1)

def save_opponent_stats(self, run_dir):
with open(os.path.join(run_dir, "opponents"), "w") as f:
f.write(
"\n".join(
[
opponent.name + " " + str(win_rate)
for opponent, win_rate in sorted(
self.win_rates.items(), key=lambda x: x[1]
)
]
)
)
f.write("\n".join([
opponent.name + " " + str(win_rate)
for opponent, win_rate in sorted(self.win_rates.items(),
key=lambda x: x[1])
]))

def restore_networks(self, session, run_dir):
opponents_file = os.path.join(run_dir, "opponents")
Expand Down Expand Up @@ -291,7 +286,8 @@ def move(self, move, policy_player_turn=False):
self.positions.append(self.position)
if self.position.gameover():
self.result = self.position.result
self.policy_player_score = float(policy_player_turn) if self.result else 0.5
self.policy_player_score = float(
policy_player_turn) if self.result else 0.5


def main(_):
Expand Down

0 comments on commit 15bfe97

Please sign in to comment.