diff --git a/consts.py b/consts.py index 4b6b9cd..54a5522 100644 --- a/consts.py +++ b/consts.py @@ -15,7 +15,8 @@ TILED_COLUMNS = np.arange(TOTAL_DISKS) % WIDTH ROW_EDGE_DISTANCE = np.min([TILED_ROWS, np.flip(TILED_ROWS, axis=0)], axis=0) -COLUMN_EDGE_DISTANCE = np.min([TILED_COLUMNS, np.flip(TILED_COLUMNS, axis=0)], axis=0) +COLUMN_EDGE_DISTANCE = np.min( + [TILED_COLUMNS, np.flip(TILED_COLUMNS, axis=0)], axis=0) ODDS = TILED_ROWS % 2 FOURS = [] @@ -69,7 +70,7 @@ for colour in range(COLOURS): for row in range(HEIGHT): disks_in_column = row ^ (row + 1) - yellow_disks = 2 ** (row + 3) if colour == YELLOW else 0 + yellow_disks = 2**(row + 3) if colour == YELLOW else 0 row_hash = disks_in_column | yellow_disks for column in range(WIDTH): row_column_hash = row_hash << (9 * column) diff --git a/network.py b/network.py index c027ad6..7f2e1e4 100644 --- a/network.py +++ b/network.py @@ -16,30 +16,31 @@ def __init__(self, scope, use_symmetry): [1, 2, HEIGHT, WIDTH], ) - self.disks = tf.placeholder( - tf.float32, shape=[None, 2, HEIGHT, WIDTH], name="disks" - ) + self.disks = tf.placeholder(tf.float32, + shape=[None, 2, HEIGHT, WIDTH], + name="disks") - self.empty = tf.placeholder( - tf.float32, shape=[None, HEIGHT, WIDTH], name="empty" - ) + self.empty = tf.placeholder(tf.float32, + shape=[None, HEIGHT, WIDTH], + name="empty") empty = tf.expand_dims(self.empty, axis=1) - self.legal_moves = tf.placeholder( - tf.float32, shape=[None, HEIGHT, WIDTH], name="legal_moves" - ) + self.legal_moves = tf.placeholder(tf.float32, + shape=[None, HEIGHT, WIDTH], + name="legal_moves") legal_moves = tf.expand_dims(self.legal_moves, axis=1) - self.threats = tf.placeholder( - tf.float32, shape=[None, 2, HEIGHT, WIDTH], name="threats" - ) + self.threats = tf.placeholder(tf.float32, + shape=[None, 2, HEIGHT, WIDTH], + name="threats") constant_features = np.array( [TILED_ROWS, ODDS, ROW_EDGE_DISTANCE, COLUMN_EDGE_DISTANCE], dtype=np.float32, ).reshape([1, 4, HEIGHT, WIDTH]) batch_size = tf.shape(self.turn)[0] - tiled_constant_features = tf.tile(constant_features, [batch_size, 1, 1, 1]) + tiled_constant_features = tf.tile(constant_features, + [batch_size, 1, 1, 1]) feature_planes = tf.concat( [ @@ -55,11 +56,12 @@ def __init__(self, scope, use_symmetry): if use_symmetry: # Interleave horizontally flipped position - feature_planes_shape = [-1] + feature_planes.shape.as_list()[1:] + feature_planes_shape = [-1 + ] + feature_planes.shape.as_list()[1:] flipped = tf.reverse(feature_planes, axis=[3]) feature_planes = tf.reshape( - tf.stack([feature_planes, flipped], axis=1), feature_planes_shape - ) + tf.stack([feature_planes, flipped], axis=1), + feature_planes_shape) with tf.name_scope("conv_layers"): if self.gpu_available(): @@ -106,9 +108,9 @@ def __init__(self, scope, use_symmetry): name="final_conv", ) disk_bias = tf.get_variable("disk_bias", shape=[TOTAL_DISKS]) - self.conv_output = tf.add( - tf.contrib.layers.flatten(final_conv), disk_bias, name="conv_output" - ) + self.conv_output = tf.add(tf.contrib.layers.flatten(final_conv), + disk_bias, + name="conv_output") self.conv_layers = [conv1, conv2, conv3, self.conv_output] @@ -119,7 +121,8 @@ def gpu_available(self): @property def variables(self): # Add '/' to stop network-1 containing network-10 variables - return tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, self.scope + "/") + return tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, + self.scope + "/") def assign(self, other): return [ @@ -135,12 +138,11 @@ def __init__(self, scope, temperature=1.0, reuse=None, use_symmetry=False): with tf.name_scope("policy"): self.temperature = tf.placeholder_with_default( - temperature, (), name="temperature" - ) + temperature, (), name="temperature") - disk_logits = tf.divide( - self.conv_output, self.temperature, name="disk_logits" - ) + disk_logits = tf.divide(self.conv_output, + self.temperature, + name="disk_logits") if use_symmetry: # Calculate average of actual and horizontally flipped position @@ -151,7 +153,9 @@ def __init__(self, scope, temperature=1.0, reuse=None, use_symmetry=False): ) disk_logits = tf.reshape( tf.reduce_mean( - tf.concat([normal, tf.reverse(flipped, axis=[3])], axis=1), + tf.concat( + [normal, tf.reverse(flipped, axis=[3])], + axis=1), axis=1, ), [-1, TOTAL_DISKS], @@ -161,10 +165,8 @@ def __init__(self, scope, temperature=1.0, reuse=None, use_symmetry=False): # - Legal moves have positive logits # - Illegal moves have -ILLEGAL_PENALTY logits legal_moves = tf.contrib.layers.flatten(self.legal_moves) - legal_disk_logits = ( - tf.nn.relu(disk_logits) * legal_moves - + (legal_moves - 1) * ILLEGAL_PENALTY - ) + legal_disk_logits = (tf.nn.relu(disk_logits) * legal_moves + + (legal_moves - 1) * ILLEGAL_PENALTY) self.policy = tf.nn.softmax(legal_disk_logits, name="policy") self.sample_move = tf.squeeze( @@ -203,10 +205,12 @@ def __init__(self, scope, use_symmetry=False): if use_symmetry: # Calculate average of actual and horizontally flipped position - self.value = tf.reduce_mean( - tf.reshape(value, [-1, 2]), axis=1, name="value" - ) + self.value = tf.reduce_mean(tf.reshape(value, [-1, 2]), + axis=1, + name="value") else: self.value = tf.squeeze(value, axis=1, name="value") - self.value_layers = self.conv_layers + [fully_connected, self.value] + self.value_layers = self.conv_layers + [ + fully_connected, self.value + ] diff --git a/policy_training.py b/policy_training.py index 6ef2f14..0b18d2d 100644 --- a/policy_training.py +++ b/policy_training.py @@ -23,42 +23,42 @@ def __init__(self, config): self.config = config self.run_dir = util.run_directory(config) - self.session = tf.Session( - config=tf.ConfigProto(gpu_options=tf.GPUOptions(allow_growth=True)) - ) + self.session = tf.Session(config=tf.ConfigProto( + gpu_options=tf.GPUOptions(allow_growth=True))) self.policy_network = PolicyNetwork("policy") self.policy_player = PolicyPlayer(self.policy_network, self.session) - util.restore_or_initialize_network( - self.session, self.run_dir, self.policy_network - ) + util.restore_or_initialize_network(self.session, self.run_dir, + self.policy_network) # Train ops self.create_train_op(self.policy_network) self.writer = tf.summary.FileWriter(self.run_dir) - util.restore_or_initialize_scope( - self.session, self.run_dir, self.training_scope.name - ) + util.restore_or_initialize_scope(self.session, self.run_dir, + self.training_scope.name) self.opponents = Opponents( - [RandomPlayer(), RandomThreatPlayer(), MaxThreatPlayer()] - ) + [RandomPlayer(), + RandomThreatPlayer(), + MaxThreatPlayer()]) self.opponents.restore_networks(self.session, self.run_dir) def create_train_op(self, policy_network): with tf.variable_scope("policy_training") as self.training_scope: self.move = tf.placeholder(tf.int32, shape=[None], name="move") - self.result = tf.placeholder(tf.float32, shape=[None], name="result") + self.result = tf.placeholder(tf.float32, + shape=[None], + name="result") policy = tf.reshape(policy_network.policy, [-1, HEIGHT, WIDTH]) move = tf.expand_dims(tf.one_hot(self.move, WIDTH), axis=1) turn = util.turn_win(policy_network.turn) move_probability = tf.reduce_sum(policy * move, axis=[1, 2]) - result_loss = -tf.reduce_mean(tf.log(move_probability) * turn * self.result) + result_loss = -tf.reduce_mean( + tf.log(move_probability) * turn * self.result) entropy_regularisation = -config.entropy * tf.reduce_mean( - policy_network.entropy - ) + policy_network.entropy) loss = result_loss + entropy_regularisation optimizer = tf.train.AdamOptimizer(self.config.learning_rate) @@ -95,17 +95,20 @@ def save(self): def play_games(self, opponent): # Create games - games = incomplete_games = [Game() for _ in range(self.config.batch_size)] + games = incomplete_games = [ + Game() for _ in range(self.config.batch_size) + ] # Let opponent play first in half of the games - self.play_move(games[0 : len(games) // 2], opponent) + self.play_move(games[0:len(games) // 2], opponent) player = self.policy_player while incomplete_games: self.play_move(incomplete_games, player) player = self.policy_player if player != self.policy_player else opponent incomplete_games = [ - game for game in incomplete_games if not game.position.gameover() + game for game in incomplete_games + if not game.position.gameover() ] return games @@ -154,7 +157,8 @@ def train_games(self, opponent, games): def process_results(self, opponent, games, step, summary): win_rate = np.mean([game.policy_player_score for game in games]) - average_moves = sum(len(game.moves) for game in games) / self.config.batch_size + average_moves = sum(len(game.moves) + for game in games) / self.config.batch_size opponent_summary = tf.Summary() opponent_summary.value.add( tag=self.training_scope.name + "/" + opponent.name + "/win_rate", @@ -170,16 +174,13 @@ def process_results(self, opponent, games, step, summary): self.opponents.update_win_rate(opponent, win_rate) - print( - "Step %d. Opponent %s, win rate %.2f <%.2f>, %.2f moves" - % ( - step, - opponent.name, - win_rate, - self.opponents.win_rates[opponent], - average_moves, - ) - ) + print("Step %d. Opponent %s, win rate %.2f <%.2f>, %.2f moves" % ( + step, + opponent.name, + win_rate, + self.opponents.win_rates[opponent], + average_moves, + )) def create_new_opponent(self, name): # Create clone of policy_player @@ -210,7 +211,8 @@ def decrease_win_rates(self): def update_win_rate(self, opponent, win_rate): # Win rate is a moving average - self.win_rates[opponent] = self.win_rates[opponent] * 0.9 + win_rate * 0.1 + self.win_rates[ + opponent] = self.win_rates[opponent] * 0.9 + win_rate * 0.1 def all_beaten(self): result = True @@ -221,32 +223,25 @@ def all_beaten(self): def choose_opponent(self): # More difficult opponents are chosen more often win_rates = np.maximum(list(self.win_rates.values()), 0.1) - probs = (1 / win_rates ** 2) - 1 + probs = (1 / win_rates**2) - 1 normalised_probs = probs / probs.sum() - return np.random.choice(list(self.win_rates.keys()), p=normalised_probs) + return np.random.choice(list(self.win_rates.keys()), + p=normalised_probs) def next_network_name(self): - network_opponents = len( - [ - opponent - for opponent in self.win_rates.keys() - if type(opponent) == PolicyPlayer - ] - ) + network_opponents = len([ + opponent for opponent in self.win_rates.keys() + if type(opponent) == PolicyPlayer + ]) return "network-%d" % (network_opponents + 1) def save_opponent_stats(self, run_dir): with open(os.path.join(run_dir, "opponents"), "w") as f: - f.write( - "\n".join( - [ - opponent.name + " " + str(win_rate) - for opponent, win_rate in sorted( - self.win_rates.items(), key=lambda x: x[1] - ) - ] - ) - ) + f.write("\n".join([ + opponent.name + " " + str(win_rate) + for opponent, win_rate in sorted(self.win_rates.items(), + key=lambda x: x[1]) + ])) def restore_networks(self, session, run_dir): opponents_file = os.path.join(run_dir, "opponents") @@ -291,7 +286,8 @@ def move(self, move, policy_player_turn=False): self.positions.append(self.position) if self.position.gameover(): self.result = self.position.result - self.policy_player_score = float(policy_player_turn) if self.result else 0.5 + self.policy_player_score = float( + policy_player_turn) if self.result else 0.5 def main(_):