ddpg.py

"""
ddpg.py
"""
__author__ = "giorgio@ac.upc.edu"
__credits__ = "https://github.com/yanpanlau"

# import simulator as environment env()
from Environment import OmnetBalancerEnv
from Environment import OmnetLinkweightEnv
from Environment import vector_to_file
import numpy as np
import tensorflow as tf
import sys
import json

from ReplayBuffer import ReplayBuffer
from ActorNetwork import ActorNetwork
from CriticNetwork import CriticNetwork
from OU import OU
from helper import setup_exp, setup_run, parser, pretty, scale


def playGame(DDPG_config, train_indicator=1):    #1 means Train, 0 means simply Run
    # SETUP STARTS HERE
    if train_indicator > 0:
        folder = setup_run(DDPG_config)
    elif train_indicator == 0:
        folder = DDPG_config['EXPERIMENT']

    if DDPG_config['RSEED'] == 0:
        DDPG_config['RSEED'] = None
    np.random.seed(DDPG_config['RSEED'])

    ACTIVE_NODES = DDPG_config['ACTIVE_NODES']

    # Generate an environment
    if DDPG_config['ENV'] == 'balancing':
        env = OmnetBalancerEnv(DDPG_config, folder)
    elif DDPG_config['ENV'] == 'label':
        env = OmnetLinkweightEnv(DDPG_config, folder)

    action_dim, state_dim = env.a_dim, env.s_dim

    MU = DDPG_config['MU']
    THETA = DDPG_config['THETA']
    SIGMA = DDPG_config['SIGMA']

    ou = OU(action_dim, MU, THETA, SIGMA)       #Ornstein-Uhlenbeck Process

    BUFFER_SIZE = DDPG_config['BUFFER_SIZE']
    BATCH_SIZE = DDPG_config['BATCH_SIZE']
    GAMMA = DDPG_config['GAMMA']
    EXPLORE = DDPG_config['EXPLORE']
    EPISODE_COUNT = DDPG_config['EPISODE_COUNT']
    MAX_STEPS = DDPG_config['MAX_STEPS']
    # explore 是 0.8
    # explore 计算出 最多实际会explore多少step  ： EXPLORE = EPISODE_COUNT * MAX_STEPS * EXPLORE
    if EXPLORE <= 1:
        EXPLORE = EPISODE_COUNT * MAX_STEPS * EXPLORE
    # SETUP ENDS HERE

    reward = 0
    done = False
    wise = False
    step = 0
    epsilon = 1
    indicator = 0

    #Tensorflow GPU optimization
    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True
    sess = tf.Session(config=config)
    from keras import backend as K
    K.set_session(sess)

    actor = ActorNetwork(sess, state_dim, action_dim, DDPG_config)
    critic = CriticNetwork(sess, state_dim, action_dim, DDPG_config)
    buff = ReplayBuffer(BUFFER_SIZE)    #Create replay buffer

    ltm = ['a_h0', 'a_h1', 'a_V', 'c_w1', 'c_a1', 'c_h1', 'c_h3', 'c_V']
    layers_to_mind = {}
    L2 = {}

    for k in ltm:
        layers_to_mind[k] = 0
        L2[k] = 0

    vector_to_file(ltm, folder + 'weightsL2' + 'Log.csv', 'w')

    #Now load the weight
    try:
        actor.model.load_weights(folder + "actormodel.h5")
        critic.model.load_weights(folder + "criticmodel.h5")
        actor.target_model.load_weights(folder + "actormodel.h5")
        critic.target_model.load_weights(folder + "criticmodel.h5")
        print("Weight load successfully")
    except:
        print("Cannot find the weight")

    print("OMNeT++ Experiment Start.")
    # initial state of simulator
    s_t = env.reset()
    loss = 0
    for i in range(EPISODE_COUNT):

        print("Episode : " + str(i) + " Replay Buffer " + str(buff.count()))

        total_reward = 0
        for j in range(MAX_STEPS):
            # 这个怎么用？？？
            epsilon -= 1.0 / EXPLORE
            a_t = np.zeros([1, action_dim])
            # noise_t = [1, action_dim] 全 0
            noise_t = np.zeros([1, action_dim])

            # 输入 状态， 拉成一维， 输出是 [1,action_dim]
            a_t_original = actor.model.predict(s_t.reshape(1, s_t.shape[0]))

            # 这里会更新noise_t，大概是下面这样的。。
            # -0.019	-0.037	0.003	-0.004	-0.026	0.049	0.012	-0.001	-0.016	0.024	0.08	-0.022	-0.027	-0.009	0.054	0.026	-0.15	-0.019	-0.034	-0.019	-0.037
            if train_indicator and epsilon > 0 and (step % 1000) // 100 != 9:
                noise_t[0] = epsilon * ou.evolve()

            a = a_t_original[0]
            n = noise_t[0]
            # 满足条件(condition)，输出x，不满足输出y。
            # 把 0<a+n<1 的位置置 a+n ,别的置a-n,然后再 clip 一下
            a_t[0] = np.where((a + n > 0) & (a + n < 1), a + n, a - n).clip(min=0, max=1)

            # execute action
            # 这里 done 一直返回的都是 0 ？？？
            s_t1, r_t, done = env.step(a_t[0])

            # s_t, a_t, r_t, s_t1
            # 状态S， S下执行A， 获得的R， 进入状态S1
            buff.add(s_t, a_t[0], r_t, s_t1, done)      #Add replay buffer

            scale = lambda x: x
            #Do the batch update
            # 取得一组，每个是 experience = (state, action, reward, new_state, done)
            batch = buff.getBatch(BATCH_SIZE)
            states = scale(np.asarray([e[0] for e in batch]))
            actions = scale(np.asarray([e[1] for e in batch]))
            rewards = scale(np.asarray([e[2] for e in batch]))
            new_states = scale(np.asarray([e[3] for e in batch]))
            dones = np.asarray([e[4] for e in batch])

            y_t = np.zeros([len(batch), action_dim])
            # 输入 <state, action>, 输出 q
            target_q_values = critic.target_model.predict([new_states, actor.target_model.predict(new_states)])

            for k in range(len(batch)):
                if dones[k]:
                    y_t[k] = rewards[k]
                else:
                    y_t[k] = rewards[k] + GAMMA*target_q_values[k]

            if train_indicator and len(batch) >= BATCH_SIZE:
                # 获取 grad， 给 actor 训练
                loss = critic.model.train_on_batch([states, actions], y_t)
                a_for_grad = actor.model.predict(states)
                grads = critic.gradients(states, a_for_grad)
                # does this give an output like train_on_batch above? NO
                actor.train(states, grads)
                actor.target_train()
                critic.target_train()
                with open(folder + 'lossLog.csv', 'a') as file:
                    file.write(pretty(loss) + '\n')

            total_reward += r_t
            s_t = s_t1

            for layer in actor.model.layers + critic.model.layers:
                if layer.name in layers_to_mind.keys():
                    L2[layer.name] = np.linalg.norm(np.ravel(layer.get_weights()[0])-layers_to_mind[layer.name])
#                     vector_to_file(np.ravel(layer.get_weights()[0]), folder + 'weights_' + layer.name + 'Log.csv', 'a')
                    layers_to_mind[layer.name] = np.ravel(layer.get_weights()[0])
#             if max(L2.values()) <= 0.02:
#                 wise = True

            if train_indicator and len(batch) >= BATCH_SIZE:
                vector_to_file([L2[x] for x in ltm], folder + 'weightsL2' + 'Log.csv', 'a')

            # 把 action 写进log，这里action用的linear的激活，然后step的时候，才softmax
            vector_to_file(a_t_original[0], folder + 'actionLog.csv', 'a')
            # 把noise_t 写进 log,
            vector_to_file(noise_t[0], folder + 'noiseLog.csv', 'a')

            if 'PRINT' in DDPG_config.keys() and DDPG_config['PRINT']:
                print("Episode", "%5d" % i, "Step", "%5d" % step, "Reward", "%.6f" % r_t)
                print("Epsilon", "%.6f" % max(epsilon, 0))

                att_ = np.split(a_t[0], ACTIVE_NODES)
                for _ in range(ACTIVE_NODES):
                    att_[_] = np.insert(att_[_], _, -1)
                att_ = np.concatenate(att_)
                print("Action\n", att_.reshape(ACTIVE_NODES, ACTIVE_NODES))
                print(max(L2, key=L2.get), pretty(max(L2.values())))

            step += 1
            if done or wise:
                break

        # 每隔2 episode 保存一次模型
        if np.mod((i+1), 2) == 0:   # writes at every 2nd episode
            if (train_indicator):
                actor.model.save_weights(folder + "actormodel.h5", overwrite=True)
                actor.model.save_weights(folder + "actormodel" + str(step) + ".h5")
                with open(folder + "actormodel.json", "w") as outfile:
                    outfile.write(actor.model.to_json(indent=4) + '\n')

                critic.model.save_weights(folder + "criticmodel.h5", overwrite=True)
                critic.model.save_weights(folder + "criticmodel" + str(step) + ".h5")
                with open(folder + "criticmodel.json", "w") as outfile:
                    outfile.write(critic.model.to_json(indent=4) + '\n')

        print("TOTAL REWARD @ " + str(i) + "-th Episode  : Reward " + str(total_reward))
        print("Total Step: " + str(step))
        print("")

    env.end()  # This is for shutting down
    print("Finish.")


if __name__ == "__main__":
    # VANILLA
    if len(sys.argv) == 1:
        with open('DDPG.json') as jconfig:
            DDPG_config = json.load(jconfig)
        DDPG_config['EXPERIMENT'] = setup_exp()
        playGame(DDPG_config, train_indicator=1)
    # PLAY
    elif len(sys.argv) == 3:
        # WATCH OUT: it appends to *Log.csv files
        if sys.argv[1] == 'play':
            with open(sys.argv[2] + '/' + 'DDPG.json') as jconfig:
                DDPG_config = json.load(jconfig)
            # here remove double slash at end if present
            experiment = sys.argv[2] if sys.argv[2][-1] == '/' else sys.argv[2] + '/'
            DDPG_config['EXPERIMENT'] = experiment
            playGame(DDPG_config, train_indicator=0)
    # PLAY WITH FILE TRAFFIC
    elif len(sys.argv) == 4:
        # WATCH OUT: it appends to *Log.csv files
        if sys.argv[1] == 'play':
            with open(sys.argv[2] + '/' + 'DDPG.json') as jconfig:
                DDPG_config = json.load(jconfig)
            # here remove double slash at end if present
            experiment = sys.argv[2] if sys.argv[2][-1] == '/' else sys.argv[2] + '/'
            DDPG_config['EXPERIMENT'] = experiment
#             DDPG_config['EPISODE_COUNT'] = 1
#             DDPG_config['MAX_STEPS'] = 1
            if DDPG_config['TRAFFIC'] == 'DIR:':
                DDPG_config['TRAFFIC'] += sys.argv[3]
            playGame(DDPG_config, train_indicator=0)