mjx-project · nissymori · Sep 30, 2022 · Sep 16, 2022 · Sep 16, 2022 · Sep 20, 2022
diff --git a/.gitignore b/.gitignore
@@ -25,6 +25,7 @@ dist
 workspace/suphx-reward-shaping/resources/*
 workspace/suphx-reward-shaping/trained_model/*
 workspace/suphx-reward-shaping/result/*
+workspace/suphx-reward-shaping/check/*
 .DS_Store
 .vscode/
 .python_version
diff --git a/workspace/suphx-reward-shaping/README.md b/workspace/suphx-reward-shaping/README.md
@@ -1,4 +1,11 @@
-## Suphnx-like reward shaping
+## Reward shaping
+In order to handle a round as an episode in RL, it is important to align the game reward to each round appropriately.
+We call it reward shaping. we will prepare 8 NN (NN_0, ...NN_7) for each round and leaning procedure is as follows
+
+- train NN_7: input: features at the begining of round 7, target: game reward.
+- train NN_6: input: features at the begining of round 7, target: prediction by NN_7 on the round7.
+- ... 
+
 
 ## How to train the model
 

diff --git a/workspace/suphx-reward-shaping/tests/test_train_helper.py b/workspace/suphx-reward-shaping/tests/test_train_helper.py
@@ -6,13 +6,13 @@
 import optax
 
 sys.path.append("../")
-from train_helper import initializa_params, loss, net, plot_result, save_params, train
+from train_helper import initializa_params, load_params, loss, net, train
 from utils import to_data
 
 layer_sizes = [3, 4, 5, 4]
 feature_size = 19
 seed = jax.random.PRNGKey(42)
-save_dir = os.path.join(os.pardir, "trained_model/test_param.pickle")
+save_dir = os.path.join(os.pardir, "result/test_param.pickle")
 result_dir = os.path.join(os.pardir, "result")
 
 mjxprotp_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), "resources")
@@ -25,42 +25,32 @@ def test_initialize_params():
 
 def test_train():
     params = initializa_params(layer_sizes, feature_size, seed)
-    features, scores = to_data(mjxprotp_dir)
+    features, targets, scores = to_data(mjxprotp_dir)
     optimizer = optax.adam(0.05)
-    params, train_log, test_log = train(
-        params, optimizer, features, scores, features, scores, epochs=1, batch_size=1
+    params, _, _, _ = train(
+        params, optimizer, features, scores, features, scores, scores, epochs=1, batch_size=1
     )
     assert len(params) == 4
 
 
-def test_save_model():
+def test_net():
     params = initializa_params(layer_sizes, feature_size, seed)
-    features, scores = to_data(mjxprotp_dir)
-    optimizer = optax.adam(0.05)
-    params = train(params, optimizer, features, scores, features, scores, epochs=1, batch_size=1)
-    save_params(params, save_dir)
-
+    features, targets, scores = to_data(mjxprotp_dir)
+    print(net(features[0], params), scores.shape)
 
-def test_plot_result():
-    params = initializa_params(layer_sizes, feature_size, seed)
-    features, scores = to_data(mjxprotp_dir)
-    optimizer = optax.adam(0.05)
-    params = train(params, optimizer, features, scores, features, scores, epochs=1, batch_size=1)
-    plot_result(params, features, scores, result_dir)
 
-
-def test_net():
+def test_loss():
     params = initializa_params(layer_sizes, feature_size, seed)
-    features, scores = to_data(mjxprotp_dir)
-    print(net(features[0], params), features, params)
+    features, targets, scores = to_data(mjxprotp_dir)
+    print(loss(params, features, targets))
 
 
-def test_loss():
+def test_to_data():
     params = initializa_params(layer_sizes, feature_size, seed)
-    features, scores = to_data(mjxprotp_dir)
-    print(loss(params, features, scores))
+    features, targets, scores = to_data(mjxprotp_dir, params=params, round_candidate=7)
+    print(features.shape, scores.shape, targets.shape)
 
 
 if __name__ == "__main__":
-    test_net()
-    test_loss()
+    test_train()
+    test_to_data()
diff --git a/workspace/suphx-reward-shaping/tests/test_utils.py b/workspace/suphx-reward-shaping/tests/test_utils.py
@@ -1,25 +1,40 @@
 import json
 import os
 import sys
+from concurrent.futures import process
 
+import numpy as np
 from google.protobuf import json_format
 
 sys.path.append("../../../")
 import mjxproto
 
 sys.path.append("../")
-from utils import _calc_wind, _preprocess_scores, to_data, to_final_game_reward
+from train_helper import initializa_params
+from utils import (
+    _calc_wind,
+    _preprocess_score,
+    _preprocess_score_inv,
+    to_data,
+    to_final_game_reward,
+)
 
 mjxprotp_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), "resources")
 
 
 def test_preprocess():
-    scores = [0, 100000, 200000, 300000]
-    print(_preprocess_scores(scores, 1))
-    assert _preprocess_scores(scores, 0) == [0, 3, 2, 1]
-    assert _preprocess_scores(scores, 1) == [1, 0, 3, 2]
-    assert _preprocess_scores(scores, 2) == [2, 1, 0, 3]
-    assert _preprocess_scores(scores, 3) == [3, 2, 1, 0]
+    assert _preprocess_score(90) == 1
+    assert _preprocess_score(-135) == 0
+
+
+def test_preprocess_inv():
+    """
+    activation functionをlogistic関数にして, 元のスコアにうまく変換できるか
+    """
+    assert _preprocess_score_inv(_preprocess_score(90)) == 90
+    assert -0.0001 <= _preprocess_score_inv(_preprocess_score(0)) <= 0.0001
+    assert 44.99999 <= _preprocess_score_inv(_preprocess_score(45)) <= 45.0001
+    assert -135.00005 <= _preprocess_score_inv(_preprocess_score(-135)) <= -134.99999
 
 
 def test_calc_wind():
@@ -28,22 +43,28 @@ def test_calc_wind():
 
 
 def test_to_final_game_reward():
-    _dir = os.path.join(mjxprotp_dir, os.listdir(mjxprotp_dir)[0])
-    with open(_dir, "r") as f:
-        lines = f.readlines()
-        _dicts = [json.loads(round) for round in lines]
-        states = [json_format.ParseDict(d, mjxproto.State()) for d in _dicts]
-        assert to_final_game_reward(states) == [0.9, 0.0, -1.35, 0.45]
+    for i in range(4):
+        scores = [
+            [0.9, 0.0, -1.35, 0.45],
+            [0.0, -1.35, 0.45, 0.9],
+            [0.9, -1.35, 0.0, 0.45],
+            [-1.35, 0.9, 0.0, 0.45],
+        ]
+        _dir = os.path.join(mjxprotp_dir, os.listdir(mjxprotp_dir)[i])
+        with open(_dir, "r") as f:
+            lines = f.readlines()
+            _dicts = [json.loads(round) for round in lines]
+            states = [json_format.ParseDict(d, mjxproto.State()) for d in _dicts]
+            print(states[-1].round_terminal.final_score.tens)
+            assert to_final_game_reward(states) == scores[i]
 
 
 def test_to_data():
     num_resources = len(os.listdir(mjxprotp_dir))
-    features, scores = to_data(mjxprotp_dir)
+    features, target, scores = to_data(mjxprotp_dir, round_candidate=7)
     assert features.shape == (num_resources, 19)
     assert scores.shape == (num_resources, 4)
 
 
 if __name__ == "__main__":
     test_to_data()
-    test_to_final_game_reward()
-    test_calc_wind()