From 261fb7c131ea2f52813a1bb43c8c6d8a084eb3e0 Mon Sep 17 00:00:00 2001
From: nissymori <kyoudai.630@gmail.com>
Date: Fri, 16 Sep 2022 20:16:18 +0900
Subject: [PATCH 01/11] fix

---
 workspace/suphx-reward-shaping/utils.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/workspace/suphx-reward-shaping/utils.py b/workspace/suphx-reward-shaping/utils.py
index 9c51c644..7d4fb678 100644
--- a/workspace/suphx-reward-shaping/utils.py
+++ b/workspace/suphx-reward-shaping/utils.py
@@ -31,10 +31,10 @@ def to_data(
             lines = f.readlines()
             _dicts = [json.loads(round) for round in lines]
             states = [json_format.ParseDict(d, mjxproto.State()) for d in _dicts]
-            features = to_feature(states, round_candidates=round_candidates)
-            features.append(features)
+            feature = to_feature(states, round_candidates=round_candidates)
+            features.append(feature)
             if use_model:
-                scores.append(model(jnp.array(features)))
+                scores.append(model(jnp.array(feature)))
             else:
                 scores.append(to_final_game_reward(states))
     features_array: jnp.ndarray = jnp.array(features)

From d5fb63a2cb31f8c82f47976680380fa11d87e67d Mon Sep 17 00:00:00 2001
From: nissymori <kyoudai.630@gmail.com>
Date: Fri, 16 Sep 2022 21:28:00 +0900
Subject: [PATCH 02/11] pass test

---
 .../tests/test_train_helper.py                | 21 +++--
 workspace/suphx-reward-shaping/train.py       | 76 +++++++++++++------
 .../suphx-reward-shaping/train_helper.py      | 30 ++++----
 workspace/suphx-reward-shaping/utils.py       | 26 +++++--
 4 files changed, 102 insertions(+), 51 deletions(-)

diff --git a/workspace/suphx-reward-shaping/tests/test_train_helper.py b/workspace/suphx-reward-shaping/tests/test_train_helper.py
index c7260aa4..4088811d 100644
--- a/workspace/suphx-reward-shaping/tests/test_train_helper.py
+++ b/workspace/suphx-reward-shaping/tests/test_train_helper.py
@@ -6,13 +6,13 @@
 import optax
 
 sys.path.append("../")
-from train_helper import initializa_params, loss, net, plot_result, save_params, train
+from train_helper import initializa_params, load_params, loss, net, plot_result, save_params, train
 from utils import to_data
 
 layer_sizes = [3, 4, 5, 4]
 feature_size = 19
 seed = jax.random.PRNGKey(42)
-save_dir = os.path.join(os.pardir, "trained_model/test_param.pickle")
+save_dir = os.path.join(os.pardir, "result/test_param.pickle")
 result_dir = os.path.join(os.pardir, "result")
 
 mjxprotp_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), "resources")
@@ -33,26 +33,32 @@ def test_train():
     assert len(params) == 4
 
 
-def test_save_model():
+def test_save_and_load():
     params = initializa_params(layer_sizes, feature_size, seed)
     features, scores = to_data(mjxprotp_dir)
     optimizer = optax.adam(0.05)
-    params = train(params, optimizer, features, scores, features, scores, epochs=1, batch_size=1)
+    params, _, _ = train(
+        params, optimizer, features, scores, features, scores, epochs=1, batch_size=1
+    )
     save_params(params, save_dir)
+    params = load_params(save_dir)
+    net(features, params)
 
 
 def test_plot_result():
     params = initializa_params(layer_sizes, feature_size, seed)
     features, scores = to_data(mjxprotp_dir)
     optimizer = optax.adam(0.05)
-    params = train(params, optimizer, features, scores, features, scores, epochs=1, batch_size=1)
+    params, _, _ = train(
+        params, optimizer, features, scores, features, scores, epochs=1, batch_size=1
+    )
     plot_result(params, features, scores, result_dir)
 
 
 def test_net():
     params = initializa_params(layer_sizes, feature_size, seed)
     features, scores = to_data(mjxprotp_dir)
-    print(net(features[0], params), features, params)
+    print(net(features[0], params), scores.shape)
 
 
 def test_loss():
@@ -62,5 +68,4 @@ def test_loss():
 
 
 if __name__ == "__main__":
-    test_net()
-    test_loss()
+    test_save_and_load()
diff --git a/workspace/suphx-reward-shaping/train.py b/workspace/suphx-reward-shaping/train.py
index 30b44242..dbf2b9d4 100644
--- a/workspace/suphx-reward-shaping/train.py
+++ b/workspace/suphx-reward-shaping/train.py
@@ -10,42 +10,74 @@
 from train_helper import initializa_params, plot_result, save_params, train
 from utils import to_data
 
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    parser.add_argument("lr", help="Enter learning rate", type=float)
-    parser.add_argument("epochs", help="Enter epochs", type=int)
-    parser.add_argument("batch_size", help="Enter batch_size", type=int)
-    parser.add_argument("is_round_one_hot", nargs="?", default="0")
-    parser.add_argument("--use_saved_data", nargs="?", default="0")
-    parser.add_argument("--round_candidates", type=int, default=None)
-    parser.add_argument("--data_path", default="resources/mjxproto")
-    parser.add_argument("--result_path", default="result")
+"""
+局ごとにデータとモデルを用意するので
+result/ 
+    features1.npy, ..., features7.npy
+    labels1.npy, ..., labels7.npy
+    params1.npy, ..., params7.pickle
+となることになる.
+"""
 
-    args = parser.parse_args()
 
-    mjxprotp_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), args.data_path)
+def save(opt, params, result_dir):
+    if opt.target_round:
+        save_dir = os.path.join(result_dir, "params" + str(opt.target_round) + ".pickle")
+        save_params(params, save_dir)
+    else:
+        save_dir = os.path.join(result_dir, "params" + ".pickle")
+        save_params(params, save_dir)
 
-    result_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), args.result_path)
 
+def set_dataset(opt, mjxproto_dir: str, result_dir: str):
     if args.use_saved_data == "0":
-        X, Y = to_data(mjxprotp_dir, round_candidates=[args.round_candidates])
-        if args.round_candidates:
-            jnp.save(os.path.join(result_dir, "features" + str(args.round_candidates)), X)
-            jnp.save(os.path.join(result_dir, "labels" + str(args.round_candidates)), Y)
+        if opt.target_round != 7:
+            params = jnp.load(
+                os.path.join(result_dir, "params" + str(opt.target_round) + ".pickle")
+            )
+            X, Y = to_data(
+                mjxproto_dir, round_candidates=[opt.target_round], params=params, use_model=True
+            )
+        else:
+            X, Y = to_data(mjxproto_dir, round_candidates=[opt.target_round])
+        if opt.target_round:
+            jnp.save(os.path.join(result_dir, "features" + str(opt.target_round)), X)
+            jnp.save(os.path.join(result_dir, "labels" + str(opt.target_round)), Y)
         else:
             jnp.save(os.path.join(result_dir, "features"), X)
             jnp.save(os.path.join(result_dir, "labels"), Y)
     else:
-        if args.round_candidates:
+        if opt.target_round:
             X: jnp.ndarray = jnp.load(
-                os.path.join(result_dir, "features" + str(args.round_candidates) + ".npy")
+                os.path.join(result_dir, "features" + str(opt.traget_round) + ".npy")
             )
             Y: jnp.ndarray = jnp.load(
-                os.path.join(result_dir, "labels" + str(args.round_candidates) + ".npy")
+                os.path.join(result_dir, "labels" + str(opt.target_round) + ".npy")
             )
         else:
             X: jnp.ndarray = jnp.load(os.path.join(result_dir, "features.npy"))
             Y: jnp.ndarray = jnp.load(os.path.join(result_dir, "labels.npy"))
+    return X, Y
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("lr", help="Enter learning rate", type=float)
+    parser.add_argument("epochs", help="Enter epochs", type=int)
+    parser.add_argument("batch_size", help="Enter batch_size", type=int)
+    parser.add_argument("is_round_one_hot", nargs="?", default="0")
+    parser.add_argument("--use_saved_data", nargs="?", default="0")
+    parser.add_argument("--data_path", default="resources/mjxproto")
+    parser.add_argument("--result_path", default="result")
+    parser.add_argument("--target_round", type=int)  # 対象となる局 e.g 3の時は東4局のデータのみ使う.
+
+    args = parser.parse_args()
+
+    mjxproto_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), args.data_path)
+
+    result_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), args.result_path)
+
+    X, Y = set_dataset(args, mjxproto_dir, result_dir)
 
     train_x = X[: math.floor(len(X) * 0.8)]
     train_y = Y[: math.floor(len(X) * 0.8)]
@@ -71,6 +103,6 @@
     plt.legend()
     plt.savefig(os.path.join(result_dir, "log/leaning_curve.png"))
 
-    save_params(params, result_dir)
+    save(args, params, result_dir)
 
-    plot_result(params, X, Y, result_dir, round_candidates=[args.round_candidates])
+    plot_result(params, X, Y, result_dir, round_candidates=[args.target_round])
diff --git a/workspace/suphx-reward-shaping/train_helper.py b/workspace/suphx-reward-shaping/train_helper.py
index 437de3a5..d087e504 100644
--- a/workspace/suphx-reward-shaping/train_helper.py
+++ b/workspace/suphx-reward-shaping/train_helper.py
@@ -1,5 +1,6 @@
 import os
 import pickle
+import sys
 from cProfile import label
 from re import I
 from typing import Dict, List, Optional
@@ -16,6 +17,9 @@
 from jax import numpy as jnp
 from jax import value_and_grad, vmap
 
+sys.path.append(".")
+from utils import _calc_curr_pos, _calc_wind, _create_data_for_plot, _remaining_oya, _to_one_hot
+
 
 def initializa_params(layer_sizes: List[int], features: int, seed) -> Dict:
     """
@@ -59,7 +63,7 @@ def net(x: jnp.ndarray, params: optax.Params) -> jnp.ndarray:
 
 def loss(params: optax.Params, batched_x: jnp.ndarray, batched_y: jnp.ndarray) -> jnp.ndarray:
     preds = net(batched_x, params)
-    loss_value = optax.l2_loss(preds, batched_y)
+    loss_value = optax.l2_loss(preds, batched_y).mean(axis=-1)
     return loss_value.mean()
 
 
@@ -142,10 +146,16 @@ def step(params, opt_state, batch, labels):
 
 
 def save_params(params: optax.Params, save_dir):
-    with open(save_dir + "params.pickle", "wb") as f:
+    with open(save_dir, "wb") as f:
         pickle.dump(params, f)
 
 
+def load_params(save_dir):
+    with open(save_dir, "rb") as f:
+        params = pickle.load(f)
+    return params
+
+
 def plot_result(
     params: optax.Params, X, Y, result_dir, is_round_one_hot=False, round_candidates=None
 ):
@@ -158,23 +168,11 @@ def plot_result(
         log_pred = []
         for j in range(60):
             x = jnp.array(_create_data_for_plot(j * 1000, i, is_round_one_hot))
-            pred = net(x, params)
+            pred = net(x, params)  # (1, 4)
             log_score.append(j * 1000)
-            log_pred.append(pred * 100)
+            log_pred.append(pred[0] * 100)
         axes[0].plot(log_score, log_pred, label="round_" + str(i))
         axes[1].plot(log_score, log_pred, ".", label="round_" + str(i))
         plt.legend()
         save_dir = os.path.join(result_dir, "prediction_at_round" + str(i) + ".png")
         plt.savefig(save_dir)
-
-
-def _create_data_for_plot(score, round, is_round_one_hot) -> List:
-    scores = [score / 100000] + [(100000 - score) / 300000] * 3
-    wind = [1, 0, 0, 0]
-    oya = [1, 0, 0, 0]
-    if is_round_one_hot:
-        rounds = [0] * 8
-        rounds[round] = 1
-        return scores + wind + oya + rounds + [0, 0]
-    else:
-        return scores + wind + oya + [round / 7, 0, 0]
diff --git a/workspace/suphx-reward-shaping/utils.py b/workspace/suphx-reward-shaping/utils.py
index 7d4fb678..ef8cf5a6 100644
--- a/workspace/suphx-reward-shaping/utils.py
+++ b/workspace/suphx-reward-shaping/utils.py
@@ -6,10 +6,8 @@
 
 import jax
 import jax.numpy as jnp
-import numpy as np
 from google.protobuf import json_format
 
-sys.path.append("../../")
 sys.path.append("../../../")
 import mjxproto
 
@@ -17,7 +15,7 @@
 
 
 def to_data(
-    mjxprotp_dir: str, round_candidates: Optional[List[int]] = None, model=None, use_model=False
+    mjxprotp_dir: str, round_candidates: Optional[List[int]] = None, params=None, use_model=False
 ) -> Tuple[jnp.ndarray, jnp.ndarray]:
     """
     jsonが入っているディレクトリを引数としてjax.numpyのデータセットを作る.
@@ -31,10 +29,15 @@ def to_data(
             lines = f.readlines()
             _dicts = [json.loads(round) for round in lines]
             states = [json_format.ParseDict(d, mjxproto.State()) for d in _dicts]
-            feature = to_feature(states, round_candidates=round_candidates)
+            feature: List = to_feature(states, round_candidates=round_candidates)
             features.append(feature)
             if use_model:
-                scores.append(model(jnp.array(feature)))
+                x = jnp.array(features)
+                for i, param in enumerate(params.values()):
+                    x = jnp.dot(x, param)
+                    if i + 1 < len(params.values()):
+                        x = jax.nn.relu(x)
+                scores.append(x)
             else:
                 scores.append(to_final_game_reward(states))
     features_array: jnp.ndarray = jnp.array(features)
@@ -141,3 +144,16 @@ def to_final_game_reward(states: List[mjxproto.State]) -> List:
     sorted_scores = sorted(final_scores, reverse=True)
     ranks = [sorted_scores.index(final_scores[i]) for i in range(4)]
     return [game_rewards[i] / 100 for i in ranks]
+
+
+def _create_data_for_plot(score, round, is_round_one_hot) -> List:
+    scores = [score / 100000] + [(100000 - score) / 300000] * 3
+    wind = _to_one_hot(4, _calc_wind(0, round))
+    oya: List[int] = _to_one_hot(4, _calc_curr_pos(0, round))
+    remainning_oya = _remaining_oya(round)
+    if is_round_one_hot:
+        rounds = [0] * 8
+        rounds[round] = 1
+        return scores + wind + oya + remainning_oya + rounds + [0, 0]
+    else:
+        return scores + wind + oya + remainning_oya + [round / 7, 0, 0]

From 795b6836866bd37575090ebae66062d493f03d46 Mon Sep 17 00:00:00 2001
From: nissymori <kyoudai.630@gmail.com>
Date: Tue, 20 Sep 2022 22:17:21 +0900
Subject: [PATCH 03/11] add

---
 .../tests/test_train_helper.py                |  8 +++++-
 .../suphx-reward-shaping/tests/test_utils.py  | 28 +++++++++++++------
 workspace/suphx-reward-shaping/train.py       | 16 +++++++----
 .../suphx-reward-shaping/train_helper.py      | 20 ++++++++++---
 workspace/suphx-reward-shaping/utils.py       | 27 +++++++++++-------
 5 files changed, 70 insertions(+), 29 deletions(-)

diff --git a/workspace/suphx-reward-shaping/tests/test_train_helper.py b/workspace/suphx-reward-shaping/tests/test_train_helper.py
index 4088811d..f724b18e 100644
--- a/workspace/suphx-reward-shaping/tests/test_train_helper.py
+++ b/workspace/suphx-reward-shaping/tests/test_train_helper.py
@@ -67,5 +67,11 @@ def test_loss():
     print(loss(params, features, scores))
 
 
+def test_to_data():
+    params = initializa_params(layer_sizes, feature_size, seed)
+    features, scores = to_data(mjxprotp_dir, use_model=True, params=params)
+    print(features.shape, scores.shape)
+
+
 if __name__ == "__main__":
-    test_save_and_load()
+    test_to_data()
diff --git a/workspace/suphx-reward-shaping/tests/test_utils.py b/workspace/suphx-reward-shaping/tests/test_utils.py
index e61a98d6..45e7ccf4 100644
--- a/workspace/suphx-reward-shaping/tests/test_utils.py
+++ b/workspace/suphx-reward-shaping/tests/test_utils.py
@@ -8,6 +8,7 @@
 import mjxproto
 
 sys.path.append("../")
+from train_helper import initializa_params
 from utils import _calc_wind, _preprocess_scores, to_data, to_final_game_reward
 
 mjxprotp_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), "resources")
@@ -28,12 +29,20 @@ def test_calc_wind():
 
 
 def test_to_final_game_reward():
-    _dir = os.path.join(mjxprotp_dir, os.listdir(mjxprotp_dir)[0])
-    with open(_dir, "r") as f:
-        lines = f.readlines()
-        _dicts = [json.loads(round) for round in lines]
-        states = [json_format.ParseDict(d, mjxproto.State()) for d in _dicts]
-        assert to_final_game_reward(states) == [0.9, 0.0, -1.35, 0.45]
+    for i in range(4):
+        scores = [
+            [0.9, 0.0, -1.35, 0.45],
+            [0.0, -1.35, 0.45, 0.9],
+            [0.9, -1.35, 0.0, 0.45],
+            [-1.35, 0.9, 0.0, 0.45],
+        ]
+        _dir = os.path.join(mjxprotp_dir, os.listdir(mjxprotp_dir)[i])
+        with open(_dir, "r") as f:
+            lines = f.readlines()
+            _dicts = [json.loads(round) for round in lines]
+            states = [json_format.ParseDict(d, mjxproto.State()) for d in _dicts]
+            print(states[-1].round_terminal.final_score.tens)
+            assert to_final_game_reward(states) == scores[i]
 
 
 def test_to_data():
@@ -43,7 +52,10 @@ def test_to_data():
     assert scores.shape == (num_resources, 4)
 
 
+def test_to_data():
+    num_resources = len(os.listdir(mjxprotp_dir))
+    features, scores = to_data(mjxprotp_dir)
+
+
 if __name__ == "__main__":
-    test_to_data()
     test_to_final_game_reward()
-    test_calc_wind()
diff --git a/workspace/suphx-reward-shaping/train.py b/workspace/suphx-reward-shaping/train.py
index dbf2b9d4..520e2a15 100644
--- a/workspace/suphx-reward-shaping/train.py
+++ b/workspace/suphx-reward-shaping/train.py
@@ -33,10 +33,14 @@ def set_dataset(opt, mjxproto_dir: str, result_dir: str):
     if args.use_saved_data == "0":
         if opt.target_round != 7:
             params = jnp.load(
-                os.path.join(result_dir, "params" + str(opt.target_round) + ".pickle")
+                os.path.join(result_dir, "params" + str(opt.target_round + 1) + ".pickle"),
+                allow_pickle=True,
             )
             X, Y = to_data(
-                mjxproto_dir, round_candidates=[opt.target_round], params=params, use_model=True
+                mjxproto_dir,
+                round_candidates=[opt.target_round],
+                params=params,
+                use_model=True,
             )
         else:
             X, Y = to_data(mjxproto_dir, round_candidates=[opt.target_round])
@@ -49,7 +53,7 @@ def set_dataset(opt, mjxproto_dir: str, result_dir: str):
     else:
         if opt.target_round:
             X: jnp.ndarray = jnp.load(
-                os.path.join(result_dir, "features" + str(opt.traget_round) + ".npy")
+                os.path.join(result_dir, "features" + str(opt.target_round) + ".npy")
             )
             Y: jnp.ndarray = jnp.load(
                 os.path.join(result_dir, "labels" + str(opt.target_round) + ".npy")
@@ -101,8 +105,8 @@ def set_dataset(opt, mjxproto_dir: str, result_dir: str):
     plt.plot(train_log, label="train")
     plt.plot(test_log, label="val")
     plt.legend()
-    plt.savefig(os.path.join(result_dir, "log/leaning_curve.png"))
+    plt.savefig(os.path.join(result_dir, "log/leaning_curve" + str(args.target_round) + ".png"))
 
     save(args, params, result_dir)
-
-    plot_result(params, X, Y, result_dir, round_candidates=[args.target_round])
+    for i in range(4):
+        plot_result(params, X, Y, result_dir, i, round_candidates=[args.target_round])
diff --git a/workspace/suphx-reward-shaping/train_helper.py b/workspace/suphx-reward-shaping/train_helper.py
index d087e504..41c98d03 100644
--- a/workspace/suphx-reward-shaping/train_helper.py
+++ b/workspace/suphx-reward-shaping/train_helper.py
@@ -157,7 +157,13 @@ def load_params(save_dir):
 
 
 def plot_result(
-    params: optax.Params, X, Y, result_dir, is_round_one_hot=False, round_candidates=None
+    params: optax.Params,
+    X,
+    Y,
+    result_dir,
+    target: int,
+    is_round_one_hot=False,
+    round_candidates=None,
 ):
     fig = plt.figure(figsize=(10, 5))
     axes = fig.subplots(1, 2)
@@ -167,12 +173,18 @@ def plot_result(
         log_score = []
         log_pred = []
         for j in range(60):
-            x = jnp.array(_create_data_for_plot(j * 1000, i, is_round_one_hot))
+            x = jnp.array(_create_data_for_plot(j * 1000, i, is_round_one_hot, target))
             pred = net(x, params)  # (1, 4)
             log_score.append(j * 1000)
-            log_pred.append(pred[0] * 100)
+            log_pred.append(pred[target] * 100)
         axes[0].plot(log_score, log_pred, label="round_" + str(i))
+        axes[0].set_title("pos=" + str(target))
+        axes[0].hlines([90, 45, 0, -135], 0, 60000, "red")
         axes[1].plot(log_score, log_pred, ".", label="round_" + str(i))
+        axes[1].set_title("pos=" + str(target))
+        axes[1].hlines([90, 45, 0, -135], 0, 60000, "red")
         plt.legend()
-        save_dir = os.path.join(result_dir, "prediction_at_round" + str(i) + ".png")
+        save_dir = os.path.join(
+            result_dir, "prediction_at_round" + str(i) + "pos=" + str(target) + ".png"
+        )
         plt.savefig(save_dir)
diff --git a/workspace/suphx-reward-shaping/utils.py b/workspace/suphx-reward-shaping/utils.py
index ef8cf5a6..8c453744 100644
--- a/workspace/suphx-reward-shaping/utils.py
+++ b/workspace/suphx-reward-shaping/utils.py
@@ -8,6 +8,7 @@
 import jax.numpy as jnp
 from google.protobuf import json_format
 
+sys.path.append("../../")
 sys.path.append("../../../")
 import mjxproto
 
@@ -29,19 +30,24 @@ def to_data(
             lines = f.readlines()
             _dicts = [json.loads(round) for round in lines]
             states = [json_format.ParseDict(d, mjxproto.State()) for d in _dicts]
+            if round_candidates:
+                if len(states) <= 7 - min(round_candidates):
+                    continue
             feature: List = to_feature(states, round_candidates=round_candidates)
             features.append(feature)
             if use_model:
-                x = jnp.array(features)
-                for i, param in enumerate(params.values()):
-                    x = jnp.dot(x, param)
-                    if i + 1 < len(params.values()):
-                        x = jax.nn.relu(x)
-                scores.append(x)
+                continue
             else:
                 scores.append(to_final_game_reward(states))
-    features_array: jnp.ndarray = jnp.array(features)
     scores_array: jnp.ndarray = jnp.array(scores)
+    features_array: jnp.ndarray = jnp.array(features)
+    if use_model:
+        x = features_array
+        for i, param in enumerate(params.values()):
+            x = jnp.dot(x, param)
+            if i + 1 < len(params.values()):
+                x = jax.nn.relu(x)
+        scores_array = x
     return features_array, scores_array
 
 
@@ -53,7 +59,7 @@ def _select_one_round(
     """
     if candidates:
         if min(candidates) > len(states) - 1:  # 候補のと対応する局がない場合, 一番近いものを返す.
-            return states[len(states) - 1]
+            return states[len(states) - (7 - min(candidates))]
         idx = random.choice(candidates)
         return states[idx]
     else:
@@ -146,8 +152,9 @@ def to_final_game_reward(states: List[mjxproto.State]) -> List:
     return [game_rewards[i] / 100 for i in ranks]
 
 
-def _create_data_for_plot(score, round, is_round_one_hot) -> List:
-    scores = [score / 100000] + [(100000 - score) / 300000] * 3
+def _create_data_for_plot(score: int, round: int, is_round_one_hot, target: int) -> List:
+    scores = [(100000 - score) / 300000] * 4
+    scores[target] = score / 100000
     wind = _to_one_hot(4, _calc_wind(0, round))
     oya: List[int] = _to_one_hot(4, _calc_curr_pos(0, round))
     remainning_oya = _remaining_oya(round)

From 8bdbbb426f1e9fa268acfee70e21b5c363b0dd6e Mon Sep 17 00:00:00 2001
From: nissymori <kyoudai.630@gmail.com>
Date: Tue, 20 Sep 2022 22:41:08 +0900
Subject: [PATCH 04/11] implement

---
 .../suphx-reward-shaping/tests/test_utils.py  |  5 ---
 workspace/suphx-reward-shaping/utils.py       | 36 +++++++++++--------
 2 files changed, 22 insertions(+), 19 deletions(-)

diff --git a/workspace/suphx-reward-shaping/tests/test_utils.py b/workspace/suphx-reward-shaping/tests/test_utils.py
index 45e7ccf4..3397b0e4 100644
--- a/workspace/suphx-reward-shaping/tests/test_utils.py
+++ b/workspace/suphx-reward-shaping/tests/test_utils.py
@@ -52,10 +52,5 @@ def test_to_data():
     assert scores.shape == (num_resources, 4)
 
 
-def test_to_data():
-    num_resources = len(os.listdir(mjxprotp_dir))
-    features, scores = to_data(mjxprotp_dir)
-
-
 if __name__ == "__main__":
     test_to_final_game_reward()
diff --git a/workspace/suphx-reward-shaping/utils.py b/workspace/suphx-reward-shaping/utils.py
index 8c453744..33fcce7b 100644
--- a/workspace/suphx-reward-shaping/utils.py
+++ b/workspace/suphx-reward-shaping/utils.py
@@ -30,10 +30,10 @@ def to_data(
             lines = f.readlines()
             _dicts = [json.loads(round) for round in lines]
             states = [json_format.ParseDict(d, mjxproto.State()) for d in _dicts]
-            if round_candidates:
-                if len(states) <= 7 - min(round_candidates):
-                    continue
-            feature: List = to_feature(states, round_candidates=round_candidates)
+            state = _select_one_round(states, round_candidates)
+            if not state:  # 該当する局がない場合飛ばす.
+                continue
+            feature: List = to_feature(state, round_candidates=round_candidates)
             features.append(feature)
             if use_model:
                 continue
@@ -51,16 +51,25 @@ def to_data(
     return features_array, scores_array
 
 
+def _filter_round(states: List[mjxproto.State], candidates: List[int]) -> List[int]:
+    indices = []
+    for idx, state in enumerate(states):
+        if state.public_observation.init_score.round in candidates:
+            indices.append(idx)
+    return indices
+
+
 def _select_one_round(
     states: List[mjxproto.State], candidates: Optional[List[int]] = None
-) -> mjxproto.State:
+) -> Optional[mjxproto.State]:
     """
     データセットに本質的で無い相関が生まれることを防ぐために一半荘につき1ペアのみを使う.
     """
     if candidates:
-        if min(candidates) > len(states) - 1:  # 候補のと対応する局がない場合, 一番近いものを返す.
-            return states[len(states) - (7 - min(candidates))]
-        idx = random.choice(candidates)
+        indices = _filter_round(states, candidates)
+        if len(indices) == 0:  # 該当する局がない場合は飛ばす.
+            return None
+        idx = random.choice(indices)
         return states[idx]
     else:
         idx: int = random.randint(0, len(states) - 1)
@@ -114,18 +123,17 @@ def _remaining_oya(round: int):  # 局終了時の残りの親の数
 
 
 def to_feature(
-    states: List[mjxproto.State],
+    state: mjxproto.State,
     is_round_one_hot=False,
     round_candidates: Optional[List[int]] = None,
 ) -> List:
     """
     特徴量 = [4playerの点数, 起家の風:one-hot, 親:one-hot, 残りの親の数, 局, 本場, 詰み棒]
     """
-    state = _select_one_round(states, candidates=round_candidates)
-    scores: List = [i / 100000 for i in state.round_terminal.final_score.tens]
-    honba: int = state.round_terminal.final_score.honba
-    tsumibo: int = state.round_terminal.final_score.riichi
-    round: int = _clip_round(state.round_terminal.final_score.round)
+    scores: List = [i / 100000 for i in state.public_observation.init_score.tens]
+    honba: int = state.public_observation.init_score.honba
+    tsumibo: int = state.public_observation.init_score.riichi
+    round: int = _clip_round(state.public_observation.init_score.round)
     wind: List[int] = _to_one_hot(4, _calc_wind(0, round))  # 起家の風のみを入力
     oya: List[int] = _to_one_hot(4, _calc_curr_pos(0, round))
     remainning_oya = _remaining_oya(round)

From b0b5ed9dc39c7f7f08f3e34f72897104a509f867 Mon Sep 17 00:00:00 2001
From: nissymori <kyoudai.630@gmail.com>
Date: Tue, 20 Sep 2022 22:57:09 +0900
Subject: [PATCH 05/11] pass test

---
 .../suphx-reward-shaping/tests/test_utils.py  |   3 +-
 workspace/suphx-reward-shaping/train.py       | 109 ++++++++++++------
 .../suphx-reward-shaping/train_helper.py      |   2 -
 workspace/suphx-reward-shaping/utils.py       |   3 +-
 4 files changed, 75 insertions(+), 42 deletions(-)

diff --git a/workspace/suphx-reward-shaping/tests/test_utils.py b/workspace/suphx-reward-shaping/tests/test_utils.py
index 3397b0e4..dddae4c7 100644
--- a/workspace/suphx-reward-shaping/tests/test_utils.py
+++ b/workspace/suphx-reward-shaping/tests/test_utils.py
@@ -47,10 +47,11 @@ def test_to_final_game_reward():
 
 def test_to_data():
     num_resources = len(os.listdir(mjxprotp_dir))
-    features, scores = to_data(mjxprotp_dir)
+    features, scores = to_data(mjxprotp_dir, round_candidates=[7])
     assert features.shape == (num_resources, 19)
     assert scores.shape == (num_resources, 4)
 
 
 if __name__ == "__main__":
+    test_to_data()
     test_to_final_game_reward()
diff --git a/workspace/suphx-reward-shaping/train.py b/workspace/suphx-reward-shaping/train.py
index 520e2a15..a0004492 100644
--- a/workspace/suphx-reward-shaping/train.py
+++ b/workspace/suphx-reward-shaping/train.py
@@ -20,43 +20,43 @@
 """
 
 
-def save(opt, params, result_dir):
-    if opt.target_round:
-        save_dir = os.path.join(result_dir, "params" + str(opt.target_round) + ".pickle")
+def save(target_round, params, result_dir):
+    if target_round:
+        save_dir = os.path.join(result_dir, "params" + str(target_round) + ".pickle")
         save_params(params, save_dir)
     else:
         save_dir = os.path.join(result_dir, "params" + ".pickle")
         save_params(params, save_dir)
 
 
-def set_dataset(opt, mjxproto_dir: str, result_dir: str):
-    if args.use_saved_data == "0":
-        if opt.target_round != 7:
+def set_dataset(target_round, mjxproto_dir: str, result_dir: str, use_saved_data):
+    if use_saved_data == "0":
+        if target_round != 7:
             params = jnp.load(
-                os.path.join(result_dir, "params" + str(opt.target_round + 1) + ".pickle"),
+                os.path.join(result_dir, "params" + str(target_round + 1) + ".pickle"),
                 allow_pickle=True,
             )
             X, Y = to_data(
                 mjxproto_dir,
-                round_candidates=[opt.target_round],
+                round_candidates=[target_round],
                 params=params,
                 use_model=True,
             )
         else:
-            X, Y = to_data(mjxproto_dir, round_candidates=[opt.target_round])
-        if opt.target_round:
-            jnp.save(os.path.join(result_dir, "features" + str(opt.target_round)), X)
-            jnp.save(os.path.join(result_dir, "labels" + str(opt.target_round)), Y)
+            X, Y = to_data(mjxproto_dir, round_candidates=[target_round])
+        if target_round:
+            jnp.save(os.path.join(result_dir, "features" + str(target_round)), X)
+            jnp.save(os.path.join(result_dir, "labels" + str(target_round)), Y)
         else:
             jnp.save(os.path.join(result_dir, "features"), X)
             jnp.save(os.path.join(result_dir, "labels"), Y)
     else:
-        if opt.target_round:
+        if target_round:
             X: jnp.ndarray = jnp.load(
-                os.path.join(result_dir, "features" + str(opt.target_round) + ".npy")
+                os.path.join(result_dir, "features" + str(target_round) + ".npy")
             )
             Y: jnp.ndarray = jnp.load(
-                os.path.join(result_dir, "labels" + str(opt.target_round) + ".npy")
+                os.path.join(result_dir, "labels" + str(target_round) + ".npy")
             )
         else:
             X: jnp.ndarray = jnp.load(os.path.join(result_dir, "features.npy"))
@@ -74,39 +74,72 @@ def set_dataset(opt, mjxproto_dir: str, result_dir: str):
     parser.add_argument("--data_path", default="resources/mjxproto")
     parser.add_argument("--result_path", default="result")
     parser.add_argument("--target_round", type=int)  # 対象となる局 e.g 3の時は東4局のデータのみ使う.
+    parser.add_argument("--at_once", type=int, default=0)
 
     args = parser.parse_args()
-
     mjxproto_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), args.data_path)
 
     result_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), args.result_path)
+    if args.at_once == 0:
+        X, Y = set_dataset(args.target_round, mjxproto_dir, result_dir, args.use_saved_data)
 
-    X, Y = set_dataset(args, mjxproto_dir, result_dir)
+        train_x = X[: math.floor(len(X) * 0.8)]
+        train_y = Y[: math.floor(len(X) * 0.8)]
+        test_x = X[math.floor(len(X) * 0.8) :]
+        test_y = Y[math.floor(len(X) * 0.8) :]
 
-    train_x = X[: math.floor(len(X) * 0.8)]
-    train_y = Y[: math.floor(len(X) * 0.8)]
-    test_x = X[math.floor(len(X) * 0.8) :]
-    test_y = Y[math.floor(len(X) * 0.8) :]
+        layer_size = [32, 32, 4]
+        seed = jax.random.PRNGKey(42)
 
-    layer_size = [32, 32, 4]
-    seed = jax.random.PRNGKey(42)
+        if args.is_round_one_hot == "0":
+            params = initializa_params(layer_size, 19, seed)
+        else:
+            params = initializa_params(layer_size, 26, seed)  # featureでroundがone-hotになっている.
 
-    if args.is_round_one_hot == "0":
-        params = initializa_params(layer_size, 19, seed)
-    else:
-        params = initializa_params(layer_size, 26, seed)  # featureでroundがone-hotになっている.
+        optimizer = optax.adam(learning_rate=args.lr)
+
+        params, train_log, test_log = train(
+            params, optimizer, train_x, train_y, test_x, test_y, args.epochs, args.batch_size
+        )
+
+        plt.plot(train_log, label="train")
+        plt.plot(test_log, label="val")
+        plt.legend()
+        plt.savefig(
+            os.path.join(result_dir, "log/leaning_curve" + str(args.target_round) + ".png")
+        )
 
-    optimizer = optax.adam(learning_rate=args.lr)
+        save(args.target_round, params, result_dir)
+        for i in range(4):
+            plot_result(params, result_dir, i, round_candidates=[args.target_round])
+    else:  # 8局分一気に学習する
+        for target_round in range(8):
+            X, Y = set_dataset(target_round, mjxproto_dir, result_dir, args.use_saved_data)
 
-    params, train_log, test_log = train(
-        params, optimizer, train_x, train_y, test_x, test_y, args.epochs, args.batch_size
-    )
+            train_x = X[: math.floor(len(X) * 0.8)]
+            train_y = Y[: math.floor(len(X) * 0.8)]
+            test_x = X[math.floor(len(X) * 0.8) :]
+            test_y = Y[math.floor(len(X) * 0.8) :]
+
+            layer_size = [32, 32, 4]
+            seed = jax.random.PRNGKey(42)
+
+            if args.is_round_one_hot == "0":
+                params = initializa_params(layer_size, 19, seed)
+            else:
+                params = initializa_params(layer_size, 26, seed)  # featureでroundがone-hotになっている.
+
+            optimizer = optax.adam(learning_rate=args.lr)
+
+            params, train_log, test_log = train(
+                params, optimizer, train_x, train_y, test_x, test_y, args.epochs, args.batch_size
+            )
 
-    plt.plot(train_log, label="train")
-    plt.plot(test_log, label="val")
-    plt.legend()
-    plt.savefig(os.path.join(result_dir, "log/leaning_curve" + str(args.target_round) + ".png"))
+            plt.plot(train_log, label="train")
+            plt.plot(test_log, label="val")
+            plt.legend()
+            plt.savefig(os.path.join(result_dir, "log/leaning_curve" + str(target_round) + ".png"))
 
-    save(args, params, result_dir)
-    for i in range(4):
-        plot_result(params, X, Y, result_dir, i, round_candidates=[args.target_round])
+            save(target_round, params, result_dir)
+            for i in range(4):
+                plot_result(params, result_dir, i, round_candidates=[target_round])
diff --git a/workspace/suphx-reward-shaping/train_helper.py b/workspace/suphx-reward-shaping/train_helper.py
index 41c98d03..ea8fc9a3 100644
--- a/workspace/suphx-reward-shaping/train_helper.py
+++ b/workspace/suphx-reward-shaping/train_helper.py
@@ -158,8 +158,6 @@ def load_params(save_dir):
 
 def plot_result(
     params: optax.Params,
-    X,
-    Y,
     result_dir,
     target: int,
     is_round_one_hot=False,
diff --git a/workspace/suphx-reward-shaping/utils.py b/workspace/suphx-reward-shaping/utils.py
index 33fcce7b..8344a164 100644
--- a/workspace/suphx-reward-shaping/utils.py
+++ b/workspace/suphx-reward-shaping/utils.py
@@ -39,7 +39,6 @@ def to_data(
                 continue
             else:
                 scores.append(to_final_game_reward(states))
-    scores_array: jnp.ndarray = jnp.array(scores)
     features_array: jnp.ndarray = jnp.array(features)
     if use_model:
         x = features_array
@@ -48,6 +47,8 @@ def to_data(
             if i + 1 < len(params.values()):
                 x = jax.nn.relu(x)
         scores_array = x
+    else:
+        scores_array: jnp.ndarray = jnp.array(scores)
     return features_array, scores_array
 
 

From 7a6586f4b6ba165b61e796f58ea5e46a27e12564 Mon Sep 17 00:00:00 2001
From: nissymori <kyoudai.630@gmail.com>
Date: Wed, 21 Sep 2022 00:26:45 +0900
Subject: [PATCH 06/11] fix readme

---
 workspace/suphx-reward-shaping/README.md | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/workspace/suphx-reward-shaping/README.md b/workspace/suphx-reward-shaping/README.md
index 53ef36ea..59d6f5c2 100644
--- a/workspace/suphx-reward-shaping/README.md
+++ b/workspace/suphx-reward-shaping/README.md
@@ -1,4 +1,11 @@
-## Suphnx-like reward shaping
+## Reward shaping
+In order to handle a round as an episode in RL, it is important to align the game reward to each round appropriately.
+We call it reward shaping. we will prepare 8 NN (NN_0, ...NN_7) for each round and leaning procedure is as follows
+
+- train NN_7: input: features at the begining of round 7, target: game reward.
+- train NN_6: input: features at the begining of round 7, target: prediction by NN_7 on the data.
+- ... 
+
 
 ## How to train the model
 

From cd447ea0c930a0ec62a787a54c263e50d5362dbc Mon Sep 17 00:00:00 2001
From: nissymori <kyoudai.630@gmail.com>
Date: Thu, 22 Sep 2022 00:30:39 +0900
Subject: [PATCH 07/11] fix

---
 workspace/suphx-reward-shaping/README.md      |  2 +-
 .../tests/test_train_helper.py                |  2 +-
 .../suphx-reward-shaping/tests/test_utils.py  |  2 +-
 workspace/suphx-reward-shaping/train.py       | 15 ++++---
 .../suphx-reward-shaping/train_helper.py      | 41 +++++++++----------
 workspace/suphx-reward-shaping/utils.py       | 36 +++++++++-------
 6 files changed, 50 insertions(+), 48 deletions(-)

diff --git a/workspace/suphx-reward-shaping/README.md b/workspace/suphx-reward-shaping/README.md
index 59d6f5c2..bc31e0b8 100644
--- a/workspace/suphx-reward-shaping/README.md
+++ b/workspace/suphx-reward-shaping/README.md
@@ -3,7 +3,7 @@ In order to handle a round as an episode in RL, it is important to align the gam
 We call it reward shaping. we will prepare 8 NN (NN_0, ...NN_7) for each round and leaning procedure is as follows
 
 - train NN_7: input: features at the begining of round 7, target: game reward.
-- train NN_6: input: features at the begining of round 7, target: prediction by NN_7 on the data.
+- train NN_6: input: features at the begining of round 7, target: prediction by NN_7 on the round7.
 - ... 
 
 
diff --git a/workspace/suphx-reward-shaping/tests/test_train_helper.py b/workspace/suphx-reward-shaping/tests/test_train_helper.py
index f724b18e..1ac254df 100644
--- a/workspace/suphx-reward-shaping/tests/test_train_helper.py
+++ b/workspace/suphx-reward-shaping/tests/test_train_helper.py
@@ -69,7 +69,7 @@ def test_loss():
 
 def test_to_data():
     params = initializa_params(layer_sizes, feature_size, seed)
-    features, scores = to_data(mjxprotp_dir, use_model=True, params=params)
+    features, scores = to_data(mjxprotp_dir, params=params)
     print(features.shape, scores.shape)
 
 
diff --git a/workspace/suphx-reward-shaping/tests/test_utils.py b/workspace/suphx-reward-shaping/tests/test_utils.py
index dddae4c7..8088a6ee 100644
--- a/workspace/suphx-reward-shaping/tests/test_utils.py
+++ b/workspace/suphx-reward-shaping/tests/test_utils.py
@@ -47,7 +47,7 @@ def test_to_final_game_reward():
 
 def test_to_data():
     num_resources = len(os.listdir(mjxprotp_dir))
-    features, scores = to_data(mjxprotp_dir, round_candidates=[7])
+    features, scores = to_data(mjxprotp_dir, round_candidate=7)
     assert features.shape == (num_resources, 19)
     assert scores.shape == (num_resources, 4)
 
diff --git a/workspace/suphx-reward-shaping/train.py b/workspace/suphx-reward-shaping/train.py
index a0004492..9ef631e7 100644
--- a/workspace/suphx-reward-shaping/train.py
+++ b/workspace/suphx-reward-shaping/train.py
@@ -31,19 +31,18 @@ def save(target_round, params, result_dir):
 
 def set_dataset(target_round, mjxproto_dir: str, result_dir: str, use_saved_data):
     if use_saved_data == "0":
-        if target_round != 7:
+        if target_round != 7:  # 南四局以外
             params = jnp.load(
                 os.path.join(result_dir, "params" + str(target_round + 1) + ".pickle"),
                 allow_pickle=True,
             )
             X, Y = to_data(
                 mjxproto_dir,
-                round_candidates=[target_round],
+                round_candidate=target_round,
                 params=params,
-                use_model=True,
             )
-        else:
-            X, Y = to_data(mjxproto_dir, round_candidates=[target_round])
+        else:  # 南四局の時.
+            X, Y = to_data(mjxproto_dir, round_candidate=target_round)
         if target_round:
             jnp.save(os.path.join(result_dir, "features" + str(target_round)), X)
             jnp.save(os.path.join(result_dir, "labels" + str(target_round)), Y)
@@ -111,9 +110,9 @@ def set_dataset(target_round, mjxproto_dir: str, result_dir: str, use_saved_data
 
         save(args.target_round, params, result_dir)
         for i in range(4):
-            plot_result(params, result_dir, i, round_candidates=[args.target_round])
+            plot_result(params, result_dir, i, round_candidate=args.target_round)
     else:  # 8局分一気に学習する
-        for target_round in range(8):
+        for target_round in range(7, -1, -1):
             X, Y = set_dataset(target_round, mjxproto_dir, result_dir, args.use_saved_data)
 
             train_x = X[: math.floor(len(X) * 0.8)]
@@ -142,4 +141,4 @@ def set_dataset(target_round, mjxproto_dir: str, result_dir: str, use_saved_data
 
             save(target_round, params, result_dir)
             for i in range(4):
-                plot_result(params, result_dir, i, round_candidates=[target_round])
+                plot_result(params, result_dir, i, round_candidate=target_round)
diff --git a/workspace/suphx-reward-shaping/train_helper.py b/workspace/suphx-reward-shaping/train_helper.py
index ea8fc9a3..23386d0a 100644
--- a/workspace/suphx-reward-shaping/train_helper.py
+++ b/workspace/suphx-reward-shaping/train_helper.py
@@ -160,29 +160,26 @@ def plot_result(
     params: optax.Params,
     result_dir,
     target: int,
+    round_candidate=7,
     is_round_one_hot=False,
-    round_candidates=None,
 ):
     fig = plt.figure(figsize=(10, 5))
     axes = fig.subplots(1, 2)
-    if not round_candidates:
-        round_candidates = [i for i in range(8)]
-    for i in round_candidates:  # 通常の局数分
-        log_score = []
-        log_pred = []
-        for j in range(60):
-            x = jnp.array(_create_data_for_plot(j * 1000, i, is_round_one_hot, target))
-            pred = net(x, params)  # (1, 4)
-            log_score.append(j * 1000)
-            log_pred.append(pred[target] * 100)
-        axes[0].plot(log_score, log_pred, label="round_" + str(i))
-        axes[0].set_title("pos=" + str(target))
-        axes[0].hlines([90, 45, 0, -135], 0, 60000, "red")
-        axes[1].plot(log_score, log_pred, ".", label="round_" + str(i))
-        axes[1].set_title("pos=" + str(target))
-        axes[1].hlines([90, 45, 0, -135], 0, 60000, "red")
-        plt.legend()
-        save_dir = os.path.join(
-            result_dir, "prediction_at_round" + str(i) + "pos=" + str(target) + ".png"
-        )
-        plt.savefig(save_dir)
+    log_score = []
+    log_pred = []
+    for j in range(60):
+        x = jnp.array(_create_data_for_plot(j * 1000, round_candidate, is_round_one_hot, target))
+        pred = net(x, params)  # (1, 4)
+        log_score.append(j * 1000)
+        log_pred.append(pred[target] * 100)
+    axes[0].plot(log_score, log_pred, label="round_" + str(round_candidate))
+    axes[0].set_title("pos=" + str(target))
+    axes[0].hlines([90, 45, 0, -135], 0, 60000, "red")
+    axes[1].plot(log_score, log_pred, ".", label="round_" + str(round_candidate))
+    axes[1].set_title("pos=" + str(target))
+    axes[1].hlines([90, 45, 0, -135], 0, 60000, "red")
+    plt.legend()
+    save_dir = os.path.join(
+        result_dir, "prediction_at_round" + str(round_candidate) + "pos=" + str(target) + ".png"
+    )
+    plt.savefig(save_dir)
diff --git a/workspace/suphx-reward-shaping/utils.py b/workspace/suphx-reward-shaping/utils.py
index 8344a164..d14ecc9b 100644
--- a/workspace/suphx-reward-shaping/utils.py
+++ b/workspace/suphx-reward-shaping/utils.py
@@ -15,14 +15,13 @@
 game_rewards = [90, 45, 0, -135]
 
 
-def to_data(
-    mjxprotp_dir: str, round_candidates: Optional[List[int]] = None, params=None, use_model=False
-) -> Tuple[jnp.ndarray, jnp.ndarray]:
+def to_data(mjxprotp_dir: str, round_candidate=7, params=None) -> Tuple[jnp.ndarray, jnp.ndarray]:
     """
     jsonが入っているディレクトリを引数としてjax.numpyのデータセットを作る.
     """
     features: List = []
     scores: List = []
+    next_features = []
     for _json in os.listdir(mjxprotp_dir):
         _json = os.path.join(mjxprotp_dir, _json)
         assert ".json" in _json
@@ -30,18 +29,25 @@ def to_data(
             lines = f.readlines()
             _dicts = [json.loads(round) for round in lines]
             states = [json_format.ParseDict(d, mjxproto.State()) for d in _dicts]
-            state = _select_one_round(states, round_candidates)
+            state = _select_one_round(states, round_candidate)
+            if params:
+                assert 0 <= round_candidate <= 6
+                next_state = _select_one_round(states, round_candidate + 1)
+                if not next_state:
+                    continue
+                next_features.append(to_feature(next_state, round_candidate))
             if not state:  # 該当する局がない場合飛ばす.
                 continue
-            feature: List = to_feature(state, round_candidates=round_candidates)
+            feature: List = to_feature(state, round_candidate=round_candidate)
             features.append(feature)
-            if use_model:
+            if params:
                 continue
             else:
                 scores.append(to_final_game_reward(states))
     features_array: jnp.ndarray = jnp.array(features)
-    if use_model:
-        x = features_array
+    if params:
+        assert next_features
+        x = jnp.array(next_features)
         for i, param in enumerate(params.values()):
             x = jnp.dot(x, param)
             if i + 1 < len(params.values()):
@@ -52,22 +58,22 @@ def to_data(
     return features_array, scores_array
 
 
-def _filter_round(states: List[mjxproto.State], candidates: List[int]) -> List[int]:
+def _filter_round(states: List[mjxproto.State], candidate: int) -> List[int]:
     indices = []
     for idx, state in enumerate(states):
-        if state.public_observation.init_score.round in candidates:
+        if state.public_observation.init_score.round == candidate:
             indices.append(idx)
     return indices
 
 
 def _select_one_round(
-    states: List[mjxproto.State], candidates: Optional[List[int]] = None
+    states: List[mjxproto.State], candidate: Optional[int] = None
 ) -> Optional[mjxproto.State]:
     """
     データセットに本質的で無い相関が生まれることを防ぐために一半荘につき1ペアのみを使う.
     """
-    if candidates:
-        indices = _filter_round(states, candidates)
+    if candidate:
+        indices = _filter_round(states, candidate)
         if len(indices) == 0:  # 該当する局がない場合は飛ばす.
             return None
         idx = random.choice(indices)
@@ -100,7 +106,7 @@ def _to_one_hot(total_num: int, idx: int) -> List[int]:
 
 def _clip_round(round: int, lim=7) -> int:
     """
-    天鳳ではてんほうでは最長西4局まで行われるが何四局以降はサドンデスなので同一視.
+    天鳳ではでは最長西4局まで行われるが何四局以降はサドンデスなので同一視.
     """
     if round < 7:
         return round
@@ -126,7 +132,7 @@ def _remaining_oya(round: int):  # 局終了時の残りの親の数
 def to_feature(
     state: mjxproto.State,
     is_round_one_hot=False,
-    round_candidates: Optional[List[int]] = None,
+    round_candidate: Optional[int] = None,
 ) -> List:
     """
     特徴量 = [4playerの点数, 起家の風:one-hot, 親:one-hot, 残りの親の数, 局, 本場, 詰み棒]

From ec422162092bfc29108522a01563f2b124b9dad8 Mon Sep 17 00:00:00 2001
From: nissymori <kyoudai.630@gmail.com>
Date: Sat, 24 Sep 2022 02:55:14 +0900
Subject: [PATCH 08/11] fix

---
 .../tests/test_train_helper.py                |  41 +--
 .../suphx-reward-shaping/tests/test_utils.py  |  31 ++-
 workspace/suphx-reward-shaping/train.py       | 246 +++++++++++-------
 .../suphx-reward-shaping/train_helper.py      | 113 ++++----
 workspace/suphx-reward-shaping/utils.py       |  55 ++--
 5 files changed, 271 insertions(+), 215 deletions(-)

diff --git a/workspace/suphx-reward-shaping/tests/test_train_helper.py b/workspace/suphx-reward-shaping/tests/test_train_helper.py
index 1ac254df..01ed1976 100644
--- a/workspace/suphx-reward-shaping/tests/test_train_helper.py
+++ b/workspace/suphx-reward-shaping/tests/test_train_helper.py
@@ -6,7 +6,7 @@
 import optax
 
 sys.path.append("../")
-from train_helper import initializa_params, load_params, loss, net, plot_result, save_params, train
+from train_helper import initializa_params, load_params, loss, net, train
 from utils import to_data
 
 layer_sizes = [3, 4, 5, 4]
@@ -25,53 +25,32 @@ def test_initialize_params():
 
 def test_train():
     params = initializa_params(layer_sizes, feature_size, seed)
-    features, scores = to_data(mjxprotp_dir)
+    features, targets, scores = to_data(mjxprotp_dir)
     optimizer = optax.adam(0.05)
-    params, train_log, test_log = train(
-        params, optimizer, features, scores, features, scores, epochs=1, batch_size=1
+    params, _, _, _ = train(
+        params, optimizer, features, scores, features, scores, scores, epochs=1, batch_size=1
     )
     assert len(params) == 4
 
 
-def test_save_and_load():
-    params = initializa_params(layer_sizes, feature_size, seed)
-    features, scores = to_data(mjxprotp_dir)
-    optimizer = optax.adam(0.05)
-    params, _, _ = train(
-        params, optimizer, features, scores, features, scores, epochs=1, batch_size=1
-    )
-    save_params(params, save_dir)
-    params = load_params(save_dir)
-    net(features, params)
-
-
-def test_plot_result():
-    params = initializa_params(layer_sizes, feature_size, seed)
-    features, scores = to_data(mjxprotp_dir)
-    optimizer = optax.adam(0.05)
-    params, _, _ = train(
-        params, optimizer, features, scores, features, scores, epochs=1, batch_size=1
-    )
-    plot_result(params, features, scores, result_dir)
-
-
 def test_net():
     params = initializa_params(layer_sizes, feature_size, seed)
-    features, scores = to_data(mjxprotp_dir)
+    features, targets, scores = to_data(mjxprotp_dir)
     print(net(features[0], params), scores.shape)
 
 
 def test_loss():
     params = initializa_params(layer_sizes, feature_size, seed)
-    features, scores = to_data(mjxprotp_dir)
-    print(loss(params, features, scores))
+    features, targets, scores = to_data(mjxprotp_dir)
+    print(loss(params, features, targets))
 
 
 def test_to_data():
     params = initializa_params(layer_sizes, feature_size, seed)
-    features, scores = to_data(mjxprotp_dir, params=params)
-    print(features.shape, scores.shape)
+    features, targets, scores = to_data(mjxprotp_dir, params=params, round_candidate=7)
+    print(features.shape, scores.shape, targets.shape)
 
 
 if __name__ == "__main__":
+    test_train()
     test_to_data()
diff --git a/workspace/suphx-reward-shaping/tests/test_utils.py b/workspace/suphx-reward-shaping/tests/test_utils.py
index 8088a6ee..bd3c9fc1 100644
--- a/workspace/suphx-reward-shaping/tests/test_utils.py
+++ b/workspace/suphx-reward-shaping/tests/test_utils.py
@@ -1,7 +1,9 @@
 import json
 import os
 import sys
+from concurrent.futures import process
 
+import numpy as np
 from google.protobuf import json_format
 
 sys.path.append("../../../")
@@ -9,18 +11,30 @@
 
 sys.path.append("../")
 from train_helper import initializa_params
-from utils import _calc_wind, _preprocess_scores, to_data, to_final_game_reward
+from utils import (
+    _calc_wind,
+    _preprocess_score,
+    _preprocess_score_inv,
+    to_data,
+    to_final_game_reward,
+)
 
 mjxprotp_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), "resources")
 
 
 def test_preprocess():
-    scores = [0, 100000, 200000, 300000]
-    print(_preprocess_scores(scores, 1))
-    assert _preprocess_scores(scores, 0) == [0, 3, 2, 1]
-    assert _preprocess_scores(scores, 1) == [1, 0, 3, 2]
-    assert _preprocess_scores(scores, 2) == [2, 1, 0, 3]
-    assert _preprocess_scores(scores, 3) == [3, 2, 1, 0]
+    assert _preprocess_score(90) == 1
+    assert _preprocess_score(-135) == 0
+
+
+def test_preprocess_inv():
+    """
+    activation functionをlogistic関数にして, 元のスコアにうまく変換できるか
+    """
+    assert _preprocess_score_inv(_preprocess_score(90)) == 90
+    assert -0.0001 <= _preprocess_score_inv(_preprocess_score(0)) <= 0.0001
+    assert 44.99999 <= _preprocess_score_inv(_preprocess_score(45)) <= 45.0001
+    assert -135.00005 <= _preprocess_score_inv(_preprocess_score(-135)) <= -134.99999
 
 
 def test_calc_wind():
@@ -47,11 +61,10 @@ def test_to_final_game_reward():
 
 def test_to_data():
     num_resources = len(os.listdir(mjxprotp_dir))
-    features, scores = to_data(mjxprotp_dir, round_candidate=7)
+    features, target, scores = to_data(mjxprotp_dir, round_candidate=7)
     assert features.shape == (num_resources, 19)
     assert scores.shape == (num_resources, 4)
 
 
 if __name__ == "__main__":
     test_to_data()
-    test_to_final_game_reward()
diff --git a/workspace/suphx-reward-shaping/train.py b/workspace/suphx-reward-shaping/train.py
index 9ef631e7..1dedd264 100644
--- a/workspace/suphx-reward-shaping/train.py
+++ b/workspace/suphx-reward-shaping/train.py
@@ -2,13 +2,14 @@
 import math
 import os
 import pickle
+from typing import List, Optional
 
 import jax
 import jax.numpy as jnp
 import matplotlib.pyplot as plt
 import optax
-from train_helper import initializa_params, plot_result, save_params, train
-from utils import to_data
+from train_helper import _preds_fig, _score_pred_pair, initializa_params, net, save_pickle, train
+from utils import _create_data_for_plot, to_data
 
 """
 局ごとにデータとモデルを用意するので
@@ -20,125 +21,178 @@
 """
 
 
-def save(target_round, params, result_dir):
-    if target_round:
-        save_dir = os.path.join(result_dir, "params" + str(target_round) + ".pickle")
-        save_params(params, save_dir)
+def file_name(type, opt) -> str:
+    file_name = ""
+    if type == "params":
+        file_name = "params/params"
+    if type == "preds":
+        file_name = "preds/pred"
+    elif type == "features":
+        file_name = "datasets/features"
+    elif type == "labesl":
+        file_name = "datasets/labels"
+    elif type == "fin_scores":
+        file_name = "datasets/fin_scores"
+    elif type == "learning_curve":
+        file_name = "logs/learning_curve"
+    elif type == "abs_loss_curve":
+        file_name = "logs/abs_loss_curve"
+    elif type == "abs_loss":
+        file_name = "logs/abs_loss"
+    elif type == "train_loss":
+        file_name = "logs/train_loss"
+    elif type == "test_loss":
+        file_name = "logs/test_loss"
+    assert file_name != ""
+    if opt.use_logistic:
+        file_name += "use_logistic"
     else:
-        save_dir = os.path.join(result_dir, "params" + ".pickle")
-        save_params(params, save_dir)
-
+        file_name += "no_logistic"
+    if opt.round_wise:
+        assert opt.target_round >= 0
+        file_name += str(opt.target_round)
+    return file_name
+
+
+def set_dataset_round_wise(mjxproto_dir: str, result_dir: str, opt):  # TD用
+    if opt.use_saved_data:
+        X: jnp.ndarray = jnp.load(
+            os.path.join(
+                result_dir, file_name("features", opt.target_round, opt.use_logistic) + ".npy"
+            )
+        )
+        Y: jnp.ndarray = jnp.load(
+            os.path.join(
+                result_dir, file_name("labels", opt.target_round, opt.use_logistic) + ".npy"
+            )
+        )
+        fin_scores: jnp.ndarray = jnp.load(
+            os.path.join(result_dir, file_name("fin_scores", opt.target_round) + ".npy")
+        )
 
-def set_dataset(target_round, mjxproto_dir: str, result_dir: str, use_saved_data):
-    if use_saved_data == "0":
-        if target_round != 7:  # 南四局以外
+    else:
+        if opt.target_round != 7:  # 南四局以外は一つ後の局のモデルを使う．
             params = jnp.load(
-                os.path.join(result_dir, "params" + str(target_round + 1) + ".pickle"),
+                os.path.join(
+                    result_dir,
+                    file_name("params", opt.target_round, opt.use_logistic) + ".pickle",
+                ),
                 allow_pickle=True,
             )
-            X, Y = to_data(
+            X, Y, fin_scores = to_data(
                 mjxproto_dir,
-                round_candidate=target_round,
+                round_candidate=opt.target_round,
                 params=params,
             )
         else:  # 南四局の時.
-            X, Y = to_data(mjxproto_dir, round_candidate=target_round)
-        if target_round:
-            jnp.save(os.path.join(result_dir, "features" + str(target_round)), X)
-            jnp.save(os.path.join(result_dir, "labels" + str(target_round)), Y)
-        else:
-            jnp.save(os.path.join(result_dir, "features"), X)
-            jnp.save(os.path.join(result_dir, "labels"), Y)
+            X, Y, fin_scores = to_data(mjxproto_dir, round_candidate=opt.target_round)
+        jnp.save(os.path.join(result_dir, file_name("features", opt)), X)
+        jnp.save(os.path.join(result_dir, file_name("labels", opt)), Y)
+        jnp.save(os.path.join(result_dir, file_name("fin_scores", opt)), Y)
+    return X, Y, fin_scores
+
+
+def set_dataset_whole(mjxprotp_dir: str, result_dir: str, opt):  # suphnx用
+    if opt.use_saved_data:
+        X: jnp.ndarray = jnp.load(os.path.join(result_dir, file_name("features") + ".npy"))
+        Y: jnp.ndarray = jnp.load(os.path.join(result_dir, file_name("labels") + ".npy"))
+        fin_scores: jnp.ndarray = jnp.load(
+            os.path.join(result_dir, file_name("fin_scores") + ".npy")
+        )
     else:
-        if target_round:
-            X: jnp.ndarray = jnp.load(
-                os.path.join(result_dir, "features" + str(target_round) + ".npy")
-            )
-            Y: jnp.ndarray = jnp.load(
-                os.path.join(result_dir, "labels" + str(target_round) + ".npy")
-            )
-        else:
-            X: jnp.ndarray = jnp.load(os.path.join(result_dir, "features.npy"))
-            Y: jnp.ndarray = jnp.load(os.path.join(result_dir, "labels.npy"))
-    return X, Y
+        X, Y, fin_scores = to_data(mjxproto_dir, round_candidate=None)
+        jnp.save(os.path.join(result_dir, file_name("features", opt)), X)
+        jnp.save(os.path.join(result_dir, file_name("labels", opt)), Y)
+        jnp.save(os.path.join(result_dir, file_name("fin_scores", opt)), Y)
+    return X, Y, fin_scores
 
 
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    parser.add_argument("lr", help="Enter learning rate", type=float)
-    parser.add_argument("epochs", help="Enter epochs", type=int)
-    parser.add_argument("batch_size", help="Enter batch_size", type=int)
-    parser.add_argument("is_round_one_hot", nargs="?", default="0")
-    parser.add_argument("--use_saved_data", nargs="?", default="0")
-    parser.add_argument("--data_path", default="resources/mjxproto")
-    parser.add_argument("--result_path", default="result")
-    parser.add_argument("--target_round", type=int)  # 対象となる局 e.g 3の時は東4局のデータのみ使う.
-    parser.add_argument("--at_once", type=int, default=0)
+def run_training(X, Y, scores, opt):
+    train_x = X[: math.floor(len(X) * 0.8)]
+    train_y = Y[: math.floor(len(X) * 0.8)]
+    test_x = X[math.floor(len(X) * 0.8) :]
+    test_y = Y[math.floor(len(X) * 0.8) :]
+    test_scores = scores[math.floor(len(X) * 0.8) :]
 
-    args = parser.parse_args()
-    mjxproto_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), args.data_path)
+    assert len(Y) == len(test_scores)
 
-    result_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), args.result_path)
-    if args.at_once == 0:
-        X, Y = set_dataset(args.target_round, mjxproto_dir, result_dir, args.use_saved_data)
+    layer_size = [32, 32, 4]
+    seed = jax.random.PRNGKey(42)
 
-        train_x = X[: math.floor(len(X) * 0.8)]
-        train_y = Y[: math.floor(len(X) * 0.8)]
-        test_x = X[math.floor(len(X) * 0.8) :]
-        test_y = Y[math.floor(len(X) * 0.8) :]
+    if opt.is_round_one_hot:
+        params = initializa_params(layer_size, 26, seed)  # featureでroundがone-hotになっている.
+    else:
+        params = initializa_params(layer_size, 19, seed)
 
-        layer_size = [32, 32, 4]
-        seed = jax.random.PRNGKey(42)
+    optimizer = optax.adam(learning_rate=opt.lr)
 
-        if args.is_round_one_hot == "0":
-            params = initializa_params(layer_size, 19, seed)
-        else:
-            params = initializa_params(layer_size, 26, seed)  # featureでroundがone-hotになっている.
+    params, train_log, test_log, test_abs_log = train(
+        params,
+        optimizer,
+        train_x,
+        train_y,
+        test_x,
+        test_y,
+        scores,
+        opt.epochs,
+        opt.batch_size,
+        use_logistic=opt.use_logistic,
+    )
+    return params, train_log, test_log, test_abs_log
 
-        optimizer = optax.adam(learning_rate=args.lr)
 
-        params, train_log, test_log = train(
-            params, optimizer, train_x, train_y, test_x, test_y, args.epochs, args.batch_size
-        )
+def plot_learning_log(train_log, test_log, test_abs_log, opt, result_dir):
+    fig = plt.figure()
+    plt.plot(train_log, label="train")
+    plt.plot(test_log, label="val")
+    plt.legend()
+    fig.savefig(os.path.join(result_dir, file_name("learning_curve", opt) + ".png"))
+    fig = plt.figure()
+    plt.plot(test_abs_log, label="val")
+    plt.legend()
+    fig.savefig(os.path.join(result_dir, file_name("abs_loss_curve", opt) + ".png"))
 
-        plt.plot(train_log, label="train")
-        plt.plot(test_log, label="val")
-        plt.legend()
-        plt.savefig(
-            os.path.join(result_dir, "log/leaning_curve" + str(args.target_round) + ".png")
-        )
 
-        save(args.target_round, params, result_dir)
-        for i in range(4):
-            plot_result(params, result_dir, i, round_candidate=args.target_round)
-    else:  # 8局分一気に学習する
-        for target_round in range(7, -1, -1):
-            X, Y = set_dataset(target_round, mjxproto_dir, result_dir, args.use_saved_data)
+def plot_result(params: optax.Params, target: int, opt, result_dir):
+    scores, preds = _score_pred_pair(
+        params, target, opt.target_round, opt.is_round_one_hot, opt.use_logistic
+    )
+    fig = _preds_fig(scores, preds, target, opt.round_candidate)
+    fig.save(os.path.join(result_dir, file_name("preds", opt)))
 
-            train_x = X[: math.floor(len(X) * 0.8)]
-            train_y = Y[: math.floor(len(X) * 0.8)]
-            test_x = X[math.floor(len(X) * 0.8) :]
-            test_y = Y[math.floor(len(X) * 0.8) :]
 
-            layer_size = [32, 32, 4]
-            seed = jax.random.PRNGKey(42)
+def save_learning_log(train_log, test_log, test_abs_log, opt, result_dir):
+    save_pickle(train_log, os.path.join(result_dir, file_name("train_loss", opt)))
+    save_pickle(test_log, os.path.join(result_dir, file_name("test_loss", opt)))
+    save_pickle(test_abs_log, os.path.join(result_dir, file_name("abs_loss", opt)))
 
-            if args.is_round_one_hot == "0":
-                params = initializa_params(layer_size, 19, seed)
-            else:
-                params = initializa_params(layer_size, 26, seed)  # featureでroundがone-hotになっている.
 
-            optimizer = optax.adam(learning_rate=args.lr)
+def save_params(params, opt, result_dir):
+    save_pickle(params, os.path.join(result_dir, file_name("params", opt)))
 
-            params, train_log, test_log = train(
-                params, optimizer, train_x, train_y, test_x, test_y, args.epochs, args.batch_size
-            )
 
-            plt.plot(train_log, label="train")
-            plt.plot(test_log, label="val")
-            plt.legend()
-            plt.savefig(os.path.join(result_dir, "log/leaning_curve" + str(target_round) + ".png"))
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("lr", help="Enter learning rate", type=float)
+    parser.add_argument("epochs", help="Enter epochs", type=int)
+    parser.add_argument("batch_size", help="Enter batch_size", type=int)
+    parser.add_argument("is_round_one_hot", type=int, default=0)
+    parser.add_argument("--use_saved_data", type=int, default=0)
+    parser.add_argument("--data_path", default="resources/mjxproto")
+    parser.add_argument("--result_path", default="result")
+    parser.add_argument("--target_round", type=int)  # 対象となる局 e.g 3の時は東4局のデータのみ使う.
+    parser.add_argument("--at_once", type=int, default=0)
+    parser.add_argument("--max_round", type=int, default=7)
+    parser.add_argument("--round_wise", type=int, default=0)  # roundごとにNNを作るか(TD or suphx)
+    parser.add_argument("--use_logistic", type=int, default=0)  # logistic関数を使うかどうか
 
-            save(target_round, params, result_dir)
-            for i in range(4):
-                plot_result(params, result_dir, i, round_candidate=target_round)
+    args = parser.parse_args()
+    mjxproto_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), args.data_path)
+    result_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), args.result_path)
+    X, Y, scores = set_dataset_round_wise(mjxproto_dir, result_dir, args)
+    params, train_log, test_log, test_abs_log = run_training(X, Y, scores, args)
+    save_params(params, args, result_dir)
+    save_learning_log(train_log, test_log, test_abs_log, args, result_dir)
+    plot_learning_log(train_log, test_log, test_abs_log, args, result_dir)
+    for i in range(4):
+        plot_result(params, i, args, result_dir)
diff --git a/workspace/suphx-reward-shaping/train_helper.py b/workspace/suphx-reward-shaping/train_helper.py
index 23386d0a..673e60c1 100644
--- a/workspace/suphx-reward-shaping/train_helper.py
+++ b/workspace/suphx-reward-shaping/train_helper.py
@@ -18,7 +18,15 @@
 from jax import value_and_grad, vmap
 
 sys.path.append(".")
-from utils import _calc_curr_pos, _calc_wind, _create_data_for_plot, _remaining_oya, _to_one_hot
+from utils import (
+    _calc_curr_pos,
+    _calc_wind,
+    _create_data_for_plot,
+    _preprocess_score,
+    _preprocess_score_inv,
+    _remaining_oya,
+    _to_one_hot,
+)
 
 
 def initializa_params(layer_sizes: List[int], features: int, seed) -> Dict:
@@ -53,44 +61,40 @@ def relu(x: jnp.ndarray) -> jnp.ndarray:
     return jnp.maximum(0, x)
 
 
-def net(x: jnp.ndarray, params: optax.Params) -> jnp.ndarray:
+def net(x: jnp.ndarray, params: optax.Params, use_logistic=False) -> jnp.ndarray:
     for i, param in enumerate(params.values()):
         x = jnp.dot(x, param)
         if i + 1 < len(params.values()):
             x = jax.nn.relu(x)
+        if use_logistic:
+            x = jnp.exp(x) / (1 + jnp.exp(x))
     return x
 
 
-def loss(params: optax.Params, batched_x: jnp.ndarray, batched_y: jnp.ndarray) -> jnp.ndarray:
-    preds = net(batched_x, params)
+def loss(
+    params: optax.Params, batched_x: jnp.ndarray, batched_y: jnp.ndarray, use_logistic=False
+) -> jnp.ndarray:
+    preds = net(batched_x, params, use_logistic=use_logistic)
     loss_value = optax.l2_loss(preds, batched_y).mean(axis=-1)
     return loss_value.mean()
 
 
-def train_one_step(params: optax.Params, opt_state, batched_dataset, optimizer, epoch):
-    @jax.jit
-    def step(params: optax.Params, opt_state, batch, labels):
-        loss_value, grads = jax.value_and_grad(loss)(params, batch, labels)
-        updates, opt_state = optimizer.update(grads, opt_state, params)
-        params = optax.apply_updates(params, updates)
-        return params, opt_state, loss_value
-
+def evaluate(params: optax.Params, batched_dataset, use_logistic=False) -> float:
     cum_loss = 0
     for batched_x, batched_y in batched_dataset:
-        params, opt_state, loss_value = step(
-            params, opt_state, batched_x.numpy(), batched_y.numpy(), optimizer
-        )
-        cum_loss += loss_value
-        if epoch % 100 == 0:  # print MSE every 100 epochs
-            pred = net(batched_x[0].numpy(), params)
-            print(f"step {epoch}, pred {pred}, actual {batched_y[0]}")
-    return params, cum_loss / len(batched_dataset)
+        cum_loss += loss(params, batched_x.numpy(), batched_y.numpy(), use_logistic=use_logistic)
+    return cum_loss / len(batched_dataset)
 
 
-def evaluate_one_step(params: optax.Params, batched_dataset) -> float:
+def evaluate_abs(
+    params: optax.Params, batched_dataset, use_logistic=False
+) -> float:  # 前処理する前のスケールでの絶対誤差
     cum_loss = 0
     for batched_x, batched_y in batched_dataset:
-        cum_loss += loss(params, batched_x.numpy(), batched_y.numpy())
+        cum_loss += jnp.abs(
+            _preprocess_score_inv(net(batched_x.numpy(), params, use_logistic=use_logistic))
+            - batched_y.numpy()
+        ).mean()
     return cum_loss / len(batched_dataset)
 
 
@@ -101,10 +105,12 @@ def train(
     Y_train: jnp.ndarray,
     X_test: jnp.ndarray,
     Y_test: jnp.ndarray,
+    Score_test: jnp.ndarray,
     epochs: int,
     batch_size: int,
     buffer_size=3,
-) -> optax.Params:
+    use_logistic=False,
+):
     """
     学習用の関数. 線形層を前提としており, バッチ処理やシャッフルのためにtensorflowを使っている.
     """
@@ -114,13 +120,16 @@ def train(
     )
     dataset_test = tf.data.Dataset.from_tensor_slices((X_test, Y_test))
     batched_dataset_test = dataset_test.batch(batch_size, drop_remainder=True)
+    dataset_abs_test = tf.data.Dataset.from_tensor_slices((X_test, Score_test))
+    batched_dataset_abs_test = dataset_abs_test.batch(batch_size, drop_remainder=True)
     opt_state = optimizer.init(params)
 
-    train_log, test_log = [], []
+    train_log, test_log, test_abs_log = [], [], []
 
-    @jax.jit
-    def step(params, opt_state, batch, labels):
-        loss_value, grads = jax.value_and_grad(loss)(params, batch, labels)
+    def step(params, opt_state, batch, labels, use_logistic=None):
+        loss_value, grads = jax.value_and_grad(loss)(
+            params, batch, labels, use_logistic=use_logistic
+        )
         updates, opt_state = optimizer.update(grads, opt_state, params)
         params = optax.apply_updates(params, updates)
         return params, opt_state, loss_value
@@ -129,25 +138,28 @@ def step(params, opt_state, batch, labels):
         cum_loss = 0
         for batched_x, batched_y in batched_dataset_train:
             params, opt_state, loss_value = step(
-                params, opt_state, batched_x.numpy(), batched_y.numpy()
+                params, opt_state, batched_x.numpy(), batched_y.numpy(), use_logistic=use_logistic
             )
             cum_loss += loss_value
             if i % 100 == 0:  # print MSE every 100 epochs
                 pred = net(batched_x[0].numpy(), params)
                 print(f"step {i}, loss: {loss_value}, pred {pred}, actual {batched_y[0]}")
         mean_train_loss = cum_loss / len(batched_dataset_train)
-
-        mean_test_loss = evaluate_one_step(params, batched_dataset_test)
+        mean_test_loss = evaluate(params, batched_dataset_test, use_logistic=use_logistic)
+        mean_abs_test_loss = evaluate_abs(
+            params, batched_dataset_abs_test, use_logistic=use_logistic
+        )
 
         # record mean of train loss and test loss per epoch
         train_log.append(float(np.array(mean_train_loss).item(0)))
         test_log.append(float(np.array(mean_test_loss).item(0)))
-    return params, train_log, test_log
+        test_abs_log.append(float(np.array(mean_abs_test_loss).item(0)))
+    return params, train_log, test_log, test_abs_log
 
 
-def save_params(params: optax.Params, save_dir):
+def save_pickle(obs, save_dir):
     with open(save_dir, "wb") as f:
-        pickle.dump(params, f)
+        pickle.dump(obs, f)
 
 
 def load_params(save_dir):
@@ -156,30 +168,25 @@ def load_params(save_dir):
     return params
 
 
-def plot_result(
-    params: optax.Params,
-    result_dir,
-    target: int,
-    round_candidate=7,
-    is_round_one_hot=False,
-):
-    fig = plt.figure(figsize=(10, 5))
-    axes = fig.subplots(1, 2)
-    log_score = []
-    log_pred = []
+def _score_pred_pair(params, target: int, round_candidate: int, is_round_one_hot, use_logistic):
+    scores = []
+    preds = []
     for j in range(60):
         x = jnp.array(_create_data_for_plot(j * 1000, round_candidate, is_round_one_hot, target))
-        pred = net(x, params)  # (1, 4)
-        log_score.append(j * 1000)
-        log_pred.append(pred[target] * 100)
-    axes[0].plot(log_score, log_pred, label="round_" + str(round_candidate))
+        pred = net(x, params, use_logistic=use_logistic)  # (1, 4)
+        scores.append(j * 1000)
+        preds.append(pred[target] * 100)
+    return scores, preds
+
+
+def _preds_fig(scores, preds, target, round_candidate):
+    fig = plt.figure(figsize=(10, 5))
+    axes = fig.subplots(1, 2)
+    axes[0].plot(scores, preds, label="round_" + str(round_candidate))
     axes[0].set_title("pos=" + str(target))
     axes[0].hlines([90, 45, 0, -135], 0, 60000, "red")
-    axes[1].plot(log_score, log_pred, ".", label="round_" + str(round_candidate))
+    axes[1].plot(scores, preds, ".", label="round_" + str(round_candidate))
     axes[1].set_title("pos=" + str(target))
     axes[1].hlines([90, 45, 0, -135], 0, 60000, "red")
     plt.legend()
-    save_dir = os.path.join(
-        result_dir, "prediction_at_round" + str(round_candidate) + "pos=" + str(target) + ".png"
-    )
-    plt.savefig(save_dir)
+    return fig
diff --git a/workspace/suphx-reward-shaping/utils.py b/workspace/suphx-reward-shaping/utils.py
index d14ecc9b..6beb1c4f 100644
--- a/workspace/suphx-reward-shaping/utils.py
+++ b/workspace/suphx-reward-shaping/utils.py
@@ -15,7 +15,9 @@
 game_rewards = [90, 45, 0, -135]
 
 
-def to_data(mjxprotp_dir: str, round_candidate=7, params=None) -> Tuple[jnp.ndarray, jnp.ndarray]:
+def to_data(
+    mjxprotp_dir: str, round_candidate=None, params=None
+) -> Tuple[jnp.ndarray, jnp.ndarray, jnp.ndarray]:
     """
     jsonが入っているディレクトリを引数としてjax.numpyのデータセットを作る.
     """
@@ -29,22 +31,20 @@ def to_data(mjxprotp_dir: str, round_candidate=7, params=None) -> Tuple[jnp.ndar
             lines = f.readlines()
             _dicts = [json.loads(round) for round in lines]
             states = [json_format.ParseDict(d, mjxproto.State()) for d in _dicts]
-            state = _select_one_round(states, round_candidate)
+            state = _select_one_round(states, candidate=round_candidate)
             if params:
-                assert 0 <= round_candidate <= 6
-                next_state = _select_one_round(states, round_candidate + 1)
+                assert (round_candidate != None) and (0 <= round_candidate <= 7)
+                next_state = _select_one_round(states, candidate=round_candidate + 1)
                 if not next_state:
                     continue
-                next_features.append(to_feature(next_state, round_candidate))
+                next_features.append(to_feature(next_state, round_candidate=round_candidate))
             if not state:  # 該当する局がない場合飛ばす.
                 continue
             feature: List = to_feature(state, round_candidate=round_candidate)
             features.append(feature)
-            if params:
-                continue
-            else:
-                scores.append(to_final_game_reward(states))
+            scores.append(to_final_game_reward(states))
     features_array: jnp.ndarray = jnp.array(features)
+    scores_array: jnp.ndarray = jnp.array(scores)
     if params:
         assert next_features
         x = jnp.array(next_features)
@@ -52,10 +52,10 @@ def to_data(mjxprotp_dir: str, round_candidate=7, params=None) -> Tuple[jnp.ndar
             x = jnp.dot(x, param)
             if i + 1 < len(params.values()):
                 x = jax.nn.relu(x)
-        scores_array = x
+        targets_array: jnp.ndarray = jnp.clip(x, -1.35, 0.9)
     else:
-        scores_array: jnp.ndarray = jnp.array(scores)
-    return features_array, scores_array
+        targets_array: jnp.ndarray = jnp.array(scores)
+    return (features_array, targets_array, scores_array)
 
 
 def _filter_round(states: List[mjxproto.State], candidate: int) -> List[int]:
@@ -72,7 +72,7 @@ def _select_one_round(
     """
     データセットに本質的で無い相関が生まれることを防ぐために一半荘につき1ペアのみを使う.
     """
-    if candidate:
+    if candidate != None:
         indices = _filter_round(states, candidate)
         if len(indices) == 0:  # 該当する局がない場合は飛ばす.
             return None
@@ -114,17 +114,6 @@ def _clip_round(round: int, lim=7) -> int:
         return 7
 
 
-def _preprocess_scores(scores, target: int) -> List:
-    """
-    局終了時の点数を100000で割って自家, 下家, 対面, 上家の順に並び替える.
-    """
-    _self: int = scores[target] / 100000
-    _left: int = scores[target - 1] / 100000
-    _front: int = scores[target - 2] / 100000
-    _right: int = scores[target - 3] / 100000
-    return [_self, _left, _front, _right]
-
-
 def _remaining_oya(round: int):  # 局終了時の残りの親の数
     return [2 - (round // 4 + ((round % 4) >= i)) for i in range(4)]
 
@@ -156,6 +145,20 @@ def to_feature(
     return feature
 
 
+def _preprocess_score(score):
+    """
+    局終了時の点数を100000で割って自家, 下家, 対面, 上家の順に並び替える.
+    """
+    return (score + 135) / 225
+
+
+def _preprocess_score_inv(processed_score):
+    """
+    変換したtargetを用て学習したNNの出力を元々のscoreの範囲に変換
+    """
+    return 225 * (processed_score) - 135
+
+
 def to_final_game_reward(states: List[mjxproto.State]) -> List:
     """
     順位点. 起家から順番に. 4次元.
@@ -168,8 +171,8 @@ def to_final_game_reward(states: List[mjxproto.State]) -> List:
 
 
 def _create_data_for_plot(score: int, round: int, is_round_one_hot, target: int) -> List:
-    scores = [(100000 - score) / 300000] * 4
-    scores[target] = score / 100000
+    scores = [_preprocess_score((100000 - score) / 3)] * 4
+    scores[target] = _preprocess_score(score)
     wind = _to_one_hot(4, _calc_wind(0, round))
     oya: List[int] = _to_one_hot(4, _calc_curr_pos(0, round))
     remainning_oya = _remaining_oya(round)

From ba4147832007eaaa59ff1b13bf58afde7b5f58f2 Mon Sep 17 00:00:00 2001
From: nissymori <kyoudai.630@gmail.com>
Date: Thu, 29 Sep 2022 13:34:41 +0900
Subject: [PATCH 09/11] fix

---
 workspace/suphx-reward-shaping/train.py       | 78 ++++++++++---------
 workspace/suphx-reward-shaping/train.sh       | 24 ++++++
 .../suphx-reward-shaping/train_helper.py      |  6 +-
 workspace/suphx-reward-shaping/utils.py       | 23 +++---
 4 files changed, 82 insertions(+), 49 deletions(-)
 create mode 100755 workspace/suphx-reward-shaping/train.sh

diff --git a/workspace/suphx-reward-shaping/train.py b/workspace/suphx-reward-shaping/train.py
index 1dedd264..b1fdbbea 100644
--- a/workspace/suphx-reward-shaping/train.py
+++ b/workspace/suphx-reward-shaping/train.py
@@ -21,7 +21,7 @@
 """
 
 
-def file_name(type, opt) -> str:
+def file_name(type, opt, slide_round=False) -> str:
     file_name = ""
     if type == "params":
         file_name = "params/params"
@@ -29,7 +29,7 @@ def file_name(type, opt) -> str:
         file_name = "preds/pred"
     elif type == "features":
         file_name = "datasets/features"
-    elif type == "labesl":
+    elif type == "labels":
         file_name = "datasets/labels"
     elif type == "fin_scores":
         file_name = "datasets/fin_scores"
@@ -45,29 +45,24 @@ def file_name(type, opt) -> str:
         file_name = "logs/test_loss"
     assert file_name != ""
     if opt.use_logistic:
-        file_name += "use_logistic"
+        file_name += "_use_logistic_"
     else:
-        file_name += "no_logistic"
+        file_name += "_no_logistic_"
     if opt.round_wise:
         assert opt.target_round >= 0
-        file_name += str(opt.target_round)
+        if slide_round:
+            file_name += str(opt.target_round + 1)
+        else:
+            file_name += str(opt.target_round)
     return file_name
 
 
 def set_dataset_round_wise(mjxproto_dir: str, result_dir: str, opt):  # TD用
-    if opt.use_saved_data:
-        X: jnp.ndarray = jnp.load(
-            os.path.join(
-                result_dir, file_name("features", opt.target_round, opt.use_logistic) + ".npy"
-            )
-        )
-        Y: jnp.ndarray = jnp.load(
-            os.path.join(
-                result_dir, file_name("labels", opt.target_round, opt.use_logistic) + ".npy"
-            )
-        )
+    if opt.use_saved_data != 0:
+        X: jnp.ndarray = jnp.load(os.path.join(result_dir, file_name("features", opt) + ".npy"))
+        Y: jnp.ndarray = jnp.load(os.path.join(result_dir, file_name("labels", opt) + ".npy"))
         fin_scores: jnp.ndarray = jnp.load(
-            os.path.join(result_dir, file_name("fin_scores", opt.target_round) + ".npy")
+            os.path.join(result_dir, file_name("fin_scores", opt) + ".npy")
         )
 
     else:
@@ -75,7 +70,7 @@ def set_dataset_round_wise(mjxproto_dir: str, result_dir: str, opt):  # TD用
             params = jnp.load(
                 os.path.join(
                     result_dir,
-                    file_name("params", opt.target_round, opt.use_logistic) + ".pickle",
+                    file_name("params", opt, slide_round=True) + ".pickle",
                 ),
                 allow_pickle=True,
             )
@@ -92,10 +87,10 @@ def set_dataset_round_wise(mjxproto_dir: str, result_dir: str, opt):  # TD用
     return X, Y, fin_scores
 
 
-def set_dataset_whole(mjxprotp_dir: str, result_dir: str, opt):  # suphnx用
+def set_dataset_whole(mjxproto_dir: str, result_dir: str, opt):  # suphnx用
     if opt.use_saved_data:
-        X: jnp.ndarray = jnp.load(os.path.join(result_dir, file_name("features") + ".npy"))
-        Y: jnp.ndarray = jnp.load(os.path.join(result_dir, file_name("labels") + ".npy"))
+        X: jnp.ndarray = jnp.load(os.path.join(result_dir, file_name("features", opt) + ".npy"))
+        Y: jnp.ndarray = jnp.load(os.path.join(result_dir, file_name("labels", opt) + ".npy"))
         fin_scores: jnp.ndarray = jnp.load(
             os.path.join(result_dir, file_name("fin_scores") + ".npy")
         )
@@ -114,7 +109,7 @@ def run_training(X, Y, scores, opt):
     test_y = Y[math.floor(len(X) * 0.8) :]
     test_scores = scores[math.floor(len(X) * 0.8) :]
 
-    assert len(Y) == len(test_scores)
+    assert len(test_y) == len(test_scores)
 
     layer_size = [32, 32, 4]
     seed = jax.random.PRNGKey(42)
@@ -133,7 +128,7 @@ def run_training(X, Y, scores, opt):
         train_y,
         test_x,
         test_y,
-        scores,
+        test_scores,
         opt.epochs,
         opt.batch_size,
         use_logistic=opt.use_logistic,
@@ -151,14 +146,16 @@ def plot_learning_log(train_log, test_log, test_abs_log, opt, result_dir):
     plt.plot(test_abs_log, label="val")
     plt.legend()
     fig.savefig(os.path.join(result_dir, file_name("abs_loss_curve", opt) + ".png"))
+    plt.close()
 
 
-def plot_result(params: optax.Params, target: int, opt, result_dir):
+def plot_result(params: optax.Params, target_pos: int, target_round: int, opt, result_dir):
     scores, preds = _score_pred_pair(
-        params, target, opt.target_round, opt.is_round_one_hot, opt.use_logistic
+        params, target_pos, target_round, opt.is_round_one_hot, opt.use_logistic
     )
-    fig = _preds_fig(scores, preds, target, opt.round_candidate)
-    fig.save(os.path.join(result_dir, file_name("preds", opt)))
+    fig = _preds_fig(scores, preds, target_pos, target_round)
+    fig.savefig(os.path.join(result_dir, file_name("preds", opt)))
+    plt.close()
 
 
 def save_learning_log(train_log, test_log, test_abs_log, opt, result_dir):
@@ -168,7 +165,7 @@ def save_learning_log(train_log, test_log, test_abs_log, opt, result_dir):
 
 
 def save_params(params, opt, result_dir):
-    save_pickle(params, os.path.join(result_dir, file_name("params", opt)))
+    save_pickle(params, os.path.join(result_dir, file_name("params", opt) + ".pickle"))
 
 
 if __name__ == "__main__":
@@ -176,23 +173,34 @@ def save_params(params, opt, result_dir):
     parser.add_argument("lr", help="Enter learning rate", type=float)
     parser.add_argument("epochs", help="Enter epochs", type=int)
     parser.add_argument("batch_size", help="Enter batch_size", type=int)
-    parser.add_argument("is_round_one_hot", type=int, default=0)
+    parser.add_argument("--is_round_one_hot", nargs="?", type=int, default=0)
     parser.add_argument("--use_saved_data", type=int, default=0)
     parser.add_argument("--data_path", default="resources/mjxproto")
     parser.add_argument("--result_path", default="result")
-    parser.add_argument("--target_round", type=int)  # 対象となる局 e.g 3の時は東4局のデータのみ使う.
-    parser.add_argument("--at_once", type=int, default=0)
-    parser.add_argument("--max_round", type=int, default=7)
+    parser.add_argument("--target_round", nargs="?", type=int)  # 対象となる局 e.g 3の時は東4局のデータのみ使う.
     parser.add_argument("--round_wise", type=int, default=0)  # roundごとにNNを作るか(TD or suphx)
     parser.add_argument("--use_logistic", type=int, default=0)  # logistic関数を使うかどうか
 
     args = parser.parse_args()
     mjxproto_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), args.data_path)
     result_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), args.result_path)
-    X, Y, scores = set_dataset_round_wise(mjxproto_dir, result_dir, args)
+    print(
+        "start_training, round_wise: {}, use_logistic: {}".format(
+            args.round_wise, args.use_logistic
+        )
+    )
+    if args.round_wise:
+        X, Y, scores = set_dataset_round_wise(mjxproto_dir, result_dir, args)
+    else:
+        X, Y, scores = set_dataset_whole(mjxproto_dir, result_dir, args)
     params, train_log, test_log, test_abs_log = run_training(X, Y, scores, args)
     save_params(params, args, result_dir)
     save_learning_log(train_log, test_log, test_abs_log, args, result_dir)
     plot_learning_log(train_log, test_log, test_abs_log, args, result_dir)
-    for i in range(4):
-        plot_result(params, i, args, result_dir)
+    if args.round_wise == 0:
+        for round in range(8):
+            for i in range(4):
+                plot_result(params, i, round, args, result_dir)
+    else:
+        for i in range(4):
+            plot_result(params, i, args.target_round, args, result_dir)
diff --git a/workspace/suphx-reward-shaping/train.sh b/workspace/suphx-reward-shaping/train.sh
new file mode 100755
index 00000000..ac2dce10
--- /dev/null
+++ b/workspace/suphx-reward-shaping/train.sh
@@ -0,0 +1,24 @@
+export result_path="result"
+export data_path="resources/mjxproto"
+export batch_size=128
+export lr=0.001
+export epochs=20
+
+
+# suphx no logistic
+python train.py $lr $epochs $batch_size --data_path=$data_path --result_path=$result_path --round_wise=0 --use_logistic=0 --use_saved_data=0
+
+# suphx use logistic
+python train.py $lr $epochs $batch_size  --data_path=$data_path --result_path=$result_path --round_wise=0 --use_logistic=1 --use_saved_data=0
+
+# TD no logistic
+for round in 7 6 5 4 3 2 1 0
+do
+python train.py $lr $epochs $batch_size --data_path=$data_path --result_path=$result_path --round_wise=1 --use_logistic=0 --target_round=$round --use_saved_data=0
+done
+
+# TD logistic
+for round in 7 6 5 4 3 2 1 0
+do
+python train.py $lr $epochs $batch_size  --data_path=$data_path --result_path=$result_path --round_wise=1 --use_logistic=1 --target_round=$round --use_saved_data=0
+done
diff --git a/workspace/suphx-reward-shaping/train_helper.py b/workspace/suphx-reward-shaping/train_helper.py
index 673e60c1..05f9eef9 100644
--- a/workspace/suphx-reward-shaping/train_helper.py
+++ b/workspace/suphx-reward-shaping/train_helper.py
@@ -108,7 +108,7 @@ def train(
     Score_test: jnp.ndarray,
     epochs: int,
     batch_size: int,
-    buffer_size=3,
+    buffer_size=1,
     use_logistic=False,
 ):
     """
@@ -123,7 +123,6 @@ def train(
     dataset_abs_test = tf.data.Dataset.from_tensor_slices((X_test, Score_test))
     batched_dataset_abs_test = dataset_abs_test.batch(batch_size, drop_remainder=True)
     opt_state = optimizer.init(params)
-
     train_log, test_log, test_abs_log = [], [], []
 
     def step(params, opt_state, batch, labels, use_logistic=None):
@@ -149,7 +148,6 @@ def step(params, opt_state, batch, labels, use_logistic=None):
         mean_abs_test_loss = evaluate_abs(
             params, batched_dataset_abs_test, use_logistic=use_logistic
         )
-
         # record mean of train loss and test loss per epoch
         train_log.append(float(np.array(mean_train_loss).item(0)))
         test_log.append(float(np.array(mean_test_loss).item(0)))
@@ -175,7 +173,7 @@ def _score_pred_pair(params, target: int, round_candidate: int, is_round_one_hot
         x = jnp.array(_create_data_for_plot(j * 1000, round_candidate, is_round_one_hot, target))
         pred = net(x, params, use_logistic=use_logistic)  # (1, 4)
         scores.append(j * 1000)
-        preds.append(pred[target] * 100)
+        preds.append(pred[target] * 255 - 135)
     return scores, preds
 
 
diff --git a/workspace/suphx-reward-shaping/utils.py b/workspace/suphx-reward-shaping/utils.py
index 6beb1c4f..53d7645b 100644
--- a/workspace/suphx-reward-shaping/utils.py
+++ b/workspace/suphx-reward-shaping/utils.py
@@ -22,13 +22,15 @@ def to_data(
     jsonが入っているディレクトリを引数としてjax.numpyのデータセットを作る.
     """
     features: List = []
+    targets: List = []
     scores: List = []
     next_features = []
     for _json in os.listdir(mjxprotp_dir):
         _json = os.path.join(mjxprotp_dir, _json)
-        assert ".json" in _json
-        with open(_json, "r") as f:
+        with open(_json, errors="ignore") as f:
             lines = f.readlines()
+            if len(lines) == 0:
+                continue
             _dicts = [json.loads(round) for round in lines]
             states = [json_format.ParseDict(d, mjxproto.State()) for d in _dicts]
             state = _select_one_round(states, candidate=round_candidate)
@@ -42,7 +44,8 @@ def to_data(
                 continue
             feature: List = to_feature(state, round_candidate=round_candidate)
             features.append(feature)
-            scores.append(to_final_game_reward(states))
+            targets.append(to_final_game_reward(states))
+            scores.append(list(map(_preprocess_score_inv, to_final_game_reward(states))))
     features_array: jnp.ndarray = jnp.array(features)
     scores_array: jnp.ndarray = jnp.array(scores)
     if params:
@@ -52,9 +55,9 @@ def to_data(
             x = jnp.dot(x, param)
             if i + 1 < len(params.values()):
                 x = jax.nn.relu(x)
-        targets_array: jnp.ndarray = jnp.clip(x, -1.35, 0.9)
+        targets_array: jnp.ndarray = x
     else:
-        targets_array: jnp.ndarray = jnp.array(scores)
+        targets_array: jnp.ndarray = jnp.array(targets)
     return (features_array, targets_array, scores_array)
 
 
@@ -79,8 +82,8 @@ def _select_one_round(
         idx = random.choice(indices)
         return states[idx]
     else:
-        idx: int = random.randint(0, len(states) - 1)
-        return states[idx]
+        state: mjxproto.State = random.choice(states)
+        return state
 
 
 def _calc_curr_pos(init_pos: int, round: int) -> int:
@@ -167,12 +170,12 @@ def to_final_game_reward(states: List[mjxproto.State]) -> List:
     final_scores = final_state.round_terminal.final_score.tens
     sorted_scores = sorted(final_scores, reverse=True)
     ranks = [sorted_scores.index(final_scores[i]) for i in range(4)]
-    return [game_rewards[i] / 100 for i in ranks]
+    return [(game_rewards[i] + 135) / 225 for i in ranks]
 
 
 def _create_data_for_plot(score: int, round: int, is_round_one_hot, target: int) -> List:
-    scores = [_preprocess_score((100000 - score) / 3)] * 4
-    scores[target] = _preprocess_score(score)
+    scores = [(100000 - score) / 300000] * 4
+    scores[target] = score / 100000
     wind = _to_one_hot(4, _calc_wind(0, round))
     oya: List[int] = _to_one_hot(4, _calc_curr_pos(0, round))
     remainning_oya = _remaining_oya(round)

From 863000c8978b9d49c0fdc41f1ec156d97abeef6c Mon Sep 17 00:00:00 2001
From: nissymori <kyoudai.630@gmail.com>
Date: Fri, 30 Sep 2022 20:31:06 +0900
Subject: [PATCH 10/11] fix

---
 workspace/suphx-reward-shaping/train.py       | 88 ++++++++++++++-----
 workspace/suphx-reward-shaping/train.sh       | 25 +++---
 .../suphx-reward-shaping/train_helper.py      | 23 ++---
 workspace/suphx-reward-shaping/utils.py       | 10 ++-
 4 files changed, 104 insertions(+), 42 deletions(-)

diff --git a/workspace/suphx-reward-shaping/train.py b/workspace/suphx-reward-shaping/train.py
index b1fdbbea..97200b0f 100644
--- a/workspace/suphx-reward-shaping/train.py
+++ b/workspace/suphx-reward-shaping/train.py
@@ -48,8 +48,7 @@ def file_name(type, opt, slide_round=False) -> str:
         file_name += "_use_logistic_"
     else:
         file_name += "_no_logistic_"
-    if opt.round_wise:
-        assert opt.target_round >= 0
+    if opt.round_wise and opt.target_round != None:
         if slide_round:
             file_name += str(opt.target_round + 1)
         else:
@@ -78,6 +77,7 @@ def set_dataset_round_wise(mjxproto_dir: str, result_dir: str, opt):  # TD用
                 mjxproto_dir,
                 round_candidate=opt.target_round,
                 params=params,
+                use_logistic=opt.use_logistic,
             )
         else:  # 南四局の時.
             X, Y, fin_scores = to_data(mjxproto_dir, round_candidate=opt.target_round)
@@ -92,7 +92,7 @@ def set_dataset_whole(mjxproto_dir: str, result_dir: str, opt):  # suphnx用
         X: jnp.ndarray = jnp.load(os.path.join(result_dir, file_name("features", opt) + ".npy"))
         Y: jnp.ndarray = jnp.load(os.path.join(result_dir, file_name("labels", opt) + ".npy"))
         fin_scores: jnp.ndarray = jnp.load(
-            os.path.join(result_dir, file_name("fin_scores") + ".npy")
+            os.path.join(result_dir, file_name("fin_scores", opt) + ".npy")
         )
     else:
         X, Y, fin_scores = to_data(mjxproto_dir, round_candidate=None)
@@ -180,27 +180,75 @@ def save_params(params, opt, result_dir):
     parser.add_argument("--target_round", nargs="?", type=int)  # 対象となる局 e.g 3の時は東4局のデータのみ使う.
     parser.add_argument("--round_wise", type=int, default=0)  # roundごとにNNを作るか(TD or suphx)
     parser.add_argument("--use_logistic", type=int, default=0)  # logistic関数を使うかどうか
+    parser.add_argument("--train", type=int, default=1)
 
     args = parser.parse_args()
     mjxproto_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), args.data_path)
     result_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), args.result_path)
-    print(
-        "start_training, round_wise: {}, use_logistic: {}".format(
-            args.round_wise, args.use_logistic
+    if args.train:
+        print(
+            "start_training, round_wise: {}, use_logistic: {}, target_round: {}".format(
+                args.round_wise, args.use_logistic, args.target_round
+            )
         )
-    )
-    if args.round_wise:
-        X, Y, scores = set_dataset_round_wise(mjxproto_dir, result_dir, args)
-    else:
-        X, Y, scores = set_dataset_whole(mjxproto_dir, result_dir, args)
-    params, train_log, test_log, test_abs_log = run_training(X, Y, scores, args)
-    save_params(params, args, result_dir)
-    save_learning_log(train_log, test_log, test_abs_log, args, result_dir)
-    plot_learning_log(train_log, test_log, test_abs_log, args, result_dir)
-    if args.round_wise == 0:
-        for round in range(8):
+        if args.round_wise:
+            X, Y, scores = set_dataset_round_wise(mjxproto_dir, result_dir, args)
+        else:
+            X, Y, scores = set_dataset_whole(mjxproto_dir, result_dir, args)
+        params, train_log, test_log, test_abs_log = run_training(X, Y, scores, args)
+        save_params(params, args, result_dir)
+        save_learning_log(train_log, test_log, test_abs_log, args, result_dir)
+        plot_learning_log(train_log, test_log, test_abs_log, args, result_dir)
+        if args.round_wise == 0:
+            for round in range(8):
+                for i in range(4):
+                    plot_result(params, i, round, args, result_dir)
+        else:
             for i in range(4):
-                plot_result(params, i, round, args, result_dir)
+                plot_result(params, i, args.target_round, args, result_dir)
+
     else:
-        for i in range(4):
-            plot_result(params, i, args.target_round, args, result_dir)
+        assert not args.target_round  # target_roundが指定されていないことを確認する.
+        params_list: List = (
+            [
+                jnp.load(
+                    os.path.join(
+                        result_dir, file_name("params", args) + str(target_round) + ".pickle"
+                    ),
+                    allow_pickle=True,
+                )
+                for target_round in range(8)
+            ]
+            if args.round_wise
+            else [
+                jnp.load(
+                    os.path.join(result_dir, file_name("params", args) + ".pickle"),
+                    allow_pickle=True,
+                )
+            ]
+            * 8
+        )
+        for target in range(4):
+            fig = plt.figure(figsize=(10, 5))
+            axes = fig.subplots(1, 2)
+            for round_candidate in range(8):
+                log_score, log_pred = _score_pred_pair(
+                    params_list[round_candidate],
+                    target,
+                    round_candidate,
+                    args.is_round_one_hot,
+                    args.use_logistic,
+                )
+                axes[0].plot(log_score, log_pred, label="round" + str(round_candidate))
+                axes[0].set_title("pos" + str(target))
+                axes[0].hlines([90, 45, 0, -135], 0, 60000, "red")
+                axes[1].plot(log_score, log_pred, ".", label="round_" + str(round_candidate))
+                axes[1].set_title("pos" + str(target))
+                axes[1].hlines([90, 45, 0, -135], 0, 60000, "red")
+                plt.legend()
+            _type = "TD" if args.round_wise else "suphx"
+            save_dir = os.path.join(
+                result_dir,
+                file_name("preds", args) + "pos=" + str(target) + _type + ".png",
+            )
+            plt.savefig(save_dir)
diff --git a/workspace/suphx-reward-shaping/train.sh b/workspace/suphx-reward-shaping/train.sh
index ac2dce10..42e54b73 100755
--- a/workspace/suphx-reward-shaping/train.sh
+++ b/workspace/suphx-reward-shaping/train.sh
@@ -6,19 +6,24 @@ export epochs=20
 
 
 # suphx no logistic
-python train.py $lr $epochs $batch_size --data_path=$data_path --result_path=$result_path --round_wise=0 --use_logistic=0 --use_saved_data=0
+#python train.py $lr $epochs $batch_size --data_path=$data_path --result_path=$result_path --round_wise=0 --use_logistic=0 --use_saved_data=0
 
 # suphx use logistic
-python train.py $lr $epochs $batch_size  --data_path=$data_path --result_path=$result_path --round_wise=0 --use_logistic=1 --use_saved_data=0
+#python train.py $lr $epochs $batch_size  --data_path=$data_path --result_path=$result_path --round_wise=0 --use_logistic=1 --use_saved_data=0
 
 # TD no logistic
-for round in 7 6 5 4 3 2 1 0
-do
-python train.py $lr $epochs $batch_size --data_path=$data_path --result_path=$result_path --round_wise=1 --use_logistic=0 --target_round=$round --use_saved_data=0
-done
+#for round in 7 6 5 4 3 2 1 0
+#do
+#python train.py $lr $epochs $batch_size --data_path=$data_path --result_path=$result_path --round_wise=1 --use_logistic=0 --target_round=$round --use_saved_data=0
+#done
 
 # TD logistic
-for round in 7 6 5 4 3 2 1 0
-do
-python train.py $lr $epochs $batch_size  --data_path=$data_path --result_path=$result_path --round_wise=1 --use_logistic=1 --target_round=$round --use_saved_data=0
-done
+#for round in 7 6 5 4 3 2 1 0
+#do
+#python train.py $lr $epochs $batch_size  --data_path=$data_path --result_path=$result_path --round_wise=1 --use_logistic=1 --target_round=$round --use_saved_data=1
+#done
+
+python train.py $lr $epochs $batch_size  --data_path=$data_path --result_path=$result_path --round_wise=0 --use_logistic=0 --use_saved_data=1 --train=0
+python train.py $lr $epochs $batch_size  --data_path=$data_path --result_path=$result_path --round_wise=0 --use_logistic=1 --use_saved_data=1 --train=0
+python train.py $lr $epochs $batch_size  --data_path=$data_path --result_path=$result_path --round_wise=1 --use_logistic=0 --use_saved_data=1 --train=0
+python train.py $lr $epochs $batch_size  --data_path=$data_path --result_path=$result_path --round_wise=1 --use_logistic=1 --use_saved_data=1 --train=0
diff --git a/workspace/suphx-reward-shaping/train_helper.py b/workspace/suphx-reward-shaping/train_helper.py
index 05f9eef9..56653613 100644
--- a/workspace/suphx-reward-shaping/train_helper.py
+++ b/workspace/suphx-reward-shaping/train_helper.py
@@ -66,8 +66,8 @@ def net(x: jnp.ndarray, params: optax.Params, use_logistic=False) -> jnp.ndarray
         x = jnp.dot(x, param)
         if i + 1 < len(params.values()):
             x = jax.nn.relu(x)
-        if use_logistic:
-            x = jnp.exp(x) / (1 + jnp.exp(x))
+    if use_logistic:
+        x = 1 / (1 + jnp.exp(-x))
     return x
 
 
@@ -110,6 +110,7 @@ def train(
     batch_size: int,
     buffer_size=1,
     use_logistic=False,
+    min_delta=0.001,
 ):
     """
     学習用の関数. 線形層を前提としており, バッチ処理やシャッフルのためにtensorflowを使っている.
@@ -120,8 +121,6 @@ def train(
     )
     dataset_test = tf.data.Dataset.from_tensor_slices((X_test, Y_test))
     batched_dataset_test = dataset_test.batch(batch_size, drop_remainder=True)
-    dataset_abs_test = tf.data.Dataset.from_tensor_slices((X_test, Score_test))
-    batched_dataset_abs_test = dataset_abs_test.batch(batch_size, drop_remainder=True)
     opt_state = optimizer.init(params)
     train_log, test_log, test_abs_log = [], [], []
 
@@ -141,17 +140,21 @@ def step(params, opt_state, batch, labels, use_logistic=None):
             )
             cum_loss += loss_value
             if i % 100 == 0:  # print MSE every 100 epochs
-                pred = net(batched_x[0].numpy(), params)
+                pred = net(batched_x[0].numpy(), params, use_logistic=use_logistic)
                 print(f"step {i}, loss: {loss_value}, pred {pred}, actual {batched_y[0]}")
         mean_train_loss = cum_loss / len(batched_dataset_train)
         mean_test_loss = evaluate(params, batched_dataset_test, use_logistic=use_logistic)
-        mean_abs_test_loss = evaluate_abs(
-            params, batched_dataset_abs_test, use_logistic=use_logistic
-        )
+        diff = test_log[-1] - mean_test_loss
         # record mean of train loss and test loss per epoch
         train_log.append(float(np.array(mean_train_loss).item(0)))
         test_log.append(float(np.array(mean_test_loss).item(0)))
-        test_abs_log.append(float(np.array(mean_abs_test_loss).item(0)))
+
+        # Early stoping
+        if diff < 0:
+            break
+        else:
+            if diff < min_delta:
+                break
     return params, train_log, test_log, test_abs_log
 
 
@@ -173,7 +176,7 @@ def _score_pred_pair(params, target: int, round_candidate: int, is_round_one_hot
         x = jnp.array(_create_data_for_plot(j * 1000, round_candidate, is_round_one_hot, target))
         pred = net(x, params, use_logistic=use_logistic)  # (1, 4)
         scores.append(j * 1000)
-        preds.append(pred[target] * 255 - 135)
+        preds.append(pred[target] * 225 - 135)
     return scores, preds
 
 
diff --git a/workspace/suphx-reward-shaping/utils.py b/workspace/suphx-reward-shaping/utils.py
index 53d7645b..eac0bc5d 100644
--- a/workspace/suphx-reward-shaping/utils.py
+++ b/workspace/suphx-reward-shaping/utils.py
@@ -16,7 +16,10 @@
 
 
 def to_data(
-    mjxprotp_dir: str, round_candidate=None, params=None
+    mjxprotp_dir: str,
+    round_candidate=None,
+    params=None,
+    use_logistic=False,
 ) -> Tuple[jnp.ndarray, jnp.ndarray, jnp.ndarray]:
     """
     jsonが入っているディレクトリを引数としてjax.numpyのデータセットを作る.
@@ -55,7 +58,10 @@ def to_data(
             x = jnp.dot(x, param)
             if i + 1 < len(params.values()):
                 x = jax.nn.relu(x)
-        targets_array: jnp.ndarray = x
+        if use_logistic:
+            targets_array: jnp.ndarray = jnp.exp(x) / (1 + jnp.exp(x))
+        else:
+            targets_array: jnp.ndarray = x
     else:
         targets_array: jnp.ndarray = jnp.array(targets)
     return (features_array, targets_array, scores_array)

From 68e7e52e8572c84e17e76b399bc17083476b70d9 Mon Sep 17 00:00:00 2001
From: nissymori <kyoudai.630@gmail.com>
Date: Fri, 30 Sep 2022 21:53:23 +0900
Subject: [PATCH 11/11] fix

---
 .gitignore                                    |  1 +
 workspace/suphx-reward-shaping/train.py       | 61 +++++++++++++++++--
 .../suphx-reward-shaping/train_helper.py      | 14 +----
 3 files changed, 59 insertions(+), 17 deletions(-)

diff --git a/.gitignore b/.gitignore
index 751f4b68..b7d41f6a 100644
--- a/.gitignore
+++ b/.gitignore
@@ -25,6 +25,7 @@ dist
 workspace/suphx-reward-shaping/resources/*
 workspace/suphx-reward-shaping/trained_model/*
 workspace/suphx-reward-shaping/result/*
+workspace/suphx-reward-shaping/check/*
 .DS_Store
 .vscode/
 .python_version
diff --git a/workspace/suphx-reward-shaping/train.py b/workspace/suphx-reward-shaping/train.py
index 97200b0f..ba6c0516 100644
--- a/workspace/suphx-reward-shaping/train.py
+++ b/workspace/suphx-reward-shaping/train.py
@@ -1,15 +1,17 @@
 import argparse
 import math
 import os
-import pickle
+from statistics import mean
 from typing import List, Optional
 
 import jax
 import jax.numpy as jnp
 import matplotlib.pyplot as plt
+import numpy as np
 import optax
+import tensorflow as tf
 from train_helper import _preds_fig, _score_pred_pair, initializa_params, net, save_pickle, train
-from utils import _create_data_for_plot, to_data
+from utils import _preprocess_score_inv, to_data
 
 """
 局ごとにデータとモデルを用意するので
@@ -83,7 +85,7 @@ def set_dataset_round_wise(mjxproto_dir: str, result_dir: str, opt):  # TD用
             X, Y, fin_scores = to_data(mjxproto_dir, round_candidate=opt.target_round)
         jnp.save(os.path.join(result_dir, file_name("features", opt)), X)
         jnp.save(os.path.join(result_dir, file_name("labels", opt)), Y)
-        jnp.save(os.path.join(result_dir, file_name("fin_scores", opt)), Y)
+        jnp.save(os.path.join(result_dir, file_name("fin_scores", opt)), fin_scores)
     return X, Y, fin_scores
 
 
@@ -98,7 +100,7 @@ def set_dataset_whole(mjxproto_dir: str, result_dir: str, opt):  # suphnx用
         X, Y, fin_scores = to_data(mjxproto_dir, round_candidate=None)
         jnp.save(os.path.join(result_dir, file_name("features", opt)), X)
         jnp.save(os.path.join(result_dir, file_name("labels", opt)), Y)
-        jnp.save(os.path.join(result_dir, file_name("fin_scores", opt)), Y)
+        jnp.save(os.path.join(result_dir, file_name("fin_scores", opt)), fin_scores)
     return X, Y, fin_scores
 
 
@@ -168,6 +170,20 @@ def save_params(params, opt, result_dir):
     save_pickle(params, os.path.join(result_dir, file_name("params", opt) + ".pickle"))
 
 
+def evaluate_abs(
+    params: optax.Params, X, score, batch_size, use_logistic=False
+) -> float:  # 前処理する前のスケールでの絶対誤差
+    dataset = tf.data.Dataset.from_tensor_slices((X, score))
+    batched_dataset = dataset.batch(batch_size, drop_remainder=True)
+    cum_loss = 0
+    for batched_x, batched_y in batched_dataset:
+        cum_loss += jnp.abs(
+            _preprocess_score_inv(net(batched_x.numpy(), params, use_logistic=use_logistic))
+            - batched_y.numpy()
+        ).mean()
+    return cum_loss / len(batched_dataset)
+
+
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
     parser.add_argument("lr", help="Enter learning rate", type=float)
@@ -228,6 +244,7 @@ def save_params(params, opt, result_dir):
             ]
             * 8
         )
+        # plot pred
         for target in range(4):
             fig = plt.figure(figsize=(10, 5))
             axes = fig.subplots(1, 2)
@@ -252,3 +269,39 @@ def save_params(params, opt, result_dir):
                 file_name("preds", args) + "pos=" + str(target) + _type + ".png",
             )
             plt.savefig(save_dir)
+        # plot abs loss
+        fig = plt.figure(figsize=(10, 5))
+        axes = fig.subplots(1, 2)
+        abs_losses: List = []
+        for round_candidate in range(8):
+            X: jnp.ndarray = jnp.load(
+                os.path.join(
+                    result_dir, file_name("features", args) + str(round_candidate) + ".npy"
+                )
+            )
+            fin_scores: jnp.ndarray = jnp.load(
+                os.path.join(
+                    result_dir, file_name("fin_scores", args) + str(round_candidate) + ".npy"
+                )
+            )
+            abs_loss = evaluate_abs(
+                params_list[round_candidate],
+                X,
+                fin_scores,
+                args.batch_size,
+                use_logistic=args.use_logistic,
+            )
+            print(round_candidate, abs_loss, fin_scores[:3])
+            abs_losses.append(float(np.array(abs_loss).item(0)))
+        axes[0].plot(abs_losses)
+        axes[0].set_title("abs loss")
+        axes[0].hlines(mean(abs_losses), 0, 8, "red")
+        axes[1].plot(abs_losses, ".")
+        axes[1].hlines(mean(abs_losses), 0, 8, "red")
+        plt.legend()
+        _type = "TD" if args.round_wise else "suphx"
+        save_dir = os.path.join(
+            result_dir,
+            file_name("abs_loss", args) + _type + ".png",
+        )
+        plt.savefig(save_dir)
diff --git a/workspace/suphx-reward-shaping/train_helper.py b/workspace/suphx-reward-shaping/train_helper.py
index 56653613..868b402c 100644
--- a/workspace/suphx-reward-shaping/train_helper.py
+++ b/workspace/suphx-reward-shaping/train_helper.py
@@ -86,18 +86,6 @@ def evaluate(params: optax.Params, batched_dataset, use_logistic=False) -> float
     return cum_loss / len(batched_dataset)
 
 
-def evaluate_abs(
-    params: optax.Params, batched_dataset, use_logistic=False
-) -> float:  # 前処理する前のスケールでの絶対誤差
-    cum_loss = 0
-    for batched_x, batched_y in batched_dataset:
-        cum_loss += jnp.abs(
-            _preprocess_score_inv(net(batched_x.numpy(), params, use_logistic=use_logistic))
-            - batched_y.numpy()
-        ).mean()
-    return cum_loss / len(batched_dataset)
-
-
 def train(
     params: optax.Params,
     optimizer: optax.GradientTransformation,
@@ -144,7 +132,7 @@ def step(params, opt_state, batch, labels, use_logistic=None):
                 print(f"step {i}, loss: {loss_value}, pred {pred}, actual {batched_y[0]}")
         mean_train_loss = cum_loss / len(batched_dataset_train)
         mean_test_loss = evaluate(params, batched_dataset_test, use_logistic=use_logistic)
-        diff = test_log[-1] - mean_test_loss
+        diff = test_log[-1] - float(np.array(mean_test_loss).item(0))
         # record mean of train loss and test loss per epoch
         train_log.append(float(np.array(mean_train_loss).item(0)))
         test_log.append(float(np.array(mean_test_loss).item(0)))