From 73c03b15a429730dcc788e28ab3a50293e2574fb Mon Sep 17 00:00:00 2001
From: mosout <mosout@qq.com>
Date: Sun, 7 May 2023 14:23:28 +0000
Subject: [PATCH 1/5] dtr exp

---
 .gitignore                 |  4 ++-
 configs/gpt2_pretrain.py   | 12 +++++--
 libai/remat/__init__.py    |  2 ++
 libai/remat/config.py      |  7 +++++
 libai/remat/trainer.py     | 15 +++++++++
 libai/utils/distributed.py | 59 ++++++++++++++++++++++++++---------
 tools/run_remat.sh         | 36 +++++++++++++++++++++
 tools/train_remat.py       | 64 ++++++++++++++++++++++++++++++++++++++
 8 files changed, 180 insertions(+), 19 deletions(-)
 create mode 100644 libai/remat/__init__.py
 create mode 100644 libai/remat/config.py
 create mode 100644 libai/remat/trainer.py
 create mode 100755 tools/run_remat.sh
 create mode 100644 tools/train_remat.py

diff --git a/.gitignore b/.gitignore
index ab28bbee3..f0985c93f 100644
--- a/.gitignore
+++ b/.gitignore
@@ -130,4 +130,6 @@ venv.bak/
 dmypy.json
 
 # Pyre type checker
-.pyre/
\ No newline at end of file
+.pyre/
+output
+exp_data
\ No newline at end of file
diff --git a/configs/gpt2_pretrain.py b/configs/gpt2_pretrain.py
index 313c814e1..8c53a56b2 100644
--- a/configs/gpt2_pretrain.py
+++ b/configs/gpt2_pretrain.py
@@ -7,9 +7,14 @@
 
 from .common.models.graph import graph
 
-vocab_file = "./data_test/gpt_data/gpt2-vocab.json"
-merge_files = "./data_test/gpt_data/gpt2-merges.txt"
-data_prefix = "./data_test/gpt_data/loss_compara_content_sentence"
+# vocab_file = "./data_test/gpt_data/gpt2-vocab.json"
+# merge_files = "./data_test/gpt_data/gpt2-merges.txt"
+# data_prefix = "./data_test/gpt_data/loss_compara_content_sentence"
+vocab_file = "/data/home/liupeihong/datasets/libai_dataset/gpt2-vocab.json"
+merge_files = "/data/home/liupeihong/datasets/libai_dataset/gpt2-merges.txt"
+data_prefix = (
+    "/data/home/liupeihong/datasets/libai_dataset/loss_compara_content_sentence"
+)
 
 tokenization.tokenizer.vocab_file = vocab_file
 tokenization.tokenizer.merges_file = merge_files
@@ -42,3 +47,4 @@
 train.evaluation.evaluator = LazyCall(PPLEvaluator)()
 
 train.output_dir = "./output/gpt2_output"
+graph.enabled=False
diff --git a/libai/remat/__init__.py b/libai/remat/__init__.py
new file mode 100644
index 000000000..2fed58a7c
--- /dev/null
+++ b/libai/remat/__init__.py
@@ -0,0 +1,2 @@
+from .config import remat_argument_parser
+from .trainer import RematTrainer
\ No newline at end of file
diff --git a/libai/remat/config.py b/libai/remat/config.py
new file mode 100644
index 000000000..7e7f068bb
--- /dev/null
+++ b/libai/remat/config.py
@@ -0,0 +1,7 @@
+from libai.config import default_argument_parser
+
+
+def remat_argument_parser(epilog=None):
+    parser = default_argument_parser(epilog=epilog)
+    parser.add_argument("--threshold", type=int)
+    return parser
diff --git a/libai/remat/trainer.py b/libai/remat/trainer.py
new file mode 100644
index 000000000..9cdb4c439
--- /dev/null
+++ b/libai/remat/trainer.py
@@ -0,0 +1,15 @@
+from libai.engine import DefaultTrainer
+import oneflow as flow
+
+
+def sync():
+    flow.comm.barrier()
+
+
+class RematTrainer(DefaultTrainer):
+    def __init__(self, cfg):
+        super().__init__(cfg)
+
+    def after_step(self):
+        sync()
+        super().after_step()
diff --git a/libai/utils/distributed.py b/libai/utils/distributed.py
index e7914a0ad..59cc235b2 100644
--- a/libai/utils/distributed.py
+++ b/libai/utils/distributed.py
@@ -29,13 +29,14 @@
 
 def _merge_devices(devices):
     num_gpus_per_node = get_world_size() // get_num_nodes()
-    node_devices = [node_id * num_gpus_per_node + device_id for node_id, device_id in devices]
+    node_devices = [
+        node_id * num_gpus_per_node + device_id for node_id, device_id in devices
+    ]
     return node_devices
 
 
 class _DistributeUtil(object):
     def __init__(self, cfg):
-
         self._init_distributed_env(cfg)
         self._init_parallel_size(cfg)
         self._init_placement_group(cfg)
@@ -47,7 +48,10 @@ def _init_distributed_env(self, cfg):
         num_nodes = get_num_nodes()
         num_gpus_per_node = get_world_size() // num_nodes
 
-        if try_get_key(cfg, "num_gpus_per_node", default=num_gpus_per_node) != num_gpus_per_node:
+        if (
+            try_get_key(cfg, "num_gpus_per_node", default=num_gpus_per_node)
+            != num_gpus_per_node
+        ):
             # This means key(num_gpus_per_node) saved in config is not equal
             # to environment variable.
             # Give user a warning about inconsistent reproduce environment.
@@ -74,7 +78,6 @@ def _init_distributed_env(self, cfg):
         self._device_type = try_get_key(cfg, "device_type", default="cuda")
 
     def _init_parallel_size(self, cfg):
-
         # tensor parallel size
         self._tensor_parallel_size = min(cfg.tensor_parallel_size, self.world_size)
         assert self.world_size % self._tensor_parallel_size == 0, (
@@ -121,7 +124,9 @@ def _init_parallel_size(self, cfg):
             if try_get_key(cfg, "pipeline_num_layers") is None:
                 cfg.pipeline_num_layers = 10000
 
-        self._model_parallel_size = self._pipeline_parallel_size * self._tensor_parallel_size
+        self._model_parallel_size = (
+            self._pipeline_parallel_size * self._tensor_parallel_size
+        )
 
         assert self.world_size % self._model_parallel_size == 0, (
             f"world size ({self.world_size}) is not divisible by"
@@ -152,21 +157,27 @@ def _init_placement_group(self, cfg):
             and cfg.pipeline_num_layers >= 8
             and cfg.pipeline_num_layers % self._pipeline_parallel_size == 0
         ):
-            temp_num_layers_per_stage = cfg.pipeline_num_layers // self._pipeline_parallel_size
+            temp_num_layers_per_stage = (
+                cfg.pipeline_num_layers // self._pipeline_parallel_size
+            )
             actual_pipeline_num_layers = cfg.pipeline_num_layers + min(
                 self._pipeline_parallel_size - 1, temp_num_layers_per_stage
             )
         else:
             actual_pipeline_num_layers = cfg.pipeline_num_layers
 
-        num_layers_per_stage = actual_pipeline_num_layers // self._pipeline_parallel_size
+        num_layers_per_stage = (
+            actual_pipeline_num_layers // self._pipeline_parallel_size
+        )
         stage_offset = actual_pipeline_num_layers % self._pipeline_parallel_size
 
         # stage_offset can make the later stages contain more layers when pipeline_num_layers
         # cannot be divided by pipeline_parallel_size.
         # This can make pipeline parallel more memory efficient.
         self._layer_stage_ids = []
-        for i in range(0, actual_pipeline_num_layers - stage_offset, num_layers_per_stage):
+        for i in range(
+            0, actual_pipeline_num_layers - stage_offset, num_layers_per_stage
+        ):
             stage_id = i // num_layers_per_stage
             if stage_id >= (self._pipeline_parallel_size - stage_offset):
                 self._layer_stage_ids.append(stage_id)
@@ -180,7 +191,9 @@ def _init_placement_group(self, cfg):
                 self._layer_stage_ids = cfg.custom_pipeline_stage_id
             cfg.actual_pipeline_stage_id = self._layer_stage_ids
 
-        self._layer_ranks = [stages_devices[stage_id] for stage_id in self._layer_stage_ids]
+        self._layer_ranks = [
+            stages_devices[stage_id] for stage_id in self._layer_stage_ids
+        ]
 
     def _init_parallel_hierarchy(self):
         if self.is_data_model_parallel():
@@ -228,7 +241,12 @@ def device_type(self):
         return self._device_type
 
     def set_device_type(self, device_type):
-        assert device_type in ["cpu", "cuda"], f"not supported for {device_type}"
+        assert device_type in [
+            "cpu",
+            "cuda",
+            "cpu+remat",
+            "cuda+remat",
+        ], f"not supported for {device_type}"
         self._device_type = device_type
 
     def get_layer_ranks(self, layer_idx):
@@ -316,6 +334,7 @@ def get_layer_placement(layer_idx, device_type=None):
     device_type = dist_util.device_type if device_type is None else device_type
     if not flow.cuda.is_available() and device_type == "cuda":
         device_type = "cpu"
+    return flow.placement.all("cuda+remat")
     return flow.placement(
         device_type,
         dist_util.get_layer_ranks(layer_idx),
@@ -353,7 +372,9 @@ def get_hidden_sbp():
 
 def get_data_parallel_rank():
     dist_util = get_dist_util()
-    return (flow.env.get_rank() // dist_util.model_parallel_size) % dist_util.data_parallel_size
+    return (
+        flow.env.get_rank() // dist_util.model_parallel_size
+    ) % dist_util.data_parallel_size
 
 
 def get_data_parallel_size():
@@ -429,10 +450,14 @@ def convert_to_distributed_default_setting(t):
             sbp=get_nd_sbp([flow.sbp.broadcast, flow.sbp.broadcast]),
             placement=get_layer_placement(0),
         )
+    
     else:
         dist_util = get_dist_util()
         device_type = dist_util.device_type
-        return t.to_global(placement=flow.placement(device_type, ranks=t.placement.ranks))
+        return t.to_global(flow.placement.all("cuda+remat"), flow.sbp.broadcast)
+        return t.to_global(
+            placement=flow.placement(device_type, ranks=t.placement.ranks)
+        )
 
 
 def ttol(tensor, pure_local=False, ranks=None):
@@ -443,7 +468,8 @@ def ttol(tensor, pure_local=False, ranks=None):
             tensor = tensor.to_global(placement=placement).to_local()
         else:
             tensor = tensor.to_global(
-                sbp=get_nd_sbp([flow.sbp.broadcast, flow.sbp.broadcast]), placement=placement
+                sbp=get_nd_sbp([flow.sbp.broadcast, flow.sbp.broadcast]),
+                placement=placement,
             ).to_local()
 
     return tensor
@@ -462,9 +488,12 @@ def tensor_to_rank0(tensor, device="cuda", to_local=False):
     assert device in ["cpu", "cuda"], f"not supported for device:{device}"
     if tensor.is_global:
         # Consider if it's 2d mesh, ranks should be [[0]] instead of [0]
-        placement = flow.placement(device, ranks=[0] if tensor.placement.ranks.ndim == 1 else [[0]])
+        placement = flow.placement(
+            device, ranks=[0] if tensor.placement.ranks.ndim == 1 else [[0]]
+        )
         tensor = tensor.to_global(
-            sbp=get_nd_sbp([flow.sbp.broadcast, flow.sbp.broadcast]), placement=placement
+            sbp=get_nd_sbp([flow.sbp.broadcast, flow.sbp.broadcast]),
+            placement=placement,
         )
         if to_local:
             tensor = ttol(tensor)
diff --git a/tools/run_remat.sh b/tools/run_remat.sh
new file mode 100755
index 000000000..0734a04bb
--- /dev/null
+++ b/tools/run_remat.sh
@@ -0,0 +1,36 @@
+#!/usr/bin/env bash
+
+set -ux
+
+SCRIPT_DIR=$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")" &>/dev/null && pwd)
+
+MODEL_NAME=gpt2
+RESULT_DIR=./exp_data
+array=(6500 7500)
+# array=(3500 4000 4500 5000 5500 6000 6500 7500 8500 9500)
+config_file=/data/home/liupeihong/codes/libai-normal/libai/config/configs/gpt2_pretrain.py
+
+METHOD=ours
+for threshold in "${array[@]}"; do
+    CUDA_VISIBLE_DEVICES=2 ONEFLOW_REMAT_SUMMARY_FILE_PREFIX=${RESULT_DIR}/$MODEL_NAME-$METHOD-$threshold ENABLE_PROFILE_FOR_DTR=0 ONEFLOW_VM_MULTI_THREAD=0 ONEFLOW_DTR_GROUP_NUM=2 \
+    python3 -m oneflow.distributed.launch \
+    --nproc_per_node 1 --nnodes 1 --node_rank 0 --master_addr 127.0.0.1 --master_port 12345 \
+    tools/train_remat.py --config-file $config_file --threshold $threshold --fast-dev-run
+done
+
+
+METHOD=dte-our-impl
+for threshold in "${array[@]}"; do
+    CUDA_VISIBLE_DEVICES=2 ONEFLOW_REMAT_SUMMARY_FILE_PREFIX=${RESULT_DIR}/$MODEL_NAME-$METHOD-$threshold ENABLE_PROFILE_FOR_DTR=0 ONEFLOW_VM_MULTI_THREAD=0 ONEFLOW_DTR_GROUP_NUM=1 ONEFLOW_REMAT_HEURISTIC_DTE=1 \
+    python3 -m oneflow.distributed.launch \
+    --nproc_per_node 1 --nnodes 1 --node_rank 0 --master_addr 127.0.0.1 --master_port 12345 \
+    tools/train_remat.py --config-file $config_file --threshold $threshold --fast-dev-run
+done
+
+METHOD=dtr-no-free
+for threshold in "${array[@]}"; do
+    CUDA_VISIBLE_DEVICES=2 ONEFLOW_REMAT_SUMMARY_FILE_PREFIX=${RESULT_DIR}/$MODEL_NAME-$METHOD-$threshold ENABLE_PROFILE_FOR_DTR=0 ONEFLOW_VM_MULTI_THREAD=0 ONEFLOW_DTR_GROUP_NUM=1 ONEFLOW_REMAT_HEURISTIC_DTR=1 \    
+    python3 -m oneflow.distributed.launch \
+    --nproc_per_node 1 --nnodes 1 --node_rank 0 --master_addr 127.0.0.1 --master_port 12345 \
+    tools/train_remat.py --config-file $config_file --threshold $threshold --fast-dev-run
+done
\ No newline at end of file
diff --git a/tools/train_remat.py b/tools/train_remat.py
new file mode 100644
index 000000000..0cdd7b94b
--- /dev/null
+++ b/tools/train_remat.py
@@ -0,0 +1,64 @@
+# coding=utf-8
+# Copyright 2021 The OneFlow Authors. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import logging
+import os
+import random
+import sys
+
+import numpy as np
+import oneflow as flow
+import argparse
+import sys
+
+sys.path.append(
+    os.path.abspath(os.path.join(os.path.dirname(__file__), os.path.pardir))
+)
+from libai.config import LazyConfig
+from libai.engine import DefaultTrainer, default_setup
+from libai.remat import remat_argument_parser, RematTrainer
+from libai.utils import distributed as dist
+
+logger = logging.getLogger("libai." + __name__)
+
+
+def main(args):
+    cfg = LazyConfig.load(args.config_file)
+    cfg = LazyConfig.apply_overrides(cfg, args.opts)
+    default_setup(cfg, args)
+
+    seed_for_rank = cfg.train.seed + flow.env.get_rank()
+    flow.manual_seed(seed_for_rank)
+    flow.cuda.manual_seed(seed_for_rank)
+    np.random.seed(seed_for_rank)
+    random.seed(seed_for_rank)
+
+    if args.fast_dev_run:
+        cfg.train.train_epoch = 0
+        cfg.train.train_iter = 30
+        cfg.train.evaluation.enabled = False
+        cfg.train.log_period = 1
+        cfg.train.input_placement_device = "cuda+remat"
+
+    trainer = RematTrainer(cfg)
+    return trainer.train()
+
+
+if __name__ == "__main__":
+    dist.set_device_type("cuda+remat")
+    args = remat_argument_parser().parse_args()
+    flow.remat.set_budget(f"{args.threshold}MB")
+    flow.remat.set_small_pieces_optimization(False)
+    main(args)

From bdd42340c60163ea7f57d5024dcacbf90cfcaa6d Mon Sep 17 00:00:00 2001
From: daquexian <daquexian566@gmail.com>
Date: Thu, 11 May 2023 13:57:27 +0800
Subject: [PATCH 2/5] refine

---
 libai/engine/default.py |  10 +-
 tools/json_to_pic.py    | 356 ++++++++++++++++++++++++++++++++++++++++
 tools/run_remat.sh      |  16 +-
 tools/train_remat.py    |   2 +-
 4 files changed, 370 insertions(+), 14 deletions(-)
 create mode 100644 tools/json_to_pic.py

diff --git a/libai/engine/default.py b/libai/engine/default.py
index 14a107167..fe76ee9db 100644
--- a/libai/engine/default.py
+++ b/libai/engine/default.py
@@ -416,11 +416,11 @@ def build_hooks(self):
         ret = [
             hooks.IterationTimer(),
             hooks.LRScheduler(),  # for beauty lr scheduler printer in `nn.Graph` mode
-            hooks.PeriodicCheckpointer(
-                self.checkpointer,
-                self.cfg.train.checkpointer.period,
-                max_to_keep=self.cfg.train.checkpointer.max_to_keep,
-            ),
+            # hooks.PeriodicCheckpointer(
+            #     self.checkpointer,
+            #     self.cfg.train.checkpointer.period,
+            #     max_to_keep=self.cfg.train.checkpointer.max_to_keep,
+            # ),
         ]
 
         if self.cfg.train.evaluation.enabled:
diff --git a/tools/json_to_pic.py b/tools/json_to_pic.py
new file mode 100644
index 000000000..80eeeaa3c
--- /dev/null
+++ b/tools/json_to_pic.py
@@ -0,0 +1,356 @@
+import json
+import csv
+import sys
+import re
+from pathlib import Path
+import matplotlib.pyplot as plt
+import numpy as np
+import os
+
+
+our_times = []
+nlr_times = []
+no_allo_times = []
+no_lr_times = []
+
+
+cwd = Path.cwd()
+
+
+def get_theo_time_from_json_file(fn):
+    with open(fn) as f:
+        j = json.load(f)
+        return float(j["theoretically time"])
+
+
+def get_real_time_from_json_file(fn):
+    with open(fn) as f:
+        j = json.load(f)
+        return float(j["real time"])
+
+
+def avg(x):
+    return sum(x) / len(x)
+
+
+def get_searching_time_from_json_file(fn):
+    with open(fn) as f:
+        j = json.load(f)
+        j = list(map(lambda x: x[2], j["overhead"]))
+        if len(j) == 0:
+            return 0
+        return sum(j)
+
+
+def get_mem_frag_rate_from_json_file(fn):
+    with open(fn) as f:
+        j = json.load(f)
+        mem_frag_rate = j["mem frag rate"]
+        if mem_frag_rate is None:
+            mem_frag_rate = np.nan
+        mem_frag_rate = float(mem_frag_rate)
+        # return %
+        return mem_frag_rate * 100
+
+
+def get_dataset_time_from_json_file(fn):
+    with open(fn) as f:
+        j = json.load(f)
+        if 'real time' in j:
+            x = j["real time"]
+            if x != 0:
+                return x * 10**6
+        x = j["dataset time"]
+        if x is not None:
+            x = float(x)
+        return x
+
+
+def get_threshold_from_json_file(fn):
+    return int(fn[-9:][:4])
+    if fn[:9] == 'overhead-':
+        fn = fn[:-5]
+        return int(fn.split('-')[-1])
+    with open(fn) as f:
+        j = json.load(f)
+        t = float(j["threshold"][:-2])
+        if t == 9900:
+            t = 10000
+        return t
+
+
+model_name = sys.argv[1]
+
+
+def draw_one(ax, data, label, i, total, kind):
+    assert kind in ["step", "line", "bar"]
+    colors = {
+            'Coop (Ours)': 'tab:blue',
+            'Coop': 'tab:blue',
+            'DTE': 'tab:orange',
+            'DTR': 'tab:green',
+            'No op-guided allocation': 'tab:brown',
+            'No recomputable in-place': 'tab:purple',
+            'No layout-aware eviction': 'tab:red',
+            }
+    markers = ["o", "*", "D", "^", "s"]
+    zorder = 200 - i
+    length = len(data)
+    data = list(zip(*data))
+    if kind == "step":
+        x = ax.step(
+            *data,
+            where="post",
+            label=label,
+            linewidth=4,
+            marker=markers[i],
+            markevery=[True] + [False] * (length - 1),
+            ms=10,
+            zorder=zorder,
+            color=colors[label],
+        )
+    elif kind == "line":
+        ax.plot(
+            *data,
+            label=label,
+            linewidth=3,
+            marker=markers[i],
+            markevery=1,
+            ms=10,
+            zorder=zorder,
+        )
+    elif kind == "bar":
+        width = 0.25
+        x = np.arange(len(data[0]))
+        bars = ax.bar(
+            x + (i - total / 2 + 0.5) * width,
+            np.where(np.isnan(data[1]), 0.5, data[1]),
+            width=width * 0.88,
+            label=label,
+            zorder=zorder,
+        )
+        for i, bar in enumerate(bars):
+            if np.isnan(data[1][i]):
+                bar.set_color('gray')
+
+
+def draw_from_files_and_draw(
+    *,
+    xlabel,
+    ylabel,
+    get_y,
+    pic_name,
+    name_to_legend_and_fn_patterns,
+    ncols,
+    nrows,
+    pic_kind,
+    data_kind,
+    imgcat=False,
+):
+    assert len(name_to_legend_and_fn_patterns) == ncols * nrows
+    if data_kind in [DK_FRAG, DK_ABLA_FRAG]:
+        fig, axs = plt.subplots(ncols=ncols, nrows=nrows, figsize=(14, 6))
+    elif data_kind == DK_TIME:
+        fig, axs = plt.subplots(ncols=ncols, nrows=nrows, figsize=(28, 9), sharex=True, sharey='row')
+    elif data_kind == DK_ABLA:
+        fig, axs = plt.subplots(ncols=ncols, nrows=nrows, figsize=(28, 9), sharex=True, sharey='row')
+    elif data_kind == DK_OH:
+        fig, axs = plt.subplots(ncols=ncols, nrows=nrows, figsize=(14, 6), sharex=True, sharey='row')
+    else:
+        raise ValueError("data_kind")
+    axs = axs.flatten()
+
+    for i, ((name, resolution), legend_and_fn_patterns) in enumerate(name_to_legend_and_fn_patterns.items()):
+        ax = axs[i]
+        _draw_from_files_and_draw_in_subplot(
+            ax=ax,
+            xlabel=xlabel,
+            ylabel=ylabel,
+            get_y=get_y,
+            legend_and_fn_pattern=legend_and_fn_patterns,
+            pic_kind=pic_kind,
+            data_kind=data_kind,
+            name=name,
+            resolution=resolution,
+            index=i,
+        )
+    left = 0.075 if data_kind in [DK_FRAG, DK_ABLA_FRAG] else 0.06
+    if data_kind == DK_OH:
+        left = 0
+    right = 1
+    top = 1
+    bottom = {DK_FRAG: 0.15, DK_ABLA: 0.18, DK_TIME: 0.11, DK_OH: 0.15}[data_kind]
+    subplot_x_center = (left + right) / 2
+    subplot_y_center = (top + bottom) / 2
+    if data_kind in [DK_TIME, DK_ABLA]:
+        fig.subplots_adjust(top=top, left=left, right=right, bottom=bottom, wspace=0.1, hspace=0.07)
+    elif data_kind == [DK_FRAG, DK_OH]:
+        fig.subplots_adjust(top=top, left=left, right=right, bottom=bottom, wspace=0.1)
+    handles, labels = axs[0].get_legend_handles_labels() # type: ignore
+    fig.legend(handles, labels, loc='upper center', ncol=3 if data_kind == DK_FRAG else 4, bbox_to_anchor=(subplot_x_center, 0.01), handlelength=1.5)
+
+    if data_kind == DK_FRAG:
+        fig.supylabel(ylabel, y=subplot_y_center, fontsize=YLABEL_FONT_SIZE_FRAG, fontweight='bold')
+    else:
+        fig.supylabel(ylabel, y=subplot_y_center, fontweight='bold')
+    fig.supxlabel(xlabel, x=subplot_x_center, fontweight='bold')
+
+    plt.savefig(pic_name, bbox_inches="tight")
+    if imgcat:
+        os.system(f"imgcat {pic_name}")
+
+
+def _draw_from_files_and_draw_in_subplot(
+    *, ax, xlabel, ylabel, get_y, legend_and_fn_pattern, pic_kind, data_kind, name, resolution, index
+):
+    data = {}
+    threshold_set = set()
+    for label, (fn_pattern, predicate) in legend_and_fn_pattern.items():
+        match = re.compile(fn_pattern).match
+        fns = list(
+            filter(match, (str(x.name) for x in cwd.iterdir()))
+        )
+        fns = list(filter(lambda fn: predicate(int(match(fn).group(1))), fns))
+        thresholds = list(map(get_threshold_from_json_file, fns))
+        threshold_set = threshold_set.union(thresholds)
+        data[label] = list(zip(thresholds, list(map(get_y, fns))))
+    max_threshold = max(threshold_set)
+    min_threshold = min(threshold_set)
+    if data_kind in [DK_TIME, DK_ABLA]:
+        base_time = min(min(x[1] for x in d) for d in data.values())
+        # dtr_base_time = min(x[1] for x in data["DTR"])
+        for label, d in data.items():
+            for i in range(len(d)):
+                # if label == 'DTR':
+                #     d[i] = (d[i][0], d[i][1] / dtr_base_time)
+                # else:
+                d[i] = (d[i][0], d[i][1] / base_time)
+
+    if pic_kind == "bar":
+        for threshold in threshold_set:
+            for label, d in data.items():
+                if threshold not in list(zip(*d))[0]:
+                    d.append((threshold, np.nan))
+    for label in data:
+        data[label].sort(key=lambda x: x[0])
+        # print(data[label])
+        data[label] = list(map(lambda x: (x[0] / max_threshold, x[1]), data[label]))
+        # pop max_threshold because it doesn't have mem frag
+        # pop min_threshold because most data is None
+        if pic_kind == "bar" and data_kind in [DK_FRAG, DK_OH]:
+            data[label].pop()
+            threshold_set.discard(max_threshold)
+            del data[label][0]
+            threshold_set.discard(min_threshold)
+
+    # print(f"max_threshold: {max_threshold}")
+
+    for i, (label, d) in enumerate(data.items()):
+        draw_one(ax, d, label, i, len(data), pic_kind)
+
+    if name[:5] == 'GPT-2' or name[:4] == 'BERT':
+        ax.plot(0.77, 1.25, label='SAR', color='tab:brown', markersize=15, marker='*', linestyle='None')
+
+    if pic_kind in ["step", "line"]:
+        if data_kind in [DK_TIME, DK_ABLA]:
+            ax.set_xticks(np.arange(0, 1.1, 0.2))
+            ax.set_xticks(np.arange(0, 1.1, 0.1), minor=True)
+            ax.set_xlim(left=0.15, right=1.1)
+
+            flag = True
+            if index >= 4:
+                flag = False
+            if data_kind == DK_ABLA:
+                flag = True
+            if flag:
+                ax.set_ylim(top=1.57)
+                ax.text(1.07, 1.51, name, fontsize=24, fontweight="bold", ha='right')
+                ax.text(1.07, 1.49, resolution, fontsize=18, ha='right', va='top')
+            else:
+                ax.set_ylim(top=1.39)
+                ax.text(1.07, 1.34, name, fontsize=24, fontweight="bold", ha='right')
+                ax.text(1.07, 1.32, resolution, fontsize=18, ha='right', va='top')
+
+        ax.grid(which='both')
+    if pic_kind in ["bar"]:
+        x = list(map(lambda x: x / max_threshold, sorted(list(threshold_set))))
+        ax.set_xticks(np.arange(len(x)), x)
+        ax.grid(axis="y")
+        ax.set_title(name, fontsize=21, fontweight='bold', pad=12)
+
+
+DK_ABLA_FRAG = "abfrag"
+DK_FRAG = "frag"
+DK_OH = "oh"
+DK_TIME = "time"
+DK_ABLA = "ablation"
+DK_FWS = "fws"
+data_kind = sys.argv[3] if len(sys.argv) > 3 else DK_FRAG
+
+DEFAULT_FONT_SIZE = 22
+YLABEL_FONT_SIZE_FRAG = 20 if data_kind == DK_FRAG else None
+plt.rcParams.update({"font.size": DEFAULT_FONT_SIZE})
+
+
+def divisible_by(n):
+    return lambda x: x % n == 0
+
+
+def unet_predicate(fn):
+    return divisible_by(1000)(fn)
+
+def gpt2_predicate(fn):
+    return divisible_by(850)(fn)
+
+def resnet50_predicate(fn):
+    return divisible_by(1000)(fn)
+
+
+if data_kind == DK_TIME:
+    d = {
+            # ("GPT-2 (8)", "Sequence length 512"): ("bert-new2", unet_predicate),
+            # ("BERT Large (4)", "Sequence length 512"): ("bert-new2", unet_predicate),
+            # ("BiLSTM (2048)", "Input dimension 100,\nHidden dimension 256,\nSequence length 128"): ("lstm_text-new2", divisible_by(850)),
+            # # ("ResNet-152 (55)", "224x224"): ("resnet152-new2", unet_predicate),
+            # ("U-Net (5)", "460x608"): ("unet-new2", unet_predicate),
+            # ("Swin-T (40)", "224x224"): ("stn2", unet_predicate),
+            # ("ResNet-50 (115)", "224x224"): ("resnet50-new2", unet_predicate),
+            # ("Inception V3 (96)", "299x299"): ("inception_v3-new2", unet_predicate),
+            # ("DenseNet-121 (70)", "224x224"): ("densenet121-new2", unet_predicate),
+
+            ("gpt", "32x32"): ("gpt2", gpt2_predicate),
+            ("gpt1", "32x32"): ("gpt2", gpt2_predicate),
+            ("gpt2", "32x32"): ("gpt2", gpt2_predicate),
+            ("gpt3", "32x32"): ("gpt2", gpt2_predicate),
+            ("gpt4", "32x32"): ("gpt2", gpt2_predicate),
+            ("gpt5", "32x32"): ("gpt2", gpt2_predicate),
+            ("gpt6", "32x32"): ("gpt2", gpt2_predicate),
+            ("gpt7", "32x32"): ("gpt2", gpt2_predicate),
+            }
+    name_to_legend_and_fn_patterns = {}
+    for k, v in d.items():
+        if k == ("Swin-T (40)", "224x224"):
+            name_to_legend_and_fn_patterns[k] = {
+                    "Coop (Ours)": (rf"{v[0]}-ours-(\d{{4,5}}).json", v[1]),
+                    "DTE": (rf"{v[0]}-no-gp-(\d{{4,5}}).json", v[1]),
+                    "DTR": (rf"{v[0]}-no-fbip-(\d{{4,5}}).json", v[1]),
+                }
+            continue
+        name_to_legend_and_fn_patterns[k] = {
+                "Coop (Ours)": (rf"{v[0]}-ours-(\d{{4,5}}).json", v[1]),
+                "DTE": (rf"{v[0]}-dte-our-impl-(\d{{4,5}}).json", v[1]),
+                "DTR": (rf"{v[0]}-dtr-no-free-(\d{{4,5}}).json", v[1]),
+            }
+
+    draw_from_files_and_draw(
+        xlabel="Memory Ratio",
+        ylabel="Compute Overhead (x)",
+        get_y=get_dataset_time_from_json_file,
+        pic_name=f"compute-overhead-main.pdf",
+        name_to_legend_and_fn_patterns=name_to_legend_and_fn_patterns,
+        ncols=4,
+        nrows=2,
+        imgcat=True,
+        pic_kind=sys.argv[2],
+        data_kind=data_kind,
+    )
\ No newline at end of file
diff --git a/tools/run_remat.sh b/tools/run_remat.sh
index 0734a04bb..c1560eaed 100755
--- a/tools/run_remat.sh
+++ b/tools/run_remat.sh
@@ -5,32 +5,32 @@ set -ux
 SCRIPT_DIR=$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")" &>/dev/null && pwd)
 
 MODEL_NAME=gpt2
-RESULT_DIR=./exp_data
-array=(6500 7500)
+RESULT_DIR=./exp_data1
+# array=(4000 4500 5000 5500 6000 6500 7000 7500 8000 8500 9000 9500)
+array=(3400 4250 5100 5950 6800 7650 8500)
 # array=(3500 4000 4500 5000 5500 6000 6500 7500 8500 9500)
-config_file=/data/home/liupeihong/codes/libai-normal/libai/config/configs/gpt2_pretrain.py
+config_file=/home/dev/files/repos/libai-normal/libai/config/configs/gpt2_pretrain.py
 
 METHOD=ours
 for threshold in "${array[@]}"; do
     CUDA_VISIBLE_DEVICES=2 ONEFLOW_REMAT_SUMMARY_FILE_PREFIX=${RESULT_DIR}/$MODEL_NAME-$METHOD-$threshold ENABLE_PROFILE_FOR_DTR=0 ONEFLOW_VM_MULTI_THREAD=0 ONEFLOW_DTR_GROUP_NUM=2 \
-    python3 -m oneflow.distributed.launch \
+    python -m oneflow.distributed.launch \
     --nproc_per_node 1 --nnodes 1 --node_rank 0 --master_addr 127.0.0.1 --master_port 12345 \
     tools/train_remat.py --config-file $config_file --threshold $threshold --fast-dev-run
 done
 
-
 METHOD=dte-our-impl
 for threshold in "${array[@]}"; do
     CUDA_VISIBLE_DEVICES=2 ONEFLOW_REMAT_SUMMARY_FILE_PREFIX=${RESULT_DIR}/$MODEL_NAME-$METHOD-$threshold ENABLE_PROFILE_FOR_DTR=0 ONEFLOW_VM_MULTI_THREAD=0 ONEFLOW_DTR_GROUP_NUM=1 ONEFLOW_REMAT_HEURISTIC_DTE=1 \
-    python3 -m oneflow.distributed.launch \
+    python -m oneflow.distributed.launch \
     --nproc_per_node 1 --nnodes 1 --node_rank 0 --master_addr 127.0.0.1 --master_port 12345 \
     tools/train_remat.py --config-file $config_file --threshold $threshold --fast-dev-run
 done
 
 METHOD=dtr-no-free
 for threshold in "${array[@]}"; do
-    CUDA_VISIBLE_DEVICES=2 ONEFLOW_REMAT_SUMMARY_FILE_PREFIX=${RESULT_DIR}/$MODEL_NAME-$METHOD-$threshold ENABLE_PROFILE_FOR_DTR=0 ONEFLOW_VM_MULTI_THREAD=0 ONEFLOW_DTR_GROUP_NUM=1 ONEFLOW_REMAT_HEURISTIC_DTR=1 \    
-    python3 -m oneflow.distributed.launch \
+    CUDA_VISIBLE_DEVICES=2 ONEFLOW_REMAT_SUMMARY_FILE_PREFIX=${RESULT_DIR}/$MODEL_NAME-$METHOD-$threshold ENABLE_PROFILE_FOR_DTR=0 ONEFLOW_VM_MULTI_THREAD=0 ONEFLOW_DTR_GROUP_NUM=1 ONEFLOW_REMAT_HEURISTIC_DTR=1 \
+    python -m oneflow.distributed.launch \
     --nproc_per_node 1 --nnodes 1 --node_rank 0 --master_addr 127.0.0.1 --master_port 12345 \
     tools/train_remat.py --config-file $config_file --threshold $threshold --fast-dev-run
 done
\ No newline at end of file
diff --git a/tools/train_remat.py b/tools/train_remat.py
index 0cdd7b94b..a94f01ae3 100644
--- a/tools/train_remat.py
+++ b/tools/train_remat.py
@@ -47,7 +47,7 @@ def main(args):
 
     if args.fast_dev_run:
         cfg.train.train_epoch = 0
-        cfg.train.train_iter = 30
+        cfg.train.train_iter = 10
         cfg.train.evaluation.enabled = False
         cfg.train.log_period = 1
         cfg.train.input_placement_device = "cuda+remat"

From a176b7e75ff5fbaea7c9d5bff6883856c1f784ed Mon Sep 17 00:00:00 2001
From: Mosout <mosout@qq.com>
Date: Thu, 11 May 2023 06:03:37 +0000
Subject: [PATCH 3/5] refine

---
 configs/dtr_gpt2_pretrain.py | 48 ++++++++++++++++++++++++++++++++++++
 configs/gpt2_pretrain.py     | 12 +++------
 2 files changed, 51 insertions(+), 9 deletions(-)
 create mode 100644 configs/dtr_gpt2_pretrain.py

diff --git a/configs/dtr_gpt2_pretrain.py b/configs/dtr_gpt2_pretrain.py
new file mode 100644
index 000000000..71d44abe3
--- /dev/null
+++ b/configs/dtr_gpt2_pretrain.py
@@ -0,0 +1,48 @@
+from libai.config import LazyCall
+from libai.evaluation import PPLEvaluator
+from .common.models.gpt import pretrain_model as model
+from .common.train import train
+from .common.optim import optim
+from .common.data.gpt_dataset import dataloader, tokenization
+
+from .common.models.graph import graph
+
+__dataset_root = "/share_nfs/sd_dataset/lph/datasets/libai_dataset"
+vocab_file = f"{__dataset_root}/gpt2-vocab.json"
+merge_files = f"{__dataset_root}/gpt2-merges.txt"
+data_prefix = (
+    f"{__dataset_root}/loss_compara_content_sentence"
+)
+
+tokenization.tokenizer.vocab_file = vocab_file
+tokenization.tokenizer.merges_file = merge_files
+dataloader.train.dataset[0].data_prefix = data_prefix
+dataloader.train.dataset[0].indexed_dataset.data_prefix = data_prefix
+dataloader.test[0].dataset.data_prefix = data_prefix
+dataloader.test[0].dataset.indexed_dataset.data_prefix = data_prefix
+
+# GPT-2 model config
+model.cfg.embedding_dropout_prob = 0.1
+model.cfg.attention_dropout_prob = 0.1
+model.cfg.num_attention_heads = 16
+model.cfg.hidden_size = 384
+model.cfg.ffn_hidden_size = 1536
+model.cfg.hidden_layers = 6
+model.cfg.max_seq_length = 1024
+
+train.input_placement_device = "cpu"
+
+train.dist.pipeline_num_layers = model.cfg.hidden_layers
+
+for ds in dataloader.train.dataset:
+    ds.max_seq_length = model.cfg.max_seq_length
+
+optim.lr = 1.5e-4
+
+train.train_micro_batch_size = 4
+train.amp.enabled = False
+
+train.evaluation.evaluator = LazyCall(PPLEvaluator)()
+
+train.output_dir = "./output/gpt2_output"
+graph.enabled = False
diff --git a/configs/gpt2_pretrain.py b/configs/gpt2_pretrain.py
index 8c53a56b2..313c814e1 100644
--- a/configs/gpt2_pretrain.py
+++ b/configs/gpt2_pretrain.py
@@ -7,14 +7,9 @@
 
 from .common.models.graph import graph
 
-# vocab_file = "./data_test/gpt_data/gpt2-vocab.json"
-# merge_files = "./data_test/gpt_data/gpt2-merges.txt"
-# data_prefix = "./data_test/gpt_data/loss_compara_content_sentence"
-vocab_file = "/data/home/liupeihong/datasets/libai_dataset/gpt2-vocab.json"
-merge_files = "/data/home/liupeihong/datasets/libai_dataset/gpt2-merges.txt"
-data_prefix = (
-    "/data/home/liupeihong/datasets/libai_dataset/loss_compara_content_sentence"
-)
+vocab_file = "./data_test/gpt_data/gpt2-vocab.json"
+merge_files = "./data_test/gpt_data/gpt2-merges.txt"
+data_prefix = "./data_test/gpt_data/loss_compara_content_sentence"
 
 tokenization.tokenizer.vocab_file = vocab_file
 tokenization.tokenizer.merges_file = merge_files
@@ -47,4 +42,3 @@
 train.evaluation.evaluator = LazyCall(PPLEvaluator)()
 
 train.output_dir = "./output/gpt2_output"
-graph.enabled=False

From 97ee979663468805f0c3f15f476c947bd75b4fab Mon Sep 17 00:00:00 2001
From: Mosout <mosout@qq.com>
Date: Sat, 13 May 2023 17:12:54 +0000
Subject: [PATCH 4/5] refine

---
 .gitignore                                |  2 +-
 configs/dtr_gpt2_pretrain.py              |  3 +-
 configs/dtr_gpt3_pretrain.py              | 48 ++++++++++++++
 libai/engine/default.py                   | 80 ++++++++++++++++++++---
 tools/json_to_pic.py                      |  3 +
 tools/{run_remat.sh => run_remat_gpt2.sh} | 16 +++--
 tools/run_remat_gpt3.sh                   | 35 ++++++++++
 7 files changed, 169 insertions(+), 18 deletions(-)
 create mode 100644 configs/dtr_gpt3_pretrain.py
 rename tools/{run_remat.sh => run_remat_gpt2.sh} (81%)
 create mode 100755 tools/run_remat_gpt3.sh

diff --git a/.gitignore b/.gitignore
index f0985c93f..574f83391 100644
--- a/.gitignore
+++ b/.gitignore
@@ -132,4 +132,4 @@ dmypy.json
 # Pyre type checker
 .pyre/
 output
-exp_data
\ No newline at end of file
+exp_data*
\ No newline at end of file
diff --git a/configs/dtr_gpt2_pretrain.py b/configs/dtr_gpt2_pretrain.py
index 71d44abe3..22b4f98cf 100644
--- a/configs/dtr_gpt2_pretrain.py
+++ b/configs/dtr_gpt2_pretrain.py
@@ -39,7 +39,8 @@
 
 optim.lr = 1.5e-4
 
-train.train_micro_batch_size = 4
+train.train_micro_batch_size = 4 
+# train.train_micro_batch_size = 42 for a800 
 train.amp.enabled = False
 
 train.evaluation.evaluator = LazyCall(PPLEvaluator)()
diff --git a/configs/dtr_gpt3_pretrain.py b/configs/dtr_gpt3_pretrain.py
new file mode 100644
index 000000000..35cb5c2c5
--- /dev/null
+++ b/configs/dtr_gpt3_pretrain.py
@@ -0,0 +1,48 @@
+from libai.config import LazyCall
+from libai.evaluation import PPLEvaluator
+from .common.models.gpt import pretrain_model as model
+from .common.train import train
+from .common.optim import optim
+from .common.data.gpt_dataset import dataloader, tokenization
+
+from .common.models.graph import graph
+
+__dataset_root = "/share_nfs/sd_dataset/lph/datasets/libai_dataset"
+vocab_file = f"{__dataset_root}/gpt2-vocab.json"
+merge_files = f"{__dataset_root}/gpt2-merges.txt"
+data_prefix = (
+    f"{__dataset_root}/loss_compara_content_sentence"
+)
+
+tokenization.tokenizer.vocab_file = vocab_file
+tokenization.tokenizer.merges_file = merge_files
+dataloader.train.dataset[0].data_prefix = data_prefix
+dataloader.train.dataset[0].indexed_dataset.data_prefix = data_prefix
+dataloader.test[0].dataset.data_prefix = data_prefix
+dataloader.test[0].dataset.indexed_dataset.data_prefix = data_prefix
+
+# GPT-2 model config
+model.cfg.embedding_dropout_prob = 0.1
+model.cfg.attention_dropout_prob = 0.1
+model.cfg.num_attention_heads = 32
+model.cfg.hidden_size = 4096
+model.cfg.ffn_hidden_size = 4096*4
+model.cfg.hidden_layers = 32
+model.cfg.max_seq_length = 1024
+
+train.input_placement_device = "cpu"
+
+train.dist.pipeline_num_layers = model.cfg.hidden_layers
+
+for ds in dataloader.train.dataset:
+    ds.max_seq_length = model.cfg.max_seq_length
+
+optim.lr = 1.5e-4
+
+train.train_micro_batch_size = 2
+train.amp.enabled = False
+
+train.evaluation.evaluator = LazyCall(PPLEvaluator)()
+
+train.output_dir = "./output/gpt3_output"
+graph.enabled = False
diff --git a/libai/engine/default.py b/libai/engine/default.py
index fe76ee9db..ef8ba0bda 100644
--- a/libai/engine/default.py
+++ b/libai/engine/default.py
@@ -44,6 +44,51 @@
 # https://github.com/facebookresearch/detectron2/blob/main/detectron2/engine/defaults.py
 # --------------------------------------------------------
 
+def count_all_parameters(model, verbose=False):
+    """
+    Count total, trainable, and non-trainable parameters in a PyTorch model.
+    Args:
+        model (nn.Module): The model to count parameters for.
+        verbose (bool, optional): Print detailed information if True.
+    Returns:
+        Tuple containing total, trainable, and non-trainable parameters, and percent trainable parameters.
+    """
+    train_params, all_params = 0, 0
+    for _, param in model.named_parameters():
+        num_params = param.numel()
+        all_params += num_params
+        if param.requires_grad:
+            train_params += num_params
+    nontrain_params = all_params - train_params
+    pct_train_params = train_params / all_params * 100
+    if verbose:
+        logger = logging.getLogger(__name__)
+        logger.info(f"Total params: {format_size(all_params)}")
+        logger.info(f"Trainable params: {format_size(train_params)}")
+        logger.info(f"Non-trainable params: {format_size(nontrain_params)}")
+        logger.info(f"Trainable params %: {pct_train_params:.4f}")
+    return all_params, train_params, nontrain_params, pct_train_params
+
+
+def format_size(size):
+    """
+    Convert bytes to a human-readable string with appropriate units.
+    Args:
+        size (int): The number of bytes.
+    Returns:
+        String representing the number of bytes with appropriate units.
+    """
+    k, m, b, t = 1024, 1024**2, 10**9, 10**12
+    if size > t:
+        return f"{round(size / t, 4)}T"
+    elif size > b:
+        return f"{round(size / b, 4)}B"
+    elif size > m:
+        return f"{round(size / m, 4)}M"
+    elif size > k:
+        return f"{round(size / k, 4)}K"
+    else:
+        return f"{size}"
 
 def _highlight(code, filename):
     try:
@@ -200,6 +245,18 @@ def default_setup(cfg, args):
 
     _compile_dependencies()
 
+class EmtypOptimizer():
+    def __init__(self,cfg):
+        pass
+
+    def clip_grad(self):
+        pass
+
+    def step(self):
+        pass
+
+    def zero_grad(self):
+        pass
 
 class DefaultTrainer(TrainerBase):
     """
@@ -314,8 +371,9 @@ def __init__(self, cfg):
             "Building time: {:.3f} seconds".format(time.time() - start_time)
         )
 
-        self.optimizer = self.build_optimizer(cfg, self.model)
-        self.lr_scheduler = self.build_lr_scheduler(cfg, self.optimizer)
+        # self.optimizer = self.build_optimizer(cfg, self.model)
+        self.optimizer = EmtypOptimizer(cfg)
+        # self.lr_scheduler = self.build_lr_scheduler(cfg, self.optimizer)
 
         if cfg.graph.enabled:
             self.graph_train = self.build_graph(
@@ -345,13 +403,14 @@ def __init__(self, cfg):
                 lr_scheduler=self.lr_scheduler,
             )
         else:
-            self.checkpointer = Checkpointer(
-                # Assume you want to save checkpoints together with logs/statistics
-                self.model,
-                cfg.train.output_dir,
-                optimizer=self.optimizer,
-                lr_scheduler=self.lr_scheduler,
-            )
+            pass
+            # self.checkpointer = Checkpointer(
+            #     # Assume you want to save checkpoints together with logs/statistics
+            #     self.model,
+            #     cfg.train.output_dir,
+            #     optimizer=self.optimizer,
+            #     lr_scheduler=self.lr_scheduler,
+            # )
 
         # Loading checkpoint before dataloader construction, because
         # dataloader needs to know the consumed iterations from
@@ -415,7 +474,7 @@ def build_hooks(self):
 
         ret = [
             hooks.IterationTimer(),
-            hooks.LRScheduler(),  # for beauty lr scheduler printer in `nn.Graph` mode
+            # hooks.LRScheduler(),  # for beauty lr scheduler printer in `nn.Graph` mode
             # hooks.PeriodicCheckpointer(
             #     self.checkpointer,
             #     self.cfg.train.checkpointer.period,
@@ -563,6 +622,7 @@ def build_model(cls, cfg):
         model = build_model(cfg.model)
         logger = logging.getLogger(__name__)
         logger.info("Model:\n{}".format(model))
+        count_all_parameters(model, verbose=True)
         model._apply(dist.convert_to_distributed_default_setting)
         return model
 
diff --git a/tools/json_to_pic.py b/tools/json_to_pic.py
index 80eeeaa3c..abf3adb89 100644
--- a/tools/json_to_pic.py
+++ b/tools/json_to_pic.py
@@ -68,6 +68,7 @@ def get_dataset_time_from_json_file(fn):
 
 def get_threshold_from_json_file(fn):
     return int(fn[-9:][:4])
+    # return int(fn[-10:][:5]) for a800
     if fn[:9] == 'overhead-':
         fn = fn[:-5]
         return int(fn.split('-')[-1])
@@ -301,6 +302,8 @@ def unet_predicate(fn):
 
 def gpt2_predicate(fn):
     return divisible_by(850)(fn)
+    # return divisible_by(7168)(fn)
+    # return divisible_by(6500)(fn)
 
 def resnet50_predicate(fn):
     return divisible_by(1000)(fn)
diff --git a/tools/run_remat.sh b/tools/run_remat_gpt2.sh
similarity index 81%
rename from tools/run_remat.sh
rename to tools/run_remat_gpt2.sh
index c1560eaed..6b7b775bb 100755
--- a/tools/run_remat.sh
+++ b/tools/run_remat_gpt2.sh
@@ -5,15 +5,19 @@ set -ux
 SCRIPT_DIR=$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")" &>/dev/null && pwd)
 
 MODEL_NAME=gpt2
-RESULT_DIR=./exp_data1
+RESULT_DIR=./exp_data
+
+
 # array=(4000 4500 5000 5500 6000 6500 7000 7500 8000 8500 9000 9500)
-array=(3400 4250 5100 5950 6800 7650 8500)
 # array=(3500 4000 4500 5000 5500 6000 6500 7500 8500 9500)
-config_file=/home/dev/files/repos/libai-normal/libai/config/configs/gpt2_pretrain.py
+array=(3400 4250 5100 5950 6800 7650 8500)
+# array=(28672 35840 43008 50176 57344 64512 71680) for a800
+
+config_file=/share_nfs/sd_dataset/lph/codes/libai_normal/configs/dtr_gpt2_pretrain.py
 
 METHOD=ours
 for threshold in "${array[@]}"; do
-    CUDA_VISIBLE_DEVICES=2 ONEFLOW_REMAT_SUMMARY_FILE_PREFIX=${RESULT_DIR}/$MODEL_NAME-$METHOD-$threshold ENABLE_PROFILE_FOR_DTR=0 ONEFLOW_VM_MULTI_THREAD=0 ONEFLOW_DTR_GROUP_NUM=2 \
+    CUDA_VISIBLE_DEVICES=0 ONEFLOW_REMAT_SUMMARY_FILE_PREFIX=${RESULT_DIR}/$MODEL_NAME-$METHOD-$threshold ENABLE_PROFILE_FOR_DTR=0 ONEFLOW_VM_MULTI_THREAD=0 ONEFLOW_DTR_GROUP_NUM=2 \
     python -m oneflow.distributed.launch \
     --nproc_per_node 1 --nnodes 1 --node_rank 0 --master_addr 127.0.0.1 --master_port 12345 \
     tools/train_remat.py --config-file $config_file --threshold $threshold --fast-dev-run
@@ -21,7 +25,7 @@ done
 
 METHOD=dte-our-impl
 for threshold in "${array[@]}"; do
-    CUDA_VISIBLE_DEVICES=2 ONEFLOW_REMAT_SUMMARY_FILE_PREFIX=${RESULT_DIR}/$MODEL_NAME-$METHOD-$threshold ENABLE_PROFILE_FOR_DTR=0 ONEFLOW_VM_MULTI_THREAD=0 ONEFLOW_DTR_GROUP_NUM=1 ONEFLOW_REMAT_HEURISTIC_DTE=1 \
+    CUDA_VISIBLE_DEVICES=0 ONEFLOW_REMAT_SUMMARY_FILE_PREFIX=${RESULT_DIR}/$MODEL_NAME-$METHOD-$threshold ENABLE_PROFILE_FOR_DTR=0 ONEFLOW_VM_MULTI_THREAD=0 ONEFLOW_DTR_GROUP_NUM=1 ONEFLOW_REMAT_HEURISTIC_DTE=1 \
     python -m oneflow.distributed.launch \
     --nproc_per_node 1 --nnodes 1 --node_rank 0 --master_addr 127.0.0.1 --master_port 12345 \
     tools/train_remat.py --config-file $config_file --threshold $threshold --fast-dev-run
@@ -29,7 +33,7 @@ done
 
 METHOD=dtr-no-free
 for threshold in "${array[@]}"; do
-    CUDA_VISIBLE_DEVICES=2 ONEFLOW_REMAT_SUMMARY_FILE_PREFIX=${RESULT_DIR}/$MODEL_NAME-$METHOD-$threshold ENABLE_PROFILE_FOR_DTR=0 ONEFLOW_VM_MULTI_THREAD=0 ONEFLOW_DTR_GROUP_NUM=1 ONEFLOW_REMAT_HEURISTIC_DTR=1 \
+    CUDA_VISIBLE_DEVICES=0 ONEFLOW_REMAT_SUMMARY_FILE_PREFIX=${RESULT_DIR}/$MODEL_NAME-$METHOD-$threshold ENABLE_PROFILE_FOR_DTR=0 ONEFLOW_VM_MULTI_THREAD=0 ONEFLOW_DTR_GROUP_NUM=1 ONEFLOW_REMAT_HEURISTIC_DTR=1 \
     python -m oneflow.distributed.launch \
     --nproc_per_node 1 --nnodes 1 --node_rank 0 --master_addr 127.0.0.1 --master_port 12345 \
     tools/train_remat.py --config-file $config_file --threshold $threshold --fast-dev-run
diff --git a/tools/run_remat_gpt3.sh b/tools/run_remat_gpt3.sh
new file mode 100755
index 000000000..1d0a16b87
--- /dev/null
+++ b/tools/run_remat_gpt3.sh
@@ -0,0 +1,35 @@
+#!/usr/bin/env bash
+
+set -ux
+
+SCRIPT_DIR=$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")" &>/dev/null && pwd)
+
+MODEL_NAME=gpt3
+RESULT_DIR=./exp_data
+
+array=(32500 39000 45500 52000 58500 65000)
+config_file=/share_nfs/sd_dataset/lph/codes/libai_normal/configs/dtr_gpt3_pretrain.py
+
+METHOD=ours
+for threshold in "${array[@]}"; do
+    CUDA_VISIBLE_DEVICES=0 ONEFLOW_REMAT_SUMMARY_FILE_PREFIX=${RESULT_DIR}/$MODEL_NAME-$METHOD-$threshold ENABLE_PROFILE_FOR_DTR=0 ONEFLOW_VM_MULTI_THREAD=0 ONEFLOW_DTR_GROUP_NUM=2 \
+    python -m oneflow.distributed.launch \
+    --nproc_per_node 1 --nnodes 1 --node_rank 0 --master_addr 127.0.0.1 --master_port 12345 \
+    tools/train_remat.py --config-file $config_file --threshold $threshold --fast-dev-run
+done
+
+METHOD=dte-our-impl
+for threshold in "${array[@]}"; do
+    CUDA_VISIBLE_DEVICES=0 ONEFLOW_REMAT_SUMMARY_FILE_PREFIX=${RESULT_DIR}/$MODEL_NAME-$METHOD-$threshold ENABLE_PROFILE_FOR_DTR=0 ONEFLOW_VM_MULTI_THREAD=0 ONEFLOW_DTR_GROUP_NUM=1 ONEFLOW_REMAT_HEURISTIC_DTE=1 \
+    python -m oneflow.distributed.launch \
+    --nproc_per_node 1 --nnodes 1 --node_rank 0 --master_addr 127.0.0.1 --master_port 12345 \
+    tools/train_remat.py --config-file $config_file --threshold $threshold --fast-dev-run
+done
+
+METHOD=dtr-no-free
+for threshold in "${array[@]}"; do
+    CUDA_VISIBLE_DEVICES=0 ONEFLOW_REMAT_SUMMARY_FILE_PREFIX=${RESULT_DIR}/$MODEL_NAME-$METHOD-$threshold ENABLE_PROFILE_FOR_DTR=0 ONEFLOW_VM_MULTI_THREAD=0 ONEFLOW_DTR_GROUP_NUM=1 ONEFLOW_REMAT_HEURISTIC_DTR=1 \
+    python -m oneflow.distributed.launch \
+    --nproc_per_node 1 --nnodes 1 --node_rank 0 --master_addr 127.0.0.1 --master_port 12345 \
+    tools/train_remat.py --config-file $config_file --threshold $threshold --fast-dev-run
+done
\ No newline at end of file

From 79f05b4730e48e89e36d2a7bbd9f7023afe01d67 Mon Sep 17 00:00:00 2001
From: Mosout <mosout@qq.com>
Date: Sat, 13 May 2023 17:16:49 +0000
Subject: [PATCH 5/5] selective checkpointing

---
 libai/layers/attention.py | 164 ++++++++++++++++++++++++--------------
 tools/run_checkpoint.sh   |  17 ++++
 2 files changed, 121 insertions(+), 60 deletions(-)
 create mode 100755 tools/run_checkpoint.sh

diff --git a/libai/layers/attention.py b/libai/layers/attention.py
index 0bec6ebc1..45fcb9bc4 100644
--- a/libai/layers/attention.py
+++ b/libai/layers/attention.py
@@ -19,6 +19,7 @@
 
 import oneflow as flow
 from oneflow import nn
+from oneflow.utils import checkpoint
 
 from .linear import Linear
 
@@ -28,6 +29,88 @@ class AttnMaskType(enum.Enum):
     causal = 2
 
 
+class CoreAttention(nn.Module):
+    def __init__(
+        self,
+        hidden_size,
+        num_attention_heads,
+        attention_dropout_prob,
+        scale_mask_softmax_fusion,
+        attn_mask_type,
+        apply_query_key_layer_scaling=False,
+        layer_idx=0,
+    ) -> None:
+        super().__init__()
+        self.hidden_size = hidden_size
+        self.head_size = hidden_size // num_attention_heads
+        self.norm_factor = 1.0 / math.sqrt(float(self.head_size))
+        self.use_cache = False
+        self.scale_mask_softmax_fusion = scale_mask_softmax_fusion
+        self.attn_mask_type = attn_mask_type
+        self.coeff = None
+        if apply_query_key_layer_scaling:
+            self.coeff = layer_idx + 1
+            self.norm_factor /= self.coeff
+        self.attention_dropout_prob = attention_dropout_prob
+        self.dropout = nn.Dropout(p=attention_dropout_prob)
+
+    def forward(self, query, key, value, attention_mask):
+        # [bsz, num_heads, tgt_len, src_len] with [S(0), S(1)]
+        attention_scores = flow.matmul(
+            query, key, transpose_b=True, alpha=self.norm_factor
+        )
+
+        # [S(0), S(1)] x [S(0), B] = [S(0), S(1)]
+        if attention_mask is not None:
+            if self.scale_mask_softmax_fusion:
+                if self.attn_mask_type == AttnMaskType.padding:
+                    attention_mask = (
+                        attention_mask.expand_as(attention_scores)
+                        if self.use_cache
+                        else attention_mask
+                    )
+                    attention_weights = flow._C.fused_scale_mask_softmax_dropout(
+                        attention_scores,
+                        attention_mask,
+                        fill_value=-10000.0,
+                        scale=self.coeff,
+                        p=self.attention_dropout_prob,
+                    )[0]
+            else:
+                if self.coeff is not None:
+                    attention_scores *= self.coeff
+                attention_scores = flow.mul(attention_scores, attention_mask)
+                attention_scores = attention_scores - 10000.0 * (1 - attention_mask)
+                # TODO(xingyu.liao): graph will occur `where_scalar` errors
+                # when using `masked_fill`
+                # attention_scores = attention_scores.masked_fill(1 - attention_mask, -10000.0)
+                attention_weights = flow.softmax(attention_scores, dim=-1)
+                # [bsz, num_heads, tgt_len, src_len]
+                attention_weights = self.dropout(attention_weights)
+        else:
+            if (
+                self.scale_mask_softmax_fusion
+                and self.attn_mask_type == AttnMaskType.causal
+            ):
+                attention_weights = flow._C.fused_scale_tril_softmax_mask_scale(
+                    attention_scores,
+                    p=self.attention_dropout_prob,
+                    diagonal=0,
+                    tril_scale_value=self.coeff,
+                    tril_fill_value=-10000.0,
+                )[0]
+            else:
+                attention_weights = flow.softmax(attention_scores, dim=-1)
+                # [bsz, num_heads, tgt_len, src_len]
+                attention_weights = self.dropout(attention_weights)
+
+        # Context shape: [bsz, num_heads, tgt_len, head_size] with [S(0), S(1)]
+        context = flow.matmul(attention_weights, value)
+        # Change shape: [bsz, num_heads, tgt_len, head_size] -> [bsz, tgt_len, num_heads, head_size]
+        context = context.transpose(1, 2)
+        return context.flatten(2)
+
+
 class MultiheadAttention(nn.Module):
     """Multi-head attention layer, support self attention and cross attention.
 
@@ -80,18 +163,9 @@ def __init__(
 
         self.num_heads = num_attention_heads
         self.head_size = hidden_size // num_attention_heads
-        self.attn_mask_type = attn_mask_type
-
-        self.attention_dropout_prob = attention_dropout_prob
-        self.dropout = nn.Dropout(p=attention_dropout_prob)
-        self.norm_factor = 1.0 / math.sqrt(float(self.head_size))
-        self.coeff = None
-        if apply_query_key_layer_scaling:
-            self.coeff = layer_idx + 1
-            self.norm_factor /= self.coeff
 
         self.is_cross_attention = is_cross_attention
-        self.scale_mask_softmax_fusion = scale_mask_softmax_fusion
+
         self.bias_dropout_fusion = bias_dropout_fusion
 
         if self.bias_dropout_fusion:
@@ -131,6 +205,15 @@ def __init__(
             skip_bias_add=self.bias_dropout_fusion,
             layer_idx=layer_idx,
         )
+        self.core_attention = CoreAttention(
+            hidden_size=hidden_size,
+            num_attention_heads=num_attention_heads,
+            attention_dropout_prob=attention_dropout_prob,
+            scale_mask_softmax_fusion=scale_mask_softmax_fusion,
+            attn_mask_type=attn_mask_type,
+            apply_query_key_layer_scaling=apply_query_key_layer_scaling,
+            layer_idx=layer_idx,
+        )
 
     def forward(
         self,
@@ -158,7 +241,7 @@ def forward(
             use_cache (bool, optional): it will be set to True, when the model is in the inference
                 phase and used for incremental decoding. Defaults to False.
         """
-
+        self.core_attention.use_cache = use_cache
         # hidden_states, encoder_states: [S(0), B]
         # attention_mask: [S(0), B]
 
@@ -193,7 +276,9 @@ def forward(
             # hidden_states is the last-added state,
             # the full key and value could be obtained by concatenating with past_key_value.
             query_key_value = self.query_key_value(hidden_states)
-            query_key_value = query_key_value.view(bsz, -1, self.num_heads, 3 * self.head_size)
+            query_key_value = query_key_value.view(
+                bsz, -1, self.num_heads, 3 * self.head_size
+            )
             query_key_value = query_key_value.permute(
                 0, 2, 1, 3
             )  # [bsz, num_heads, src_len, 3 * head_size]
@@ -207,58 +292,17 @@ def forward(
         if use_cache:
             past_key_value = (key, value)
 
-        # [bsz, num_heads, tgt_len, src_len] with [S(0), S(1)]
-        attention_scores = flow.matmul(query, key, transpose_b=True, alpha=self.norm_factor)
-
-        # [S(0), S(1)] x [S(0), B] = [S(0), S(1)]
-        if attention_mask is not None:
-            if self.scale_mask_softmax_fusion:
-                if self.attn_mask_type == AttnMaskType.padding:
-                    attention_mask = (
-                        attention_mask.expand_as(attention_scores) if use_cache else attention_mask
-                    )
-                    attention_weights = flow._C.fused_scale_mask_softmax_dropout(
-                        attention_scores,
-                        attention_mask,
-                        fill_value=-10000.0,
-                        scale=self.coeff,
-                        p=self.attention_dropout_prob,
-                    )[0]
-            else:
-                if self.coeff is not None:
-                    attention_scores *= self.coeff
-                attention_scores = flow.mul(attention_scores, attention_mask)
-                attention_scores = attention_scores - 10000.0 * (1 - attention_mask)
-                # TODO(xingyu.liao): graph will occur `where_scalar` errors
-                # when using `masked_fill`
-                # attention_scores = attention_scores.masked_fill(1 - attention_mask, -10000.0)
-                attention_weights = flow.softmax(attention_scores, dim=-1)
-                # [bsz, num_heads, tgt_len, src_len]
-                attention_weights = self.dropout(attention_weights)
-        else:
-            if self.scale_mask_softmax_fusion and self.attn_mask_type == AttnMaskType.causal:
-                attention_weights = flow._C.fused_scale_tril_softmax_mask_scale(
-                    attention_scores,
-                    p=self.attention_dropout_prob,
-                    diagonal=0,
-                    tril_scale_value=self.coeff,
-                    tril_fill_value=-10000.0,
-                )[0]
-            else:
-                attention_weights = flow.softmax(attention_scores, dim=-1)
-                # [bsz, num_heads, tgt_len, src_len]
-                attention_weights = self.dropout(attention_weights)
-
-        # Context shape: [bsz, num_heads, tgt_len, head_size] with [S(0), S(1)]
-        context = flow.matmul(attention_weights, value)
-        # Change shape: [bsz, num_heads, tgt_len, head_size] -> [bsz, tgt_len, num_heads, head_size]
-        context = context.transpose(1, 2)
-
         # Concat multi-head results from
         # [bsz, tgt_len, num_heads, head_size] -> [bsz, tgt_len, num_heads * head_size]
         # SBP sign: [S(0), S(2)]
         # [S(0), S(2)] x [B, S(0)] = [S(0), P] -> [S(0), B]
-        output = self.dense(context.flatten(2))
+
+        # context = self.core_attention(query, key, value, attention_mask)
+
+        context = checkpoint.checkpoint(
+            self.core_attention, query, key, value, attention_mask
+        )
+        output = self.dense(context)
 
         if self.bias_dropout_fusion:
             output, bias = output
diff --git a/tools/run_checkpoint.sh b/tools/run_checkpoint.sh
new file mode 100755
index 000000000..7347fecc1
--- /dev/null
+++ b/tools/run_checkpoint.sh
@@ -0,0 +1,17 @@
+#!/usr/bin/env bash
+
+set -ux
+
+SCRIPT_DIR=$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")" &>/dev/null && pwd)
+
+MODEL_NAME=gpt2
+RESULT_DIR=./exp_data_${MODEL_NAME}
+THRESHOLD=9500
+
+config_file=/home/dev/files/repos/libai-normal/libai/config/configs/${MODEL_NAME}_pretrain.py
+
+
+CUDA_VISIBLE_DEVICES=0 ONEFLOW_REMAT_SUMMARY_FILE_PREFIX=${RESULT_DIR}/$MODEL_NAME-checkpointing ENABLE_PROFILE_FOR_DTR=0 ONEFLOW_VM_MULTI_THREAD=0 ONEFLOW_DTR_GROUP_NUM=2 \
+python -m oneflow.distributed.launch \
+--nproc_per_node 1 --nnodes 1 --node_rank 0 --master_addr 127.0.0.1 --master_port 12345 \
+tools/train_remat.py --config-file $config_file --threshold $THRESHOLD --fast-dev-run
\ No newline at end of file