From 26a64e5ea8cd8686abaf716b7afe092e1c2025d6 Mon Sep 17 00:00:00 2001
From: chende <chende16@foxmail.com>
Date: Thu, 6 Jun 2024 14:07:27 +0000
Subject: [PATCH 01/26] automated training scripts for resnet50.

---
 Classification/resnet50/Readme.md             | 225 ++++++++++++++++++
 .../resnet50/ansible_workspace/inventory.ini  |   5 +
 .../resnet50/ansible_workspace/set_docker.sh  |  18 ++
 .../resnet50/ansible_workspace/train.sh       |   9 +
 .../ansible_workspace/update_tools.sh         |   3 +
 .../tools/args_train_ddp_graph_resnet50.sh    | 144 +++++++++++
 Classification/resnet50/tools/extract.py      |  27 +++
 .../resnet50/tools/prepare_docker.sh          |  14 ++
 Classification/resnet50/tools/profile.sh      |   7 +
 Classification/resnet50/tools/train.sh        |   7 +
 10 files changed, 459 insertions(+)
 create mode 100644 Classification/resnet50/Readme.md
 create mode 100644 Classification/resnet50/ansible_workspace/inventory.ini
 create mode 100755 Classification/resnet50/ansible_workspace/set_docker.sh
 create mode 100644 Classification/resnet50/ansible_workspace/train.sh
 create mode 100755 Classification/resnet50/ansible_workspace/update_tools.sh
 create mode 100755 Classification/resnet50/tools/args_train_ddp_graph_resnet50.sh
 create mode 100644 Classification/resnet50/tools/extract.py
 create mode 100755 Classification/resnet50/tools/prepare_docker.sh
 create mode 100755 Classification/resnet50/tools/profile.sh
 create mode 100755 Classification/resnet50/tools/train.sh

diff --git a/Classification/resnet50/Readme.md b/Classification/resnet50/Readme.md
new file mode 100644
index 0000000..c72cdd4
--- /dev/null
+++ b/Classification/resnet50/Readme.md
@@ -0,0 +1,225 @@
+# 千卡 0.85
+
+[toc]
+
+## 文件目录结构
+```
+├── ansible_workspace # 主节点上的工作目录
+│   ├── inventory.ini # 用来配置节点信息
+│   ├── set_docker.sh # 在各节点上创建docker，并且配置好docker内环境
+│   ├── profile.sh # 根据节点数启动profile
+│   ├── train.sh # 根据节点数启动训练
+│   └── update_tools.sh # 将主节点的tools文件夹复制到各个子节点
+├── tools # 在各个节点使用的文件
+│   ├── args_train_ddp_graph_resnet50.sh # 接受模型训练参数并启动训练
+│   ├── models.tar.gz # 模型，为防止git网络问题，所以先下载放在共享目录下
+│   ├── extract.py # 提取log中train阶段的throughput的平均值
+│   ├── prepare_docker.sh # 用于配置docker内环境
+│   ├── profile.sh # 根据节点数在本机启动profile
+│   └── train.sh # 根据节点数在本机启动训练
+└── Readme.md
+```
+
+需求：有NVLink，以及 shared_nfs
+
+以下供参考
+
+## 第一步： 配置环境
+
+### 1.1 所有节点配置SSH Key，并设置authorized_keys
+
+（怎么自动化）
+
+需要一个共享的存储空间，如：`/shared_nfs/k85`，在一个文件夹下准备好 
+
+- authorized_keys : 在主节点运行 
+
+  ```bash
+  #!/bin/bash
+  
+  # 设置 SSH 目录路径
+  SSH_DIR="$HOME/.ssh"
+  
+  # 检查 SSH 目录是否存在，如果不存在则创建
+  if [ ! -d "$SSH_DIR" ]; then
+    mkdir -p "$SSH_DIR"
+    echo "Created directory: $SSH_DIR"
+  fi
+  
+  # 设置密钥文件路径
+  KEY_PATH="$SSH_DIR/id_rsa"
+  
+  # 生成 SSH 密钥对
+  ssh-keygen -t rsa -b 2048 -f "$KEY_PATH" -N "" -q
+  
+  # 创建 authorized_keys 文件
+  cat $SSH_DIR/id_rsa.pub > $SSH_DIR/authorized_keys
+  
+  # 将 authorized_keys 文件拷贝到共享目录
+  cp $SSH_DIR/authorized_keys shared_nfs/k85
+  ```
+
+- 在子节点运行
+
+  ```bash
+  #!/bin/bash
+  
+  # 设置 SSH 目录路径
+  SSH_DIR="$HOME/.ssh"
+  
+  # 检查 SSH 目录是否存在，如果不存在则创建
+  if [ ! -d "$SSH_DIR" ]; then
+    mkdir -p "$SSH_DIR"
+    echo "Created directory: $SSH_DIR"
+  fi
+  
+  # 设置密钥文件路径
+  KEY_PATH="$SSH_DIR/id_rsa"
+  
+  # 生成 SSH 密钥对
+  ssh-keygen -t rsa -b 2048 -f "$KEY_PATH" -N "" -q
+  
+  # 将 authorized_keys 文件拷贝到 .ssh 目录
+  cp shared_nfs/k85/authorized_keys $SSH_DIR
+  ```
+
+### 1.2 主节点安装 Ansible，并配置节点ip
+示例文件：./ansible_workspace/inventory.ini
+```ini
+[hosts]
+of27 ansible_host=192.168.1.27 ansible_ssh_common_args='-o StrictHostKeyChecking=no'
+of25 ansible_host=192.168.1.25 ansible_ssh_common_args='-o StrictHostKeyChecking=no'
+of26 ansible_host=192.168.1.26 ansible_ssh_common_args='-o StrictHostKeyChecking=no'
+of28 ansible_host=192.168.1.28 ansible_ssh_common_args='-o StrictHostKeyChecking=no'
+```
+
+### 1.3 共享目录中拷贝镜像、数据集、models脚本
+主要为设置docker内环境的脚本 和 启动docker内训练的脚本
+设置docker内环境脚本(./tools/prepare_docker.sh)如下：
+```Bash
+#!/bin/bash
+# 将tools视为共享目录
+pip config set global.index-url https://pypi.tuna.tsinghua.edu.cn/simple
+python3 -m pip install --upgrade pip
+python3 -m pip install --pre oneflow -f https://oneflow-staging.oss-cn-beijing.aliyuncs.com/branch/master/cu121
+
+
+cd /workspace 
+cp tools/models.tar.gz ./
+tar -xvf models.tar.gz
+pip install -r models/dev-requirements.txt
+pip install -r models/Vision/classification/image/resnet50/requirements.txt
+
+# 将需要使用到的脚本拷到对应文件夹下
+cp tools/args_train_ddp_graph_resnet50.sh models/Vision/classification/image/resnet50/examples/
+cp tools/train.sh models/Vision/classification/image/resnet50/
+cp tools/profile.sh models/Vision/classification/image/resnet50/
+```
+启动dokcer内训练的脚本(./tools/train.sh)如下：
+```Bash
+# 根据使用的节点数，来判断本机是否开始训练
+NUM_NODES=${1:-1}
+
+if [ "$NODE_RANK" -lt "$NUM_NODES" ]; then
+  bash examples/args_train_ddp_graph.sh "$NUM_NODES" 8 "$NODE_RANK" 192.168.1.27 /data/dataset/ImageNet/ofrecord 192 50 true python3 graph gpu 100 false '' 1
+else
+  echo do nothing
+fi
+```
+其中[args_train_ddp_graph.sh](https://github.com/Oneflow-Inc/OneAutoTest/blob/main/ResNet50/args_train_ddp_graph.sh)参考自OneAutoTest仓库
+### 1.4 使用ansible 在所有节点执行 docker load, docker tag命令
+根据上文中inventory.ini文件依次在节点上创建docker，并将NODE_RANK写入docker的环境变量内，脚本(./ansible_workspace/set_docker.sh)内容如下：
+```Bash
+set -ex
+if [ $# -ne 1 ]; then
+  echo "Usage: $0 filename"
+  exit 1
+fi
+host_file="$1"
+num_hosts=$(wc -l < "$host_file")
+docker_name="cd_test"
+
+mapfile -t lines < "$host_file"
+
+for (( i=1; i<${#lines[@]}; i++ )); do
+  line="${lines[$i]}"
+  host_name=$(echo "$line" | awk '{print $1}')
+  # 根据inventory.ini文件中节点顺序，将NODE_RANK写入docker的环境变量中
+  ansible $host_name -i $host_file -m shell -a "docker run -itd -e NODE_RANK=$((i-1)) -v /data/dataset/ImageNet:/data/dataset/ImageNet -v /data/home/chende/tools:/workspace/tools --network host --gpus all --shm-size=16g --ulimit memlock=-1 --ulimit core=0 --ulimit stack=67108864 --privileged --ipc host --cap-add=IPC_LOCK --name $docker_name nvcr.io/nvidia/pytorch:24.03-py3 bash"
+done
+# 在docker内运行环境设置的脚本
+ansible hosts -i "$host_file" -m shell -a "docker exec $docker_name bash -c 'bash /workspace/tools/prepare_docker.sh'"
+```
+使用方式：
+```Bash
+bash set_docker.sh inventory.ini
+```
+
+## 第二步：进行测试
+
+### 2.1 自动测试与日志搜集
+
+编写一个测试命令脚本文件（./ansible_workspace/train.sh）
+```Bash
+#!/bin/bash
+set -ex
+if [ $# -ne 1 ]; then
+  echo "Usage: $0 num_nodes"
+  exit 1
+fi
+NUM_NODES="$1"
+docker_name="cd_test_new"
+ansible hosts -i inventory.ini -m shell -a "docker exec $docker_name bash -c 'cd /workspace/models/Vision/classification/image/resnet50 && bash train.sh $NUM_NODES'"
+```
+
+- 需要一个参数: 节点数，
+- 运行该命令能够自动启动相应数量的节点运行。
+- 运行结束后收集日志到主节点。
+- 保存日志的目录可以以：`prefix_节点数_日期时间_surfix` 命名，前缀和后缀可以自定义
+
+### 2.2 自动日志解析
+
+可以使用2.1节提供的命令运行多次，比如：
+
+```bash
+train.sh 1
+train.sh 2
+train.sh 4
+train.sh 8
+train.sh 16
+```
+
+完成后应该保存了多个日志目录，需要编写一个日志处理脚本，从这些日志目录中提取性能数据并制成 markdown 格式的表格
+
+注：不需要完整训练，训练稳定后获取到数据就可以了。
+
+### 2.3 自动 nsys 性能测试
+
+需要编写一个能够运行 nsys 的性能测试脚本文件（./ansible_workspace/profile.sh），和2.1的脚本类似，只是启动时需要调用nsys，我们需要搜集这些信息分析，然后进行优化。这个脚本文件。
+```Bash
+#!/bin/bash
+set -ex
+if [ $# -ne 1 ]; then
+  echo "Usage: $0 num_nodes"
+  exit 1
+fi
+NUM_NODES="$1"
+docker_name="cd_test_new"
+ansible hosts -i inventory.ini -m shell -a "docker exec $docker_name bash -c 'cd /workspace/models/Vision/classification/image/resnet50 && bash profile.sh $NUM_NODES'"
+```
+```Bash
+# 根据使用的节点数，来判断是否在本地开始profile
+NUM_NODES=${1:-1}
+
+if [ "$NODE_RANK" -lt "$NUM_NODES" ]; then
+  # 在启动训练时添加nsys启动路径，即可进行profile
+  bash examples/args_train_ddp_graph.sh "$NUM_NODES" 8 "$NODE_RANK" 192.168.1.27 /data/dataset/ImageNet/ofrecord 192 50 true python3 graph gpu 100 false '/usr/local/cuda/bin/nsys' 1
+else
+  echo do nothing
+fi
+```
+[args_train_ddp_graph.sh](https://github.com/Oneflow-Inc/OneAutoTest/blob/main/ResNet50/args_train_ddp_graph.sh)中包含使用nsys启动的选项
+- 需要一个参数: 节点数，
+- 运行该命令能够自动启动相应数量的节点运行。
+- 运行结束后收集日志和nsys相关文件到主节点。
+- 保存日志的目录可以以：`prefix_节点数_日期时间_surfix` 命名，前缀和后缀可以自定义
\ No newline at end of file
diff --git a/Classification/resnet50/ansible_workspace/inventory.ini b/Classification/resnet50/ansible_workspace/inventory.ini
new file mode 100644
index 0000000..82e0bac
--- /dev/null
+++ b/Classification/resnet50/ansible_workspace/inventory.ini
@@ -0,0 +1,5 @@
+[hosts]
+of27 ansible_host=192.168.1.27 ansible_ssh_common_args='-o StrictHostKeyChecking=no'
+of25 ansible_host=192.168.1.25 ansible_ssh_common_args='-o StrictHostKeyChecking=no'
+of26 ansible_host=192.168.1.26 ansible_ssh_common_args='-o StrictHostKeyChecking=no'
+of28 ansible_host=192.168.1.28 ansible_ssh_common_args='-o StrictHostKeyChecking=no'
\ No newline at end of file
diff --git a/Classification/resnet50/ansible_workspace/set_docker.sh b/Classification/resnet50/ansible_workspace/set_docker.sh
new file mode 100755
index 0000000..7a55f49
--- /dev/null
+++ b/Classification/resnet50/ansible_workspace/set_docker.sh
@@ -0,0 +1,18 @@
+#!/bin/bash
+set -ex
+if [ $# -ne 1 ]; then
+  echo "Usage: $0 filename"
+  exit 1
+fi
+host_file="$1"
+num_hosts=$(wc -l < "$host_file")
+docker_name="cd_test_new"
+
+mapfile -t lines < "$host_file"
+
+for (( i=1; i<${#lines[@]}; i++ )); do
+  line="${lines[$i]}"
+  host_name=$(echo "$line" | awk '{print $1}')
+  ansible $host_name -i $host_file -m shell -a "docker run -itd -e NODE_RANK=$((i-1)) -v /data/dataset/ImageNet:/data/dataset/ImageNet -v /data/home/chende/tools:/workspace/tools --network host --gpus all --shm-size=16g --ulimit memlock=-1 --ulimit core=0 --ulimit stack=67108864 --privileged --ipc host --cap-add=IPC_LOCK --name $docker_name nvcr.io/nvidia/pytorch:24.03-py3 bash"
+done
+ansible hosts -i "$host_file" -m shell -a "docker exec $docker_name bash -c 'bash /workspace/tools/prepare_docker.sh'"
\ No newline at end of file
diff --git a/Classification/resnet50/ansible_workspace/train.sh b/Classification/resnet50/ansible_workspace/train.sh
new file mode 100644
index 0000000..350bf78
--- /dev/null
+++ b/Classification/resnet50/ansible_workspace/train.sh
@@ -0,0 +1,9 @@
+#!/bin/bash
+set -ex
+if [ $# -ne 1 ]; then
+  echo "Usage: $0 num_nodes"
+  exit 1
+fi
+NUM_NODES="$1"
+docker_name="cd_test_new"
+ansible hosts -i inventory.ini -m shell -a "docker exec $docker_name bash -c 'cd /workspace/models/Vision/classification/image/resnet50 && bash train.sh $NUM_NODES'"
\ No newline at end of file
diff --git a/Classification/resnet50/ansible_workspace/update_tools.sh b/Classification/resnet50/ansible_workspace/update_tools.sh
new file mode 100755
index 0000000..cda6deb
--- /dev/null
+++ b/Classification/resnet50/ansible_workspace/update_tools.sh
@@ -0,0 +1,3 @@
+#!/bin/bash
+echo "/data/home/chende/tools"
+ansible hosts -i inventory.ini -m copy -a "src=/data/home/chende/tools dest=/data/home/chende/ mode=0755"
\ No newline at end of file
diff --git a/Classification/resnet50/tools/args_train_ddp_graph_resnet50.sh b/Classification/resnet50/tools/args_train_ddp_graph_resnet50.sh
new file mode 100755
index 0000000..b4fbd36
--- /dev/null
+++ b/Classification/resnet50/tools/args_train_ddp_graph_resnet50.sh
@@ -0,0 +1,144 @@
+rm -rf core.*
+
+set -ex
+
+
+# bash examples/args_train_ddp_graph.sh ${NUM_NODES} ${DEVICE_NUM_PER_NODE} ${NODE_RANK} ${MASTER_ADDR}
+# ${OFRECORD_PATH} ${TRAIN_BATCH_SIZE} ${EPOCH} ${USE_FP16} ${PYTHON_BIN} ${RUN_TYPE} ${DEBUG_AND_NCCL} ${NSYS_BIN} ${RUN_COMMIT}
+
+# bash examples/args_train_ddp_graph.sh 1 8 0 127.0.0.1 /dataset/79846248 192 50 false python3 ddp false '' 1
+
+NUM_NODES=${1:-1}
+DEVICE_NUM_PER_NODE=${2:-8}
+NODE_RANK=${3:-0}
+MASTER_ADDR=${4:-"127.0.0.1"}
+OFRECORD_PATH=${5:-"/dataset/imagenet/ofrecord"}
+TRAIN_BATCH_SIZE=${6:-192}
+EPOCH=${7:-50}
+USE_FP16=${8:-false}
+PYTHON_BIN=${9:-"python3"}
+RUN_TYPE=${10:-"ddp"} # graph+fp16
+DECODE_TYPE=${11:-"cpu"}
+PRINT_INTERVAL=${12:-100}
+DEBUG_AND_NCCL=${13:-false}
+NSYS_BIN=${14:-""}
+RUN_COMMIT=${15:-"master"}
+ACC=${16:-1}
+VAL_BATCH_SIZE=${17:-50}
+
+
+SRC_DIR=$(realpath $(dirname $0)/..)
+
+AMP_OR="FP32"
+if $USE_FP16; then
+    AMP_OR="FP16"
+fi
+
+TRAN_MODEL="resnet50"
+RUN_TIME=$(date "+%Y%m%d_%H%M%S%N")
+LOG_FOLDER=${SRC_DIR}/test_logs/$HOSTNAME/${NUM_NODES}n${DEVICE_NUM_PER_NODE}g
+mkdir -p $LOG_FOLDER
+LOG_FILENAME=$LOG_FOLDER/${TRAN_MODEL}_${RUN_TYPE}_DC${DECODE_TYPE}_${AMP_OR}_mb${TRAIN_BATCH_SIZE}_gb$((${TRAIN_BATCH_SIZE}*${NUM_NODES}*${DEVICE_NUM_PER_NODE}*${ACC}))_acc${ACC}_${NUM_NODES}n${DEVICE_NUM_PER_NODE}g_${RUN_COMMIT}_${RUN_TIME}
+
+
+export PYTHONUNBUFFERED=1
+echo PYTHONUNBUFFERED=$PYTHONUNBUFFERED
+#export ONEFLOW_COMM_NET_IB_ENABLE=True
+export NCCL_LAUNCH_MODE=GROUP
+echo NCCL_LAUNCH_MODE=$NCCL_LAUNCH_MODE
+echo DEBUG_AND_NCCL=$DEBUG_AND_NCCL
+if $DEBUG_AND_NCCL; then
+    export ONEFLOW_DEBUG_MODE=1
+    echo ONEFLOW_DEBUG_MODE=$ONEFLOW_DEBUG_MODE
+    export NCCL_DEBUG=INFO
+    echo NCCL_DEBUG=$NCCL_DEBUG
+fi
+
+#export ONEFLOW_KERNEL_ENABLE_CUDA_GRAPH=1
+#export ONEFLOW_THREAD_ENABLE_LOCAL_MESSAGE_QUEUE=1
+#export ONEFLOW_KERNEL_DISABLE_BLOB_ACCESS_CHECKER=1
+#export ONEFLOW_ACTOR_ENABLE_LIGHT_ACTOR=1
+#export ONEFLOW_STREAM_REUSE_CUDA_EVENT=1
+
+#export ONEFLOW_STREAM_CUDA_EVENT_FLAG_BLOCKING_SYNC=true
+#export ONEFLOW_VM_WORKLOAD_ON_SCHEDULER_THREAD=1
+
+LEARNING_RATE=$(echo | awk "{print $NUM_NODES*$DEVICE_NUM_PER_NODE*$TRAIN_BATCH_SIZE*$ACC/1000}")
+MOM=0.875
+OFRECORD_PART_NUM=256
+
+EXIT_NUM=-1
+
+if [ ${EPOCH} -lt 10 ];then
+    EXIT_NUM=300
+fi
+CMD=""
+
+if [[ ! -z "${NSYS_BIN}" ]]; then
+    export ONEFLOW_PROFILER_KERNEL_PROFILE_KERNEL_FORWARD_RANGE=1
+    export ONEFLOW_DEBUG_MODE=1
+    # CMD+="${NSYS_BIN} profile --stats true -t nvtx --output ${LOG_FILENAME} "
+    export CUDNN_LOGINFO_DBG=1
+    export CUDNN_LOGDEST_DBG=${SRC_DIR}/cudnn.log
+    CMD+="${NSYS_BIN} profile --stats true --output ${LOG_FILENAME} "
+    EXIT_NUM=30
+fi
+
+
+CMD+="${PYTHON_BIN} -m oneflow.distributed.launch "
+
+CMD+="--nproc_per_node ${DEVICE_NUM_PER_NODE} "
+CMD+="--nnodes ${NUM_NODES} "
+CMD+="--node_rank ${NODE_RANK} "
+CMD+="--master_addr ${MASTER_ADDR} "
+CMD+="--master_port 12345 "
+CMD+="${SRC_DIR}/train.py "
+CMD+="--ofrecord-path ${OFRECORD_PATH} "
+CMD+="--ofrecord-part-num ${OFRECORD_PART_NUM} "
+CMD+="--num-devices-per-node ${DEVICE_NUM_PER_NODE} "
+CMD+="--lr ${LEARNING_RATE} "
+CMD+="--momentum ${MOM} "
+CMD+="--num-epochs ${EPOCH} "
+CMD+="--train-batch-size ${TRAIN_BATCH_SIZE} "
+CMD+="--train-global-batch-size $((${TRAIN_BATCH_SIZE}*${NUM_NODES}*${DEVICE_NUM_PER_NODE}*${ACC})) "
+CMD+="--val-batch-size ${VAL_BATCH_SIZE} "
+CMD+="--val-global-batch-size $((${VAL_BATCH_SIZE}*${NUM_NODES}*${DEVICE_NUM_PER_NODE}*${ACC})) "
+CMD+="--print-interval ${PRINT_INTERVAL} "
+#CMD+="--synthetic-data "
+
+if $USE_FP16; then
+    echo USE_FP16=$USE_FP16
+    CMD+="--use-fp16 --channel-last "
+fi
+
+if [ $EXIT_NUM != -1 ]; then
+    CMD+="--skip-eval "
+fi
+if [ $RUN_TYPE == 'ddp' ]; then
+    CMD+="--ddp "
+else
+    CMD+="--scale-grad --graph "
+    CMD+="--fuse-bn-relu "
+    CMD+="--fuse-bn-add-relu "
+fi
+
+if [ $DECODE_TYPE == 'gpu' ]; then
+    CMD+="--use-gpu-decode "
+fi
+
+echo "Rum cmd ${CMD}"
+
+$CMD 2>&1 | tee ${LOG_FILENAME}.log
+
+echo "Writting log to ${LOG_FILENAME}.log"
+
+if [[ ! -z "${NSYS_BIN}" ]]; then
+    rm ${LOG_FOLDER}/*.sqlite
+    mkdir -p ${LOG_FILENAME}
+    #rm -rf ./log/$HOSTNAME/oneflow.*
+    cp ./log/$HOSTNAME/* ${LOG_FILENAME}/
+    mv ${SRC_DIR}/cudnn.log ${LOG_FILENAME}/cudnn.log
+fi
+
+rm -rf ./log/$HOSTNAME
+echo "done"
diff --git a/Classification/resnet50/tools/extract.py b/Classification/resnet50/tools/extract.py
new file mode 100644
index 0000000..477903d
--- /dev/null
+++ b/Classification/resnet50/tools/extract.py
@@ -0,0 +1,27 @@
+import sys
+import re
+
+# 文件路径
+file_path = sys.argv[1]
+
+# 存储 train 模式下的 throughput
+print(file_path)
+train_throughputs = []
+
+# 正则表达式模式匹配 train 模式下的 throughput
+pattern = re.compile(r'\[train\][^|]*?throughput:\s(\d+\.\d+)')
+
+# 读取文件并提取需要的信息
+with open(file_path, 'r') as file:
+    for line in file:
+        matches = pattern.findall(line)
+        for match in matches:
+            throughput = float(match)
+            train_throughputs.append(throughput)
+
+# 计算平均 throughput
+if train_throughputs:
+    average_throughput = sum(train_throughputs) / len(train_throughputs)
+    print(f'The average throughput for [train] mode is: {average_throughput:.6f}')
+else:
+    print('No [train] mode throughputs found.')
diff --git a/Classification/resnet50/tools/prepare_docker.sh b/Classification/resnet50/tools/prepare_docker.sh
new file mode 100755
index 0000000..84147f5
--- /dev/null
+++ b/Classification/resnet50/tools/prepare_docker.sh
@@ -0,0 +1,14 @@
+#!/bin/bash
+pip config set global.index-url https://pypi.tuna.tsinghua.edu.cn/simple
+python3 -m pip install --upgrade pip
+python3 -m pip install --pre oneflow -f https://oneflow-staging.oss-cn-beijing.aliyuncs.com/branch/master/cu121
+
+cd /workspace 
+cp tools/models.tar.gz ./
+tar -xvf models.tar.gz
+pip install -r models/dev-requirements.txt
+pip install -r models/Vision/classification/image/resnet50/requirements.txt
+
+cp tools/args_train_ddp_graph_resnet50.sh models/Vision/classification/image/resnet50/examples/
+cp tools/train.sh models/Vision/classification/image/resnet50/
+cp tools/profile.sh models/Vision/classification/image/resnet50/
\ No newline at end of file
diff --git a/Classification/resnet50/tools/profile.sh b/Classification/resnet50/tools/profile.sh
new file mode 100755
index 0000000..79e0d0a
--- /dev/null
+++ b/Classification/resnet50/tools/profile.sh
@@ -0,0 +1,7 @@
+NUM_NODES=${1:-1}
+
+if [ "$NODE_RANK" -lt "$NUM_NODES" ]; then
+  bash examples/args_train_ddp_graph_resnet50.sh "$NUM_NODES" 8 "$NODE_RANK" 192.168.1.27 /data/dataset/ImageNet/ofrecord 128 50 true python3 graph gpu 100 false '/usr/local/cuda/bin/nsys' 1
+else
+  echo do nothing
+fi
\ No newline at end of file
diff --git a/Classification/resnet50/tools/train.sh b/Classification/resnet50/tools/train.sh
new file mode 100755
index 0000000..5758c05
--- /dev/null
+++ b/Classification/resnet50/tools/train.sh
@@ -0,0 +1,7 @@
+NUM_NODES=${1:-1}
+
+if [ "$NODE_RANK" -lt "$NUM_NODES" ]; then
+  bash examples/args_train_ddp_graph_resnet50.sh "$NUM_NODES" 8 "$NODE_RANK" 192.168.1.27 /data/dataset/ImageNet/ofrecord 128 50 true python3 graph gpu 100 false '' 1
+else
+  echo do nothing
+fi
\ No newline at end of file

From 4a82d2e74a7e1109962e9347240808a2fca49aea Mon Sep 17 00:00:00 2001
From: chende <chende16@foxmail.com>
Date: Thu, 6 Jun 2024 14:12:52 +0000
Subject: [PATCH 02/26] few changes.

---
 Classification/resnet50/Readme.md                    | 2 +-
 Classification/resnet50/ansible_workspace/profile.sh | 9 +++++++++
 2 files changed, 10 insertions(+), 1 deletion(-)
 create mode 100644 Classification/resnet50/ansible_workspace/profile.sh

diff --git a/Classification/resnet50/Readme.md b/Classification/resnet50/Readme.md
index c72cdd4..21be9e5 100644
--- a/Classification/resnet50/Readme.md
+++ b/Classification/resnet50/Readme.md
@@ -12,7 +12,7 @@
 │   └── update_tools.sh # 将主节点的tools文件夹复制到各个子节点
 ├── tools # 在各个节点使用的文件
 │   ├── args_train_ddp_graph_resnet50.sh # 接受模型训练参数并启动训练
-│   ├── models.tar.gz # 模型，为防止git网络问题，所以先下载放在共享目录下
+│   ├── models.tar.gz # 模型，为防止git网络问题，建议先下载放在共享目录下
 │   ├── extract.py # 提取log中train阶段的throughput的平均值
 │   ├── prepare_docker.sh # 用于配置docker内环境
 │   ├── profile.sh # 根据节点数在本机启动profile
diff --git a/Classification/resnet50/ansible_workspace/profile.sh b/Classification/resnet50/ansible_workspace/profile.sh
new file mode 100644
index 0000000..db6abfb
--- /dev/null
+++ b/Classification/resnet50/ansible_workspace/profile.sh
@@ -0,0 +1,9 @@
+#!/bin/bash
+set -ex
+if [ $# -ne 1 ]; then
+  echo "Usage: $0 num_nodes"
+  exit 1
+fi
+NUM_NODES="$1"
+docker_name="cd_test_new"
+ansible hosts -i inventory.ini -m shell -a "docker exec $docker_name bash -c 'cd /workspace/models/Vision/classification/image/resnet50 && bash profile.sh $NUM_NODES'"
\ No newline at end of file

From 48e0ece68ed80ea476af11604d0eb0ebeecd7588 Mon Sep 17 00:00:00 2001
From: chende <chende16@foxmail.com>
Date: Thu, 6 Jun 2024 14:37:33 +0000
Subject: [PATCH 03/26] typo

---
 Classification/resnet50/Readme.md | 28 ++++++++++++++--------------
 1 file changed, 14 insertions(+), 14 deletions(-)

diff --git a/Classification/resnet50/Readme.md b/Classification/resnet50/Readme.md
index 21be9e5..b19a080 100644
--- a/Classification/resnet50/Readme.md
+++ b/Classification/resnet50/Readme.md
@@ -121,12 +121,24 @@ cp tools/profile.sh models/Vision/classification/image/resnet50/
 NUM_NODES=${1:-1}
 
 if [ "$NODE_RANK" -lt "$NUM_NODES" ]; then
-  bash examples/args_train_ddp_graph.sh "$NUM_NODES" 8 "$NODE_RANK" 192.168.1.27 /data/dataset/ImageNet/ofrecord 192 50 true python3 graph gpu 100 false '' 1
+  bash examples/args_train_ddp_graph_resnet50.sh "$NUM_NODES" 8 "$NODE_RANK" 192.168.1.27 /data/dataset/ImageNet/ofrecord 192 50 true python3 graph gpu 100 false '' 1
 else
   echo do nothing
 fi
 ```
-其中[args_train_ddp_graph.sh](https://github.com/Oneflow-Inc/OneAutoTest/blob/main/ResNet50/args_train_ddp_graph.sh)参考自OneAutoTest仓库
+启动dokcer内profile(./tools/profile.sh)如下：
+```Bash
+# 根据使用的节点数，来判断是否在本地开始profile
+NUM_NODES=${1:-1}
+
+if [ "$NODE_RANK" -lt "$NUM_NODES" ]; then
+  # 在启动训练时添加nsys启动路径，即可进行profile
+  bash examples/args_train_ddp_graph_resnet50.sh "$NUM_NODES" 8 "$NODE_RANK" 192.168.1.27 /data/dataset/ImageNet/ofrecord 192 50 true python3 graph gpu 100 false '/usr/local/cuda/bin/nsys' 1
+else
+  echo do nothing
+fi
+```
+args_train_ddp_graph_resnet50.sh文件参考自OneAutoTest仓库[args_train_ddp_graph.sh](https://github.com/Oneflow-Inc/OneAutoTest/blob/main/ResNet50/args_train_ddp_graph.sh)，其中包含使用nsys启动的选项
 ### 1.4 使用ansible 在所有节点执行 docker load, docker tag命令
 根据上文中inventory.ini文件依次在节点上创建docker，并将NODE_RANK写入docker的环境变量内，脚本(./ansible_workspace/set_docker.sh)内容如下：
 ```Bash
@@ -207,18 +219,6 @@ NUM_NODES="$1"
 docker_name="cd_test_new"
 ansible hosts -i inventory.ini -m shell -a "docker exec $docker_name bash -c 'cd /workspace/models/Vision/classification/image/resnet50 && bash profile.sh $NUM_NODES'"
 ```
-```Bash
-# 根据使用的节点数，来判断是否在本地开始profile
-NUM_NODES=${1:-1}
-
-if [ "$NODE_RANK" -lt "$NUM_NODES" ]; then
-  # 在启动训练时添加nsys启动路径，即可进行profile
-  bash examples/args_train_ddp_graph.sh "$NUM_NODES" 8 "$NODE_RANK" 192.168.1.27 /data/dataset/ImageNet/ofrecord 192 50 true python3 graph gpu 100 false '/usr/local/cuda/bin/nsys' 1
-else
-  echo do nothing
-fi
-```
-[args_train_ddp_graph.sh](https://github.com/Oneflow-Inc/OneAutoTest/blob/main/ResNet50/args_train_ddp_graph.sh)中包含使用nsys启动的选项
 - 需要一个参数: 节点数，
 - 运行该命令能够自动启动相应数量的节点运行。
 - 运行结束后收集日志和nsys相关文件到主节点。

From e9a15a2e276e40502732995881cc3d89b7f81c3d Mon Sep 17 00:00:00 2001
From: ShawnXuan <xiexuanx2@gmail.com>
Date: Wed, 31 Jul 2024 10:56:44 +0000
Subject: [PATCH 04/26] distribute_ssh_key

---
 .../resnet50/dist_ssh_key/README.md           | 79 +++++++++++++++++++
 .../dist_ssh_key/distribute_ssh_key.yml       | 19 +++++
 .../resnet50/dist_ssh_key/inventory.ini       |  3 +
 Classification/resnet50/dist_ssh_key/vars.yml |  8 ++
 4 files changed, 109 insertions(+)
 create mode 100644 Classification/resnet50/dist_ssh_key/README.md
 create mode 100644 Classification/resnet50/dist_ssh_key/distribute_ssh_key.yml
 create mode 100644 Classification/resnet50/dist_ssh_key/inventory.ini
 create mode 100644 Classification/resnet50/dist_ssh_key/vars.yml

diff --git a/Classification/resnet50/dist_ssh_key/README.md b/Classification/resnet50/dist_ssh_key/README.md
new file mode 100644
index 0000000..427e890
--- /dev/null
+++ b/Classification/resnet50/dist_ssh_key/README.md
@@ -0,0 +1,79 @@
+# 使用 Ansible 将 SSH 公钥分发到多个目标主机
+
+## 1. 创建变量文件并加密
+
+创建一个包含密码的变量文件vars.yml：
+
+```yaml
+all:
+  hosts:
+    192.168.1.27:
+      ansible_user: myuser
+      ansible_password: mypassword
+    192.168.1.28:
+      ansible_user: myuser
+      ansible_password: mypassword
+```
+
+然后使用Ansible Vault加密这个文件：
+
+```bash
+ansible-vault encrypt vars.yml
+```
+
+注意：
+
+1. 执行 `ansible-vault` 的过程中需要设定一个密码，请记住或保存好这个密码
+2. `vars.yml`将被替换为加密后的文件
+
+## 2. 创建主机清单文件
+
+创建一个主机清单文件`inventory.ini`：
+
+```ini
+[all]
+node1 ansible_host=192.168.1.27 ansible_user=myuser
+node2 ansible_host=192.168.1.28 ansible_user=myuser
+```
+
+注：需要根据情况修改 `ansible_user` 的值
+
+## 3. 创建Playbook
+
+如果文件存在，这一步可以忽略。
+
+创建一个Playbook distribute_ssh_key.yml：
+
+```yaml
+---
+- name: Distribute SSH key
+  hosts: all
+  vars_files:
+    - vars.yml
+  tasks:
+    - name: Create .ssh directory if it doesn't exist
+      file:
+        path: /home/{{ ansible_user }}/.ssh
+        state: directory
+        mode: '0700'
+        owner: "{{ ansible_user }}"
+        group: "{{ ansible_user }}"
+
+    - name: Copy the SSH key to the authorized_keys file
+      authorized_key:
+        user: "{{ ansible_user }}"
+        state: present
+        key: "{{ lookup('file', '/path/to/id_rsa.pub') }}"
+```
+
+注：`vars_files` 配置为 `vars.yml`
+
+## 4. 运行Playbook
+
+使用以下命令运行Playbook，并解密变量文件：
+
+```bash
+ansible-playbook -i inventory.ini distribute_ssh_key.yml --ask-vault-pass
+```
+
+
diff --git a/Classification/resnet50/dist_ssh_key/distribute_ssh_key.yml b/Classification/resnet50/dist_ssh_key/distribute_ssh_key.yml
new file mode 100644
index 0000000..95147c2
--- /dev/null
+++ b/Classification/resnet50/dist_ssh_key/distribute_ssh_key.yml
@@ -0,0 +1,19 @@
+---
+- name: Distribute SSH key
+  hosts: all
+  vars_files:
+    - vars.yml
+  tasks:
+    - name: Create .ssh directory if it doesn't exist
+      file:
+        path: /home/{{ ansible_user }}/.ssh
+        state: directory
+        mode: '0700'
+        owner: "{{ ansible_user }}"
+        group: "{{ ansible_user }}"
+
+    - name: Copy the SSH key to the authorized_keys file
+      authorized_key:
+        user: "{{ ansible_user }}"
+        state: present
+        key: "{{ lookup('file', '/home/xiexuan/.ssh/id_rsa.pub') }}"
diff --git a/Classification/resnet50/dist_ssh_key/inventory.ini b/Classification/resnet50/dist_ssh_key/inventory.ini
new file mode 100644
index 0000000..894b65d
--- /dev/null
+++ b/Classification/resnet50/dist_ssh_key/inventory.ini
@@ -0,0 +1,3 @@
+[all]
+of27 ansible_host=192.168.1.27 ansible_user=xiexuan
+of28 ansible_host=192.168.1.28 ansible_user=xiexuan
diff --git a/Classification/resnet50/dist_ssh_key/vars.yml b/Classification/resnet50/dist_ssh_key/vars.yml
new file mode 100644
index 0000000..49c7dbf
--- /dev/null
+++ b/Classification/resnet50/dist_ssh_key/vars.yml
@@ -0,0 +1,8 @@
+all:
+  hosts:
+    192.168.1.27:
+      ansible_user: myuser
+      ansible_password: mypassword
+    192.168.1.28:
+      ansible_user: myuser
+      ansible_password: mypassword

From 0e15764505c6d0ffc796d3ad3651c1987168894d Mon Sep 17 00:00:00 2001
From: ShawnXuan <xiexuanx2@gmail.com>
Date: Wed, 31 Jul 2024 13:12:51 +0000
Subject: [PATCH 05/26] rename dir

---
 .../resnet50/{dist_ssh_key => 0_dist_ssh_key}/README.md           | 0
 .../{dist_ssh_key => 0_dist_ssh_key}/distribute_ssh_key.yml       | 0
 .../resnet50/{dist_ssh_key => 0_dist_ssh_key}/inventory.ini       | 0
 Classification/resnet50/{dist_ssh_key => 0_dist_ssh_key}/vars.yml | 0
 4 files changed, 0 insertions(+), 0 deletions(-)
 rename Classification/resnet50/{dist_ssh_key => 0_dist_ssh_key}/README.md (100%)
 rename Classification/resnet50/{dist_ssh_key => 0_dist_ssh_key}/distribute_ssh_key.yml (100%)
 rename Classification/resnet50/{dist_ssh_key => 0_dist_ssh_key}/inventory.ini (100%)
 rename Classification/resnet50/{dist_ssh_key => 0_dist_ssh_key}/vars.yml (100%)

diff --git a/Classification/resnet50/dist_ssh_key/README.md b/Classification/resnet50/0_dist_ssh_key/README.md
similarity index 100%
rename from Classification/resnet50/dist_ssh_key/README.md
rename to Classification/resnet50/0_dist_ssh_key/README.md
diff --git a/Classification/resnet50/dist_ssh_key/distribute_ssh_key.yml b/Classification/resnet50/0_dist_ssh_key/distribute_ssh_key.yml
similarity index 100%
rename from Classification/resnet50/dist_ssh_key/distribute_ssh_key.yml
rename to Classification/resnet50/0_dist_ssh_key/distribute_ssh_key.yml
diff --git a/Classification/resnet50/dist_ssh_key/inventory.ini b/Classification/resnet50/0_dist_ssh_key/inventory.ini
similarity index 100%
rename from Classification/resnet50/dist_ssh_key/inventory.ini
rename to Classification/resnet50/0_dist_ssh_key/inventory.ini
diff --git a/Classification/resnet50/dist_ssh_key/vars.yml b/Classification/resnet50/0_dist_ssh_key/vars.yml
similarity index 100%
rename from Classification/resnet50/dist_ssh_key/vars.yml
rename to Classification/resnet50/0_dist_ssh_key/vars.yml

From 8850c1aa906f61326617b73b80a31893e9a9f0a9 Mon Sep 17 00:00:00 2001
From: ShawnXuan <xiexuanx2@gmail.com>
Date: Wed, 31 Jul 2024 13:13:40 +0000
Subject: [PATCH 06/26] mv inventory.ini to root

---
 Classification/resnet50/inventory.ini | 4 ++++
 1 file changed, 4 insertions(+)
 create mode 100644 Classification/resnet50/inventory.ini

diff --git a/Classification/resnet50/inventory.ini b/Classification/resnet50/inventory.ini
new file mode 100644
index 0000000..a4388b6
--- /dev/null
+++ b/Classification/resnet50/inventory.ini
@@ -0,0 +1,4 @@
+[hosts]
+of25 ansible_host=192.168.1.25 ansible_ssh_common_args='-o StrictHostKeyChecking=no'
+of27 ansible_host=192.168.1.27 ansible_ssh_common_args='-o StrictHostKeyChecking=no'
+of28 ansible_host=192.168.1.28 ansible_ssh_common_args='-o StrictHostKeyChecking=no'

From e100a96ec1ab18dfd4e6b2a65cef5c65ec7ede16 Mon Sep 17 00:00:00 2001
From: ShawnXuan <xiexuanx2@gmail.com>
Date: Wed, 31 Jul 2024 13:14:41 +0000
Subject: [PATCH 07/26] update

---
 Classification/resnet50/ansible_workspace/inventory.ini | 5 -----
 1 file changed, 5 deletions(-)
 delete mode 100644 Classification/resnet50/ansible_workspace/inventory.ini

diff --git a/Classification/resnet50/ansible_workspace/inventory.ini b/Classification/resnet50/ansible_workspace/inventory.ini
deleted file mode 100644
index 82e0bac..0000000
--- a/Classification/resnet50/ansible_workspace/inventory.ini
+++ /dev/null
@@ -1,5 +0,0 @@
-[hosts]
-of27 ansible_host=192.168.1.27 ansible_ssh_common_args='-o StrictHostKeyChecking=no'
-of25 ansible_host=192.168.1.25 ansible_ssh_common_args='-o StrictHostKeyChecking=no'
-of26 ansible_host=192.168.1.26 ansible_ssh_common_args='-o StrictHostKeyChecking=no'
-of28 ansible_host=192.168.1.28 ansible_ssh_common_args='-o StrictHostKeyChecking=no'
\ No newline at end of file

From dcb9451cc05c684755b26d9a9b844a1aa3b8ae1b Mon Sep 17 00:00:00 2001
From: ShawnXuan <xiexuanx2@gmail.com>
Date: Wed, 31 Jul 2024 13:18:06 +0000
Subject: [PATCH 08/26] add bash

---
 Classification/resnet50/0_dist_ssh_key/dist_ssh_key.sh | 1 +
 1 file changed, 1 insertion(+)
 create mode 100755 Classification/resnet50/0_dist_ssh_key/dist_ssh_key.sh

diff --git a/Classification/resnet50/0_dist_ssh_key/dist_ssh_key.sh b/Classification/resnet50/0_dist_ssh_key/dist_ssh_key.sh
new file mode 100755
index 0000000..7cb0f1d
--- /dev/null
+++ b/Classification/resnet50/0_dist_ssh_key/dist_ssh_key.sh
@@ -0,0 +1 @@
+ansible-playbook -i inventory.ini distribute_ssh_key.yml --ask-vault-pass

From c8497b6eabab91d96f87bd2427748c98d4b9729c Mon Sep 17 00:00:00 2001
From: ShawnXuan <xiexuanx2@gmail.com>
Date: Wed, 31 Jul 2024 13:19:36 +0000
Subject: [PATCH 09/26] update

---
 Classification/resnet50/0_dist_ssh_key/README.md | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/Classification/resnet50/0_dist_ssh_key/README.md b/Classification/resnet50/0_dist_ssh_key/README.md
index 427e890..856df74 100644
--- a/Classification/resnet50/0_dist_ssh_key/README.md
+++ b/Classification/resnet50/0_dist_ssh_key/README.md
@@ -75,5 +75,9 @@ node2 ansible_host=192.168.1.28 ansible_user=myuser
 ```bash
 ansible-playbook -i inventory.ini distribute_ssh_key.yml --ask-vault-pass
 ```
+或者运行
 
+```bash
+./dist_ssh_key.sh
+```
 

From b9528444a788c2a05aaeea14b411c5964171a5f0 Mon Sep 17 00:00:00 2001
From: ShawnXuan <xiexuanx2@gmail.com>
Date: Thu, 1 Aug 2024 02:18:02 +0000
Subject: [PATCH 10/26] pull or load docker image

---
 .../resnet50/1_get_docker_image/load.sh       | 26 +++++++++++++++++
 .../load_and_tag_docker_image.yml             | 28 +++++++++++++++++++
 .../resnet50/1_get_docker_image/pull.sh       |  7 +++++
 .../1_get_docker_image/pull_docker_image.yml  | 17 +++++++++++
 Classification/resnet50/inventory.ini         |  3 +-
 5 files changed, 79 insertions(+), 2 deletions(-)
 create mode 100755 Classification/resnet50/1_get_docker_image/load.sh
 create mode 100644 Classification/resnet50/1_get_docker_image/load_and_tag_docker_image.yml
 create mode 100755 Classification/resnet50/1_get_docker_image/pull.sh
 create mode 100644 Classification/resnet50/1_get_docker_image/pull_docker_image.yml

diff --git a/Classification/resnet50/1_get_docker_image/load.sh b/Classification/resnet50/1_get_docker_image/load.sh
new file mode 100755
index 0000000..5df8bcd
--- /dev/null
+++ b/Classification/resnet50/1_get_docker_image/load.sh
@@ -0,0 +1,26 @@
+#!/bin/bash
+
+if [ -n "$1" ]; then
+  docker_image_path=$1
+else
+  docker_image_path="/share_nfs/k85/oneflow.0.9.1.dev20240203-cuda11.8.tar"
+fi
+
+if [ -n "$2" ]; then
+  docker_image_tag=$2
+else
+  docker_image_tag="oneflowinc/oneflow:0.9.1.dev20240203-cuda11.8"
+fi
+
+if [ -n "$3" ]; then
+  force_load=$3
+else
+  force_load=false
+fi
+
+ansible-playbook \
+    -i ../inventory.ini \
+    load_and_tag_docker_image.yml \
+    -e "docker_image_path=$docker_image_path" \
+    -e "docker_image_tag=$docker_image_tag" \
+    -e "force_load=$force_load"
diff --git a/Classification/resnet50/1_get_docker_image/load_and_tag_docker_image.yml b/Classification/resnet50/1_get_docker_image/load_and_tag_docker_image.yml
new file mode 100644
index 0000000..5a2f92d
--- /dev/null
+++ b/Classification/resnet50/1_get_docker_image/load_and_tag_docker_image.yml
@@ -0,0 +1,28 @@
+---
+- name: Load and tag Docker image
+  hosts: all
+  vars:
+    docker_image_path: "/share_nfs/k85/oneflow.0.9.1.dev20240203-cuda11.8.tar"
+    docker_image_tag: "oneflowinc/oneflow:0.9.1.dev20240203-cuda11.8"
+    force_load: false
+
+  tasks:
+    - name: Check if Docker image with the specified tag already exists
+      command: "docker images -q {{ docker_image_tag }}"
+      register: image_id
+      changed_when: false
+      when: not force_load
+
+    - name: Load Docker image from tar file
+      command: "docker load -i {{ docker_image_path }}"
+      when: force_load or image_id.stdout == ""
+      register: load_output
+
+    - name: Get image ID from load output
+      set_fact:
+        loaded_image_id: "{{ load_output.stdout_lines[-1] | regex_search('sha256:[0-9a-f]+') }}"
+      when: force_load or image_id.stdout == ""
+
+    - name: Tag the loaded Docker image
+      command: "docker tag {{ loaded_image_id }} {{ docker_image_tag }}"
+      when: force_load or image_id.stdout == ""
diff --git a/Classification/resnet50/1_get_docker_image/pull.sh b/Classification/resnet50/1_get_docker_image/pull.sh
new file mode 100755
index 0000000..8787fea
--- /dev/null
+++ b/Classification/resnet50/1_get_docker_image/pull.sh
@@ -0,0 +1,7 @@
+#!/bin/bash
+
+if [ -n "$1" ]; then
+  ansible-playbook -i ../inventory.ini pull_docker_image.yml -e "docker_image=$1"
+else
+  ansible-playbook -i ../inventory.ini pull_docker_image.yml
+fi
diff --git a/Classification/resnet50/1_get_docker_image/pull_docker_image.yml b/Classification/resnet50/1_get_docker_image/pull_docker_image.yml
new file mode 100644
index 0000000..d086e6a
--- /dev/null
+++ b/Classification/resnet50/1_get_docker_image/pull_docker_image.yml
@@ -0,0 +1,17 @@
+---
+- name: Pull specified Docker image
+  hosts: all
+  vars:
+    docker_image: "oneflowinc/oneflow:0.9.1.dev20240203-cuda11.8"
+
+  tasks:
+    - name: Check if the Docker image is already present
+      command: "docker images -q {{ docker_image }}"
+      register: docker_image_id
+      changed_when: false
+
+    - name: Pull Docker image if not present
+      docker_image:
+        name: "{{ docker_image }}"
+        source: pull
+      when: docker_image_id.stdout == ""
diff --git a/Classification/resnet50/inventory.ini b/Classification/resnet50/inventory.ini
index a4388b6..01027fd 100644
--- a/Classification/resnet50/inventory.ini
+++ b/Classification/resnet50/inventory.ini
@@ -1,4 +1,3 @@
-[hosts]
-of25 ansible_host=192.168.1.25 ansible_ssh_common_args='-o StrictHostKeyChecking=no'
+[all]
 of27 ansible_host=192.168.1.27 ansible_ssh_common_args='-o StrictHostKeyChecking=no'
 of28 ansible_host=192.168.1.28 ansible_ssh_common_args='-o StrictHostKeyChecking=no'

From efed68f417d1c2e7229164dfad8dcbe2935afcad Mon Sep 17 00:00:00 2001
From: ShawnXuan <xiexuanx2@gmail.com>
Date: Thu, 1 Aug 2024 02:30:44 +0000
Subject: [PATCH 11/26] update readme

---
 .../resnet50/1_get_docker_image/README.md     | 61 +++++++++++++++++++
 1 file changed, 61 insertions(+)
 create mode 100644 Classification/resnet50/1_get_docker_image/README.md

diff --git a/Classification/resnet50/1_get_docker_image/README.md b/Classification/resnet50/1_get_docker_image/README.md
new file mode 100644
index 0000000..3f6282a
--- /dev/null
+++ b/Classification/resnet50/1_get_docker_image/README.md
@@ -0,0 +1,61 @@
+# 拉取或导入镜像
+
+## 拉取镜像
+
+适用于直接从 dockerhub 拉取镜像。
+
+用法: `./pull.sh [镜像标签]`
+
+参数说明:
+
+- 镜像标签 (可选)  : 要拉取的Docker镜像标签，例如 alpine:latest。如果未提供，则使用playbook中的默认值。
+
+示例:
+
+- 默认使用:
+
+```bash
+./pull.sh
+```
+
+- 指定镜像标签:
+
+ ```bash
+./pull.sh alpine:latest
+ ```
+
+## 导入镜像
+
+适用于本地共享目录有已经保存镜像的tar文件，使用 `docker load` 导入。
+
+用法: `./load.sh [镜像文件路径] [镜像标签] [强制导入]`
+
+参数说明:
+
+- 镜像文件路径 (可选)  : 要导入的Docker镜像tar文件路径，默认为 `/share_nfs/k85/oneflow.0.9.1.dev20240203-cuda11.8.tar`
+- 镜像标签 (可选)      : 导入后设置的Docker镜像标签，默认为 `oneflowinc/oneflow:0.9.1.dev20240203-cuda11.8`
+- 强制导入 (可选)      : 是否强制导入镜像（true 或 false），默认为 false
+
+示例:
+
+- 默认使用:
+
+  ```bash
+ ./load.sh
+  ```
+
+- 指定镜像文件路径和标签:
+
+```bash
+./load.sh /path/to/shared/abc.tar myrepo/myimage:latest
+```
+
+- 强制导入镜像:    
+
+```bash
+./load.sh /path/to/shared/abc.tar myrepo/myimage:latest true
+```
+
+
+
+

From 6ac0f4cefe6764feba617e135e332b0e0a924e6c Mon Sep 17 00:00:00 2001
From: ShawnXuan <xiexuanx2@gmail.com>
Date: Thu, 1 Aug 2024 02:56:43 +0000
Subject: [PATCH 12/26] rm old files

---
 .../resnet50/ansible_workspace/profile.sh     |   9 --
 .../resnet50/ansible_workspace/set_docker.sh  |  18 ---
 .../resnet50/ansible_workspace/train.sh       |   9 --
 .../ansible_workspace/update_tools.sh         |   3 -
 .../tools/args_train_ddp_graph_resnet50.sh    | 144 ------------------
 Classification/resnet50/tools/extract.py      |  27 ----
 .../resnet50/tools/prepare_docker.sh          |  14 --
 Classification/resnet50/tools/profile.sh      |   7 -
 Classification/resnet50/tools/train.sh        |   7 -
 9 files changed, 238 deletions(-)
 delete mode 100644 Classification/resnet50/ansible_workspace/profile.sh
 delete mode 100755 Classification/resnet50/ansible_workspace/set_docker.sh
 delete mode 100644 Classification/resnet50/ansible_workspace/train.sh
 delete mode 100755 Classification/resnet50/ansible_workspace/update_tools.sh
 delete mode 100755 Classification/resnet50/tools/args_train_ddp_graph_resnet50.sh
 delete mode 100644 Classification/resnet50/tools/extract.py
 delete mode 100755 Classification/resnet50/tools/prepare_docker.sh
 delete mode 100755 Classification/resnet50/tools/profile.sh
 delete mode 100755 Classification/resnet50/tools/train.sh

diff --git a/Classification/resnet50/ansible_workspace/profile.sh b/Classification/resnet50/ansible_workspace/profile.sh
deleted file mode 100644
index db6abfb..0000000
--- a/Classification/resnet50/ansible_workspace/profile.sh
+++ /dev/null
@@ -1,9 +0,0 @@
-#!/bin/bash
-set -ex
-if [ $# -ne 1 ]; then
-  echo "Usage: $0 num_nodes"
-  exit 1
-fi
-NUM_NODES="$1"
-docker_name="cd_test_new"
-ansible hosts -i inventory.ini -m shell -a "docker exec $docker_name bash -c 'cd /workspace/models/Vision/classification/image/resnet50 && bash profile.sh $NUM_NODES'"
\ No newline at end of file
diff --git a/Classification/resnet50/ansible_workspace/set_docker.sh b/Classification/resnet50/ansible_workspace/set_docker.sh
deleted file mode 100755
index 7a55f49..0000000
--- a/Classification/resnet50/ansible_workspace/set_docker.sh
+++ /dev/null
@@ -1,18 +0,0 @@
-#!/bin/bash
-set -ex
-if [ $# -ne 1 ]; then
-  echo "Usage: $0 filename"
-  exit 1
-fi
-host_file="$1"
-num_hosts=$(wc -l < "$host_file")
-docker_name="cd_test_new"
-
-mapfile -t lines < "$host_file"
-
-for (( i=1; i<${#lines[@]}; i++ )); do
-  line="${lines[$i]}"
-  host_name=$(echo "$line" | awk '{print $1}')
-  ansible $host_name -i $host_file -m shell -a "docker run -itd -e NODE_RANK=$((i-1)) -v /data/dataset/ImageNet:/data/dataset/ImageNet -v /data/home/chende/tools:/workspace/tools --network host --gpus all --shm-size=16g --ulimit memlock=-1 --ulimit core=0 --ulimit stack=67108864 --privileged --ipc host --cap-add=IPC_LOCK --name $docker_name nvcr.io/nvidia/pytorch:24.03-py3 bash"
-done
-ansible hosts -i "$host_file" -m shell -a "docker exec $docker_name bash -c 'bash /workspace/tools/prepare_docker.sh'"
\ No newline at end of file
diff --git a/Classification/resnet50/ansible_workspace/train.sh b/Classification/resnet50/ansible_workspace/train.sh
deleted file mode 100644
index 350bf78..0000000
--- a/Classification/resnet50/ansible_workspace/train.sh
+++ /dev/null
@@ -1,9 +0,0 @@
-#!/bin/bash
-set -ex
-if [ $# -ne 1 ]; then
-  echo "Usage: $0 num_nodes"
-  exit 1
-fi
-NUM_NODES="$1"
-docker_name="cd_test_new"
-ansible hosts -i inventory.ini -m shell -a "docker exec $docker_name bash -c 'cd /workspace/models/Vision/classification/image/resnet50 && bash train.sh $NUM_NODES'"
\ No newline at end of file
diff --git a/Classification/resnet50/ansible_workspace/update_tools.sh b/Classification/resnet50/ansible_workspace/update_tools.sh
deleted file mode 100755
index cda6deb..0000000
--- a/Classification/resnet50/ansible_workspace/update_tools.sh
+++ /dev/null
@@ -1,3 +0,0 @@
-#!/bin/bash
-echo "/data/home/chende/tools"
-ansible hosts -i inventory.ini -m copy -a "src=/data/home/chende/tools dest=/data/home/chende/ mode=0755"
\ No newline at end of file
diff --git a/Classification/resnet50/tools/args_train_ddp_graph_resnet50.sh b/Classification/resnet50/tools/args_train_ddp_graph_resnet50.sh
deleted file mode 100755
index b4fbd36..0000000
--- a/Classification/resnet50/tools/args_train_ddp_graph_resnet50.sh
+++ /dev/null
@@ -1,144 +0,0 @@
-rm -rf core.*
-
-set -ex
-
-
-# bash examples/args_train_ddp_graph.sh ${NUM_NODES} ${DEVICE_NUM_PER_NODE} ${NODE_RANK} ${MASTER_ADDR}
-# ${OFRECORD_PATH} ${TRAIN_BATCH_SIZE} ${EPOCH} ${USE_FP16} ${PYTHON_BIN} ${RUN_TYPE} ${DEBUG_AND_NCCL} ${NSYS_BIN} ${RUN_COMMIT}
-
-# bash examples/args_train_ddp_graph.sh 1 8 0 127.0.0.1 /dataset/79846248 192 50 false python3 ddp false '' 1
-
-NUM_NODES=${1:-1}
-DEVICE_NUM_PER_NODE=${2:-8}
-NODE_RANK=${3:-0}
-MASTER_ADDR=${4:-"127.0.0.1"}
-OFRECORD_PATH=${5:-"/dataset/imagenet/ofrecord"}
-TRAIN_BATCH_SIZE=${6:-192}
-EPOCH=${7:-50}
-USE_FP16=${8:-false}
-PYTHON_BIN=${9:-"python3"}
-RUN_TYPE=${10:-"ddp"} # graph+fp16
-DECODE_TYPE=${11:-"cpu"}
-PRINT_INTERVAL=${12:-100}
-DEBUG_AND_NCCL=${13:-false}
-NSYS_BIN=${14:-""}
-RUN_COMMIT=${15:-"master"}
-ACC=${16:-1}
-VAL_BATCH_SIZE=${17:-50}
-
-
-SRC_DIR=$(realpath $(dirname $0)/..)
-
-AMP_OR="FP32"
-if $USE_FP16; then
-    AMP_OR="FP16"
-fi
-
-TRAN_MODEL="resnet50"
-RUN_TIME=$(date "+%Y%m%d_%H%M%S%N")
-LOG_FOLDER=${SRC_DIR}/test_logs/$HOSTNAME/${NUM_NODES}n${DEVICE_NUM_PER_NODE}g
-mkdir -p $LOG_FOLDER
-LOG_FILENAME=$LOG_FOLDER/${TRAN_MODEL}_${RUN_TYPE}_DC${DECODE_TYPE}_${AMP_OR}_mb${TRAIN_BATCH_SIZE}_gb$((${TRAIN_BATCH_SIZE}*${NUM_NODES}*${DEVICE_NUM_PER_NODE}*${ACC}))_acc${ACC}_${NUM_NODES}n${DEVICE_NUM_PER_NODE}g_${RUN_COMMIT}_${RUN_TIME}
-
-
-export PYTHONUNBUFFERED=1
-echo PYTHONUNBUFFERED=$PYTHONUNBUFFERED
-#export ONEFLOW_COMM_NET_IB_ENABLE=True
-export NCCL_LAUNCH_MODE=GROUP
-echo NCCL_LAUNCH_MODE=$NCCL_LAUNCH_MODE
-echo DEBUG_AND_NCCL=$DEBUG_AND_NCCL
-if $DEBUG_AND_NCCL; then
-    export ONEFLOW_DEBUG_MODE=1
-    echo ONEFLOW_DEBUG_MODE=$ONEFLOW_DEBUG_MODE
-    export NCCL_DEBUG=INFO
-    echo NCCL_DEBUG=$NCCL_DEBUG
-fi
-
-#export ONEFLOW_KERNEL_ENABLE_CUDA_GRAPH=1
-#export ONEFLOW_THREAD_ENABLE_LOCAL_MESSAGE_QUEUE=1
-#export ONEFLOW_KERNEL_DISABLE_BLOB_ACCESS_CHECKER=1
-#export ONEFLOW_ACTOR_ENABLE_LIGHT_ACTOR=1
-#export ONEFLOW_STREAM_REUSE_CUDA_EVENT=1
-
-#export ONEFLOW_STREAM_CUDA_EVENT_FLAG_BLOCKING_SYNC=true
-#export ONEFLOW_VM_WORKLOAD_ON_SCHEDULER_THREAD=1
-
-LEARNING_RATE=$(echo | awk "{print $NUM_NODES*$DEVICE_NUM_PER_NODE*$TRAIN_BATCH_SIZE*$ACC/1000}")
-MOM=0.875
-OFRECORD_PART_NUM=256
-
-EXIT_NUM=-1
-
-if [ ${EPOCH} -lt 10 ];then
-    EXIT_NUM=300
-fi
-CMD=""
-
-if [[ ! -z "${NSYS_BIN}" ]]; then
-    export ONEFLOW_PROFILER_KERNEL_PROFILE_KERNEL_FORWARD_RANGE=1
-    export ONEFLOW_DEBUG_MODE=1
-    # CMD+="${NSYS_BIN} profile --stats true -t nvtx --output ${LOG_FILENAME} "
-    export CUDNN_LOGINFO_DBG=1
-    export CUDNN_LOGDEST_DBG=${SRC_DIR}/cudnn.log
-    CMD+="${NSYS_BIN} profile --stats true --output ${LOG_FILENAME} "
-    EXIT_NUM=30
-fi
-
-
-CMD+="${PYTHON_BIN} -m oneflow.distributed.launch "
-
-CMD+="--nproc_per_node ${DEVICE_NUM_PER_NODE} "
-CMD+="--nnodes ${NUM_NODES} "
-CMD+="--node_rank ${NODE_RANK} "
-CMD+="--master_addr ${MASTER_ADDR} "
-CMD+="--master_port 12345 "
-CMD+="${SRC_DIR}/train.py "
-CMD+="--ofrecord-path ${OFRECORD_PATH} "
-CMD+="--ofrecord-part-num ${OFRECORD_PART_NUM} "
-CMD+="--num-devices-per-node ${DEVICE_NUM_PER_NODE} "
-CMD+="--lr ${LEARNING_RATE} "
-CMD+="--momentum ${MOM} "
-CMD+="--num-epochs ${EPOCH} "
-CMD+="--train-batch-size ${TRAIN_BATCH_SIZE} "
-CMD+="--train-global-batch-size $((${TRAIN_BATCH_SIZE}*${NUM_NODES}*${DEVICE_NUM_PER_NODE}*${ACC})) "
-CMD+="--val-batch-size ${VAL_BATCH_SIZE} "
-CMD+="--val-global-batch-size $((${VAL_BATCH_SIZE}*${NUM_NODES}*${DEVICE_NUM_PER_NODE}*${ACC})) "
-CMD+="--print-interval ${PRINT_INTERVAL} "
-#CMD+="--synthetic-data "
-
-if $USE_FP16; then
-    echo USE_FP16=$USE_FP16
-    CMD+="--use-fp16 --channel-last "
-fi
-
-if [ $EXIT_NUM != -1 ]; then
-    CMD+="--skip-eval "
-fi
-if [ $RUN_TYPE == 'ddp' ]; then
-    CMD+="--ddp "
-else
-    CMD+="--scale-grad --graph "
-    CMD+="--fuse-bn-relu "
-    CMD+="--fuse-bn-add-relu "
-fi
-
-if [ $DECODE_TYPE == 'gpu' ]; then
-    CMD+="--use-gpu-decode "
-fi
-
-echo "Rum cmd ${CMD}"
-
-$CMD 2>&1 | tee ${LOG_FILENAME}.log
-
-echo "Writting log to ${LOG_FILENAME}.log"
-
-if [[ ! -z "${NSYS_BIN}" ]]; then
-    rm ${LOG_FOLDER}/*.sqlite
-    mkdir -p ${LOG_FILENAME}
-    #rm -rf ./log/$HOSTNAME/oneflow.*
-    cp ./log/$HOSTNAME/* ${LOG_FILENAME}/
-    mv ${SRC_DIR}/cudnn.log ${LOG_FILENAME}/cudnn.log
-fi
-
-rm -rf ./log/$HOSTNAME
-echo "done"
diff --git a/Classification/resnet50/tools/extract.py b/Classification/resnet50/tools/extract.py
deleted file mode 100644
index 477903d..0000000
--- a/Classification/resnet50/tools/extract.py
+++ /dev/null
@@ -1,27 +0,0 @@
-import sys
-import re
-
-# 文件路径
-file_path = sys.argv[1]
-
-# 存储 train 模式下的 throughput
-print(file_path)
-train_throughputs = []
-
-# 正则表达式模式匹配 train 模式下的 throughput
-pattern = re.compile(r'\[train\][^|]*?throughput:\s(\d+\.\d+)')
-
-# 读取文件并提取需要的信息
-with open(file_path, 'r') as file:
-    for line in file:
-        matches = pattern.findall(line)
-        for match in matches:
-            throughput = float(match)
-            train_throughputs.append(throughput)
-
-# 计算平均 throughput
-if train_throughputs:
-    average_throughput = sum(train_throughputs) / len(train_throughputs)
-    print(f'The average throughput for [train] mode is: {average_throughput:.6f}')
-else:
-    print('No [train] mode throughputs found.')
diff --git a/Classification/resnet50/tools/prepare_docker.sh b/Classification/resnet50/tools/prepare_docker.sh
deleted file mode 100755
index 84147f5..0000000
--- a/Classification/resnet50/tools/prepare_docker.sh
+++ /dev/null
@@ -1,14 +0,0 @@
-#!/bin/bash
-pip config set global.index-url https://pypi.tuna.tsinghua.edu.cn/simple
-python3 -m pip install --upgrade pip
-python3 -m pip install --pre oneflow -f https://oneflow-staging.oss-cn-beijing.aliyuncs.com/branch/master/cu121
-
-cd /workspace 
-cp tools/models.tar.gz ./
-tar -xvf models.tar.gz
-pip install -r models/dev-requirements.txt
-pip install -r models/Vision/classification/image/resnet50/requirements.txt
-
-cp tools/args_train_ddp_graph_resnet50.sh models/Vision/classification/image/resnet50/examples/
-cp tools/train.sh models/Vision/classification/image/resnet50/
-cp tools/profile.sh models/Vision/classification/image/resnet50/
\ No newline at end of file
diff --git a/Classification/resnet50/tools/profile.sh b/Classification/resnet50/tools/profile.sh
deleted file mode 100755
index 79e0d0a..0000000
--- a/Classification/resnet50/tools/profile.sh
+++ /dev/null
@@ -1,7 +0,0 @@
-NUM_NODES=${1:-1}
-
-if [ "$NODE_RANK" -lt "$NUM_NODES" ]; then
-  bash examples/args_train_ddp_graph_resnet50.sh "$NUM_NODES" 8 "$NODE_RANK" 192.168.1.27 /data/dataset/ImageNet/ofrecord 128 50 true python3 graph gpu 100 false '/usr/local/cuda/bin/nsys' 1
-else
-  echo do nothing
-fi
\ No newline at end of file
diff --git a/Classification/resnet50/tools/train.sh b/Classification/resnet50/tools/train.sh
deleted file mode 100755
index 5758c05..0000000
--- a/Classification/resnet50/tools/train.sh
+++ /dev/null
@@ -1,7 +0,0 @@
-NUM_NODES=${1:-1}
-
-if [ "$NODE_RANK" -lt "$NUM_NODES" ]; then
-  bash examples/args_train_ddp_graph_resnet50.sh "$NUM_NODES" 8 "$NODE_RANK" 192.168.1.27 /data/dataset/ImageNet/ofrecord 128 50 true python3 graph gpu 100 false '' 1
-else
-  echo do nothing
-fi
\ No newline at end of file

From fce3e8d78e3e581cfd292cddc73d4cc6daf79681 Mon Sep 17 00:00:00 2001
From: ShawnXuan <xiexuanx2@gmail.com>
Date: Thu, 1 Aug 2024 03:17:16 +0000
Subject: [PATCH 13/26] update

---
 Classification/resnet50/0_dist_ssh_key/inventory.ini | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/Classification/resnet50/0_dist_ssh_key/inventory.ini b/Classification/resnet50/0_dist_ssh_key/inventory.ini
index 894b65d..3812599 100644
--- a/Classification/resnet50/0_dist_ssh_key/inventory.ini
+++ b/Classification/resnet50/0_dist_ssh_key/inventory.ini
@@ -1,3 +1,3 @@
 [all]
-of27 ansible_host=192.168.1.27 ansible_user=xiexuan
-of28 ansible_host=192.168.1.28 ansible_user=xiexuan
+of27 ansible_host=192.168.1.27 ansible_user=myuser
+of28 ansible_host=192.168.1.28 ansible_user=myuser

From 4284d9d463b1af91530ea60d151ec6dc7181862d Mon Sep 17 00:00:00 2001
From: ShawnXuan <xiexuanx2@gmail.com>
Date: Thu, 1 Aug 2024 06:06:33 +0000
Subject: [PATCH 14/26] update readme

---
 Classification/resnet50/README.md |  25 ++++
 Classification/resnet50/Readme.md | 225 ------------------------------
 2 files changed, 25 insertions(+), 225 deletions(-)
 create mode 100644 Classification/resnet50/README.md
 delete mode 100644 Classification/resnet50/Readme.md

diff --git a/Classification/resnet50/README.md b/Classification/resnet50/README.md
new file mode 100644
index 0000000..3be3866
--- /dev/null
+++ b/Classification/resnet50/README.md
@@ -0,0 +1,25 @@
+# 使用Ansible在多节点环境分布式训练
+
+文件目录
+
+```
+.
+├── 0_dist_ssh_key                    # 分发 SSH 公钥到各个节点
+│   ├── distribute_ssh_key.yml        # ansible playbook
+│   ├── dist_ssh_key.sh               # 执行脚本
+│   ├── inventory.ini                 # 仅用于分发公钥的主机清单文件，需要根据实际情况配置
+│   ├── README.md                     # 说明文件
+│   └── vars.yml                      # 初始未加密的用户密码文件，需经过配置并加密后使用
+├── 1_get_docker_image                # 各个节点获取 docker 镜像
+│   ├── load_and_tag_docker_image.yml # 导入镜像 ansible playbook
+│   ├── load.sh                       # 导入镜像执行脚本
+│   ├── pull_docker_image.yml         # 拉取镜像 ansible playbook
+│   ├── pull.sh                       # 拉取镜像执行脚本
+│   └── README.md                     # 说明文件
+├── inventory.ini                     # 主机清单文件，需要根据实际情况配置
+└── README.md                         # 说明文件
+```
+
+
+
+
diff --git a/Classification/resnet50/Readme.md b/Classification/resnet50/Readme.md
deleted file mode 100644
index b19a080..0000000
--- a/Classification/resnet50/Readme.md
+++ /dev/null
@@ -1,225 +0,0 @@
-# 千卡 0.85
-
-[toc]
-
-## 文件目录结构
-```
-├── ansible_workspace # 主节点上的工作目录
-│   ├── inventory.ini # 用来配置节点信息
-│   ├── set_docker.sh # 在各节点上创建docker，并且配置好docker内环境
-│   ├── profile.sh # 根据节点数启动profile
-│   ├── train.sh # 根据节点数启动训练
-│   └── update_tools.sh # 将主节点的tools文件夹复制到各个子节点
-├── tools # 在各个节点使用的文件
-│   ├── args_train_ddp_graph_resnet50.sh # 接受模型训练参数并启动训练
-│   ├── models.tar.gz # 模型，为防止git网络问题，建议先下载放在共享目录下
-│   ├── extract.py # 提取log中train阶段的throughput的平均值
-│   ├── prepare_docker.sh # 用于配置docker内环境
-│   ├── profile.sh # 根据节点数在本机启动profile
-│   └── train.sh # 根据节点数在本机启动训练
-└── Readme.md
-```
-
-需求：有NVLink，以及 shared_nfs
-
-以下供参考
-
-## 第一步： 配置环境
-
-### 1.1 所有节点配置SSH Key，并设置authorized_keys
-
-（怎么自动化）
-
-需要一个共享的存储空间，如：`/shared_nfs/k85`，在一个文件夹下准备好 
-
-- authorized_keys : 在主节点运行 
-
-  ```bash
-  #!/bin/bash
-  
-  # 设置 SSH 目录路径
-  SSH_DIR="$HOME/.ssh"
-  
-  # 检查 SSH 目录是否存在，如果不存在则创建
-  if [ ! -d "$SSH_DIR" ]; then
-    mkdir -p "$SSH_DIR"
-    echo "Created directory: $SSH_DIR"
-  fi
-  
-  # 设置密钥文件路径
-  KEY_PATH="$SSH_DIR/id_rsa"
-  
-  # 生成 SSH 密钥对
-  ssh-keygen -t rsa -b 2048 -f "$KEY_PATH" -N "" -q
-  
-  # 创建 authorized_keys 文件
-  cat $SSH_DIR/id_rsa.pub > $SSH_DIR/authorized_keys
-  
-  # 将 authorized_keys 文件拷贝到共享目录
-  cp $SSH_DIR/authorized_keys shared_nfs/k85
-  ```
-
-- 在子节点运行
-
-  ```bash
-  #!/bin/bash
-  
-  # 设置 SSH 目录路径
-  SSH_DIR="$HOME/.ssh"
-  
-  # 检查 SSH 目录是否存在，如果不存在则创建
-  if [ ! -d "$SSH_DIR" ]; then
-    mkdir -p "$SSH_DIR"
-    echo "Created directory: $SSH_DIR"
-  fi
-  
-  # 设置密钥文件路径
-  KEY_PATH="$SSH_DIR/id_rsa"
-  
-  # 生成 SSH 密钥对
-  ssh-keygen -t rsa -b 2048 -f "$KEY_PATH" -N "" -q
-  
-  # 将 authorized_keys 文件拷贝到 .ssh 目录
-  cp shared_nfs/k85/authorized_keys $SSH_DIR
-  ```
-
-### 1.2 主节点安装 Ansible，并配置节点ip
-示例文件：./ansible_workspace/inventory.ini
-```ini
-[hosts]
-of27 ansible_host=192.168.1.27 ansible_ssh_common_args='-o StrictHostKeyChecking=no'
-of25 ansible_host=192.168.1.25 ansible_ssh_common_args='-o StrictHostKeyChecking=no'
-of26 ansible_host=192.168.1.26 ansible_ssh_common_args='-o StrictHostKeyChecking=no'
-of28 ansible_host=192.168.1.28 ansible_ssh_common_args='-o StrictHostKeyChecking=no'
-```
-
-### 1.3 共享目录中拷贝镜像、数据集、models脚本
-主要为设置docker内环境的脚本 和 启动docker内训练的脚本
-设置docker内环境脚本(./tools/prepare_docker.sh)如下：
-```Bash
-#!/bin/bash
-# 将tools视为共享目录
-pip config set global.index-url https://pypi.tuna.tsinghua.edu.cn/simple
-python3 -m pip install --upgrade pip
-python3 -m pip install --pre oneflow -f https://oneflow-staging.oss-cn-beijing.aliyuncs.com/branch/master/cu121
-
-
-cd /workspace 
-cp tools/models.tar.gz ./
-tar -xvf models.tar.gz
-pip install -r models/dev-requirements.txt
-pip install -r models/Vision/classification/image/resnet50/requirements.txt
-
-# 将需要使用到的脚本拷到对应文件夹下
-cp tools/args_train_ddp_graph_resnet50.sh models/Vision/classification/image/resnet50/examples/
-cp tools/train.sh models/Vision/classification/image/resnet50/
-cp tools/profile.sh models/Vision/classification/image/resnet50/
-```
-启动dokcer内训练的脚本(./tools/train.sh)如下：
-```Bash
-# 根据使用的节点数，来判断本机是否开始训练
-NUM_NODES=${1:-1}
-
-if [ "$NODE_RANK" -lt "$NUM_NODES" ]; then
-  bash examples/args_train_ddp_graph_resnet50.sh "$NUM_NODES" 8 "$NODE_RANK" 192.168.1.27 /data/dataset/ImageNet/ofrecord 192 50 true python3 graph gpu 100 false '' 1
-else
-  echo do nothing
-fi
-```
-启动dokcer内profile(./tools/profile.sh)如下：
-```Bash
-# 根据使用的节点数，来判断是否在本地开始profile
-NUM_NODES=${1:-1}
-
-if [ "$NODE_RANK" -lt "$NUM_NODES" ]; then
-  # 在启动训练时添加nsys启动路径，即可进行profile
-  bash examples/args_train_ddp_graph_resnet50.sh "$NUM_NODES" 8 "$NODE_RANK" 192.168.1.27 /data/dataset/ImageNet/ofrecord 192 50 true python3 graph gpu 100 false '/usr/local/cuda/bin/nsys' 1
-else
-  echo do nothing
-fi
-```
-args_train_ddp_graph_resnet50.sh文件参考自OneAutoTest仓库[args_train_ddp_graph.sh](https://github.com/Oneflow-Inc/OneAutoTest/blob/main/ResNet50/args_train_ddp_graph.sh)，其中包含使用nsys启动的选项
-### 1.4 使用ansible 在所有节点执行 docker load, docker tag命令
-根据上文中inventory.ini文件依次在节点上创建docker，并将NODE_RANK写入docker的环境变量内，脚本(./ansible_workspace/set_docker.sh)内容如下：
-```Bash
-set -ex
-if [ $# -ne 1 ]; then
-  echo "Usage: $0 filename"
-  exit 1
-fi
-host_file="$1"
-num_hosts=$(wc -l < "$host_file")
-docker_name="cd_test"
-
-mapfile -t lines < "$host_file"
-
-for (( i=1; i<${#lines[@]}; i++ )); do
-  line="${lines[$i]}"
-  host_name=$(echo "$line" | awk '{print $1}')
-  # 根据inventory.ini文件中节点顺序，将NODE_RANK写入docker的环境变量中
-  ansible $host_name -i $host_file -m shell -a "docker run -itd -e NODE_RANK=$((i-1)) -v /data/dataset/ImageNet:/data/dataset/ImageNet -v /data/home/chende/tools:/workspace/tools --network host --gpus all --shm-size=16g --ulimit memlock=-1 --ulimit core=0 --ulimit stack=67108864 --privileged --ipc host --cap-add=IPC_LOCK --name $docker_name nvcr.io/nvidia/pytorch:24.03-py3 bash"
-done
-# 在docker内运行环境设置的脚本
-ansible hosts -i "$host_file" -m shell -a "docker exec $docker_name bash -c 'bash /workspace/tools/prepare_docker.sh'"
-```
-使用方式：
-```Bash
-bash set_docker.sh inventory.ini
-```
-
-## 第二步：进行测试
-
-### 2.1 自动测试与日志搜集
-
-编写一个测试命令脚本文件（./ansible_workspace/train.sh）
-```Bash
-#!/bin/bash
-set -ex
-if [ $# -ne 1 ]; then
-  echo "Usage: $0 num_nodes"
-  exit 1
-fi
-NUM_NODES="$1"
-docker_name="cd_test_new"
-ansible hosts -i inventory.ini -m shell -a "docker exec $docker_name bash -c 'cd /workspace/models/Vision/classification/image/resnet50 && bash train.sh $NUM_NODES'"
-```
-
-- 需要一个参数: 节点数，
-- 运行该命令能够自动启动相应数量的节点运行。
-- 运行结束后收集日志到主节点。
-- 保存日志的目录可以以：`prefix_节点数_日期时间_surfix` 命名，前缀和后缀可以自定义
-
-### 2.2 自动日志解析
-
-可以使用2.1节提供的命令运行多次，比如：
-
-```bash
-train.sh 1
-train.sh 2
-train.sh 4
-train.sh 8
-train.sh 16
-```
-
-完成后应该保存了多个日志目录，需要编写一个日志处理脚本，从这些日志目录中提取性能数据并制成 markdown 格式的表格
-
-注：不需要完整训练，训练稳定后获取到数据就可以了。
-
-### 2.3 自动 nsys 性能测试
-
-需要编写一个能够运行 nsys 的性能测试脚本文件（./ansible_workspace/profile.sh），和2.1的脚本类似，只是启动时需要调用nsys，我们需要搜集这些信息分析，然后进行优化。这个脚本文件。
-```Bash
-#!/bin/bash
-set -ex
-if [ $# -ne 1 ]; then
-  echo "Usage: $0 num_nodes"
-  exit 1
-fi
-NUM_NODES="$1"
-docker_name="cd_test_new"
-ansible hosts -i inventory.ini -m shell -a "docker exec $docker_name bash -c 'cd /workspace/models/Vision/classification/image/resnet50 && bash profile.sh $NUM_NODES'"
-```
-- 需要一个参数: 节点数，
-- 运行该命令能够自动启动相应数量的节点运行。
-- 运行结束后收集日志和nsys相关文件到主节点。
-- 保存日志的目录可以以：`prefix_节点数_日期时间_surfix` 命名，前缀和后缀可以自定义
\ No newline at end of file

From 3f7a512cc009bf1703c7c1f71cb3d90dec2e95b7 Mon Sep 17 00:00:00 2001
From: ShawnXuan <xiexuanx2@gmail.com>
Date: Fri, 2 Aug 2024 00:54:33 +0000
Subject: [PATCH 15/26] support distributed training

---
 .../resnet50/2_distributed_training/d.sh      |  1 +
 .../2_distributed_training/dist_training.yml  | 51 +++++++++++++++++++
 2 files changed, 52 insertions(+)
 create mode 100755 Classification/resnet50/2_distributed_training/d.sh
 create mode 100644 Classification/resnet50/2_distributed_training/dist_training.yml

diff --git a/Classification/resnet50/2_distributed_training/d.sh b/Classification/resnet50/2_distributed_training/d.sh
new file mode 100755
index 0000000..db2ac66
--- /dev/null
+++ b/Classification/resnet50/2_distributed_training/d.sh
@@ -0,0 +1 @@
+ansible-playbook -i ../inventory.ini dist_training.yml
diff --git a/Classification/resnet50/2_distributed_training/dist_training.yml b/Classification/resnet50/2_distributed_training/dist_training.yml
new file mode 100644
index 0000000..2e214bd
--- /dev/null
+++ b/Classification/resnet50/2_distributed_training/dist_training.yml
@@ -0,0 +1,51 @@
+---
+- name: Distributed Training Setup
+  hosts: all
+  vars:
+    device_num_per_node: 8
+    num_nodes: "{{ groups['all'] | length }}"
+    master_addr: "{{ hostvars[groups['all'][0]].ansible_host }}"
+    docker_image: "oneflowinc/oneflow:0.9.1.dev20240203-cuda11.8"
+    src: "/share_nfs/k85/models/Vision/classification/image/resnet50"
+
+  tasks:
+    - name: Set node rank
+      set_fact:
+        node_rank: "{{ groups['all'].index(inventory_hostname) }}"
+
+    - name: Run dist_train.sh in Docker container
+      command: >
+        docker run --rm --gpus all 
+        --runtime=nvidia --privileged
+        --network host --ipc=host 
+        -v {{ src }}:/workspace
+        -w /workspace
+        {{ docker_image }} /bin/bash -c "
+          python3 -m oneflow.distributed.launch \
+            --nproc_per_node {{ device_num_per_node }} \
+            --nnodes {{ num_nodes }} \
+            --node_rank {{ node_rank }} \
+            --master_addr {{ master_addr }} \
+            /workspace/train.py \
+            --synthetic-data \
+            --batches-per-epoch 1000 \
+            --num-devices-per-node {{ device_num_per_node }} \
+            --lr 1.536 \
+            --num-epochs 1 \
+            --train-batch-size 32 \
+            --graph \
+            --use-fp16 \
+            --metric-local False \
+            --metric-train-acc True \
+            --fuse-bn-relu \
+            --fuse-bn-add-relu \
+            --use-gpu-decode \
+            --channel-last \
+            --skip-eval
+        "
+      register: output
+
+    - name: Display output
+      debug:
+        var: output.stdout
+

From 8911664fcc424eddb6fcc0368f1748afdf2b4f14 Mon Sep 17 00:00:00 2001
From: ShawnXuan <xiexuanx2@gmail.com>
Date: Fri, 2 Aug 2024 01:05:49 +0000
Subject: [PATCH 16/26] update

---
 .../resnet50/2_distributed_training/README.md | 40 +++++++++++++++++++
 .../resnet50/2_distributed_training/d.sh      |  1 -
 .../run_dist_training.sh                      | 15 +++++++
 3 files changed, 55 insertions(+), 1 deletion(-)
 create mode 100644 Classification/resnet50/2_distributed_training/README.md
 delete mode 100755 Classification/resnet50/2_distributed_training/d.sh
 create mode 100755 Classification/resnet50/2_distributed_training/run_dist_training.sh

diff --git a/Classification/resnet50/2_distributed_training/README.md b/Classification/resnet50/2_distributed_training/README.md
new file mode 100644
index 0000000..0f6ddc8
--- /dev/null
+++ b/Classification/resnet50/2_distributed_training/README.md
@@ -0,0 +1,40 @@
+# run_dist_training.sh 使用说明
+
+`run_dist_training.sh` 是一个 Bash 脚本，用于运行 `ansible-playbook` 命令来启动分布式训练。此脚本支持通过参数指定 Docker 镜像和源目录。
+
+## 用法
+
+```bash
+./run_dist_training.sh [docker_image] [src]
+```
+
+## 参数
+
+- `docker_image` (可选): 要使用的 Docker 镜像名称。默认为 `oneflowinc/oneflow:0.9.1.dev20240203-cuda11.8`。
+- `src` (可选): 要挂载到 Docker 容器的源目录。默认为 `/share_nfs/k85/models/Vision/classification/image/resnet50`。
+
+## 示例
+
+1. 使用默认值运行：
+
+```bash
+./run_dist_training.sh
+```
+
+2. 指定 Docker 镜像运行：
+
+```bash
+./run_dist_training.sh "my_custom_image:latest"
+```
+
+3. 指定 Docker 镜像和源目录运行：
+
+```bash
+./run_dist_training.sh "my_custom_image:latest" "/my/custom/src"
+```
+
+## 注意
+
+如果不提供参数，脚本将使用默认的 Docker 镜像和源目录。
+```
+
diff --git a/Classification/resnet50/2_distributed_training/d.sh b/Classification/resnet50/2_distributed_training/d.sh
deleted file mode 100755
index db2ac66..0000000
--- a/Classification/resnet50/2_distributed_training/d.sh
+++ /dev/null
@@ -1 +0,0 @@
-ansible-playbook -i ../inventory.ini dist_training.yml
diff --git a/Classification/resnet50/2_distributed_training/run_dist_training.sh b/Classification/resnet50/2_distributed_training/run_dist_training.sh
new file mode 100755
index 0000000..3747a5f
--- /dev/null
+++ b/Classification/resnet50/2_distributed_training/run_dist_training.sh
@@ -0,0 +1,15 @@
+#!/bin/bash
+
+DOCKER_IMAGE="oneflowinc/oneflow:0.9.1.dev20240203-cuda11.8"
+SRC="/share_nfs/k85/models/Vision/classification/image/resnet50"
+
+if [ -n "$1" ]; then
+  DOCKER_IMAGE="$1"
+fi
+
+if [ -n "$2" ]; then
+  SRC="$2"
+fi
+
+# 运行 ansible-playbook 命令
+ansible-playbook -i ../inventory.ini dist_training.yml -e "docker_image=${DOCKER_IMAGE}" -e "src=${SRC}"

From cc6baa8b6df3076f0a143829bff7cc8b676a0b80 Mon Sep 17 00:00:00 2001
From: ShawnXuan <xiexuanx2@gmail.com>
Date: Fri, 2 Aug 2024 02:36:34 +0000
Subject: [PATCH 17/26] update README

---
 Classification/resnet50/2_distributed_training/README.md | 1 -
 Classification/resnet50/README.md                        | 4 ++++
 2 files changed, 4 insertions(+), 1 deletion(-)

diff --git a/Classification/resnet50/2_distributed_training/README.md b/Classification/resnet50/2_distributed_training/README.md
index 0f6ddc8..2ee78d3 100644
--- a/Classification/resnet50/2_distributed_training/README.md
+++ b/Classification/resnet50/2_distributed_training/README.md
@@ -36,5 +36,4 @@
 ## 注意
 
 如果不提供参数，脚本将使用默认的 Docker 镜像和源目录。
-```
 
diff --git a/Classification/resnet50/README.md b/Classification/resnet50/README.md
index 3be3866..6cdb99e 100644
--- a/Classification/resnet50/README.md
+++ b/Classification/resnet50/README.md
@@ -16,6 +16,10 @@
 │   ├── pull_docker_image.yml         # 拉取镜像 ansible playbook
 │   ├── pull.sh                       # 拉取镜像执行脚本
 │   └── README.md                     # 说明文件
+├── 2_distributed_training            # 分布式训练
+│   ├── dist_training.yml             # 用于分布式训练的 ansible playbook 
+│   └── README.md                     # 说明文件
+│   └── run_dist_training.sh          # 分布式训练执行脚本
 ├── inventory.ini                     # 主机清单文件，需要根据实际情况配置
 └── README.md                         # 说明文件
 ```

From 60cde364a5bdec57134a18523ac0ef677b53c4f7 Mon Sep 17 00:00:00 2001
From: ShawnXuan <xiexuanx2@gmail.com>
Date: Fri, 2 Aug 2024 02:46:08 +0000
Subject: [PATCH 18/26] fix

---
 Classification/resnet50/1_get_docker_image/README.md | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/Classification/resnet50/1_get_docker_image/README.md b/Classification/resnet50/1_get_docker_image/README.md
index 3f6282a..99709fd 100644
--- a/Classification/resnet50/1_get_docker_image/README.md
+++ b/Classification/resnet50/1_get_docker_image/README.md
@@ -40,9 +40,9 @@
 
 - 默认使用:
 
-  ```bash
- ./load.sh
-  ```
+```bash
+./load.sh
+```
 
 - 指定镜像文件路径和标签:
 

From 0e4498f4b2131dfef4404f5b635c62016610abb8 Mon Sep 17 00:00:00 2001
From: ShawnXuan <xiexuanx2@gmail.com>
Date: Fri, 2 Aug 2024 03:06:02 +0000
Subject: [PATCH 19/26] one node training

---
 .../2_distributed_training/dist_training.yml  |  2 +-
 .../resnet50/3_1node_training/README.md       | 39 ++++++++++++++
 .../3_1node_training/one_node_training.yml    | 51 +++++++++++++++++++
 .../3_1node_training/run_one_node_training.sh | 15 ++++++
 Classification/resnet50/inventory.ini         |  2 +
 5 files changed, 108 insertions(+), 1 deletion(-)
 create mode 100644 Classification/resnet50/3_1node_training/README.md
 create mode 100644 Classification/resnet50/3_1node_training/one_node_training.yml
 create mode 100755 Classification/resnet50/3_1node_training/run_one_node_training.sh

diff --git a/Classification/resnet50/2_distributed_training/dist_training.yml b/Classification/resnet50/2_distributed_training/dist_training.yml
index 2e214bd..cf20099 100644
--- a/Classification/resnet50/2_distributed_training/dist_training.yml
+++ b/Classification/resnet50/2_distributed_training/dist_training.yml
@@ -13,7 +13,7 @@
       set_fact:
         node_rank: "{{ groups['all'].index(inventory_hostname) }}"
 
-    - name: Run dist_train.sh in Docker container
+    - name: distributed training in Docker container
       command: >
         docker run --rm --gpus all 
         --runtime=nvidia --privileged
diff --git a/Classification/resnet50/3_1node_training/README.md b/Classification/resnet50/3_1node_training/README.md
new file mode 100644
index 0000000..02203a4
--- /dev/null
+++ b/Classification/resnet50/3_1node_training/README.md
@@ -0,0 +1,39 @@
+# run_one_node_training.sh 使用说明
+
+`run_one_node_training.sh` 是一个 Bash 脚本，用于运行 `ansible-playbook` 命令来启动分布式训练。此脚本支持通过参数指定 Docker 镜像和源目录。
+
+## 用法
+
+```bash
+./run_one_node_training.sh [docker_image] [src]
+```
+
+## 参数
+
+- `docker_image` (可选): 要使用的 Docker 镜像名称。默认为 `oneflowinc/oneflow:0.9.1.dev20240203-cuda11.8`。
+- `src` (可选): 要挂载到 Docker 容器的源目录。默认为 `/share_nfs/k85/models/Vision/classification/image/resnet50`。
+
+## 示例
+
+1. 使用默认值运行：
+
+```bash
+./run_one_node_training.sh
+```
+
+2. 指定 Docker 镜像运行：
+
+```bash
+./run_one_node_training.sh "my_custom_image:latest"
+```
+
+3. 指定 Docker 镜像和源目录运行：
+
+```bash
+./run_one_node_training.sh "my_custom_image:latest" "/my/custom/src"
+```
+
+## 注意
+
+如果不提供参数，脚本将使用默认的 Docker 镜像和源目录。
+
diff --git a/Classification/resnet50/3_1node_training/one_node_training.yml b/Classification/resnet50/3_1node_training/one_node_training.yml
new file mode 100644
index 0000000..9563d03
--- /dev/null
+++ b/Classification/resnet50/3_1node_training/one_node_training.yml
@@ -0,0 +1,51 @@
+---
+- name: Distributed Training Setup
+  hosts: one_node
+  vars:
+    device_num_per_node: 8
+    num_nodes: "{{ groups['one_node'] | length }}"
+    master_addr: "{{ hostvars[groups['one_node'][0]].ansible_host }}"
+    docker_image: "oneflowinc/oneflow:0.9.1.dev20240203-cuda11.8"
+    src: "/share_nfs/k85/models/Vision/classification/image/resnet50"
+
+  tasks:
+    - name: Set node rank
+      set_fact:
+        node_rank: "{{ groups['one_node'].index(inventory_hostname) }}"
+
+    - name: Training on one node in Docker container
+      command: >
+        docker run --rm --gpus all 
+        --runtime=nvidia --privileged
+        --network host --ipc=host 
+        -v {{ src }}:/workspace
+        -w /workspace
+        {{ docker_image }} /bin/bash -c "
+          python3 -m oneflow.distributed.launch \
+            --nproc_per_node {{ device_num_per_node }} \
+            --nnodes {{ num_nodes }} \
+            --node_rank {{ node_rank }} \
+            --master_addr {{ master_addr }} \
+            /workspace/train.py \
+            --synthetic-data \
+            --batches-per-epoch 1000 \
+            --num-devices-per-node {{ device_num_per_node }} \
+            --lr 1.536 \
+            --num-epochs 1 \
+            --train-batch-size 32 \
+            --graph \
+            --use-fp16 \
+            --metric-local False \
+            --metric-train-acc True \
+            --fuse-bn-relu \
+            --fuse-bn-add-relu \
+            --use-gpu-decode \
+            --channel-last \
+            --skip-eval
+        "
+      register: output
+
+    - name: Display output
+      debug:
+        var: output.stdout
+
diff --git a/Classification/resnet50/3_1node_training/run_one_node_training.sh b/Classification/resnet50/3_1node_training/run_one_node_training.sh
new file mode 100755
index 0000000..ab5ebee
--- /dev/null
+++ b/Classification/resnet50/3_1node_training/run_one_node_training.sh
@@ -0,0 +1,15 @@
+#!/bin/bash
+
+DOCKER_IMAGE="oneflowinc/oneflow:0.9.1.dev20240203-cuda11.8"
+SRC="/share_nfs/k85/models/Vision/classification/image/resnet50"
+
+if [ -n "$1" ]; then
+  DOCKER_IMAGE="$1"
+fi
+
+if [ -n "$2" ]; then
+  SRC="$2"
+fi
+
+# 运行 ansible-playbook 命令
+ansible-playbook -i ../inventory.ini one_node_training.yml -e "docker_image=${DOCKER_IMAGE}" -e "src=${SRC}"
diff --git a/Classification/resnet50/inventory.ini b/Classification/resnet50/inventory.ini
index 01027fd..c51a8a2 100644
--- a/Classification/resnet50/inventory.ini
+++ b/Classification/resnet50/inventory.ini
@@ -1,3 +1,5 @@
+[one_node]
+of25 ansible_host=192.168.1.25 ansible_ssh_common_args='-o StrictHostKeyChecking=no'
 [all]
 of27 ansible_host=192.168.1.27 ansible_ssh_common_args='-o StrictHostKeyChecking=no'
 of28 ansible_host=192.168.1.28 ansible_ssh_common_args='-o StrictHostKeyChecking=no'

From ecdbbbb449e36cda53ad0a005478e5e03a68835a Mon Sep 17 00:00:00 2001
From: ShawnXuan <xiexuanx2@gmail.com>
Date: Fri, 2 Aug 2024 03:31:33 +0000
Subject: [PATCH 20/26] update

---
 Classification/resnet50/3_1node_training/README.md            | 2 +-
 .../resnet50/3_1node_training/one_node_training.yml           | 2 +-
 Classification/resnet50/README.md                             | 4 ++++
 3 files changed, 6 insertions(+), 2 deletions(-)

diff --git a/Classification/resnet50/3_1node_training/README.md b/Classification/resnet50/3_1node_training/README.md
index 02203a4..fb6a81f 100644
--- a/Classification/resnet50/3_1node_training/README.md
+++ b/Classification/resnet50/3_1node_training/README.md
@@ -1,6 +1,6 @@
 # run_one_node_training.sh 使用说明
 
-`run_one_node_training.sh` 是一个 Bash 脚本，用于运行 `ansible-playbook` 命令来启动分布式训练。此脚本支持通过参数指定 Docker 镜像和源目录。
+`run_one_node_training.sh` 是一个 Bash 脚本，用于运行 `ansible-playbook` 命令来启动单节点上的训练。此脚本支持通过参数指定 Docker 镜像和源目录。
 
 ## 用法
 
diff --git a/Classification/resnet50/3_1node_training/one_node_training.yml b/Classification/resnet50/3_1node_training/one_node_training.yml
index 9563d03..6709ac7 100644
--- a/Classification/resnet50/3_1node_training/one_node_training.yml
+++ b/Classification/resnet50/3_1node_training/one_node_training.yml
@@ -1,5 +1,5 @@
 ---
-- name: Distributed Training Setup
+- name: One Node Training Setup
   hosts: one_node
   vars:
     device_num_per_node: 8
diff --git a/Classification/resnet50/README.md b/Classification/resnet50/README.md
index 6cdb99e..86aa36a 100644
--- a/Classification/resnet50/README.md
+++ b/Classification/resnet50/README.md
@@ -20,6 +20,10 @@
 │   ├── dist_training.yml             # 用于分布式训练的 ansible playbook 
 │   └── README.md                     # 说明文件
 │   └── run_dist_training.sh          # 分布式训练执行脚本
+├── 3_1node_training                  # 在一个节点上训练，用于获得基准
+│   ├── one_node_training.yml         # 单节点训练playbook
+│   ├── README.md                     # 说明文件
+│   └── run_one_node_training.sh      # 单节点训练执行脚本
 ├── inventory.ini                     # 主机清单文件，需要根据实际情况配置
 └── README.md                         # 说明文件
 ```

From b77913ebae50fc778e5406aecfff757c44d5678e Mon Sep 17 00:00:00 2001
From: ShawnXuan <xiexuanx2@gmail.com>
Date: Fri, 2 Aug 2024 03:34:10 +0000
Subject: [PATCH 21/26] profiling

---
 Classification/resnet50/4_profiling/README.md | 39 ++++++++++++++
 .../resnet50/4_profiling/profiling.yml        | 52 +++++++++++++++++++
 .../resnet50/4_profiling/run_profiling.sh     | 15 ++++++
 Classification/resnet50/README.md             |  4 ++
 4 files changed, 110 insertions(+)
 create mode 100644 Classification/resnet50/4_profiling/README.md
 create mode 100644 Classification/resnet50/4_profiling/profiling.yml
 create mode 100755 Classification/resnet50/4_profiling/run_profiling.sh

diff --git a/Classification/resnet50/4_profiling/README.md b/Classification/resnet50/4_profiling/README.md
new file mode 100644
index 0000000..90b314e
--- /dev/null
+++ b/Classification/resnet50/4_profiling/README.md
@@ -0,0 +1,39 @@
+# profiling.sh 使用说明
+
+`profiling.sh` 是一个 Bash 脚本，用于运行 `ansible-playbook` 命令来启动分布式训练同时采集性能相关信息。此脚本支持通过参数指定 Docker 镜像和源目录。
+
+## 用法
+
+```bash
+./profiling.sh [docker_image] [src]
+```
+
+## 参数
+
+- `docker_image` (可选): 要使用的 Docker 镜像名称。默认为 `oneflowinc/oneflow:0.9.1.dev20240203-cuda11.8`。
+- `src` (可选): 要挂载到 Docker 容器的源目录。默认为 `/share_nfs/k85/models/Vision/classification/image/resnet50`。
+
+## 示例
+
+1. 使用默认值运行：
+
+```bash
+./profiling.sh
+```
+
+2. 指定 Docker 镜像运行：
+
+```bash
+./profiling.sh "my_custom_image:latest"
+```
+
+3. 指定 Docker 镜像和源目录运行：
+
+```bash
+./profiling.sh "my_custom_image:latest" "/my/custom/src"
+```
+
+## 注意
+
+如果不提供参数，脚本将使用默认的 Docker 镜像和源目录。
+
diff --git a/Classification/resnet50/4_profiling/profiling.yml b/Classification/resnet50/4_profiling/profiling.yml
new file mode 100644
index 0000000..5969fef
--- /dev/null
+++ b/Classification/resnet50/4_profiling/profiling.yml
@@ -0,0 +1,52 @@
+---
+- name: Distributed Profiling Setup
+  hosts: all
+  vars:
+    device_num_per_node: 8
+    num_nodes: "{{ groups['all'] | length }}"
+    master_addr: "{{ hostvars[groups['all'][0]].ansible_host }}"
+    docker_image: "oneflowinc/oneflow:0.9.1.dev20240203-cuda11.8"
+    src: "/share_nfs/k85/models/Vision/classification/image/resnet50"
+
+  tasks:
+    - name: Set node rank
+      set_fact:
+        node_rank: "{{ groups['all'].index(inventory_hostname) }}"
+
+    - name: distributed training in Docker container
+      command: >
+        docker run --rm --gpus all 
+        --runtime=nvidia --privileged
+        --network host --ipc=host 
+        -v {{ src }}:/workspace
+        -w /workspace
+        {{ docker_image }} /bin/bash -c "
+          nsys profile --stats=true \
+          python3 -m oneflow.distributed.launch \
+            --nproc_per_node {{ device_num_per_node }} \
+            --nnodes {{ num_nodes }} \
+            --node_rank {{ node_rank }} \
+            --master_addr {{ master_addr }} \
+            /workspace/train.py \
+            --synthetic-data \
+            --batches-per-epoch 100 \
+            --num-devices-per-node {{ device_num_per_node }} \
+            --lr 1.536 \
+            --num-epochs 1 \
+            --train-batch-size 32 \
+            --graph \
+            --use-fp16 \
+            --metric-local False \
+            --metric-train-acc True \
+            --fuse-bn-relu \
+            --fuse-bn-add-relu \
+            --use-gpu-decode \
+            --channel-last \
+            --skip-eval
+        "
+      register: output
+
+    - name: Display output
+      debug:
+        var: output.stdout
+
diff --git a/Classification/resnet50/4_profiling/run_profiling.sh b/Classification/resnet50/4_profiling/run_profiling.sh
new file mode 100755
index 0000000..d51843f
--- /dev/null
+++ b/Classification/resnet50/4_profiling/run_profiling.sh
@@ -0,0 +1,15 @@
+#!/bin/bash
+
+DOCKER_IMAGE="oneflowinc/oneflow:0.9.1.dev20240203-cuda11.8"
+SRC="/share_nfs/k85/models/Vision/classification/image/resnet50"
+
+if [ -n "$1" ]; then
+  DOCKER_IMAGE="$1"
+fi
+
+if [ -n "$2" ]; then
+  SRC="$2"
+fi
+
+# 运行 ansible-playbook 命令
+ansible-playbook -i ../inventory.ini profiling.yml -e "docker_image=${DOCKER_IMAGE}" -e "src=${SRC}"
diff --git a/Classification/resnet50/README.md b/Classification/resnet50/README.md
index 86aa36a..acc3769 100644
--- a/Classification/resnet50/README.md
+++ b/Classification/resnet50/README.md
@@ -24,6 +24,10 @@
 │   ├── one_node_training.yml         # 单节点训练playbook
 │   ├── README.md                     # 说明文件
 │   └── run_one_node_training.sh      # 单节点训练执行脚本
+├── 4_profiling                       # 使用nsys采集性能相关信息
+│   ├── profiling.yml                 # profiling ansible playbook
+│   ├── README.md                     # 说明文件
+│   └── run_profiling.sh              # 采集信息执行脚本
 ├── inventory.ini                     # 主机清单文件，需要根据实际情况配置
 └── README.md                         # 说明文件
 ```

From 2257c7a4bd2fb3cd117170ab4331cc77b5849b85 Mon Sep 17 00:00:00 2001
From: ShawnXuan <xiexuanx2@gmail.com>
Date: Fri, 2 Aug 2024 03:42:31 +0000
Subject: [PATCH 22/26] update

---
 Classification/resnet50/README.md | 115 ++++++++++++++++++++++++------
 1 file changed, 93 insertions(+), 22 deletions(-)

diff --git a/Classification/resnet50/README.md b/Classification/resnet50/README.md
index acc3769..33b62e8 100644
--- a/Classification/resnet50/README.md
+++ b/Classification/resnet50/README.md
@@ -1,37 +1,108 @@
 # 使用Ansible在多节点环境分布式训练
 
-文件目录
+## 目录结构
 
 ```
 .
 ├── 0_dist_ssh_key                    # 分发 SSH 公钥到各个节点
-│   ├── distribute_ssh_key.yml        # ansible playbook
-│   ├── dist_ssh_key.sh               # 执行脚本
-│   ├── inventory.ini                 # 仅用于分发公钥的主机清单文件，需要根据实际情况配置
-│   ├── README.md                     # 说明文件
-│   └── vars.yml                      # 初始未加密的用户密码文件，需经过配置并加密后使用
-├── 1_get_docker_image                # 各个节点获取 docker 镜像
-│   ├── load_and_tag_docker_image.yml # 导入镜像 ansible playbook
-│   ├── load.sh                       # 导入镜像执行脚本
-│   ├── pull_docker_image.yml         # 拉取镜像 ansible playbook
-│   ├── pull.sh                       # 拉取镜像执行脚本
-│   └── README.md                     # 说明文件
+│   ├── distribute_ssh_key.yml        # Ansible playbook
+│   ├── dist_ssh_key.sh               # 执行脚本
+│   ├── inventory.ini                 # 仅用于分发公钥的主机清单文件，需要根据实际情况配置
+│   ├── README.md                     # 说明文件
+│   └── vars.yml                      # 初始未加密的用户密码文件，需经过配置并加密后使用
+├── 1_get_docker_image                # 各个节点获取 Docker 镜像
+│   ├── load_and_tag_docker_image.yml # 导入镜像 Ansible playbook
+│   ├── load.sh                       # 导入镜像执行脚本
+│   ├── pull_docker_image.yml         # 拉取镜像 Ansible playbook
+│   ├── pull.sh                       # 拉取镜像执行脚本
+│   └── README.md                     # 说明文件
 ├── 2_distributed_training            # 分布式训练
-│   ├── dist_training.yml             # 用于分布式训练的 ansible playbook 
-│   └── README.md                     # 说明文件
-│   └── run_dist_training.sh          # 分布式训练执行脚本
+│   ├── dist_training.yml             # 用于分布式训练的 Ansible playbook
+│   ├── run_dist_training.sh          # 分布式训练执行脚本
+│   └── README.md                     # 说明文件
 ├── 3_1node_training                  # 在一个节点上训练，用于获得基准
-│   ├── one_node_training.yml         # 单节点训练playbook
-│   ├── README.md                     # 说明文件
-│   └── run_one_node_training.sh      # 单节点训练执行脚本
-├── 4_profiling                       # 使用nsys采集性能相关信息
-│   ├── profiling.yml                 # profiling ansible playbook
-│   ├── README.md                     # 说明文件
-│   └── run_profiling.sh              # 采集信息执行脚本
+│   ├── one_node_training.yml         # 单节点训练 Ansible playbook
+│   ├── run_one_node_training.sh      # 单节点训练执行脚本
+│   └── README.md                     # 说明文件
+├── 4_profiling                       # 使用 nsys 采集性能相关信息
+│   ├── profiling.yml                 # Profiling Ansible playbook
+│   ├── run_profiling.sh              # 采集信息执行脚本
+│   └── README.md                     # 说明文件
 ├── inventory.ini                     # 主机清单文件，需要根据实际情况配置
 └── README.md                         # 说明文件
 ```
 
+## 分步说明
 
+### 0_dist_ssh_key
 
+该目录用于分发 SSH 公钥到各个节点。
 
+- `distribute_ssh_key.yml`: Ansible playbook，用于分发公钥。
+- `dist_ssh_key.sh`: 执行分发公钥的脚本。
+- `inventory.ini`: 主机清单文件，需要根据实际情况配置。
+- `vars.yml`: 初始未加密的用户密码文件，需经过配置并加密后使用。
+
+### 1_get_docker_image
+
+该目录用于在各个节点上获取 Docker 镜像。
+
+- `load_and_tag_docker_image.yml`: Ansible playbook，用于导入 Docker 镜像并设置标签。
+- `load.sh`: 导入镜像执行脚本。
+- `pull_docker_image.yml`: Ansible playbook，用于拉取 Docker 镜像。
+- `pull.sh`: 拉取镜像执行脚本。
+
+### 2_distributed_training
+
+该目录用于执行分布式训练。
+
+- `dist_training.yml`: 用于分布式训练的 Ansible playbook。
+- `run_dist_training.sh`: 分布式训练执行脚本。
+
+### 3_1node_training
+
+该目录用于在一个节点上训练，以获得基准。
+
+- `one_node_training.yml`: 单节点训练 Ansible playbook。
+- `run_one_node_training.sh`: 单节点训练执行脚本。
+
+### 4_profiling
+
+该目录用于使用 nsys 采集性能相关信息。
+
+- `profiling.yml`: Profiling Ansible playbook。
+- `run_profiling.sh`: 采集信息执行脚本。
+
+## 使用方法
+
+1. **分发 SSH 公钥**:
+```sh
+cd 0_dist_ssh_key
+./dist_ssh_key.sh
+```
+
+2. **获取 Docker 镜像**:
+```sh
+cd 1_get_docker_image
+./pull.sh  # 或者 ./load.sh
+```
+
+3. **执行分布式训练**:
+```sh
+cd 2_distributed_training
+./run_dist_training.sh [docker_image] [src]
+```
+
+4. **在一个节点上训练**:
+```sh
+cd 3_1node_training
+./run_one_node_training.sh
+```
+
+5. **采集性能相关信息**:
+```sh
+cd 4_profiling
+./run_profiling.sh
+```
+
+注意：在运行这些脚本之前，请确保已经正确配置了 `inventory.ini` 文件中的主机信息。

From b82da5b6db15c04070ee5ab7d99fed64150389d7 Mon Sep 17 00:00:00 2001
From: ShawnXuan <xiexuanx2@gmail.com>
Date: Mon, 5 Aug 2024 04:07:28 +0000
Subject: [PATCH 23/26] fix

---
 Classification/resnet50/0_dist_ssh_key/README.md             | 5 +++++
 .../resnet50/0_dist_ssh_key/distribute_ssh_key.yml           | 2 +-
 2 files changed, 6 insertions(+), 1 deletion(-)

diff --git a/Classification/resnet50/0_dist_ssh_key/README.md b/Classification/resnet50/0_dist_ssh_key/README.md
index 856df74..54797c8 100644
--- a/Classification/resnet50/0_dist_ssh_key/README.md
+++ b/Classification/resnet50/0_dist_ssh_key/README.md
@@ -1,4 +1,9 @@
 # 使用 Ansible 将 SSH 公钥分发到多个目标主机
+## 0. 安装Ansible
+
+```bash
+pip install ansible-vault
+```
 
 ## 1. 创建变量文件并加密
 
diff --git a/Classification/resnet50/0_dist_ssh_key/distribute_ssh_key.yml b/Classification/resnet50/0_dist_ssh_key/distribute_ssh_key.yml
index 95147c2..685bcee 100644
--- a/Classification/resnet50/0_dist_ssh_key/distribute_ssh_key.yml
+++ b/Classification/resnet50/0_dist_ssh_key/distribute_ssh_key.yml
@@ -16,4 +16,4 @@
       authorized_key:
         user: "{{ ansible_user }}"
         state: present
-        key: "{{ lookup('file', '/home/xiexuan/.ssh/id_rsa.pub') }}"
+        key: "{{ lookup('file', '/home/用户名/.ssh/id_rsa.pub') }}"

From d25bb4a57351dffe3ea30a8a99e28891643921dc Mon Sep 17 00:00:00 2001
From: ShawnXuan <xiexuanx2@gmail.com>
Date: Mon, 5 Aug 2024 04:09:39 +0000
Subject: [PATCH 24/26] update

---
 Classification/resnet50/1_get_docker_image/README.md | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/Classification/resnet50/1_get_docker_image/README.md b/Classification/resnet50/1_get_docker_image/README.md
index 99709fd..21e1813 100644
--- a/Classification/resnet50/1_get_docker_image/README.md
+++ b/Classification/resnet50/1_get_docker_image/README.md
@@ -1,5 +1,7 @@
 # 拉取或导入镜像
 
+注: 用户需要有各台机器的docker权限
+
 ## 拉取镜像
 
 适用于直接从 dockerhub 拉取镜像。

From 5aa4ac12c4accd1f323cabbef859cd704e3f3f4a Mon Sep 17 00:00:00 2001
From: XIE Xuan <xiexuanx2@gmail.com>
Date: Mon, 5 Aug 2024 15:21:47 +0800
Subject: [PATCH 25/26] Update README.md

---
 Classification/resnet50/0_dist_ssh_key/README.md | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/Classification/resnet50/0_dist_ssh_key/README.md b/Classification/resnet50/0_dist_ssh_key/README.md
index 54797c8..a7e7be7 100644
--- a/Classification/resnet50/0_dist_ssh_key/README.md
+++ b/Classification/resnet50/0_dist_ssh_key/README.md
@@ -1,4 +1,6 @@
 # 使用 Ansible 将 SSH 公钥分发到多个目标主机
+<img width="829" alt="image" src="https://github.com/user-attachments/assets/ec938595-dee4-4f6e-8818-93b3a299020e">
+
 ## 0. 安装Ansible
 
 ```bash

From 9b292186c43830d4d1a2496a149699d70149f74b Mon Sep 17 00:00:00 2001
From: ShawnXuan <xiexuanx2@gmail.com>
Date: Wed, 7 Aug 2024 07:06:13 +0000
Subject: [PATCH 26/26] update

---
 .../resnet50/2_distributed_training/dist_training.yml           | 2 --
 1 file changed, 2 deletions(-)

diff --git a/Classification/resnet50/2_distributed_training/dist_training.yml b/Classification/resnet50/2_distributed_training/dist_training.yml
index cf20099..c3f9dd5 100644
--- a/Classification/resnet50/2_distributed_training/dist_training.yml
+++ b/Classification/resnet50/2_distributed_training/dist_training.yml
@@ -37,8 +37,6 @@
             --use-fp16 \
             --metric-local False \
             --metric-train-acc True \
-            --fuse-bn-relu \
-            --fuse-bn-add-relu \
             --use-gpu-decode \
             --channel-last \
             --skip-eval