From 26a64e5ea8cd8686abaf716b7afe092e1c2025d6 Mon Sep 17 00:00:00 2001 From: chende Date: Thu, 6 Jun 2024 14:07:27 +0000 Subject: [PATCH 01/26] automated training scripts for resnet50. --- Classification/resnet50/Readme.md | 225 ++++++++++++++++++ .../resnet50/ansible_workspace/inventory.ini | 5 + .../resnet50/ansible_workspace/set_docker.sh | 18 ++ .../resnet50/ansible_workspace/train.sh | 9 + .../ansible_workspace/update_tools.sh | 3 + .../tools/args_train_ddp_graph_resnet50.sh | 144 +++++++++++ Classification/resnet50/tools/extract.py | 27 +++ .../resnet50/tools/prepare_docker.sh | 14 ++ Classification/resnet50/tools/profile.sh | 7 + Classification/resnet50/tools/train.sh | 7 + 10 files changed, 459 insertions(+) create mode 100644 Classification/resnet50/Readme.md create mode 100644 Classification/resnet50/ansible_workspace/inventory.ini create mode 100755 Classification/resnet50/ansible_workspace/set_docker.sh create mode 100644 Classification/resnet50/ansible_workspace/train.sh create mode 100755 Classification/resnet50/ansible_workspace/update_tools.sh create mode 100755 Classification/resnet50/tools/args_train_ddp_graph_resnet50.sh create mode 100644 Classification/resnet50/tools/extract.py create mode 100755 Classification/resnet50/tools/prepare_docker.sh create mode 100755 Classification/resnet50/tools/profile.sh create mode 100755 Classification/resnet50/tools/train.sh diff --git a/Classification/resnet50/Readme.md b/Classification/resnet50/Readme.md new file mode 100644 index 0000000..c72cdd4 --- /dev/null +++ b/Classification/resnet50/Readme.md @@ -0,0 +1,225 @@ +# 千卡 0.85 + +[toc] + +## 文件目录结构 +``` +├── ansible_workspace # 主节点上的工作目录 +│   ├── inventory.ini # 用来配置节点信息 +│   ├── set_docker.sh # 在各节点上创建docker,并且配置好docker内环境 +│   ├── profile.sh # 根据节点数启动profile +│   ├── train.sh # 根据节点数启动训练 +│   └── update_tools.sh # 将主节点的tools文件夹复制到各个子节点 +├── tools # 在各个节点使用的文件 +│ ├── args_train_ddp_graph_resnet50.sh # 接受模型训练参数并启动训练 +│ ├── models.tar.gz # 模型,为防止git网络问题,所以先下载放在共享目录下 +│ ├── extract.py # 提取log中train阶段的throughput的平均值 +│ ├── prepare_docker.sh # 用于配置docker内环境 +│ ├── profile.sh # 根据节点数在本机启动profile +│ └── train.sh # 根据节点数在本机启动训练 +└── Readme.md +``` + +需求:有NVLink,以及 shared_nfs + +以下供参考 + +## 第一步: 配置环境 + +### 1.1 所有节点配置SSH Key,并设置authorized_keys + +(怎么自动化) + +需要一个共享的存储空间,如:`/shared_nfs/k85`,在一个文件夹下准备好 + +- authorized_keys : 在主节点运行 + + ```bash + #!/bin/bash + + # 设置 SSH 目录路径 + SSH_DIR="$HOME/.ssh" + + # 检查 SSH 目录是否存在,如果不存在则创建 + if [ ! -d "$SSH_DIR" ]; then + mkdir -p "$SSH_DIR" + echo "Created directory: $SSH_DIR" + fi + + # 设置密钥文件路径 + KEY_PATH="$SSH_DIR/id_rsa" + + # 生成 SSH 密钥对 + ssh-keygen -t rsa -b 2048 -f "$KEY_PATH" -N "" -q + + # 创建 authorized_keys 文件 + cat $SSH_DIR/id_rsa.pub > $SSH_DIR/authorized_keys + + # 将 authorized_keys 文件拷贝到共享目录 + cp $SSH_DIR/authorized_keys shared_nfs/k85 + ``` + +- 在子节点运行 + + ```bash + #!/bin/bash + + # 设置 SSH 目录路径 + SSH_DIR="$HOME/.ssh" + + # 检查 SSH 目录是否存在,如果不存在则创建 + if [ ! -d "$SSH_DIR" ]; then + mkdir -p "$SSH_DIR" + echo "Created directory: $SSH_DIR" + fi + + # 设置密钥文件路径 + KEY_PATH="$SSH_DIR/id_rsa" + + # 生成 SSH 密钥对 + ssh-keygen -t rsa -b 2048 -f "$KEY_PATH" -N "" -q + + # 将 authorized_keys 文件拷贝到 .ssh 目录 + cp shared_nfs/k85/authorized_keys $SSH_DIR + ``` + +### 1.2 主节点安装 Ansible,并配置节点ip +示例文件:./ansible_workspace/inventory.ini +```ini +[hosts] +of27 ansible_host=192.168.1.27 ansible_ssh_common_args='-o StrictHostKeyChecking=no' +of25 ansible_host=192.168.1.25 ansible_ssh_common_args='-o StrictHostKeyChecking=no' +of26 ansible_host=192.168.1.26 ansible_ssh_common_args='-o StrictHostKeyChecking=no' +of28 ansible_host=192.168.1.28 ansible_ssh_common_args='-o StrictHostKeyChecking=no' +``` + +### 1.3 共享目录中拷贝镜像、数据集、models脚本 +主要为设置docker内环境的脚本 和 启动docker内训练的脚本 +设置docker内环境脚本(./tools/prepare_docker.sh)如下: +```Bash +#!/bin/bash +# 将tools视为共享目录 +pip config set global.index-url https://pypi.tuna.tsinghua.edu.cn/simple +python3 -m pip install --upgrade pip +python3 -m pip install --pre oneflow -f https://oneflow-staging.oss-cn-beijing.aliyuncs.com/branch/master/cu121 + + +cd /workspace +cp tools/models.tar.gz ./ +tar -xvf models.tar.gz +pip install -r models/dev-requirements.txt +pip install -r models/Vision/classification/image/resnet50/requirements.txt + +# 将需要使用到的脚本拷到对应文件夹下 +cp tools/args_train_ddp_graph_resnet50.sh models/Vision/classification/image/resnet50/examples/ +cp tools/train.sh models/Vision/classification/image/resnet50/ +cp tools/profile.sh models/Vision/classification/image/resnet50/ +``` +启动dokcer内训练的脚本(./tools/train.sh)如下: +```Bash +# 根据使用的节点数,来判断本机是否开始训练 +NUM_NODES=${1:-1} + +if [ "$NODE_RANK" -lt "$NUM_NODES" ]; then + bash examples/args_train_ddp_graph.sh "$NUM_NODES" 8 "$NODE_RANK" 192.168.1.27 /data/dataset/ImageNet/ofrecord 192 50 true python3 graph gpu 100 false '' 1 +else + echo do nothing +fi +``` +其中[args_train_ddp_graph.sh](https://github.com/Oneflow-Inc/OneAutoTest/blob/main/ResNet50/args_train_ddp_graph.sh)参考自OneAutoTest仓库 +### 1.4 使用ansible 在所有节点执行 docker load, docker tag命令 +根据上文中inventory.ini文件依次在节点上创建docker,并将NODE_RANK写入docker的环境变量内,脚本(./ansible_workspace/set_docker.sh)内容如下: +```Bash +set -ex +if [ $# -ne 1 ]; then + echo "Usage: $0 filename" + exit 1 +fi +host_file="$1" +num_hosts=$(wc -l < "$host_file") +docker_name="cd_test" + +mapfile -t lines < "$host_file" + +for (( i=1; i<${#lines[@]}; i++ )); do + line="${lines[$i]}" + host_name=$(echo "$line" | awk '{print $1}') + # 根据inventory.ini文件中节点顺序,将NODE_RANK写入docker的环境变量中 + ansible $host_name -i $host_file -m shell -a "docker run -itd -e NODE_RANK=$((i-1)) -v /data/dataset/ImageNet:/data/dataset/ImageNet -v /data/home/chende/tools:/workspace/tools --network host --gpus all --shm-size=16g --ulimit memlock=-1 --ulimit core=0 --ulimit stack=67108864 --privileged --ipc host --cap-add=IPC_LOCK --name $docker_name nvcr.io/nvidia/pytorch:24.03-py3 bash" +done +# 在docker内运行环境设置的脚本 +ansible hosts -i "$host_file" -m shell -a "docker exec $docker_name bash -c 'bash /workspace/tools/prepare_docker.sh'" +``` +使用方式: +```Bash +bash set_docker.sh inventory.ini +``` + +## 第二步:进行测试 + +### 2.1 自动测试与日志搜集 + +编写一个测试命令脚本文件(./ansible_workspace/train.sh) +```Bash +#!/bin/bash +set -ex +if [ $# -ne 1 ]; then + echo "Usage: $0 num_nodes" + exit 1 +fi +NUM_NODES="$1" +docker_name="cd_test_new" +ansible hosts -i inventory.ini -m shell -a "docker exec $docker_name bash -c 'cd /workspace/models/Vision/classification/image/resnet50 && bash train.sh $NUM_NODES'" +``` + +- 需要一个参数: 节点数, +- 运行该命令能够自动启动相应数量的节点运行。 +- 运行结束后收集日志到主节点。 +- 保存日志的目录可以以:`prefix_节点数_日期时间_surfix` 命名,前缀和后缀可以自定义 + +### 2.2 自动日志解析 + +可以使用2.1节提供的命令运行多次,比如: + +```bash +train.sh 1 +train.sh 2 +train.sh 4 +train.sh 8 +train.sh 16 +``` + +完成后应该保存了多个日志目录,需要编写一个日志处理脚本,从这些日志目录中提取性能数据并制成 markdown 格式的表格 + +注:不需要完整训练,训练稳定后获取到数据就可以了。 + +### 2.3 自动 nsys 性能测试 + +需要编写一个能够运行 nsys 的性能测试脚本文件(./ansible_workspace/profile.sh),和2.1的脚本类似,只是启动时需要调用nsys,我们需要搜集这些信息分析,然后进行优化。这个脚本文件。 +```Bash +#!/bin/bash +set -ex +if [ $# -ne 1 ]; then + echo "Usage: $0 num_nodes" + exit 1 +fi +NUM_NODES="$1" +docker_name="cd_test_new" +ansible hosts -i inventory.ini -m shell -a "docker exec $docker_name bash -c 'cd /workspace/models/Vision/classification/image/resnet50 && bash profile.sh $NUM_NODES'" +``` +```Bash +# 根据使用的节点数,来判断是否在本地开始profile +NUM_NODES=${1:-1} + +if [ "$NODE_RANK" -lt "$NUM_NODES" ]; then + # 在启动训练时添加nsys启动路径,即可进行profile + bash examples/args_train_ddp_graph.sh "$NUM_NODES" 8 "$NODE_RANK" 192.168.1.27 /data/dataset/ImageNet/ofrecord 192 50 true python3 graph gpu 100 false '/usr/local/cuda/bin/nsys' 1 +else + echo do nothing +fi +``` +[args_train_ddp_graph.sh](https://github.com/Oneflow-Inc/OneAutoTest/blob/main/ResNet50/args_train_ddp_graph.sh)中包含使用nsys启动的选项 +- 需要一个参数: 节点数, +- 运行该命令能够自动启动相应数量的节点运行。 +- 运行结束后收集日志和nsys相关文件到主节点。 +- 保存日志的目录可以以:`prefix_节点数_日期时间_surfix` 命名,前缀和后缀可以自定义 \ No newline at end of file diff --git a/Classification/resnet50/ansible_workspace/inventory.ini b/Classification/resnet50/ansible_workspace/inventory.ini new file mode 100644 index 0000000..82e0bac --- /dev/null +++ b/Classification/resnet50/ansible_workspace/inventory.ini @@ -0,0 +1,5 @@ +[hosts] +of27 ansible_host=192.168.1.27 ansible_ssh_common_args='-o StrictHostKeyChecking=no' +of25 ansible_host=192.168.1.25 ansible_ssh_common_args='-o StrictHostKeyChecking=no' +of26 ansible_host=192.168.1.26 ansible_ssh_common_args='-o StrictHostKeyChecking=no' +of28 ansible_host=192.168.1.28 ansible_ssh_common_args='-o StrictHostKeyChecking=no' \ No newline at end of file diff --git a/Classification/resnet50/ansible_workspace/set_docker.sh b/Classification/resnet50/ansible_workspace/set_docker.sh new file mode 100755 index 0000000..7a55f49 --- /dev/null +++ b/Classification/resnet50/ansible_workspace/set_docker.sh @@ -0,0 +1,18 @@ +#!/bin/bash +set -ex +if [ $# -ne 1 ]; then + echo "Usage: $0 filename" + exit 1 +fi +host_file="$1" +num_hosts=$(wc -l < "$host_file") +docker_name="cd_test_new" + +mapfile -t lines < "$host_file" + +for (( i=1; i<${#lines[@]}; i++ )); do + line="${lines[$i]}" + host_name=$(echo "$line" | awk '{print $1}') + ansible $host_name -i $host_file -m shell -a "docker run -itd -e NODE_RANK=$((i-1)) -v /data/dataset/ImageNet:/data/dataset/ImageNet -v /data/home/chende/tools:/workspace/tools --network host --gpus all --shm-size=16g --ulimit memlock=-1 --ulimit core=0 --ulimit stack=67108864 --privileged --ipc host --cap-add=IPC_LOCK --name $docker_name nvcr.io/nvidia/pytorch:24.03-py3 bash" +done +ansible hosts -i "$host_file" -m shell -a "docker exec $docker_name bash -c 'bash /workspace/tools/prepare_docker.sh'" \ No newline at end of file diff --git a/Classification/resnet50/ansible_workspace/train.sh b/Classification/resnet50/ansible_workspace/train.sh new file mode 100644 index 0000000..350bf78 --- /dev/null +++ b/Classification/resnet50/ansible_workspace/train.sh @@ -0,0 +1,9 @@ +#!/bin/bash +set -ex +if [ $# -ne 1 ]; then + echo "Usage: $0 num_nodes" + exit 1 +fi +NUM_NODES="$1" +docker_name="cd_test_new" +ansible hosts -i inventory.ini -m shell -a "docker exec $docker_name bash -c 'cd /workspace/models/Vision/classification/image/resnet50 && bash train.sh $NUM_NODES'" \ No newline at end of file diff --git a/Classification/resnet50/ansible_workspace/update_tools.sh b/Classification/resnet50/ansible_workspace/update_tools.sh new file mode 100755 index 0000000..cda6deb --- /dev/null +++ b/Classification/resnet50/ansible_workspace/update_tools.sh @@ -0,0 +1,3 @@ +#!/bin/bash +echo "/data/home/chende/tools" +ansible hosts -i inventory.ini -m copy -a "src=/data/home/chende/tools dest=/data/home/chende/ mode=0755" \ No newline at end of file diff --git a/Classification/resnet50/tools/args_train_ddp_graph_resnet50.sh b/Classification/resnet50/tools/args_train_ddp_graph_resnet50.sh new file mode 100755 index 0000000..b4fbd36 --- /dev/null +++ b/Classification/resnet50/tools/args_train_ddp_graph_resnet50.sh @@ -0,0 +1,144 @@ +rm -rf core.* + +set -ex + + +# bash examples/args_train_ddp_graph.sh ${NUM_NODES} ${DEVICE_NUM_PER_NODE} ${NODE_RANK} ${MASTER_ADDR} +# ${OFRECORD_PATH} ${TRAIN_BATCH_SIZE} ${EPOCH} ${USE_FP16} ${PYTHON_BIN} ${RUN_TYPE} ${DEBUG_AND_NCCL} ${NSYS_BIN} ${RUN_COMMIT} + +# bash examples/args_train_ddp_graph.sh 1 8 0 127.0.0.1 /dataset/79846248 192 50 false python3 ddp false '' 1 + +NUM_NODES=${1:-1} +DEVICE_NUM_PER_NODE=${2:-8} +NODE_RANK=${3:-0} +MASTER_ADDR=${4:-"127.0.0.1"} +OFRECORD_PATH=${5:-"/dataset/imagenet/ofrecord"} +TRAIN_BATCH_SIZE=${6:-192} +EPOCH=${7:-50} +USE_FP16=${8:-false} +PYTHON_BIN=${9:-"python3"} +RUN_TYPE=${10:-"ddp"} # graph+fp16 +DECODE_TYPE=${11:-"cpu"} +PRINT_INTERVAL=${12:-100} +DEBUG_AND_NCCL=${13:-false} +NSYS_BIN=${14:-""} +RUN_COMMIT=${15:-"master"} +ACC=${16:-1} +VAL_BATCH_SIZE=${17:-50} + + +SRC_DIR=$(realpath $(dirname $0)/..) + +AMP_OR="FP32" +if $USE_FP16; then + AMP_OR="FP16" +fi + +TRAN_MODEL="resnet50" +RUN_TIME=$(date "+%Y%m%d_%H%M%S%N") +LOG_FOLDER=${SRC_DIR}/test_logs/$HOSTNAME/${NUM_NODES}n${DEVICE_NUM_PER_NODE}g +mkdir -p $LOG_FOLDER +LOG_FILENAME=$LOG_FOLDER/${TRAN_MODEL}_${RUN_TYPE}_DC${DECODE_TYPE}_${AMP_OR}_mb${TRAIN_BATCH_SIZE}_gb$((${TRAIN_BATCH_SIZE}*${NUM_NODES}*${DEVICE_NUM_PER_NODE}*${ACC}))_acc${ACC}_${NUM_NODES}n${DEVICE_NUM_PER_NODE}g_${RUN_COMMIT}_${RUN_TIME} + + +export PYTHONUNBUFFERED=1 +echo PYTHONUNBUFFERED=$PYTHONUNBUFFERED +#export ONEFLOW_COMM_NET_IB_ENABLE=True +export NCCL_LAUNCH_MODE=GROUP +echo NCCL_LAUNCH_MODE=$NCCL_LAUNCH_MODE +echo DEBUG_AND_NCCL=$DEBUG_AND_NCCL +if $DEBUG_AND_NCCL; then + export ONEFLOW_DEBUG_MODE=1 + echo ONEFLOW_DEBUG_MODE=$ONEFLOW_DEBUG_MODE + export NCCL_DEBUG=INFO + echo NCCL_DEBUG=$NCCL_DEBUG +fi + +#export ONEFLOW_KERNEL_ENABLE_CUDA_GRAPH=1 +#export ONEFLOW_THREAD_ENABLE_LOCAL_MESSAGE_QUEUE=1 +#export ONEFLOW_KERNEL_DISABLE_BLOB_ACCESS_CHECKER=1 +#export ONEFLOW_ACTOR_ENABLE_LIGHT_ACTOR=1 +#export ONEFLOW_STREAM_REUSE_CUDA_EVENT=1 + +#export ONEFLOW_STREAM_CUDA_EVENT_FLAG_BLOCKING_SYNC=true +#export ONEFLOW_VM_WORKLOAD_ON_SCHEDULER_THREAD=1 + +LEARNING_RATE=$(echo | awk "{print $NUM_NODES*$DEVICE_NUM_PER_NODE*$TRAIN_BATCH_SIZE*$ACC/1000}") +MOM=0.875 +OFRECORD_PART_NUM=256 + +EXIT_NUM=-1 + +if [ ${EPOCH} -lt 10 ];then + EXIT_NUM=300 +fi +CMD="" + +if [[ ! -z "${NSYS_BIN}" ]]; then + export ONEFLOW_PROFILER_KERNEL_PROFILE_KERNEL_FORWARD_RANGE=1 + export ONEFLOW_DEBUG_MODE=1 + # CMD+="${NSYS_BIN} profile --stats true -t nvtx --output ${LOG_FILENAME} " + export CUDNN_LOGINFO_DBG=1 + export CUDNN_LOGDEST_DBG=${SRC_DIR}/cudnn.log + CMD+="${NSYS_BIN} profile --stats true --output ${LOG_FILENAME} " + EXIT_NUM=30 +fi + + +CMD+="${PYTHON_BIN} -m oneflow.distributed.launch " + +CMD+="--nproc_per_node ${DEVICE_NUM_PER_NODE} " +CMD+="--nnodes ${NUM_NODES} " +CMD+="--node_rank ${NODE_RANK} " +CMD+="--master_addr ${MASTER_ADDR} " +CMD+="--master_port 12345 " +CMD+="${SRC_DIR}/train.py " +CMD+="--ofrecord-path ${OFRECORD_PATH} " +CMD+="--ofrecord-part-num ${OFRECORD_PART_NUM} " +CMD+="--num-devices-per-node ${DEVICE_NUM_PER_NODE} " +CMD+="--lr ${LEARNING_RATE} " +CMD+="--momentum ${MOM} " +CMD+="--num-epochs ${EPOCH} " +CMD+="--train-batch-size ${TRAIN_BATCH_SIZE} " +CMD+="--train-global-batch-size $((${TRAIN_BATCH_SIZE}*${NUM_NODES}*${DEVICE_NUM_PER_NODE}*${ACC})) " +CMD+="--val-batch-size ${VAL_BATCH_SIZE} " +CMD+="--val-global-batch-size $((${VAL_BATCH_SIZE}*${NUM_NODES}*${DEVICE_NUM_PER_NODE}*${ACC})) " +CMD+="--print-interval ${PRINT_INTERVAL} " +#CMD+="--synthetic-data " + +if $USE_FP16; then + echo USE_FP16=$USE_FP16 + CMD+="--use-fp16 --channel-last " +fi + +if [ $EXIT_NUM != -1 ]; then + CMD+="--skip-eval " +fi +if [ $RUN_TYPE == 'ddp' ]; then + CMD+="--ddp " +else + CMD+="--scale-grad --graph " + CMD+="--fuse-bn-relu " + CMD+="--fuse-bn-add-relu " +fi + +if [ $DECODE_TYPE == 'gpu' ]; then + CMD+="--use-gpu-decode " +fi + +echo "Rum cmd ${CMD}" + +$CMD 2>&1 | tee ${LOG_FILENAME}.log + +echo "Writting log to ${LOG_FILENAME}.log" + +if [[ ! -z "${NSYS_BIN}" ]]; then + rm ${LOG_FOLDER}/*.sqlite + mkdir -p ${LOG_FILENAME} + #rm -rf ./log/$HOSTNAME/oneflow.* + cp ./log/$HOSTNAME/* ${LOG_FILENAME}/ + mv ${SRC_DIR}/cudnn.log ${LOG_FILENAME}/cudnn.log +fi + +rm -rf ./log/$HOSTNAME +echo "done" diff --git a/Classification/resnet50/tools/extract.py b/Classification/resnet50/tools/extract.py new file mode 100644 index 0000000..477903d --- /dev/null +++ b/Classification/resnet50/tools/extract.py @@ -0,0 +1,27 @@ +import sys +import re + +# 文件路径 +file_path = sys.argv[1] + +# 存储 train 模式下的 throughput +print(file_path) +train_throughputs = [] + +# 正则表达式模式匹配 train 模式下的 throughput +pattern = re.compile(r'\[train\][^|]*?throughput:\s(\d+\.\d+)') + +# 读取文件并提取需要的信息 +with open(file_path, 'r') as file: + for line in file: + matches = pattern.findall(line) + for match in matches: + throughput = float(match) + train_throughputs.append(throughput) + +# 计算平均 throughput +if train_throughputs: + average_throughput = sum(train_throughputs) / len(train_throughputs) + print(f'The average throughput for [train] mode is: {average_throughput:.6f}') +else: + print('No [train] mode throughputs found.') diff --git a/Classification/resnet50/tools/prepare_docker.sh b/Classification/resnet50/tools/prepare_docker.sh new file mode 100755 index 0000000..84147f5 --- /dev/null +++ b/Classification/resnet50/tools/prepare_docker.sh @@ -0,0 +1,14 @@ +#!/bin/bash +pip config set global.index-url https://pypi.tuna.tsinghua.edu.cn/simple +python3 -m pip install --upgrade pip +python3 -m pip install --pre oneflow -f https://oneflow-staging.oss-cn-beijing.aliyuncs.com/branch/master/cu121 + +cd /workspace +cp tools/models.tar.gz ./ +tar -xvf models.tar.gz +pip install -r models/dev-requirements.txt +pip install -r models/Vision/classification/image/resnet50/requirements.txt + +cp tools/args_train_ddp_graph_resnet50.sh models/Vision/classification/image/resnet50/examples/ +cp tools/train.sh models/Vision/classification/image/resnet50/ +cp tools/profile.sh models/Vision/classification/image/resnet50/ \ No newline at end of file diff --git a/Classification/resnet50/tools/profile.sh b/Classification/resnet50/tools/profile.sh new file mode 100755 index 0000000..79e0d0a --- /dev/null +++ b/Classification/resnet50/tools/profile.sh @@ -0,0 +1,7 @@ +NUM_NODES=${1:-1} + +if [ "$NODE_RANK" -lt "$NUM_NODES" ]; then + bash examples/args_train_ddp_graph_resnet50.sh "$NUM_NODES" 8 "$NODE_RANK" 192.168.1.27 /data/dataset/ImageNet/ofrecord 128 50 true python3 graph gpu 100 false '/usr/local/cuda/bin/nsys' 1 +else + echo do nothing +fi \ No newline at end of file diff --git a/Classification/resnet50/tools/train.sh b/Classification/resnet50/tools/train.sh new file mode 100755 index 0000000..5758c05 --- /dev/null +++ b/Classification/resnet50/tools/train.sh @@ -0,0 +1,7 @@ +NUM_NODES=${1:-1} + +if [ "$NODE_RANK" -lt "$NUM_NODES" ]; then + bash examples/args_train_ddp_graph_resnet50.sh "$NUM_NODES" 8 "$NODE_RANK" 192.168.1.27 /data/dataset/ImageNet/ofrecord 128 50 true python3 graph gpu 100 false '' 1 +else + echo do nothing +fi \ No newline at end of file From 4a82d2e74a7e1109962e9347240808a2fca49aea Mon Sep 17 00:00:00 2001 From: chende Date: Thu, 6 Jun 2024 14:12:52 +0000 Subject: [PATCH 02/26] few changes. --- Classification/resnet50/Readme.md | 2 +- Classification/resnet50/ansible_workspace/profile.sh | 9 +++++++++ 2 files changed, 10 insertions(+), 1 deletion(-) create mode 100644 Classification/resnet50/ansible_workspace/profile.sh diff --git a/Classification/resnet50/Readme.md b/Classification/resnet50/Readme.md index c72cdd4..21be9e5 100644 --- a/Classification/resnet50/Readme.md +++ b/Classification/resnet50/Readme.md @@ -12,7 +12,7 @@ │   └── update_tools.sh # 将主节点的tools文件夹复制到各个子节点 ├── tools # 在各个节点使用的文件 │ ├── args_train_ddp_graph_resnet50.sh # 接受模型训练参数并启动训练 -│ ├── models.tar.gz # 模型,为防止git网络问题,所以先下载放在共享目录下 +│ ├── models.tar.gz # 模型,为防止git网络问题,建议先下载放在共享目录下 │ ├── extract.py # 提取log中train阶段的throughput的平均值 │ ├── prepare_docker.sh # 用于配置docker内环境 │ ├── profile.sh # 根据节点数在本机启动profile diff --git a/Classification/resnet50/ansible_workspace/profile.sh b/Classification/resnet50/ansible_workspace/profile.sh new file mode 100644 index 0000000..db6abfb --- /dev/null +++ b/Classification/resnet50/ansible_workspace/profile.sh @@ -0,0 +1,9 @@ +#!/bin/bash +set -ex +if [ $# -ne 1 ]; then + echo "Usage: $0 num_nodes" + exit 1 +fi +NUM_NODES="$1" +docker_name="cd_test_new" +ansible hosts -i inventory.ini -m shell -a "docker exec $docker_name bash -c 'cd /workspace/models/Vision/classification/image/resnet50 && bash profile.sh $NUM_NODES'" \ No newline at end of file From 48e0ece68ed80ea476af11604d0eb0ebeecd7588 Mon Sep 17 00:00:00 2001 From: chende Date: Thu, 6 Jun 2024 14:37:33 +0000 Subject: [PATCH 03/26] typo --- Classification/resnet50/Readme.md | 28 ++++++++++++++-------------- 1 file changed, 14 insertions(+), 14 deletions(-) diff --git a/Classification/resnet50/Readme.md b/Classification/resnet50/Readme.md index 21be9e5..b19a080 100644 --- a/Classification/resnet50/Readme.md +++ b/Classification/resnet50/Readme.md @@ -121,12 +121,24 @@ cp tools/profile.sh models/Vision/classification/image/resnet50/ NUM_NODES=${1:-1} if [ "$NODE_RANK" -lt "$NUM_NODES" ]; then - bash examples/args_train_ddp_graph.sh "$NUM_NODES" 8 "$NODE_RANK" 192.168.1.27 /data/dataset/ImageNet/ofrecord 192 50 true python3 graph gpu 100 false '' 1 + bash examples/args_train_ddp_graph_resnet50.sh "$NUM_NODES" 8 "$NODE_RANK" 192.168.1.27 /data/dataset/ImageNet/ofrecord 192 50 true python3 graph gpu 100 false '' 1 else echo do nothing fi ``` -其中[args_train_ddp_graph.sh](https://github.com/Oneflow-Inc/OneAutoTest/blob/main/ResNet50/args_train_ddp_graph.sh)参考自OneAutoTest仓库 +启动dokcer内profile(./tools/profile.sh)如下: +```Bash +# 根据使用的节点数,来判断是否在本地开始profile +NUM_NODES=${1:-1} + +if [ "$NODE_RANK" -lt "$NUM_NODES" ]; then + # 在启动训练时添加nsys启动路径,即可进行profile + bash examples/args_train_ddp_graph_resnet50.sh "$NUM_NODES" 8 "$NODE_RANK" 192.168.1.27 /data/dataset/ImageNet/ofrecord 192 50 true python3 graph gpu 100 false '/usr/local/cuda/bin/nsys' 1 +else + echo do nothing +fi +``` +args_train_ddp_graph_resnet50.sh文件参考自OneAutoTest仓库[args_train_ddp_graph.sh](https://github.com/Oneflow-Inc/OneAutoTest/blob/main/ResNet50/args_train_ddp_graph.sh),其中包含使用nsys启动的选项 ### 1.4 使用ansible 在所有节点执行 docker load, docker tag命令 根据上文中inventory.ini文件依次在节点上创建docker,并将NODE_RANK写入docker的环境变量内,脚本(./ansible_workspace/set_docker.sh)内容如下: ```Bash @@ -207,18 +219,6 @@ NUM_NODES="$1" docker_name="cd_test_new" ansible hosts -i inventory.ini -m shell -a "docker exec $docker_name bash -c 'cd /workspace/models/Vision/classification/image/resnet50 && bash profile.sh $NUM_NODES'" ``` -```Bash -# 根据使用的节点数,来判断是否在本地开始profile -NUM_NODES=${1:-1} - -if [ "$NODE_RANK" -lt "$NUM_NODES" ]; then - # 在启动训练时添加nsys启动路径,即可进行profile - bash examples/args_train_ddp_graph.sh "$NUM_NODES" 8 "$NODE_RANK" 192.168.1.27 /data/dataset/ImageNet/ofrecord 192 50 true python3 graph gpu 100 false '/usr/local/cuda/bin/nsys' 1 -else - echo do nothing -fi -``` -[args_train_ddp_graph.sh](https://github.com/Oneflow-Inc/OneAutoTest/blob/main/ResNet50/args_train_ddp_graph.sh)中包含使用nsys启动的选项 - 需要一个参数: 节点数, - 运行该命令能够自动启动相应数量的节点运行。 - 运行结束后收集日志和nsys相关文件到主节点。 From e9a15a2e276e40502732995881cc3d89b7f81c3d Mon Sep 17 00:00:00 2001 From: ShawnXuan Date: Wed, 31 Jul 2024 10:56:44 +0000 Subject: [PATCH 04/26] distribute_ssh_key --- .../resnet50/dist_ssh_key/README.md | 79 +++++++++++++++++++ .../dist_ssh_key/distribute_ssh_key.yml | 19 +++++ .../resnet50/dist_ssh_key/inventory.ini | 3 + Classification/resnet50/dist_ssh_key/vars.yml | 8 ++ 4 files changed, 109 insertions(+) create mode 100644 Classification/resnet50/dist_ssh_key/README.md create mode 100644 Classification/resnet50/dist_ssh_key/distribute_ssh_key.yml create mode 100644 Classification/resnet50/dist_ssh_key/inventory.ini create mode 100644 Classification/resnet50/dist_ssh_key/vars.yml diff --git a/Classification/resnet50/dist_ssh_key/README.md b/Classification/resnet50/dist_ssh_key/README.md new file mode 100644 index 0000000..427e890 --- /dev/null +++ b/Classification/resnet50/dist_ssh_key/README.md @@ -0,0 +1,79 @@ +# 使用 Ansible 将 SSH 公钥分发到多个目标主机 + +## 1. 创建变量文件并加密 + +创建一个包含密码的变量文件vars.yml: + +```yaml +all: + hosts: + 192.168.1.27: + ansible_user: myuser + ansible_password: mypassword + 192.168.1.28: + ansible_user: myuser + ansible_password: mypassword +``` + +然后使用Ansible Vault加密这个文件: + +```bash +ansible-vault encrypt vars.yml +``` + +注意: + +1. 执行 `ansible-vault` 的过程中需要设定一个密码,请记住或保存好这个密码 +2. `vars.yml`将被替换为加密后的文件 + +## 2. 创建主机清单文件 + +创建一个主机清单文件`inventory.ini`: + +```ini +[all] +node1 ansible_host=192.168.1.27 ansible_user=myuser +node2 ansible_host=192.168.1.28 ansible_user=myuser +``` + +注:需要根据情况修改 `ansible_user` 的值 + +## 3. 创建Playbook + +如果文件存在,这一步可以忽略。 + +创建一个Playbook distribute_ssh_key.yml: + +```yaml +--- +- name: Distribute SSH key + hosts: all + vars_files: + - vars.yml + tasks: + - name: Create .ssh directory if it doesn't exist + file: + path: /home/{{ ansible_user }}/.ssh + state: directory + mode: '0700' + owner: "{{ ansible_user }}" + group: "{{ ansible_user }}" + + - name: Copy the SSH key to the authorized_keys file + authorized_key: + user: "{{ ansible_user }}" + state: present + key: "{{ lookup('file', '/path/to/id_rsa.pub') }}" +``` + +注:`vars_files` 配置为 `vars.yml` + +## 4. 运行Playbook + +使用以下命令运行Playbook,并解密变量文件: + +```bash +ansible-playbook -i inventory.ini distribute_ssh_key.yml --ask-vault-pass +``` + + diff --git a/Classification/resnet50/dist_ssh_key/distribute_ssh_key.yml b/Classification/resnet50/dist_ssh_key/distribute_ssh_key.yml new file mode 100644 index 0000000..95147c2 --- /dev/null +++ b/Classification/resnet50/dist_ssh_key/distribute_ssh_key.yml @@ -0,0 +1,19 @@ +--- +- name: Distribute SSH key + hosts: all + vars_files: + - vars.yml + tasks: + - name: Create .ssh directory if it doesn't exist + file: + path: /home/{{ ansible_user }}/.ssh + state: directory + mode: '0700' + owner: "{{ ansible_user }}" + group: "{{ ansible_user }}" + + - name: Copy the SSH key to the authorized_keys file + authorized_key: + user: "{{ ansible_user }}" + state: present + key: "{{ lookup('file', '/home/xiexuan/.ssh/id_rsa.pub') }}" diff --git a/Classification/resnet50/dist_ssh_key/inventory.ini b/Classification/resnet50/dist_ssh_key/inventory.ini new file mode 100644 index 0000000..894b65d --- /dev/null +++ b/Classification/resnet50/dist_ssh_key/inventory.ini @@ -0,0 +1,3 @@ +[all] +of27 ansible_host=192.168.1.27 ansible_user=xiexuan +of28 ansible_host=192.168.1.28 ansible_user=xiexuan diff --git a/Classification/resnet50/dist_ssh_key/vars.yml b/Classification/resnet50/dist_ssh_key/vars.yml new file mode 100644 index 0000000..49c7dbf --- /dev/null +++ b/Classification/resnet50/dist_ssh_key/vars.yml @@ -0,0 +1,8 @@ +all: + hosts: + 192.168.1.27: + ansible_user: myuser + ansible_password: mypassword + 192.168.1.28: + ansible_user: myuser + ansible_password: mypassword From 0e15764505c6d0ffc796d3ad3651c1987168894d Mon Sep 17 00:00:00 2001 From: ShawnXuan Date: Wed, 31 Jul 2024 13:12:51 +0000 Subject: [PATCH 05/26] rename dir --- .../resnet50/{dist_ssh_key => 0_dist_ssh_key}/README.md | 0 .../{dist_ssh_key => 0_dist_ssh_key}/distribute_ssh_key.yml | 0 .../resnet50/{dist_ssh_key => 0_dist_ssh_key}/inventory.ini | 0 Classification/resnet50/{dist_ssh_key => 0_dist_ssh_key}/vars.yml | 0 4 files changed, 0 insertions(+), 0 deletions(-) rename Classification/resnet50/{dist_ssh_key => 0_dist_ssh_key}/README.md (100%) rename Classification/resnet50/{dist_ssh_key => 0_dist_ssh_key}/distribute_ssh_key.yml (100%) rename Classification/resnet50/{dist_ssh_key => 0_dist_ssh_key}/inventory.ini (100%) rename Classification/resnet50/{dist_ssh_key => 0_dist_ssh_key}/vars.yml (100%) diff --git a/Classification/resnet50/dist_ssh_key/README.md b/Classification/resnet50/0_dist_ssh_key/README.md similarity index 100% rename from Classification/resnet50/dist_ssh_key/README.md rename to Classification/resnet50/0_dist_ssh_key/README.md diff --git a/Classification/resnet50/dist_ssh_key/distribute_ssh_key.yml b/Classification/resnet50/0_dist_ssh_key/distribute_ssh_key.yml similarity index 100% rename from Classification/resnet50/dist_ssh_key/distribute_ssh_key.yml rename to Classification/resnet50/0_dist_ssh_key/distribute_ssh_key.yml diff --git a/Classification/resnet50/dist_ssh_key/inventory.ini b/Classification/resnet50/0_dist_ssh_key/inventory.ini similarity index 100% rename from Classification/resnet50/dist_ssh_key/inventory.ini rename to Classification/resnet50/0_dist_ssh_key/inventory.ini diff --git a/Classification/resnet50/dist_ssh_key/vars.yml b/Classification/resnet50/0_dist_ssh_key/vars.yml similarity index 100% rename from Classification/resnet50/dist_ssh_key/vars.yml rename to Classification/resnet50/0_dist_ssh_key/vars.yml From 8850c1aa906f61326617b73b80a31893e9a9f0a9 Mon Sep 17 00:00:00 2001 From: ShawnXuan Date: Wed, 31 Jul 2024 13:13:40 +0000 Subject: [PATCH 06/26] mv inventory.ini to root --- Classification/resnet50/inventory.ini | 4 ++++ 1 file changed, 4 insertions(+) create mode 100644 Classification/resnet50/inventory.ini diff --git a/Classification/resnet50/inventory.ini b/Classification/resnet50/inventory.ini new file mode 100644 index 0000000..a4388b6 --- /dev/null +++ b/Classification/resnet50/inventory.ini @@ -0,0 +1,4 @@ +[hosts] +of25 ansible_host=192.168.1.25 ansible_ssh_common_args='-o StrictHostKeyChecking=no' +of27 ansible_host=192.168.1.27 ansible_ssh_common_args='-o StrictHostKeyChecking=no' +of28 ansible_host=192.168.1.28 ansible_ssh_common_args='-o StrictHostKeyChecking=no' From e100a96ec1ab18dfd4e6b2a65cef5c65ec7ede16 Mon Sep 17 00:00:00 2001 From: ShawnXuan Date: Wed, 31 Jul 2024 13:14:41 +0000 Subject: [PATCH 07/26] update --- Classification/resnet50/ansible_workspace/inventory.ini | 5 ----- 1 file changed, 5 deletions(-) delete mode 100644 Classification/resnet50/ansible_workspace/inventory.ini diff --git a/Classification/resnet50/ansible_workspace/inventory.ini b/Classification/resnet50/ansible_workspace/inventory.ini deleted file mode 100644 index 82e0bac..0000000 --- a/Classification/resnet50/ansible_workspace/inventory.ini +++ /dev/null @@ -1,5 +0,0 @@ -[hosts] -of27 ansible_host=192.168.1.27 ansible_ssh_common_args='-o StrictHostKeyChecking=no' -of25 ansible_host=192.168.1.25 ansible_ssh_common_args='-o StrictHostKeyChecking=no' -of26 ansible_host=192.168.1.26 ansible_ssh_common_args='-o StrictHostKeyChecking=no' -of28 ansible_host=192.168.1.28 ansible_ssh_common_args='-o StrictHostKeyChecking=no' \ No newline at end of file From dcb9451cc05c684755b26d9a9b844a1aa3b8ae1b Mon Sep 17 00:00:00 2001 From: ShawnXuan Date: Wed, 31 Jul 2024 13:18:06 +0000 Subject: [PATCH 08/26] add bash --- Classification/resnet50/0_dist_ssh_key/dist_ssh_key.sh | 1 + 1 file changed, 1 insertion(+) create mode 100755 Classification/resnet50/0_dist_ssh_key/dist_ssh_key.sh diff --git a/Classification/resnet50/0_dist_ssh_key/dist_ssh_key.sh b/Classification/resnet50/0_dist_ssh_key/dist_ssh_key.sh new file mode 100755 index 0000000..7cb0f1d --- /dev/null +++ b/Classification/resnet50/0_dist_ssh_key/dist_ssh_key.sh @@ -0,0 +1 @@ +ansible-playbook -i inventory.ini distribute_ssh_key.yml --ask-vault-pass From c8497b6eabab91d96f87bd2427748c98d4b9729c Mon Sep 17 00:00:00 2001 From: ShawnXuan Date: Wed, 31 Jul 2024 13:19:36 +0000 Subject: [PATCH 09/26] update --- Classification/resnet50/0_dist_ssh_key/README.md | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/Classification/resnet50/0_dist_ssh_key/README.md b/Classification/resnet50/0_dist_ssh_key/README.md index 427e890..856df74 100644 --- a/Classification/resnet50/0_dist_ssh_key/README.md +++ b/Classification/resnet50/0_dist_ssh_key/README.md @@ -75,5 +75,9 @@ node2 ansible_host=192.168.1.28 ansible_user=myuser ```bash ansible-playbook -i inventory.ini distribute_ssh_key.yml --ask-vault-pass ``` +或者运行 +```bash +./dist_ssh_key.sh +``` From b9528444a788c2a05aaeea14b411c5964171a5f0 Mon Sep 17 00:00:00 2001 From: ShawnXuan Date: Thu, 1 Aug 2024 02:18:02 +0000 Subject: [PATCH 10/26] pull or load docker image --- .../resnet50/1_get_docker_image/load.sh | 26 +++++++++++++++++ .../load_and_tag_docker_image.yml | 28 +++++++++++++++++++ .../resnet50/1_get_docker_image/pull.sh | 7 +++++ .../1_get_docker_image/pull_docker_image.yml | 17 +++++++++++ Classification/resnet50/inventory.ini | 3 +- 5 files changed, 79 insertions(+), 2 deletions(-) create mode 100755 Classification/resnet50/1_get_docker_image/load.sh create mode 100644 Classification/resnet50/1_get_docker_image/load_and_tag_docker_image.yml create mode 100755 Classification/resnet50/1_get_docker_image/pull.sh create mode 100644 Classification/resnet50/1_get_docker_image/pull_docker_image.yml diff --git a/Classification/resnet50/1_get_docker_image/load.sh b/Classification/resnet50/1_get_docker_image/load.sh new file mode 100755 index 0000000..5df8bcd --- /dev/null +++ b/Classification/resnet50/1_get_docker_image/load.sh @@ -0,0 +1,26 @@ +#!/bin/bash + +if [ -n "$1" ]; then + docker_image_path=$1 +else + docker_image_path="/share_nfs/k85/oneflow.0.9.1.dev20240203-cuda11.8.tar" +fi + +if [ -n "$2" ]; then + docker_image_tag=$2 +else + docker_image_tag="oneflowinc/oneflow:0.9.1.dev20240203-cuda11.8" +fi + +if [ -n "$3" ]; then + force_load=$3 +else + force_load=false +fi + +ansible-playbook \ + -i ../inventory.ini \ + load_and_tag_docker_image.yml \ + -e "docker_image_path=$docker_image_path" \ + -e "docker_image_tag=$docker_image_tag" \ + -e "force_load=$force_load" diff --git a/Classification/resnet50/1_get_docker_image/load_and_tag_docker_image.yml b/Classification/resnet50/1_get_docker_image/load_and_tag_docker_image.yml new file mode 100644 index 0000000..5a2f92d --- /dev/null +++ b/Classification/resnet50/1_get_docker_image/load_and_tag_docker_image.yml @@ -0,0 +1,28 @@ +--- +- name: Load and tag Docker image + hosts: all + vars: + docker_image_path: "/share_nfs/k85/oneflow.0.9.1.dev20240203-cuda11.8.tar" + docker_image_tag: "oneflowinc/oneflow:0.9.1.dev20240203-cuda11.8" + force_load: false + + tasks: + - name: Check if Docker image with the specified tag already exists + command: "docker images -q {{ docker_image_tag }}" + register: image_id + changed_when: false + when: not force_load + + - name: Load Docker image from tar file + command: "docker load -i {{ docker_image_path }}" + when: force_load or image_id.stdout == "" + register: load_output + + - name: Get image ID from load output + set_fact: + loaded_image_id: "{{ load_output.stdout_lines[-1] | regex_search('sha256:[0-9a-f]+') }}" + when: force_load or image_id.stdout == "" + + - name: Tag the loaded Docker image + command: "docker tag {{ loaded_image_id }} {{ docker_image_tag }}" + when: force_load or image_id.stdout == "" diff --git a/Classification/resnet50/1_get_docker_image/pull.sh b/Classification/resnet50/1_get_docker_image/pull.sh new file mode 100755 index 0000000..8787fea --- /dev/null +++ b/Classification/resnet50/1_get_docker_image/pull.sh @@ -0,0 +1,7 @@ +#!/bin/bash + +if [ -n "$1" ]; then + ansible-playbook -i ../inventory.ini pull_docker_image.yml -e "docker_image=$1" +else + ansible-playbook -i ../inventory.ini pull_docker_image.yml +fi diff --git a/Classification/resnet50/1_get_docker_image/pull_docker_image.yml b/Classification/resnet50/1_get_docker_image/pull_docker_image.yml new file mode 100644 index 0000000..d086e6a --- /dev/null +++ b/Classification/resnet50/1_get_docker_image/pull_docker_image.yml @@ -0,0 +1,17 @@ +--- +- name: Pull specified Docker image + hosts: all + vars: + docker_image: "oneflowinc/oneflow:0.9.1.dev20240203-cuda11.8" + + tasks: + - name: Check if the Docker image is already present + command: "docker images -q {{ docker_image }}" + register: docker_image_id + changed_when: false + + - name: Pull Docker image if not present + docker_image: + name: "{{ docker_image }}" + source: pull + when: docker_image_id.stdout == "" diff --git a/Classification/resnet50/inventory.ini b/Classification/resnet50/inventory.ini index a4388b6..01027fd 100644 --- a/Classification/resnet50/inventory.ini +++ b/Classification/resnet50/inventory.ini @@ -1,4 +1,3 @@ -[hosts] -of25 ansible_host=192.168.1.25 ansible_ssh_common_args='-o StrictHostKeyChecking=no' +[all] of27 ansible_host=192.168.1.27 ansible_ssh_common_args='-o StrictHostKeyChecking=no' of28 ansible_host=192.168.1.28 ansible_ssh_common_args='-o StrictHostKeyChecking=no' From efed68f417d1c2e7229164dfad8dcbe2935afcad Mon Sep 17 00:00:00 2001 From: ShawnXuan Date: Thu, 1 Aug 2024 02:30:44 +0000 Subject: [PATCH 11/26] update readme --- .../resnet50/1_get_docker_image/README.md | 61 +++++++++++++++++++ 1 file changed, 61 insertions(+) create mode 100644 Classification/resnet50/1_get_docker_image/README.md diff --git a/Classification/resnet50/1_get_docker_image/README.md b/Classification/resnet50/1_get_docker_image/README.md new file mode 100644 index 0000000..3f6282a --- /dev/null +++ b/Classification/resnet50/1_get_docker_image/README.md @@ -0,0 +1,61 @@ +# 拉取或导入镜像 + +## 拉取镜像 + +适用于直接从 dockerhub 拉取镜像。 + +用法: `./pull.sh [镜像标签]` + +参数说明: + +- 镜像标签 (可选) : 要拉取的Docker镜像标签,例如 alpine:latest。如果未提供,则使用playbook中的默认值。 + +示例: + +- 默认使用: + +```bash +./pull.sh +``` + +- 指定镜像标签: + + ```bash +./pull.sh alpine:latest + ``` + +## 导入镜像 + +适用于本地共享目录有已经保存镜像的tar文件,使用 `docker load` 导入。 + +用法: `./load.sh [镜像文件路径] [镜像标签] [强制导入]` + +参数说明: + +- 镜像文件路径 (可选) : 要导入的Docker镜像tar文件路径,默认为 `/share_nfs/k85/oneflow.0.9.1.dev20240203-cuda11.8.tar` +- 镜像标签 (可选) : 导入后设置的Docker镜像标签,默认为 `oneflowinc/oneflow:0.9.1.dev20240203-cuda11.8` +- 强制导入 (可选) : 是否强制导入镜像(true 或 false),默认为 false + +示例: + +- 默认使用: + + ```bash + ./load.sh + ``` + +- 指定镜像文件路径和标签: + +```bash +./load.sh /path/to/shared/abc.tar myrepo/myimage:latest +``` + +- 强制导入镜像: + +```bash +./load.sh /path/to/shared/abc.tar myrepo/myimage:latest true +``` + + + + From 6ac0f4cefe6764feba617e135e332b0e0a924e6c Mon Sep 17 00:00:00 2001 From: ShawnXuan Date: Thu, 1 Aug 2024 02:56:43 +0000 Subject: [PATCH 12/26] rm old files --- .../resnet50/ansible_workspace/profile.sh | 9 -- .../resnet50/ansible_workspace/set_docker.sh | 18 --- .../resnet50/ansible_workspace/train.sh | 9 -- .../ansible_workspace/update_tools.sh | 3 - .../tools/args_train_ddp_graph_resnet50.sh | 144 ------------------ Classification/resnet50/tools/extract.py | 27 ---- .../resnet50/tools/prepare_docker.sh | 14 -- Classification/resnet50/tools/profile.sh | 7 - Classification/resnet50/tools/train.sh | 7 - 9 files changed, 238 deletions(-) delete mode 100644 Classification/resnet50/ansible_workspace/profile.sh delete mode 100755 Classification/resnet50/ansible_workspace/set_docker.sh delete mode 100644 Classification/resnet50/ansible_workspace/train.sh delete mode 100755 Classification/resnet50/ansible_workspace/update_tools.sh delete mode 100755 Classification/resnet50/tools/args_train_ddp_graph_resnet50.sh delete mode 100644 Classification/resnet50/tools/extract.py delete mode 100755 Classification/resnet50/tools/prepare_docker.sh delete mode 100755 Classification/resnet50/tools/profile.sh delete mode 100755 Classification/resnet50/tools/train.sh diff --git a/Classification/resnet50/ansible_workspace/profile.sh b/Classification/resnet50/ansible_workspace/profile.sh deleted file mode 100644 index db6abfb..0000000 --- a/Classification/resnet50/ansible_workspace/profile.sh +++ /dev/null @@ -1,9 +0,0 @@ -#!/bin/bash -set -ex -if [ $# -ne 1 ]; then - echo "Usage: $0 num_nodes" - exit 1 -fi -NUM_NODES="$1" -docker_name="cd_test_new" -ansible hosts -i inventory.ini -m shell -a "docker exec $docker_name bash -c 'cd /workspace/models/Vision/classification/image/resnet50 && bash profile.sh $NUM_NODES'" \ No newline at end of file diff --git a/Classification/resnet50/ansible_workspace/set_docker.sh b/Classification/resnet50/ansible_workspace/set_docker.sh deleted file mode 100755 index 7a55f49..0000000 --- a/Classification/resnet50/ansible_workspace/set_docker.sh +++ /dev/null @@ -1,18 +0,0 @@ -#!/bin/bash -set -ex -if [ $# -ne 1 ]; then - echo "Usage: $0 filename" - exit 1 -fi -host_file="$1" -num_hosts=$(wc -l < "$host_file") -docker_name="cd_test_new" - -mapfile -t lines < "$host_file" - -for (( i=1; i<${#lines[@]}; i++ )); do - line="${lines[$i]}" - host_name=$(echo "$line" | awk '{print $1}') - ansible $host_name -i $host_file -m shell -a "docker run -itd -e NODE_RANK=$((i-1)) -v /data/dataset/ImageNet:/data/dataset/ImageNet -v /data/home/chende/tools:/workspace/tools --network host --gpus all --shm-size=16g --ulimit memlock=-1 --ulimit core=0 --ulimit stack=67108864 --privileged --ipc host --cap-add=IPC_LOCK --name $docker_name nvcr.io/nvidia/pytorch:24.03-py3 bash" -done -ansible hosts -i "$host_file" -m shell -a "docker exec $docker_name bash -c 'bash /workspace/tools/prepare_docker.sh'" \ No newline at end of file diff --git a/Classification/resnet50/ansible_workspace/train.sh b/Classification/resnet50/ansible_workspace/train.sh deleted file mode 100644 index 350bf78..0000000 --- a/Classification/resnet50/ansible_workspace/train.sh +++ /dev/null @@ -1,9 +0,0 @@ -#!/bin/bash -set -ex -if [ $# -ne 1 ]; then - echo "Usage: $0 num_nodes" - exit 1 -fi -NUM_NODES="$1" -docker_name="cd_test_new" -ansible hosts -i inventory.ini -m shell -a "docker exec $docker_name bash -c 'cd /workspace/models/Vision/classification/image/resnet50 && bash train.sh $NUM_NODES'" \ No newline at end of file diff --git a/Classification/resnet50/ansible_workspace/update_tools.sh b/Classification/resnet50/ansible_workspace/update_tools.sh deleted file mode 100755 index cda6deb..0000000 --- a/Classification/resnet50/ansible_workspace/update_tools.sh +++ /dev/null @@ -1,3 +0,0 @@ -#!/bin/bash -echo "/data/home/chende/tools" -ansible hosts -i inventory.ini -m copy -a "src=/data/home/chende/tools dest=/data/home/chende/ mode=0755" \ No newline at end of file diff --git a/Classification/resnet50/tools/args_train_ddp_graph_resnet50.sh b/Classification/resnet50/tools/args_train_ddp_graph_resnet50.sh deleted file mode 100755 index b4fbd36..0000000 --- a/Classification/resnet50/tools/args_train_ddp_graph_resnet50.sh +++ /dev/null @@ -1,144 +0,0 @@ -rm -rf core.* - -set -ex - - -# bash examples/args_train_ddp_graph.sh ${NUM_NODES} ${DEVICE_NUM_PER_NODE} ${NODE_RANK} ${MASTER_ADDR} -# ${OFRECORD_PATH} ${TRAIN_BATCH_SIZE} ${EPOCH} ${USE_FP16} ${PYTHON_BIN} ${RUN_TYPE} ${DEBUG_AND_NCCL} ${NSYS_BIN} ${RUN_COMMIT} - -# bash examples/args_train_ddp_graph.sh 1 8 0 127.0.0.1 /dataset/79846248 192 50 false python3 ddp false '' 1 - -NUM_NODES=${1:-1} -DEVICE_NUM_PER_NODE=${2:-8} -NODE_RANK=${3:-0} -MASTER_ADDR=${4:-"127.0.0.1"} -OFRECORD_PATH=${5:-"/dataset/imagenet/ofrecord"} -TRAIN_BATCH_SIZE=${6:-192} -EPOCH=${7:-50} -USE_FP16=${8:-false} -PYTHON_BIN=${9:-"python3"} -RUN_TYPE=${10:-"ddp"} # graph+fp16 -DECODE_TYPE=${11:-"cpu"} -PRINT_INTERVAL=${12:-100} -DEBUG_AND_NCCL=${13:-false} -NSYS_BIN=${14:-""} -RUN_COMMIT=${15:-"master"} -ACC=${16:-1} -VAL_BATCH_SIZE=${17:-50} - - -SRC_DIR=$(realpath $(dirname $0)/..) - -AMP_OR="FP32" -if $USE_FP16; then - AMP_OR="FP16" -fi - -TRAN_MODEL="resnet50" -RUN_TIME=$(date "+%Y%m%d_%H%M%S%N") -LOG_FOLDER=${SRC_DIR}/test_logs/$HOSTNAME/${NUM_NODES}n${DEVICE_NUM_PER_NODE}g -mkdir -p $LOG_FOLDER -LOG_FILENAME=$LOG_FOLDER/${TRAN_MODEL}_${RUN_TYPE}_DC${DECODE_TYPE}_${AMP_OR}_mb${TRAIN_BATCH_SIZE}_gb$((${TRAIN_BATCH_SIZE}*${NUM_NODES}*${DEVICE_NUM_PER_NODE}*${ACC}))_acc${ACC}_${NUM_NODES}n${DEVICE_NUM_PER_NODE}g_${RUN_COMMIT}_${RUN_TIME} - - -export PYTHONUNBUFFERED=1 -echo PYTHONUNBUFFERED=$PYTHONUNBUFFERED -#export ONEFLOW_COMM_NET_IB_ENABLE=True -export NCCL_LAUNCH_MODE=GROUP -echo NCCL_LAUNCH_MODE=$NCCL_LAUNCH_MODE -echo DEBUG_AND_NCCL=$DEBUG_AND_NCCL -if $DEBUG_AND_NCCL; then - export ONEFLOW_DEBUG_MODE=1 - echo ONEFLOW_DEBUG_MODE=$ONEFLOW_DEBUG_MODE - export NCCL_DEBUG=INFO - echo NCCL_DEBUG=$NCCL_DEBUG -fi - -#export ONEFLOW_KERNEL_ENABLE_CUDA_GRAPH=1 -#export ONEFLOW_THREAD_ENABLE_LOCAL_MESSAGE_QUEUE=1 -#export ONEFLOW_KERNEL_DISABLE_BLOB_ACCESS_CHECKER=1 -#export ONEFLOW_ACTOR_ENABLE_LIGHT_ACTOR=1 -#export ONEFLOW_STREAM_REUSE_CUDA_EVENT=1 - -#export ONEFLOW_STREAM_CUDA_EVENT_FLAG_BLOCKING_SYNC=true -#export ONEFLOW_VM_WORKLOAD_ON_SCHEDULER_THREAD=1 - -LEARNING_RATE=$(echo | awk "{print $NUM_NODES*$DEVICE_NUM_PER_NODE*$TRAIN_BATCH_SIZE*$ACC/1000}") -MOM=0.875 -OFRECORD_PART_NUM=256 - -EXIT_NUM=-1 - -if [ ${EPOCH} -lt 10 ];then - EXIT_NUM=300 -fi -CMD="" - -if [[ ! -z "${NSYS_BIN}" ]]; then - export ONEFLOW_PROFILER_KERNEL_PROFILE_KERNEL_FORWARD_RANGE=1 - export ONEFLOW_DEBUG_MODE=1 - # CMD+="${NSYS_BIN} profile --stats true -t nvtx --output ${LOG_FILENAME} " - export CUDNN_LOGINFO_DBG=1 - export CUDNN_LOGDEST_DBG=${SRC_DIR}/cudnn.log - CMD+="${NSYS_BIN} profile --stats true --output ${LOG_FILENAME} " - EXIT_NUM=30 -fi - - -CMD+="${PYTHON_BIN} -m oneflow.distributed.launch " - -CMD+="--nproc_per_node ${DEVICE_NUM_PER_NODE} " -CMD+="--nnodes ${NUM_NODES} " -CMD+="--node_rank ${NODE_RANK} " -CMD+="--master_addr ${MASTER_ADDR} " -CMD+="--master_port 12345 " -CMD+="${SRC_DIR}/train.py " -CMD+="--ofrecord-path ${OFRECORD_PATH} " -CMD+="--ofrecord-part-num ${OFRECORD_PART_NUM} " -CMD+="--num-devices-per-node ${DEVICE_NUM_PER_NODE} " -CMD+="--lr ${LEARNING_RATE} " -CMD+="--momentum ${MOM} " -CMD+="--num-epochs ${EPOCH} " -CMD+="--train-batch-size ${TRAIN_BATCH_SIZE} " -CMD+="--train-global-batch-size $((${TRAIN_BATCH_SIZE}*${NUM_NODES}*${DEVICE_NUM_PER_NODE}*${ACC})) " -CMD+="--val-batch-size ${VAL_BATCH_SIZE} " -CMD+="--val-global-batch-size $((${VAL_BATCH_SIZE}*${NUM_NODES}*${DEVICE_NUM_PER_NODE}*${ACC})) " -CMD+="--print-interval ${PRINT_INTERVAL} " -#CMD+="--synthetic-data " - -if $USE_FP16; then - echo USE_FP16=$USE_FP16 - CMD+="--use-fp16 --channel-last " -fi - -if [ $EXIT_NUM != -1 ]; then - CMD+="--skip-eval " -fi -if [ $RUN_TYPE == 'ddp' ]; then - CMD+="--ddp " -else - CMD+="--scale-grad --graph " - CMD+="--fuse-bn-relu " - CMD+="--fuse-bn-add-relu " -fi - -if [ $DECODE_TYPE == 'gpu' ]; then - CMD+="--use-gpu-decode " -fi - -echo "Rum cmd ${CMD}" - -$CMD 2>&1 | tee ${LOG_FILENAME}.log - -echo "Writting log to ${LOG_FILENAME}.log" - -if [[ ! -z "${NSYS_BIN}" ]]; then - rm ${LOG_FOLDER}/*.sqlite - mkdir -p ${LOG_FILENAME} - #rm -rf ./log/$HOSTNAME/oneflow.* - cp ./log/$HOSTNAME/* ${LOG_FILENAME}/ - mv ${SRC_DIR}/cudnn.log ${LOG_FILENAME}/cudnn.log -fi - -rm -rf ./log/$HOSTNAME -echo "done" diff --git a/Classification/resnet50/tools/extract.py b/Classification/resnet50/tools/extract.py deleted file mode 100644 index 477903d..0000000 --- a/Classification/resnet50/tools/extract.py +++ /dev/null @@ -1,27 +0,0 @@ -import sys -import re - -# 文件路径 -file_path = sys.argv[1] - -# 存储 train 模式下的 throughput -print(file_path) -train_throughputs = [] - -# 正则表达式模式匹配 train 模式下的 throughput -pattern = re.compile(r'\[train\][^|]*?throughput:\s(\d+\.\d+)') - -# 读取文件并提取需要的信息 -with open(file_path, 'r') as file: - for line in file: - matches = pattern.findall(line) - for match in matches: - throughput = float(match) - train_throughputs.append(throughput) - -# 计算平均 throughput -if train_throughputs: - average_throughput = sum(train_throughputs) / len(train_throughputs) - print(f'The average throughput for [train] mode is: {average_throughput:.6f}') -else: - print('No [train] mode throughputs found.') diff --git a/Classification/resnet50/tools/prepare_docker.sh b/Classification/resnet50/tools/prepare_docker.sh deleted file mode 100755 index 84147f5..0000000 --- a/Classification/resnet50/tools/prepare_docker.sh +++ /dev/null @@ -1,14 +0,0 @@ -#!/bin/bash -pip config set global.index-url https://pypi.tuna.tsinghua.edu.cn/simple -python3 -m pip install --upgrade pip -python3 -m pip install --pre oneflow -f https://oneflow-staging.oss-cn-beijing.aliyuncs.com/branch/master/cu121 - -cd /workspace -cp tools/models.tar.gz ./ -tar -xvf models.tar.gz -pip install -r models/dev-requirements.txt -pip install -r models/Vision/classification/image/resnet50/requirements.txt - -cp tools/args_train_ddp_graph_resnet50.sh models/Vision/classification/image/resnet50/examples/ -cp tools/train.sh models/Vision/classification/image/resnet50/ -cp tools/profile.sh models/Vision/classification/image/resnet50/ \ No newline at end of file diff --git a/Classification/resnet50/tools/profile.sh b/Classification/resnet50/tools/profile.sh deleted file mode 100755 index 79e0d0a..0000000 --- a/Classification/resnet50/tools/profile.sh +++ /dev/null @@ -1,7 +0,0 @@ -NUM_NODES=${1:-1} - -if [ "$NODE_RANK" -lt "$NUM_NODES" ]; then - bash examples/args_train_ddp_graph_resnet50.sh "$NUM_NODES" 8 "$NODE_RANK" 192.168.1.27 /data/dataset/ImageNet/ofrecord 128 50 true python3 graph gpu 100 false '/usr/local/cuda/bin/nsys' 1 -else - echo do nothing -fi \ No newline at end of file diff --git a/Classification/resnet50/tools/train.sh b/Classification/resnet50/tools/train.sh deleted file mode 100755 index 5758c05..0000000 --- a/Classification/resnet50/tools/train.sh +++ /dev/null @@ -1,7 +0,0 @@ -NUM_NODES=${1:-1} - -if [ "$NODE_RANK" -lt "$NUM_NODES" ]; then - bash examples/args_train_ddp_graph_resnet50.sh "$NUM_NODES" 8 "$NODE_RANK" 192.168.1.27 /data/dataset/ImageNet/ofrecord 128 50 true python3 graph gpu 100 false '' 1 -else - echo do nothing -fi \ No newline at end of file From fce3e8d78e3e581cfd292cddc73d4cc6daf79681 Mon Sep 17 00:00:00 2001 From: ShawnXuan Date: Thu, 1 Aug 2024 03:17:16 +0000 Subject: [PATCH 13/26] update --- Classification/resnet50/0_dist_ssh_key/inventory.ini | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Classification/resnet50/0_dist_ssh_key/inventory.ini b/Classification/resnet50/0_dist_ssh_key/inventory.ini index 894b65d..3812599 100644 --- a/Classification/resnet50/0_dist_ssh_key/inventory.ini +++ b/Classification/resnet50/0_dist_ssh_key/inventory.ini @@ -1,3 +1,3 @@ [all] -of27 ansible_host=192.168.1.27 ansible_user=xiexuan -of28 ansible_host=192.168.1.28 ansible_user=xiexuan +of27 ansible_host=192.168.1.27 ansible_user=myuser +of28 ansible_host=192.168.1.28 ansible_user=myuser From 4284d9d463b1af91530ea60d151ec6dc7181862d Mon Sep 17 00:00:00 2001 From: ShawnXuan Date: Thu, 1 Aug 2024 06:06:33 +0000 Subject: [PATCH 14/26] update readme --- Classification/resnet50/README.md | 25 ++++ Classification/resnet50/Readme.md | 225 ------------------------------ 2 files changed, 25 insertions(+), 225 deletions(-) create mode 100644 Classification/resnet50/README.md delete mode 100644 Classification/resnet50/Readme.md diff --git a/Classification/resnet50/README.md b/Classification/resnet50/README.md new file mode 100644 index 0000000..3be3866 --- /dev/null +++ b/Classification/resnet50/README.md @@ -0,0 +1,25 @@ +# 使用Ansible在多节点环境分布式训练 + +文件目录 + +``` +. +├── 0_dist_ssh_key # 分发 SSH 公钥到各个节点 +│   ├── distribute_ssh_key.yml # ansible playbook +│   ├── dist_ssh_key.sh # 执行脚本 +│   ├── inventory.ini # 仅用于分发公钥的主机清单文件,需要根据实际情况配置 +│   ├── README.md # 说明文件 +│   └── vars.yml # 初始未加密的用户密码文件,需经过配置并加密后使用 +├── 1_get_docker_image # 各个节点获取 docker 镜像 +│   ├── load_and_tag_docker_image.yml # 导入镜像 ansible playbook +│   ├── load.sh # 导入镜像执行脚本 +│   ├── pull_docker_image.yml # 拉取镜像 ansible playbook +│   ├── pull.sh # 拉取镜像执行脚本 +│   └── README.md # 说明文件 +├── inventory.ini # 主机清单文件,需要根据实际情况配置 +└── README.md # 说明文件 +``` + + + + diff --git a/Classification/resnet50/Readme.md b/Classification/resnet50/Readme.md deleted file mode 100644 index b19a080..0000000 --- a/Classification/resnet50/Readme.md +++ /dev/null @@ -1,225 +0,0 @@ -# 千卡 0.85 - -[toc] - -## 文件目录结构 -``` -├── ansible_workspace # 主节点上的工作目录 -│   ├── inventory.ini # 用来配置节点信息 -│   ├── set_docker.sh # 在各节点上创建docker,并且配置好docker内环境 -│   ├── profile.sh # 根据节点数启动profile -│   ├── train.sh # 根据节点数启动训练 -│   └── update_tools.sh # 将主节点的tools文件夹复制到各个子节点 -├── tools # 在各个节点使用的文件 -│ ├── args_train_ddp_graph_resnet50.sh # 接受模型训练参数并启动训练 -│ ├── models.tar.gz # 模型,为防止git网络问题,建议先下载放在共享目录下 -│ ├── extract.py # 提取log中train阶段的throughput的平均值 -│ ├── prepare_docker.sh # 用于配置docker内环境 -│ ├── profile.sh # 根据节点数在本机启动profile -│ └── train.sh # 根据节点数在本机启动训练 -└── Readme.md -``` - -需求:有NVLink,以及 shared_nfs - -以下供参考 - -## 第一步: 配置环境 - -### 1.1 所有节点配置SSH Key,并设置authorized_keys - -(怎么自动化) - -需要一个共享的存储空间,如:`/shared_nfs/k85`,在一个文件夹下准备好 - -- authorized_keys : 在主节点运行 - - ```bash - #!/bin/bash - - # 设置 SSH 目录路径 - SSH_DIR="$HOME/.ssh" - - # 检查 SSH 目录是否存在,如果不存在则创建 - if [ ! -d "$SSH_DIR" ]; then - mkdir -p "$SSH_DIR" - echo "Created directory: $SSH_DIR" - fi - - # 设置密钥文件路径 - KEY_PATH="$SSH_DIR/id_rsa" - - # 生成 SSH 密钥对 - ssh-keygen -t rsa -b 2048 -f "$KEY_PATH" -N "" -q - - # 创建 authorized_keys 文件 - cat $SSH_DIR/id_rsa.pub > $SSH_DIR/authorized_keys - - # 将 authorized_keys 文件拷贝到共享目录 - cp $SSH_DIR/authorized_keys shared_nfs/k85 - ``` - -- 在子节点运行 - - ```bash - #!/bin/bash - - # 设置 SSH 目录路径 - SSH_DIR="$HOME/.ssh" - - # 检查 SSH 目录是否存在,如果不存在则创建 - if [ ! -d "$SSH_DIR" ]; then - mkdir -p "$SSH_DIR" - echo "Created directory: $SSH_DIR" - fi - - # 设置密钥文件路径 - KEY_PATH="$SSH_DIR/id_rsa" - - # 生成 SSH 密钥对 - ssh-keygen -t rsa -b 2048 -f "$KEY_PATH" -N "" -q - - # 将 authorized_keys 文件拷贝到 .ssh 目录 - cp shared_nfs/k85/authorized_keys $SSH_DIR - ``` - -### 1.2 主节点安装 Ansible,并配置节点ip -示例文件:./ansible_workspace/inventory.ini -```ini -[hosts] -of27 ansible_host=192.168.1.27 ansible_ssh_common_args='-o StrictHostKeyChecking=no' -of25 ansible_host=192.168.1.25 ansible_ssh_common_args='-o StrictHostKeyChecking=no' -of26 ansible_host=192.168.1.26 ansible_ssh_common_args='-o StrictHostKeyChecking=no' -of28 ansible_host=192.168.1.28 ansible_ssh_common_args='-o StrictHostKeyChecking=no' -``` - -### 1.3 共享目录中拷贝镜像、数据集、models脚本 -主要为设置docker内环境的脚本 和 启动docker内训练的脚本 -设置docker内环境脚本(./tools/prepare_docker.sh)如下: -```Bash -#!/bin/bash -# 将tools视为共享目录 -pip config set global.index-url https://pypi.tuna.tsinghua.edu.cn/simple -python3 -m pip install --upgrade pip -python3 -m pip install --pre oneflow -f https://oneflow-staging.oss-cn-beijing.aliyuncs.com/branch/master/cu121 - - -cd /workspace -cp tools/models.tar.gz ./ -tar -xvf models.tar.gz -pip install -r models/dev-requirements.txt -pip install -r models/Vision/classification/image/resnet50/requirements.txt - -# 将需要使用到的脚本拷到对应文件夹下 -cp tools/args_train_ddp_graph_resnet50.sh models/Vision/classification/image/resnet50/examples/ -cp tools/train.sh models/Vision/classification/image/resnet50/ -cp tools/profile.sh models/Vision/classification/image/resnet50/ -``` -启动dokcer内训练的脚本(./tools/train.sh)如下: -```Bash -# 根据使用的节点数,来判断本机是否开始训练 -NUM_NODES=${1:-1} - -if [ "$NODE_RANK" -lt "$NUM_NODES" ]; then - bash examples/args_train_ddp_graph_resnet50.sh "$NUM_NODES" 8 "$NODE_RANK" 192.168.1.27 /data/dataset/ImageNet/ofrecord 192 50 true python3 graph gpu 100 false '' 1 -else - echo do nothing -fi -``` -启动dokcer内profile(./tools/profile.sh)如下: -```Bash -# 根据使用的节点数,来判断是否在本地开始profile -NUM_NODES=${1:-1} - -if [ "$NODE_RANK" -lt "$NUM_NODES" ]; then - # 在启动训练时添加nsys启动路径,即可进行profile - bash examples/args_train_ddp_graph_resnet50.sh "$NUM_NODES" 8 "$NODE_RANK" 192.168.1.27 /data/dataset/ImageNet/ofrecord 192 50 true python3 graph gpu 100 false '/usr/local/cuda/bin/nsys' 1 -else - echo do nothing -fi -``` -args_train_ddp_graph_resnet50.sh文件参考自OneAutoTest仓库[args_train_ddp_graph.sh](https://github.com/Oneflow-Inc/OneAutoTest/blob/main/ResNet50/args_train_ddp_graph.sh),其中包含使用nsys启动的选项 -### 1.4 使用ansible 在所有节点执行 docker load, docker tag命令 -根据上文中inventory.ini文件依次在节点上创建docker,并将NODE_RANK写入docker的环境变量内,脚本(./ansible_workspace/set_docker.sh)内容如下: -```Bash -set -ex -if [ $# -ne 1 ]; then - echo "Usage: $0 filename" - exit 1 -fi -host_file="$1" -num_hosts=$(wc -l < "$host_file") -docker_name="cd_test" - -mapfile -t lines < "$host_file" - -for (( i=1; i<${#lines[@]}; i++ )); do - line="${lines[$i]}" - host_name=$(echo "$line" | awk '{print $1}') - # 根据inventory.ini文件中节点顺序,将NODE_RANK写入docker的环境变量中 - ansible $host_name -i $host_file -m shell -a "docker run -itd -e NODE_RANK=$((i-1)) -v /data/dataset/ImageNet:/data/dataset/ImageNet -v /data/home/chende/tools:/workspace/tools --network host --gpus all --shm-size=16g --ulimit memlock=-1 --ulimit core=0 --ulimit stack=67108864 --privileged --ipc host --cap-add=IPC_LOCK --name $docker_name nvcr.io/nvidia/pytorch:24.03-py3 bash" -done -# 在docker内运行环境设置的脚本 -ansible hosts -i "$host_file" -m shell -a "docker exec $docker_name bash -c 'bash /workspace/tools/prepare_docker.sh'" -``` -使用方式: -```Bash -bash set_docker.sh inventory.ini -``` - -## 第二步:进行测试 - -### 2.1 自动测试与日志搜集 - -编写一个测试命令脚本文件(./ansible_workspace/train.sh) -```Bash -#!/bin/bash -set -ex -if [ $# -ne 1 ]; then - echo "Usage: $0 num_nodes" - exit 1 -fi -NUM_NODES="$1" -docker_name="cd_test_new" -ansible hosts -i inventory.ini -m shell -a "docker exec $docker_name bash -c 'cd /workspace/models/Vision/classification/image/resnet50 && bash train.sh $NUM_NODES'" -``` - -- 需要一个参数: 节点数, -- 运行该命令能够自动启动相应数量的节点运行。 -- 运行结束后收集日志到主节点。 -- 保存日志的目录可以以:`prefix_节点数_日期时间_surfix` 命名,前缀和后缀可以自定义 - -### 2.2 自动日志解析 - -可以使用2.1节提供的命令运行多次,比如: - -```bash -train.sh 1 -train.sh 2 -train.sh 4 -train.sh 8 -train.sh 16 -``` - -完成后应该保存了多个日志目录,需要编写一个日志处理脚本,从这些日志目录中提取性能数据并制成 markdown 格式的表格 - -注:不需要完整训练,训练稳定后获取到数据就可以了。 - -### 2.3 自动 nsys 性能测试 - -需要编写一个能够运行 nsys 的性能测试脚本文件(./ansible_workspace/profile.sh),和2.1的脚本类似,只是启动时需要调用nsys,我们需要搜集这些信息分析,然后进行优化。这个脚本文件。 -```Bash -#!/bin/bash -set -ex -if [ $# -ne 1 ]; then - echo "Usage: $0 num_nodes" - exit 1 -fi -NUM_NODES="$1" -docker_name="cd_test_new" -ansible hosts -i inventory.ini -m shell -a "docker exec $docker_name bash -c 'cd /workspace/models/Vision/classification/image/resnet50 && bash profile.sh $NUM_NODES'" -``` -- 需要一个参数: 节点数, -- 运行该命令能够自动启动相应数量的节点运行。 -- 运行结束后收集日志和nsys相关文件到主节点。 -- 保存日志的目录可以以:`prefix_节点数_日期时间_surfix` 命名,前缀和后缀可以自定义 \ No newline at end of file From 3f7a512cc009bf1703c7c1f71cb3d90dec2e95b7 Mon Sep 17 00:00:00 2001 From: ShawnXuan Date: Fri, 2 Aug 2024 00:54:33 +0000 Subject: [PATCH 15/26] support distributed training --- .../resnet50/2_distributed_training/d.sh | 1 + .../2_distributed_training/dist_training.yml | 51 +++++++++++++++++++ 2 files changed, 52 insertions(+) create mode 100755 Classification/resnet50/2_distributed_training/d.sh create mode 100644 Classification/resnet50/2_distributed_training/dist_training.yml diff --git a/Classification/resnet50/2_distributed_training/d.sh b/Classification/resnet50/2_distributed_training/d.sh new file mode 100755 index 0000000..db2ac66 --- /dev/null +++ b/Classification/resnet50/2_distributed_training/d.sh @@ -0,0 +1 @@ +ansible-playbook -i ../inventory.ini dist_training.yml diff --git a/Classification/resnet50/2_distributed_training/dist_training.yml b/Classification/resnet50/2_distributed_training/dist_training.yml new file mode 100644 index 0000000..2e214bd --- /dev/null +++ b/Classification/resnet50/2_distributed_training/dist_training.yml @@ -0,0 +1,51 @@ +--- +- name: Distributed Training Setup + hosts: all + vars: + device_num_per_node: 8 + num_nodes: "{{ groups['all'] | length }}" + master_addr: "{{ hostvars[groups['all'][0]].ansible_host }}" + docker_image: "oneflowinc/oneflow:0.9.1.dev20240203-cuda11.8" + src: "/share_nfs/k85/models/Vision/classification/image/resnet50" + + tasks: + - name: Set node rank + set_fact: + node_rank: "{{ groups['all'].index(inventory_hostname) }}" + + - name: Run dist_train.sh in Docker container + command: > + docker run --rm --gpus all + --runtime=nvidia --privileged + --network host --ipc=host + -v {{ src }}:/workspace + -w /workspace + {{ docker_image }} /bin/bash -c " + python3 -m oneflow.distributed.launch \ + --nproc_per_node {{ device_num_per_node }} \ + --nnodes {{ num_nodes }} \ + --node_rank {{ node_rank }} \ + --master_addr {{ master_addr }} \ + /workspace/train.py \ + --synthetic-data \ + --batches-per-epoch 1000 \ + --num-devices-per-node {{ device_num_per_node }} \ + --lr 1.536 \ + --num-epochs 1 \ + --train-batch-size 32 \ + --graph \ + --use-fp16 \ + --metric-local False \ + --metric-train-acc True \ + --fuse-bn-relu \ + --fuse-bn-add-relu \ + --use-gpu-decode \ + --channel-last \ + --skip-eval + " + register: output + + - name: Display output + debug: + var: output.stdout + From 8911664fcc424eddb6fcc0368f1748afdf2b4f14 Mon Sep 17 00:00:00 2001 From: ShawnXuan Date: Fri, 2 Aug 2024 01:05:49 +0000 Subject: [PATCH 16/26] update --- .../resnet50/2_distributed_training/README.md | 40 +++++++++++++++++++ .../resnet50/2_distributed_training/d.sh | 1 - .../run_dist_training.sh | 15 +++++++ 3 files changed, 55 insertions(+), 1 deletion(-) create mode 100644 Classification/resnet50/2_distributed_training/README.md delete mode 100755 Classification/resnet50/2_distributed_training/d.sh create mode 100755 Classification/resnet50/2_distributed_training/run_dist_training.sh diff --git a/Classification/resnet50/2_distributed_training/README.md b/Classification/resnet50/2_distributed_training/README.md new file mode 100644 index 0000000..0f6ddc8 --- /dev/null +++ b/Classification/resnet50/2_distributed_training/README.md @@ -0,0 +1,40 @@ +# run_dist_training.sh 使用说明 + +`run_dist_training.sh` 是一个 Bash 脚本,用于运行 `ansible-playbook` 命令来启动分布式训练。此脚本支持通过参数指定 Docker 镜像和源目录。 + +## 用法 + +```bash +./run_dist_training.sh [docker_image] [src] +``` + +## 参数 + +- `docker_image` (可选): 要使用的 Docker 镜像名称。默认为 `oneflowinc/oneflow:0.9.1.dev20240203-cuda11.8`。 +- `src` (可选): 要挂载到 Docker 容器的源目录。默认为 `/share_nfs/k85/models/Vision/classification/image/resnet50`。 + +## 示例 + +1. 使用默认值运行: + +```bash +./run_dist_training.sh +``` + +2. 指定 Docker 镜像运行: + +```bash +./run_dist_training.sh "my_custom_image:latest" +``` + +3. 指定 Docker 镜像和源目录运行: + +```bash +./run_dist_training.sh "my_custom_image:latest" "/my/custom/src" +``` + +## 注意 + +如果不提供参数,脚本将使用默认的 Docker 镜像和源目录。 +``` + diff --git a/Classification/resnet50/2_distributed_training/d.sh b/Classification/resnet50/2_distributed_training/d.sh deleted file mode 100755 index db2ac66..0000000 --- a/Classification/resnet50/2_distributed_training/d.sh +++ /dev/null @@ -1 +0,0 @@ -ansible-playbook -i ../inventory.ini dist_training.yml diff --git a/Classification/resnet50/2_distributed_training/run_dist_training.sh b/Classification/resnet50/2_distributed_training/run_dist_training.sh new file mode 100755 index 0000000..3747a5f --- /dev/null +++ b/Classification/resnet50/2_distributed_training/run_dist_training.sh @@ -0,0 +1,15 @@ +#!/bin/bash + +DOCKER_IMAGE="oneflowinc/oneflow:0.9.1.dev20240203-cuda11.8" +SRC="/share_nfs/k85/models/Vision/classification/image/resnet50" + +if [ -n "$1" ]; then + DOCKER_IMAGE="$1" +fi + +if [ -n "$2" ]; then + SRC="$2" +fi + +# 运行 ansible-playbook 命令 +ansible-playbook -i ../inventory.ini dist_training.yml -e "docker_image=${DOCKER_IMAGE}" -e "src=${SRC}" From cc6baa8b6df3076f0a143829bff7cc8b676a0b80 Mon Sep 17 00:00:00 2001 From: ShawnXuan Date: Fri, 2 Aug 2024 02:36:34 +0000 Subject: [PATCH 17/26] update README --- Classification/resnet50/2_distributed_training/README.md | 1 - Classification/resnet50/README.md | 4 ++++ 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/Classification/resnet50/2_distributed_training/README.md b/Classification/resnet50/2_distributed_training/README.md index 0f6ddc8..2ee78d3 100644 --- a/Classification/resnet50/2_distributed_training/README.md +++ b/Classification/resnet50/2_distributed_training/README.md @@ -36,5 +36,4 @@ ## 注意 如果不提供参数,脚本将使用默认的 Docker 镜像和源目录。 -``` diff --git a/Classification/resnet50/README.md b/Classification/resnet50/README.md index 3be3866..6cdb99e 100644 --- a/Classification/resnet50/README.md +++ b/Classification/resnet50/README.md @@ -16,6 +16,10 @@ │   ├── pull_docker_image.yml # 拉取镜像 ansible playbook │   ├── pull.sh # 拉取镜像执行脚本 │   └── README.md # 说明文件 +├── 2_distributed_training # 分布式训练 +│   ├── dist_training.yml # 用于分布式训练的 ansible playbook +│   └── README.md # 说明文件 +│   └── run_dist_training.sh # 分布式训练执行脚本 ├── inventory.ini # 主机清单文件,需要根据实际情况配置 └── README.md # 说明文件 ``` From 60cde364a5bdec57134a18523ac0ef677b53c4f7 Mon Sep 17 00:00:00 2001 From: ShawnXuan Date: Fri, 2 Aug 2024 02:46:08 +0000 Subject: [PATCH 18/26] fix --- Classification/resnet50/1_get_docker_image/README.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/Classification/resnet50/1_get_docker_image/README.md b/Classification/resnet50/1_get_docker_image/README.md index 3f6282a..99709fd 100644 --- a/Classification/resnet50/1_get_docker_image/README.md +++ b/Classification/resnet50/1_get_docker_image/README.md @@ -40,9 +40,9 @@ - 默认使用: - ```bash - ./load.sh - ``` +```bash +./load.sh +``` - 指定镜像文件路径和标签: From 0e4498f4b2131dfef4404f5b635c62016610abb8 Mon Sep 17 00:00:00 2001 From: ShawnXuan Date: Fri, 2 Aug 2024 03:06:02 +0000 Subject: [PATCH 19/26] one node training --- .../2_distributed_training/dist_training.yml | 2 +- .../resnet50/3_1node_training/README.md | 39 ++++++++++++++ .../3_1node_training/one_node_training.yml | 51 +++++++++++++++++++ .../3_1node_training/run_one_node_training.sh | 15 ++++++ Classification/resnet50/inventory.ini | 2 + 5 files changed, 108 insertions(+), 1 deletion(-) create mode 100644 Classification/resnet50/3_1node_training/README.md create mode 100644 Classification/resnet50/3_1node_training/one_node_training.yml create mode 100755 Classification/resnet50/3_1node_training/run_one_node_training.sh diff --git a/Classification/resnet50/2_distributed_training/dist_training.yml b/Classification/resnet50/2_distributed_training/dist_training.yml index 2e214bd..cf20099 100644 --- a/Classification/resnet50/2_distributed_training/dist_training.yml +++ b/Classification/resnet50/2_distributed_training/dist_training.yml @@ -13,7 +13,7 @@ set_fact: node_rank: "{{ groups['all'].index(inventory_hostname) }}" - - name: Run dist_train.sh in Docker container + - name: distributed training in Docker container command: > docker run --rm --gpus all --runtime=nvidia --privileged diff --git a/Classification/resnet50/3_1node_training/README.md b/Classification/resnet50/3_1node_training/README.md new file mode 100644 index 0000000..02203a4 --- /dev/null +++ b/Classification/resnet50/3_1node_training/README.md @@ -0,0 +1,39 @@ +# run_one_node_training.sh 使用说明 + +`run_one_node_training.sh` 是一个 Bash 脚本,用于运行 `ansible-playbook` 命令来启动分布式训练。此脚本支持通过参数指定 Docker 镜像和源目录。 + +## 用法 + +```bash +./run_one_node_training.sh [docker_image] [src] +``` + +## 参数 + +- `docker_image` (可选): 要使用的 Docker 镜像名称。默认为 `oneflowinc/oneflow:0.9.1.dev20240203-cuda11.8`。 +- `src` (可选): 要挂载到 Docker 容器的源目录。默认为 `/share_nfs/k85/models/Vision/classification/image/resnet50`。 + +## 示例 + +1. 使用默认值运行: + +```bash +./run_one_node_training.sh +``` + +2. 指定 Docker 镜像运行: + +```bash +./run_one_node_training.sh "my_custom_image:latest" +``` + +3. 指定 Docker 镜像和源目录运行: + +```bash +./run_one_node_training.sh "my_custom_image:latest" "/my/custom/src" +``` + +## 注意 + +如果不提供参数,脚本将使用默认的 Docker 镜像和源目录。 + diff --git a/Classification/resnet50/3_1node_training/one_node_training.yml b/Classification/resnet50/3_1node_training/one_node_training.yml new file mode 100644 index 0000000..9563d03 --- /dev/null +++ b/Classification/resnet50/3_1node_training/one_node_training.yml @@ -0,0 +1,51 @@ +--- +- name: Distributed Training Setup + hosts: one_node + vars: + device_num_per_node: 8 + num_nodes: "{{ groups['one_node'] | length }}" + master_addr: "{{ hostvars[groups['one_node'][0]].ansible_host }}" + docker_image: "oneflowinc/oneflow:0.9.1.dev20240203-cuda11.8" + src: "/share_nfs/k85/models/Vision/classification/image/resnet50" + + tasks: + - name: Set node rank + set_fact: + node_rank: "{{ groups['one_node'].index(inventory_hostname) }}" + + - name: Training on one node in Docker container + command: > + docker run --rm --gpus all + --runtime=nvidia --privileged + --network host --ipc=host + -v {{ src }}:/workspace + -w /workspace + {{ docker_image }} /bin/bash -c " + python3 -m oneflow.distributed.launch \ + --nproc_per_node {{ device_num_per_node }} \ + --nnodes {{ num_nodes }} \ + --node_rank {{ node_rank }} \ + --master_addr {{ master_addr }} \ + /workspace/train.py \ + --synthetic-data \ + --batches-per-epoch 1000 \ + --num-devices-per-node {{ device_num_per_node }} \ + --lr 1.536 \ + --num-epochs 1 \ + --train-batch-size 32 \ + --graph \ + --use-fp16 \ + --metric-local False \ + --metric-train-acc True \ + --fuse-bn-relu \ + --fuse-bn-add-relu \ + --use-gpu-decode \ + --channel-last \ + --skip-eval + " + register: output + + - name: Display output + debug: + var: output.stdout + diff --git a/Classification/resnet50/3_1node_training/run_one_node_training.sh b/Classification/resnet50/3_1node_training/run_one_node_training.sh new file mode 100755 index 0000000..ab5ebee --- /dev/null +++ b/Classification/resnet50/3_1node_training/run_one_node_training.sh @@ -0,0 +1,15 @@ +#!/bin/bash + +DOCKER_IMAGE="oneflowinc/oneflow:0.9.1.dev20240203-cuda11.8" +SRC="/share_nfs/k85/models/Vision/classification/image/resnet50" + +if [ -n "$1" ]; then + DOCKER_IMAGE="$1" +fi + +if [ -n "$2" ]; then + SRC="$2" +fi + +# 运行 ansible-playbook 命令 +ansible-playbook -i ../inventory.ini one_node_training.yml -e "docker_image=${DOCKER_IMAGE}" -e "src=${SRC}" diff --git a/Classification/resnet50/inventory.ini b/Classification/resnet50/inventory.ini index 01027fd..c51a8a2 100644 --- a/Classification/resnet50/inventory.ini +++ b/Classification/resnet50/inventory.ini @@ -1,3 +1,5 @@ +[one_node] +of25 ansible_host=192.168.1.25 ansible_ssh_common_args='-o StrictHostKeyChecking=no' [all] of27 ansible_host=192.168.1.27 ansible_ssh_common_args='-o StrictHostKeyChecking=no' of28 ansible_host=192.168.1.28 ansible_ssh_common_args='-o StrictHostKeyChecking=no' From ecdbbbb449e36cda53ad0a005478e5e03a68835a Mon Sep 17 00:00:00 2001 From: ShawnXuan Date: Fri, 2 Aug 2024 03:31:33 +0000 Subject: [PATCH 20/26] update --- Classification/resnet50/3_1node_training/README.md | 2 +- .../resnet50/3_1node_training/one_node_training.yml | 2 +- Classification/resnet50/README.md | 4 ++++ 3 files changed, 6 insertions(+), 2 deletions(-) diff --git a/Classification/resnet50/3_1node_training/README.md b/Classification/resnet50/3_1node_training/README.md index 02203a4..fb6a81f 100644 --- a/Classification/resnet50/3_1node_training/README.md +++ b/Classification/resnet50/3_1node_training/README.md @@ -1,6 +1,6 @@ # run_one_node_training.sh 使用说明 -`run_one_node_training.sh` 是一个 Bash 脚本,用于运行 `ansible-playbook` 命令来启动分布式训练。此脚本支持通过参数指定 Docker 镜像和源目录。 +`run_one_node_training.sh` 是一个 Bash 脚本,用于运行 `ansible-playbook` 命令来启动单节点上的训练。此脚本支持通过参数指定 Docker 镜像和源目录。 ## 用法 diff --git a/Classification/resnet50/3_1node_training/one_node_training.yml b/Classification/resnet50/3_1node_training/one_node_training.yml index 9563d03..6709ac7 100644 --- a/Classification/resnet50/3_1node_training/one_node_training.yml +++ b/Classification/resnet50/3_1node_training/one_node_training.yml @@ -1,5 +1,5 @@ --- -- name: Distributed Training Setup +- name: One Node Training Setup hosts: one_node vars: device_num_per_node: 8 diff --git a/Classification/resnet50/README.md b/Classification/resnet50/README.md index 6cdb99e..86aa36a 100644 --- a/Classification/resnet50/README.md +++ b/Classification/resnet50/README.md @@ -20,6 +20,10 @@ │   ├── dist_training.yml # 用于分布式训练的 ansible playbook │   └── README.md # 说明文件 │   └── run_dist_training.sh # 分布式训练执行脚本 +├── 3_1node_training # 在一个节点上训练,用于获得基准 +│   ├── one_node_training.yml # 单节点训练playbook +│   ├── README.md # 说明文件 +│   └── run_one_node_training.sh # 单节点训练执行脚本 ├── inventory.ini # 主机清单文件,需要根据实际情况配置 └── README.md # 说明文件 ``` From b77913ebae50fc778e5406aecfff757c44d5678e Mon Sep 17 00:00:00 2001 From: ShawnXuan Date: Fri, 2 Aug 2024 03:34:10 +0000 Subject: [PATCH 21/26] profiling --- Classification/resnet50/4_profiling/README.md | 39 ++++++++++++++ .../resnet50/4_profiling/profiling.yml | 52 +++++++++++++++++++ .../resnet50/4_profiling/run_profiling.sh | 15 ++++++ Classification/resnet50/README.md | 4 ++ 4 files changed, 110 insertions(+) create mode 100644 Classification/resnet50/4_profiling/README.md create mode 100644 Classification/resnet50/4_profiling/profiling.yml create mode 100755 Classification/resnet50/4_profiling/run_profiling.sh diff --git a/Classification/resnet50/4_profiling/README.md b/Classification/resnet50/4_profiling/README.md new file mode 100644 index 0000000..90b314e --- /dev/null +++ b/Classification/resnet50/4_profiling/README.md @@ -0,0 +1,39 @@ +# profiling.sh 使用说明 + +`profiling.sh` 是一个 Bash 脚本,用于运行 `ansible-playbook` 命令来启动分布式训练同时采集性能相关信息。此脚本支持通过参数指定 Docker 镜像和源目录。 + +## 用法 + +```bash +./profiling.sh [docker_image] [src] +``` + +## 参数 + +- `docker_image` (可选): 要使用的 Docker 镜像名称。默认为 `oneflowinc/oneflow:0.9.1.dev20240203-cuda11.8`。 +- `src` (可选): 要挂载到 Docker 容器的源目录。默认为 `/share_nfs/k85/models/Vision/classification/image/resnet50`。 + +## 示例 + +1. 使用默认值运行: + +```bash +./profiling.sh +``` + +2. 指定 Docker 镜像运行: + +```bash +./profiling.sh "my_custom_image:latest" +``` + +3. 指定 Docker 镜像和源目录运行: + +```bash +./profiling.sh "my_custom_image:latest" "/my/custom/src" +``` + +## 注意 + +如果不提供参数,脚本将使用默认的 Docker 镜像和源目录。 + diff --git a/Classification/resnet50/4_profiling/profiling.yml b/Classification/resnet50/4_profiling/profiling.yml new file mode 100644 index 0000000..5969fef --- /dev/null +++ b/Classification/resnet50/4_profiling/profiling.yml @@ -0,0 +1,52 @@ +--- +- name: Distributed Profiling Setup + hosts: all + vars: + device_num_per_node: 8 + num_nodes: "{{ groups['all'] | length }}" + master_addr: "{{ hostvars[groups['all'][0]].ansible_host }}" + docker_image: "oneflowinc/oneflow:0.9.1.dev20240203-cuda11.8" + src: "/share_nfs/k85/models/Vision/classification/image/resnet50" + + tasks: + - name: Set node rank + set_fact: + node_rank: "{{ groups['all'].index(inventory_hostname) }}" + + - name: distributed training in Docker container + command: > + docker run --rm --gpus all + --runtime=nvidia --privileged + --network host --ipc=host + -v {{ src }}:/workspace + -w /workspace + {{ docker_image }} /bin/bash -c " + nsys profile --stats=true \ + python3 -m oneflow.distributed.launch \ + --nproc_per_node {{ device_num_per_node }} \ + --nnodes {{ num_nodes }} \ + --node_rank {{ node_rank }} \ + --master_addr {{ master_addr }} \ + /workspace/train.py \ + --synthetic-data \ + --batches-per-epoch 100 \ + --num-devices-per-node {{ device_num_per_node }} \ + --lr 1.536 \ + --num-epochs 1 \ + --train-batch-size 32 \ + --graph \ + --use-fp16 \ + --metric-local False \ + --metric-train-acc True \ + --fuse-bn-relu \ + --fuse-bn-add-relu \ + --use-gpu-decode \ + --channel-last \ + --skip-eval + " + register: output + + - name: Display output + debug: + var: output.stdout + diff --git a/Classification/resnet50/4_profiling/run_profiling.sh b/Classification/resnet50/4_profiling/run_profiling.sh new file mode 100755 index 0000000..d51843f --- /dev/null +++ b/Classification/resnet50/4_profiling/run_profiling.sh @@ -0,0 +1,15 @@ +#!/bin/bash + +DOCKER_IMAGE="oneflowinc/oneflow:0.9.1.dev20240203-cuda11.8" +SRC="/share_nfs/k85/models/Vision/classification/image/resnet50" + +if [ -n "$1" ]; then + DOCKER_IMAGE="$1" +fi + +if [ -n "$2" ]; then + SRC="$2" +fi + +# 运行 ansible-playbook 命令 +ansible-playbook -i ../inventory.ini profiling.yml -e "docker_image=${DOCKER_IMAGE}" -e "src=${SRC}" diff --git a/Classification/resnet50/README.md b/Classification/resnet50/README.md index 86aa36a..acc3769 100644 --- a/Classification/resnet50/README.md +++ b/Classification/resnet50/README.md @@ -24,6 +24,10 @@ │   ├── one_node_training.yml # 单节点训练playbook │   ├── README.md # 说明文件 │   └── run_one_node_training.sh # 单节点训练执行脚本 +├── 4_profiling # 使用nsys采集性能相关信息 +│   ├── profiling.yml # profiling ansible playbook +│   ├── README.md # 说明文件 +│   └── run_profiling.sh # 采集信息执行脚本 ├── inventory.ini # 主机清单文件,需要根据实际情况配置 └── README.md # 说明文件 ``` From 2257c7a4bd2fb3cd117170ab4331cc77b5849b85 Mon Sep 17 00:00:00 2001 From: ShawnXuan Date: Fri, 2 Aug 2024 03:42:31 +0000 Subject: [PATCH 22/26] update --- Classification/resnet50/README.md | 115 ++++++++++++++++++++++++------ 1 file changed, 93 insertions(+), 22 deletions(-) diff --git a/Classification/resnet50/README.md b/Classification/resnet50/README.md index acc3769..33b62e8 100644 --- a/Classification/resnet50/README.md +++ b/Classification/resnet50/README.md @@ -1,37 +1,108 @@ # 使用Ansible在多节点环境分布式训练 -文件目录 +## 目录结构 ``` . ├── 0_dist_ssh_key # 分发 SSH 公钥到各个节点 -│   ├── distribute_ssh_key.yml # ansible playbook -│   ├── dist_ssh_key.sh # 执行脚本 -│   ├── inventory.ini # 仅用于分发公钥的主机清单文件,需要根据实际情况配置 -│   ├── README.md # 说明文件 -│   └── vars.yml # 初始未加密的用户密码文件,需经过配置并加密后使用 -├── 1_get_docker_image # 各个节点获取 docker 镜像 -│   ├── load_and_tag_docker_image.yml # 导入镜像 ansible playbook -│   ├── load.sh # 导入镜像执行脚本 -│   ├── pull_docker_image.yml # 拉取镜像 ansible playbook -│   ├── pull.sh # 拉取镜像执行脚本 -│   └── README.md # 说明文件 +│ ├── distribute_ssh_key.yml # Ansible playbook +│ ├── dist_ssh_key.sh # 执行脚本 +│ ├── inventory.ini # 仅用于分发公钥的主机清单文件,需要根据实际情况配置 +│ ├── README.md # 说明文件 +│ └── vars.yml # 初始未加密的用户密码文件,需经过配置并加密后使用 +├── 1_get_docker_image # 各个节点获取 Docker 镜像 +│ ├── load_and_tag_docker_image.yml # 导入镜像 Ansible playbook +│ ├── load.sh # 导入镜像执行脚本 +│ ├── pull_docker_image.yml # 拉取镜像 Ansible playbook +│ ├── pull.sh # 拉取镜像执行脚本 +│ └── README.md # 说明文件 ├── 2_distributed_training # 分布式训练 -│   ├── dist_training.yml # 用于分布式训练的 ansible playbook -│   └── README.md # 说明文件 -│   └── run_dist_training.sh # 分布式训练执行脚本 +│ ├── dist_training.yml # 用于分布式训练的 Ansible playbook +│ ├── run_dist_training.sh # 分布式训练执行脚本 +│ └── README.md # 说明文件 ├── 3_1node_training # 在一个节点上训练,用于获得基准 -│   ├── one_node_training.yml # 单节点训练playbook -│   ├── README.md # 说明文件 -│   └── run_one_node_training.sh # 单节点训练执行脚本 -├── 4_profiling # 使用nsys采集性能相关信息 -│   ├── profiling.yml # profiling ansible playbook -│   ├── README.md # 说明文件 -│   └── run_profiling.sh # 采集信息执行脚本 +│ ├── one_node_training.yml # 单节点训练 Ansible playbook +│ ├── run_one_node_training.sh # 单节点训练执行脚本 +│ └── README.md # 说明文件 +├── 4_profiling # 使用 nsys 采集性能相关信息 +│ ├── profiling.yml # Profiling Ansible playbook +│ ├── run_profiling.sh # 采集信息执行脚本 +│ └── README.md # 说明文件 ├── inventory.ini # 主机清单文件,需要根据实际情况配置 └── README.md # 说明文件 ``` +## 分步说明 +### 0_dist_ssh_key +该目录用于分发 SSH 公钥到各个节点。 +- `distribute_ssh_key.yml`: Ansible playbook,用于分发公钥。 +- `dist_ssh_key.sh`: 执行分发公钥的脚本。 +- `inventory.ini`: 主机清单文件,需要根据实际情况配置。 +- `vars.yml`: 初始未加密的用户密码文件,需经过配置并加密后使用。 + +### 1_get_docker_image + +该目录用于在各个节点上获取 Docker 镜像。 + +- `load_and_tag_docker_image.yml`: Ansible playbook,用于导入 Docker 镜像并设置标签。 +- `load.sh`: 导入镜像执行脚本。 +- `pull_docker_image.yml`: Ansible playbook,用于拉取 Docker 镜像。 +- `pull.sh`: 拉取镜像执行脚本。 + +### 2_distributed_training + +该目录用于执行分布式训练。 + +- `dist_training.yml`: 用于分布式训练的 Ansible playbook。 +- `run_dist_training.sh`: 分布式训练执行脚本。 + +### 3_1node_training + +该目录用于在一个节点上训练,以获得基准。 + +- `one_node_training.yml`: 单节点训练 Ansible playbook。 +- `run_one_node_training.sh`: 单节点训练执行脚本。 + +### 4_profiling + +该目录用于使用 nsys 采集性能相关信息。 + +- `profiling.yml`: Profiling Ansible playbook。 +- `run_profiling.sh`: 采集信息执行脚本。 + +## 使用方法 + +1. **分发 SSH 公钥**: +```sh +cd 0_dist_ssh_key +./dist_ssh_key.sh +``` + +2. **获取 Docker 镜像**: +```sh +cd 1_get_docker_image +./pull.sh # 或者 ./load.sh +``` + +3. **执行分布式训练**: +```sh +cd 2_distributed_training +./run_dist_training.sh [docker_image] [src] +``` + +4. **在一个节点上训练**: +```sh +cd 3_1node_training +./run_one_node_training.sh +``` + +5. **采集性能相关信息**: +```sh +cd 4_profiling +./run_profiling.sh +``` + +注意:在运行这些脚本之前,请确保已经正确配置了 `inventory.ini` 文件中的主机信息。 From b82da5b6db15c04070ee5ab7d99fed64150389d7 Mon Sep 17 00:00:00 2001 From: ShawnXuan Date: Mon, 5 Aug 2024 04:07:28 +0000 Subject: [PATCH 23/26] fix --- Classification/resnet50/0_dist_ssh_key/README.md | 5 +++++ .../resnet50/0_dist_ssh_key/distribute_ssh_key.yml | 2 +- 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/Classification/resnet50/0_dist_ssh_key/README.md b/Classification/resnet50/0_dist_ssh_key/README.md index 856df74..54797c8 100644 --- a/Classification/resnet50/0_dist_ssh_key/README.md +++ b/Classification/resnet50/0_dist_ssh_key/README.md @@ -1,4 +1,9 @@ # 使用 Ansible 将 SSH 公钥分发到多个目标主机 +## 0. 安装Ansible + +```bash +pip install ansible-vault +``` ## 1. 创建变量文件并加密 diff --git a/Classification/resnet50/0_dist_ssh_key/distribute_ssh_key.yml b/Classification/resnet50/0_dist_ssh_key/distribute_ssh_key.yml index 95147c2..685bcee 100644 --- a/Classification/resnet50/0_dist_ssh_key/distribute_ssh_key.yml +++ b/Classification/resnet50/0_dist_ssh_key/distribute_ssh_key.yml @@ -16,4 +16,4 @@ authorized_key: user: "{{ ansible_user }}" state: present - key: "{{ lookup('file', '/home/xiexuan/.ssh/id_rsa.pub') }}" + key: "{{ lookup('file', '/home/用户名/.ssh/id_rsa.pub') }}" From d25bb4a57351dffe3ea30a8a99e28891643921dc Mon Sep 17 00:00:00 2001 From: ShawnXuan Date: Mon, 5 Aug 2024 04:09:39 +0000 Subject: [PATCH 24/26] update --- Classification/resnet50/1_get_docker_image/README.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/Classification/resnet50/1_get_docker_image/README.md b/Classification/resnet50/1_get_docker_image/README.md index 99709fd..21e1813 100644 --- a/Classification/resnet50/1_get_docker_image/README.md +++ b/Classification/resnet50/1_get_docker_image/README.md @@ -1,5 +1,7 @@ # 拉取或导入镜像 +注: 用户需要有各台机器的docker权限 + ## 拉取镜像 适用于直接从 dockerhub 拉取镜像。 From 5aa4ac12c4accd1f323cabbef859cd704e3f3f4a Mon Sep 17 00:00:00 2001 From: XIE Xuan Date: Mon, 5 Aug 2024 15:21:47 +0800 Subject: [PATCH 25/26] Update README.md --- Classification/resnet50/0_dist_ssh_key/README.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/Classification/resnet50/0_dist_ssh_key/README.md b/Classification/resnet50/0_dist_ssh_key/README.md index 54797c8..a7e7be7 100644 --- a/Classification/resnet50/0_dist_ssh_key/README.md +++ b/Classification/resnet50/0_dist_ssh_key/README.md @@ -1,4 +1,6 @@ # 使用 Ansible 将 SSH 公钥分发到多个目标主机 +image + ## 0. 安装Ansible ```bash From 9b292186c43830d4d1a2496a149699d70149f74b Mon Sep 17 00:00:00 2001 From: ShawnXuan Date: Wed, 7 Aug 2024 07:06:13 +0000 Subject: [PATCH 26/26] update --- .../resnet50/2_distributed_training/dist_training.yml | 2 -- 1 file changed, 2 deletions(-) diff --git a/Classification/resnet50/2_distributed_training/dist_training.yml b/Classification/resnet50/2_distributed_training/dist_training.yml index cf20099..c3f9dd5 100644 --- a/Classification/resnet50/2_distributed_training/dist_training.yml +++ b/Classification/resnet50/2_distributed_training/dist_training.yml @@ -37,8 +37,6 @@ --use-fp16 \ --metric-local False \ --metric-train-acc True \ - --fuse-bn-relu \ - --fuse-bn-add-relu \ --use-gpu-decode \ --channel-last \ --skip-eval