From 7e58cce9a6365d85dfbc2026d6a07da6c8de3437 Mon Sep 17 00:00:00 2001 From: wangna11 Date: Mon, 20 Mar 2023 08:17:02 +0000 Subject: [PATCH] add PETR for 3d --- .gitmodules | 4 + OtherFrame/3d/PyTorch/petr/README.md | 87 +++++++++++++++++++ OtherFrame/3d/PyTorch/petr/run_PyTorch.sh | 24 +++++ .../3d/PyTorch/petr/scripts/PrepareEnv.sh | 64 ++++++++++++++ .../3d/PyTorch/petr/scripts/analysis_log.py | 53 +++++++++++ .../3d/PyTorch/petr/scripts/run_benchmark.sh | 62 +++++++++++++ 6 files changed, 294 insertions(+) create mode 100644 OtherFrame/3d/PyTorch/petr/README.md create mode 100644 OtherFrame/3d/PyTorch/petr/run_PyTorch.sh create mode 100644 OtherFrame/3d/PyTorch/petr/scripts/PrepareEnv.sh create mode 100644 OtherFrame/3d/PyTorch/petr/scripts/analysis_log.py create mode 100644 OtherFrame/3d/PyTorch/petr/scripts/run_benchmark.sh diff --git a/.gitmodules b/.gitmodules index b134053793..c8e4c49b3e 100644 --- a/.gitmodules +++ b/.gitmodules @@ -125,3 +125,7 @@ [submodule "OtherFrame/ocr/TensorFlow/models/EAST"] path = OtherFrame/ocr/TensorFlow/models/EAST url = https://github.com/argman/EAST.git +[submodule "OtherFrame/3d/PyTorch/petr/models/petr"] + path = OtherFrame/3d/PyTorch/petr/models/petr + url = https://github.com/wangna11BD/PETR.git + branch = main \ No newline at end of file diff --git a/OtherFrame/3d/PyTorch/petr/README.md b/OtherFrame/3d/PyTorch/petr/README.md new file mode 100644 index 0000000000..4d142bc647 --- /dev/null +++ b/OtherFrame/3d/PyTorch/petr/README.md @@ -0,0 +1,87 @@ +# PyTorch 生成模型 性能复现 +## 目录 + +``` +├── README.md # 说明文档 +├── run_PyTorch.sh # 执行入口,包括环境搭建、测试获取所有生成模型的训练性能 +├── scripts/PrepareEnv.sh # PyTorch和PETR运行环境搭建、训练数据下载 +├── scripts/analysis_log.py # 分析训练的log得到训练性能的数据 +├── scripts/run_benchmark.sh # 执行实体,测试单个生成模型的训练性能 +└── models # 提供竞品PyTorch框架的repo +``` + +## 环境介绍 +### 物理机环境 +- 单机(单卡、4卡、8卡) + - 系统:CentOS release 7.5 (Final) + - GPU:Tesla V100-SXM2-32GB * 8 + - CPU:Intel(R) Xeon(R) Gold 6271C CPU @ 2.60GHz * 80 + - CUDA、cudnn Version: cuda10.2-cudnn7 + +### Docker 镜像 + +- **镜像版本**: `registry.baidubce.com/paddlepaddle/paddle:2.1.2-gpu-cuda10.2-cudnn7` +- **PyTorch 版本**: `1.9.1` +- **CUDA 版本**: `10.2` +- **cuDnn 版本**: `7` + +## 测试步骤 + +```bash +bash run_PyTorch.sh # 创建容器,在该标准环境中测试模型 +``` + +如果在docker内部按住torch等框架耗时很久,可以设置代理。下载测试数据的时候,需要关闭代理,否则下载耗时很久。 + +脚本内容,如: + +```bash +#!/usr/bin/env bash +ImageName="registry.baidubce.com/paddlepaddle/paddle:2.1.2-gpu-cuda10.2-cudnn7"; +docker pull ${ImageName} +run_cmd="cp /workspace/scripts/PrepareEnv.sh ./; + bash PrepareEnv.sh; + cd /workspace/models/mmedi; + cp -r /workspace/mmedi_benchmark_configs ./; + cp /workspace/scripts/run_benchmark.sh ./; + cp /workspace/scripts/analysis_log.py ./; + CUDA_VISIBLE_DEVICES=0 bash run_benchmark.sh esrgan_sp_bs32 sp fp32 32 300 4; + CUDA_VISIBLE_DEVICES=0 bash run_benchmark.sh esrgan_sp_bs64 sp fp32 64 300 4; + CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 bash run_benchmark.sh esrgan_mp_bs32 mp fp32 32 300 4; + CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 bash run_benchmark.sh esrgan_mp_bs64 mp fp32 64 300 4; + CUDA_VISIBLE_DEVICES=0 bash run_benchmark.sh edvr_sp_bs4 sp fp32 4 300 3; + CUDA_VISIBLE_DEVICES=0 bash run_benchmark.sh edvr_sp_bs64 sp fp32 64 300 3; + CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 bash run_benchmark.sh edvr_mp_bs4 mp fp32 4 300 3; + CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 bash run_benchmark.sh edvr_mp_bs64 mp fp32 64 300 3; + CUDA_VISIBLE_DEVICES=0 bash run_benchmark.sh basicvsr_sp_bs2 sp fp32 2 300 4; + CUDA_VISIBLE_DEVICES=0 bash run_benchmark.sh basicvsr_sp_bs4 sp fp32 4 300 4; + CUDA_VISIBLE_DEVICES=0,1,2,3 bash run_benchmark.sh basicvsr_mp_bs2 mp fp32 2 300 4; + CUDA_VISIBLE_DEVICES=0,1,2,3 bash run_benchmark.sh basicvsr_mp_bs4 mp fp32 4 300 4; + " +nvidia-docker run --name test_torch_gan -i \ + --net=host \ + --shm-size=128g \ + -v $PWD:/workspace \ + ${ImageName} /bin/bash -c "${run_cmd}" +nvidia-docker stop test_torch_gan +nvidia-docker rm test_torch_gan +``` + +## 输出 + +执行完成后,在当前目录会产出分割模型训练性能数据的文件,比如`petr_sp_bs1_fp32_1_speed`等文件,内容如下所示。 + +```bash +{ +"log_file": "/workspace/models/mmedi/petr_sp_bs1_fp32_1_speed", \ # log 目录,创建规范见PrepareEnv.sh +"model_name": "petr_sp_bs1", \ # 模型case名,创建规范:repoName_模型名_bs${bs_item}_${fp_item} +"mission_name": "3D检测", \ # 模型case所属任务名称,具体可参考scripts/config.ini +"direction_id": 0, \ # 模型case所属方向id,0:CV|1:NLP|2:Rec 具体可参考benchmark/scripts/config.ini +"run_mode": "sp", \ # 单卡:sp|多卡:mp +"index": 1, \ # 速度验证默认为1 +"gpu_num": 1, \ # 1|8 +"FINAL_RESULT": 75.655, \ # 速度计算后的平均值,需要skip掉不稳定的前几步值 +"JOB_FAIL_FLAG": 0, \ # 该模型case运行0:成功|1:失败 +"UNIT": "images/s" \ # 速度指标的单位 +} +``` diff --git a/OtherFrame/3d/PyTorch/petr/run_PyTorch.sh b/OtherFrame/3d/PyTorch/petr/run_PyTorch.sh new file mode 100644 index 0000000000..ecf85bd1dc --- /dev/null +++ b/OtherFrame/3d/PyTorch/petr/run_PyTorch.sh @@ -0,0 +1,24 @@ +#!/usr/bin/env bash + +ImageName="registry.baidubce.com/paddlepaddle/paddle:2.1.2-gpu-cuda10.2-cudnn7"; +docker pull ${ImageName} + +run_cmd="cp /workspace/scripts/PrepareEnv.sh ./; + bash PrepareEnv.sh; + cd /workspace/models/petr; + cp /workspace/scripts/run_benchmark.sh ./; + cp /workspace/scripts/analysis_log.py ./; + CUDA_VISIBLE_DEVICES=0 bash run_benchmark.sh petr_sp_fp32_bs1 sp fp32 1 300 4; + CUDA_VISIBLE_DEVICES=0 bash run_benchmark.sh petr_sp_fp16_bs1 sp fp16 1 300 4; + CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 bash run_benchmark.sh petr_mp_fp32_bs1 mp fp32 8 300 4; + CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 bash run_benchmark.sh petr_mp_fp16_bs1 mp fp16 8 300 4; + " + +nvidia-docker run --name test_torch_3d -i \ + --net=host \ + --shm-size=128g \ + -v $PWD:/workspace \ + ${ImageName} /bin/bash -c "${run_cmd}" + +nvidia-docker stop test_torch_3d +nvidia-docker rm test_torch_3d \ No newline at end of file diff --git a/OtherFrame/3d/PyTorch/petr/scripts/PrepareEnv.sh b/OtherFrame/3d/PyTorch/petr/scripts/PrepareEnv.sh new file mode 100644 index 0000000000..ed0a3a4a30 --- /dev/null +++ b/OtherFrame/3d/PyTorch/petr/scripts/PrepareEnv.sh @@ -0,0 +1,64 @@ +#!/usr/bin/env bash + +# 公共配置文件,配置python 安装pytorch,运行目录:/workspace (起容器的时候映射的目录:benchmark/OtherFrameworks/gan/PyTorch/mmedting) +echo "*******prepare benchmark***********" + +################################# 创建一些log目录,如: +export BENCHMARK_ROOT=/workspace +log_date=`date "+%Y.%m%d.%H%M%S"` +frame=pytorch1.9.0 +cuda_version=10.2 +save_log_dir=${BENCHMARK_ROOT}/logs/${frame}_${log_date}_${cuda_version}/ + +if [[ -d ${save_log_dir} ]]; then + rm -rf ${save_log_dir} +fi +# this for update the log_path coding mat +export TRAIN_LOG_DIR=${save_log_dir}/train_log +mkdir -p ${TRAIN_LOG_DIR} + +log_path=${TRAIN_LOG_DIR} + +################################# 配置python, 如: +rm -rf run_env +mkdir run_env +ln -s $(which python3.7) run_env/python +ln -s $(which pip3.7) run_env/pip +export PATH=/workspace/run_env:${PATH} + +################################# 安装框架 如: +pip install -U pip +echo `pip --version` +pip install torch==1.9.1+cu102 -f https://download.pytorch.org/whl/torch_stable.html +pip install torchvision==0.10.1+cu102 -f https://download.pytorch.org/whl/torch_stable.html + +pip install mmcv-full==1.4.0 -f https://download.openmmlab.com/mmcv/dist/cu102/torch1.9.0/index.html + +cd /workspace/models +git clone https://github.com/open-mmlab/mmdetection.git +cd /workspace/models/mmdetection +git checkout v2.24.1 +pip install -r requirements/build.txt +python setup.py develop + +pip install mmsegmentation==0.20.2 + +cd /workspace/models +git clone https://github.com/open-mmlab/mmdetection3d.git +cd /workspace/models/mmdetection3d +git checkout v0.17.1 +pip install -r requirements/build.txt +python setup.py develop + +cd /workspace/models/petr +mkdir ckpts +ln -s /workspace/models/mmdetection3d /workspace/models/petr/mmdetection3d + +################################# 准备训练数据 如: +mkdir -p data +# 由于nuscenes数据集太大,为避免每次下载过于耗时,请将nuscenes数据集拷贝到data目录下 +# 并软链到/data/Dataset/nuScenes目录 +# cp -r /nuscenes_dataste_root data/ +# ln -s /nuscenes_dataste_root /data/Dataset/nuScenes + +echo "*******prepare benchmark end***********" diff --git a/OtherFrame/3d/PyTorch/petr/scripts/analysis_log.py b/OtherFrame/3d/PyTorch/petr/scripts/analysis_log.py new file mode 100644 index 0000000000..eb62e4af38 --- /dev/null +++ b/OtherFrame/3d/PyTorch/petr/scripts/analysis_log.py @@ -0,0 +1,53 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +# encoding=utf-8 vi:ts=4:sw=4:expandtab:ft=python + +import re +import sys +import json + +def analyze(model_name, batch_size, log_file, res_log_file): + gpu_ids_pat = re.compile(r"GPU (.*):") + time_pat = re.compile(r"time: (.*), data_time") + + logs = open(log_file).readlines() + logs = ";".join(logs) + gpu_ids_res = gpu_ids_pat.findall(logs) + time_res = time_pat.findall(logs) + + fail_flag = 0 + run_mode = "" + gpu_num = 0 + ips = 0 + + if gpu_ids_res == [] or time_res == []: + fail_flag = 1 + else: + gpu_num = int(gpu_ids_res[0][-1]) + run_mode = "sp" if gpu_num == 1 else "mp" + + skip_num = 1 + total_time = 0 + for i in range(skip_num, len(time_res)): + total_time += float(time_res[i]) + avg_time = total_time / (len(time_res) - skip_num) + ips = float(batch_size) * round(1 / avg_time, 3) + + info = {"log_file": log_file, "model_name": model_name, "mission_name": "3D检测", + "direction_id": 0, "run_mode": run_mode, "index": 1, "gpu_num": gpu_num, + "FINAL_RESULT": ips, "JOB_FAIL_FLAG": fail_flag, "UNIT": "images/s"} + json_info = json.dumps(info) + with open(res_log_file, "w") as of: + of.write(json_info) + +if __name__ == "__main__": + if len(sys.argv) != 5: + print("Usage:" + sys.argv[0] + " model_name path/to/log/file path/to/res/log/file") + sys.exit() + + model_name = sys.argv[1] + batch_size = sys.argv[2] + log_file = sys.argv[3] + res_log_file = sys.argv[4] + + analyze(model_name, batch_size, log_file, res_log_file) \ No newline at end of file diff --git a/OtherFrame/3d/PyTorch/petr/scripts/run_benchmark.sh b/OtherFrame/3d/PyTorch/petr/scripts/run_benchmark.sh new file mode 100644 index 0000000000..b343e3c501 --- /dev/null +++ b/OtherFrame/3d/PyTorch/petr/scripts/run_benchmark.sh @@ -0,0 +1,62 @@ +#!/usr/bin/env bash +set -xe + +# Test training benchmark for a model. + +# Usage: CUDA_VISIBLE_DEVICES=xxx bash run_benchmark.sh ${model_name} ${run_mode} ${fp_item} ${bs_item} ${max_iter} ${num_workers} + +function _set_params(){ + model_name=${1:-"model_name"} + run_mode=${2:-"sp"} # sp or mp + fp_item=${3:-"fp32"} # fp32 or fp16 + batch_size=${4:-"2"} + max_iter=${5:-"100"} + num_workers=${6:-"3"} + run_log_path=${TRAIN_LOG_DIR:-$(pwd)} + + device=${CUDA_VISIBLE_DEVICES//,/ } + arr=(${device}) + num_gpu_devices=${#arr[*]} + log_file=${run_log_path}/${model_name}_${fp_item}_${num_gpu_devices} + res_log_file=${run_log_path}/${model_name}_${fp_item}_${num_gpu_devices}_speed +} + +function _analysis_log(){ + python analysis_log.py ${model_name} ${batch_size} ${log_file} ${res_log_file} + cp ${log_file} /workspace + cp ${res_log_file} /workspace +} + +function _train(){ + echo "Train ${model_name} on ${num_gpu_devices} GPUs" + echo "current CUDA_VISIBLE_DEVICES=$CUDA_VISIBLE_DEVICES, gpus=$num_gpu_devices, batch_size=$batch_size" + + train_config="projects/configs/petrv2/${model_name}.py" + train_options="work_dirs/${model_name}/" + + case ${run_mode} in + sp) train_cmd="./tools/dist_train.sh ${train_config} 1 --work-dir ${train_options}" ;; + mp) train_cmd="./tools/dist_train.sh ${train_config} 8 --work-dir ${train_options}" ;; + *) echo "choose run_mode(sp or mp)"; exit 1; + esac + + timeout 15m ${train_cmd} > ${log_file} 2>&1 + if [ $? -ne 0 ];then + echo -e "${model_name}, FAIL" + export job_fail_flag=1 + else + echo -e "${model_name}, SUCCESS" + export job_fail_flag=0 + fi + if [ $run_mode = "mp" -a -d mylog ]; then + rm ${log_file} + cp mylog/workerlog.0 ${log_file} + fi + + _analysis_log + + kill -9 `ps -ef|grep 'python'|awk '{print $2}'` +} + +_set_params $@ +_train \ No newline at end of file