From 30c3ac55f46f8d59aee317e54aecd8ef5074c865 Mon Sep 17 00:00:00 2001 From: Ang Wang Date: Thu, 2 Jan 2025 17:29:38 +0800 Subject: [PATCH] Optimize the accuracy benchmark workflow (#44) --- .github/workflows/accuracy_benchmark.yml | 40 ++++++++++++++++++------ benchmarks/accuracy/fastchat.sh | 7 +++-- benchmarks/accuracy/llama.sh | 2 ++ benchmarks/accuracy/run.sh | 37 +++++++++++++++++++--- docker/Dockerfile.base | 2 +- 5 files changed, 71 insertions(+), 17 deletions(-) diff --git a/.github/workflows/accuracy_benchmark.yml b/.github/workflows/accuracy_benchmark.yml index 16c2473..abadc34 100644 --- a/.github/workflows/accuracy_benchmark.yml +++ b/.github/workflows/accuracy_benchmark.yml @@ -3,8 +3,8 @@ name: Daily Accuracy Benchmark on: workflow_dispatch: schedule: - # Runs daily at 3:00 AM, Beijing time. - - cron: '0 19 * * *' # This is UTC time + # Runs daily at 2:00 AM, Beijing time. + - cron: '0 18 * * *' # This is UTC time jobs: accuracy_benchmark: @@ -14,12 +14,34 @@ jobs: - name: Checkout code uses: actions/checkout@v3 + - name: Create .env file + run: | + echo "OSS_AK_ID=${{ secrets.OSS_AK_ID }}" >> .env + echo "OSS_AK_SECRET=${{ secrets.OSS_AK_SECRET }}" >> .env + echo "OSS_ENDPOINT=${{ secrets.OSS_ENDPOINT }}" >> .env + echo "M6_TENANT=${{ secrets.M6_TENANT }}" >> .env + echo "MIT_SPIDER_TOKEN=${{ secrets.MIT_SPIDER_TOKEN }}" >> .env + echo "MIT_SPIDER_URL=${{ secrets.MIT_SPIDER_URL }}" >> .env + - name: Perform the accuracy benchmark - run: cd benchmarks/accuracy && bash ./run.sh + run: | + docker pull $UT_IMAGE + echo 'Running accuracy benchmark...' + docker run \ + -v $PWD:$PWD \ + -w $PWD \ + --net host \ + --ipc host \ + --shm-size 80G \ + -t --rm \ + --gpus all \ + --env-file .env \ + $UT_IMAGE bash -c ' \ + git config --global --add safe.directory $PWD && \ + pip install -e . && \ + cd benchmarks/accuracy && NPROC_PER_NODE=4 bash ./run.sh' env: - OSS_AK_ID: ${{ secrets.OSS_AK_ID }} - OSS_AK_SECRET: ${{ secrets.OSS_AK_SECRET }} - OSS_ENDPOINT: ${{ secrets.OSS_ENDPOINT }} - M6_TENANT: ${{ secrets.M6_TENANT }} - MIT_SPIDER_TOKEN: ${{ secrets.MIT_SPIDER_TOKEN }} - MIT_SPIDER_URL: ${{ secrets.MIT_SPIDER_URL }} + UT_IMAGE: ${{ secrets.UT_IMAGE }} + + - name: Clean up .env + run: rm -f .env diff --git a/benchmarks/accuracy/fastchat.sh b/benchmarks/accuracy/fastchat.sh index 334b34e..907723f 100644 --- a/benchmarks/accuracy/fastchat.sh +++ b/benchmarks/accuracy/fastchat.sh @@ -1,5 +1,7 @@ #!/bin/bash +set -e + if [ "$#" -ne 1 ]; then echo "Usage: MIT_SPIDER_TOKEN=*** MIT_SPIDER_URL=*** M6_TENANT=*** $0 " echo "You must provide exactly 1 parameters." @@ -17,7 +19,7 @@ fi MODEL_DIR=$(realpath $1) MODEL_ID=$(basename "$MODEL_DIR")_$(date +"%Y%m%d_%H%M%S") -NUM_GPUS_TOTAL=1 +NUM_GPUS_TOTAL=4 JUDGMENT_PARALLEL=4 function install_fastchat { @@ -25,8 +27,7 @@ function install_fastchat { git clone https://github.com/AlibabaPAI/FastChat_TorchAcc.git fi - output=$(python -m pip list | grep fschat) - if [[ -n $output ]]; then + if python -m pip list | grep -q fschat; then echo "All requirements are installed." else echo "Install requirements ..." diff --git a/benchmarks/accuracy/llama.sh b/benchmarks/accuracy/llama.sh index 6ecc301..267f5a2 100755 --- a/benchmarks/accuracy/llama.sh +++ b/benchmarks/accuracy/llama.sh @@ -1,5 +1,7 @@ #!/bin/bash +set -e + if [[ $# -ne 2 && $# -ne 3 ]]; then echo "Usage: $0 [checkpiont_output_dir]" echo " local_model_dir: Path to the local directory where the model will be saved." diff --git a/benchmarks/accuracy/run.sh b/benchmarks/accuracy/run.sh index 94bbd12..9591180 100755 --- a/benchmarks/accuracy/run.sh +++ b/benchmarks/accuracy/run.sh @@ -1,5 +1,7 @@ #!/bin/bash +set -e + if [ "$#" -eq 1 ]; then MODEL_DIR=$(realpath "$1") elif [ "$#" -eq 0 ]; then @@ -25,6 +27,7 @@ ORIG_MODEL_EVAL_LOG="$RES_FOLDER/original_model_eval.log" TORCH_MODEL_EVAL_LOG="$RES_FOLDER/torch_model_eval.log" ACC_MODEL_EVAL_LOG="$RES_FOLDER/acc_model_eval.log" RES_LOG_FILE="$RES_FOLDER/result.log" +OSS_BUCKET_PATH="oss://pai-devel/benchmark/accuracy/$(date +'%Y-%m')/$TIMESTAMP" mkdir -p $RES_FOLDER @@ -55,14 +58,18 @@ function upload_to_oss { curl https://gosspublic.alicdn.com/ossutil/install.sh | bash fi ossutil config -e ${OSS_ENDPOINT} -i ${OSS_AK_ID} -k ${OSS_AK_SECRET} - ossutil cp -r -f -j 10 --exclude "*.safetensors" --exclude="*.bin" $RES_FOLDER oss://pai-devel/benchmark/accuracy/"$TIMESTAMP" + ossutil cp -r -f -j 10 --exclude "*.safetensors" --exclude="*.bin" $RES_FOLDER $OSS_BUCKET_PATH else echo "No oss information found. Skip uploading to oss." fi } -function collect_and_show_results { +is_numeric() { + [[ "$1" =~ ^[0-9]+([.][0-9]+)?$ ]] +} + +function collect_and_upload_results { # Collect and compare the results ORIG_SCORE=$(tail -1 $ORIG_MODEL_EVAL_LOG | awk '{print $NF}') TORCH_SCORE=$(tail -1 $TORCH_MODEL_EVAL_LOG | awk '{print $NF}') @@ -100,9 +107,31 @@ function collect_and_show_results { echo -e "\n${CYAN}More details can be found in = ${RESET}${RES_FOLDER}" echo -e "${BLUE}==========================================================${RESET}" } | tee >(sed 's/\x1b\[[0-9;]*m//g' > $RES_LOG_FILE) + + upload_to_oss + + # Check the results + if ! is_numeric "$TORCH_TRAIN_LOSS" || \ + ! is_numeric "$TORCH_TRAIN_RUNTIME" || \ + ! is_numeric "$TORCH_TRAIN_STEPS_PER_SECOND" || \ + ! is_numeric "$ACC_TRAIN_LOSS" || \ + ! is_numeric "$ACC_TRAIN_RUNTIME" || \ + ! is_numeric "$ACC_TRAIN_STEPS_PER_SECOND" || \ + ! is_numeric "$ORIG_SCORE" || \ + ! is_numeric "$TORCH_SCORE" || \ + ! is_numeric "$ACC_SCORE" || \ ; then + echo "Error: One or more variables are not numeric." + exit 1 + fi + + LOSS_DIFF=$(echo "$TORCH_TRAIN_LOSS - $ACC_TRAIN_LOSS" | bc -l) + LOSS_DIFF_ABS=$(echo "${LOSS_DIFF#-}" | bc -l) + if (( $(echo "$LOSS_DIFF_ABS > 0.01" | bc -l) )); then + echo "Error: The difference between ACC_TRAIN_LOSS and TORCH_TRAIN_LOSS exceeds 1e-2." + exit 1 + fi } do_train do_evaluation -collect_and_show_results -upload_to_oss +collect_and_upload_results diff --git a/docker/Dockerfile.base b/docker/Dockerfile.base index fe80ef7..2565f1e 100644 --- a/docker/Dockerfile.base +++ b/docker/Dockerfile.base @@ -79,7 +79,7 @@ ENV TERM xterm RUN apt-get update \ && apt-get install -y --allow-downgrades --allow-change-held-packages --no-install-recommends libnl-3-dev libnl-route-3-dev libnl-3-200 libnl-route-3-200 iproute2 udev dmidecode ethtool \ && apt-get clean \ - && rm -rf /rm -rf /var/lib/apt/lists/* + && rm -rf /var/lib/apt/lists/* RUN cd /tmp/ && \ wget http://pythonrun.oss-cn-zhangjiakou.aliyuncs.com/rdma/nic-libs-mellanox-rdma-5.2-2/nic-lib-rdma-core-installer-ubuntu.tar.gz && \