From 30c3ac55f46f8d59aee317e54aecd8ef5074c865 Mon Sep 17 00:00:00 2001
From: Ang Wang <wangang.wa@alibaba-inc.com>
Date: Thu, 2 Jan 2025 17:29:38 +0800
Subject: [PATCH] Optimize the accuracy benchmark workflow (#44)

---
 .github/workflows/accuracy_benchmark.yml | 40 ++++++++++++++++++------
 benchmarks/accuracy/fastchat.sh          |  7 +++--
 benchmarks/accuracy/llama.sh             |  2 ++
 benchmarks/accuracy/run.sh               | 37 +++++++++++++++++++---
 docker/Dockerfile.base                   |  2 +-
 5 files changed, 71 insertions(+), 17 deletions(-)

diff --git a/.github/workflows/accuracy_benchmark.yml b/.github/workflows/accuracy_benchmark.yml
index 16c2473..abadc34 100644
--- a/.github/workflows/accuracy_benchmark.yml
+++ b/.github/workflows/accuracy_benchmark.yml
@@ -3,8 +3,8 @@ name: Daily Accuracy Benchmark
 on:
   workflow_dispatch:
   schedule:
-    # Runs daily at 3:00 AM, Beijing time.
-    - cron: '0 19 * * *' # This is UTC time
+    # Runs daily at 2:00 AM, Beijing time.
+    - cron: '0 18 * * *' # This is UTC time
 
 jobs:
   accuracy_benchmark:
@@ -14,12 +14,34 @@ jobs:
     - name: Checkout code
       uses: actions/checkout@v3
 
+    - name: Create .env file
+      run: |
+        echo "OSS_AK_ID=${{ secrets.OSS_AK_ID }}" >> .env
+        echo "OSS_AK_SECRET=${{ secrets.OSS_AK_SECRET }}" >> .env
+        echo "OSS_ENDPOINT=${{ secrets.OSS_ENDPOINT }}" >> .env
+        echo "M6_TENANT=${{ secrets.M6_TENANT }}" >> .env
+        echo "MIT_SPIDER_TOKEN=${{ secrets.MIT_SPIDER_TOKEN }}" >> .env
+        echo "MIT_SPIDER_URL=${{ secrets.MIT_SPIDER_URL }}" >> .env
+
     - name: Perform the accuracy benchmark
-      run: cd benchmarks/accuracy && bash ./run.sh
+      run: |
+        docker pull $UT_IMAGE
+        echo 'Running accuracy benchmark...'
+        docker run \
+          -v $PWD:$PWD \
+          -w $PWD \
+          --net host \
+          --ipc host \
+          --shm-size 80G \
+          -t --rm \
+          --gpus all \
+          --env-file .env \
+          $UT_IMAGE bash -c ' \
+            git config --global --add safe.directory $PWD && \
+            pip install -e . && \
+            cd benchmarks/accuracy && NPROC_PER_NODE=4 bash ./run.sh'
       env:
-        OSS_AK_ID: ${{ secrets.OSS_AK_ID }}
-        OSS_AK_SECRET: ${{ secrets.OSS_AK_SECRET }}
-        OSS_ENDPOINT: ${{ secrets.OSS_ENDPOINT }}
-        M6_TENANT: ${{ secrets.M6_TENANT }}
-        MIT_SPIDER_TOKEN: ${{ secrets.MIT_SPIDER_TOKEN }}
-        MIT_SPIDER_URL: ${{ secrets.MIT_SPIDER_URL }}
+        UT_IMAGE: ${{ secrets.UT_IMAGE }}
+
+    - name: Clean up .env
+      run: rm -f .env
diff --git a/benchmarks/accuracy/fastchat.sh b/benchmarks/accuracy/fastchat.sh
index 334b34e..907723f 100644
--- a/benchmarks/accuracy/fastchat.sh
+++ b/benchmarks/accuracy/fastchat.sh
@@ -1,5 +1,7 @@
 #!/bin/bash
 
+set -e
+
 if [ "$#" -ne 1 ]; then
   echo "Usage: MIT_SPIDER_TOKEN=*** MIT_SPIDER_URL=*** M6_TENANT=*** $0 <local_model_dir>"
   echo "You must provide exactly 1 parameters."
@@ -17,7 +19,7 @@ fi
 
 MODEL_DIR=$(realpath $1)
 MODEL_ID=$(basename "$MODEL_DIR")_$(date +"%Y%m%d_%H%M%S")
-NUM_GPUS_TOTAL=1
+NUM_GPUS_TOTAL=4
 JUDGMENT_PARALLEL=4
 
 function install_fastchat {
@@ -25,8 +27,7 @@ function install_fastchat {
     git clone https://github.com/AlibabaPAI/FastChat_TorchAcc.git
   fi
 
-  output=$(python -m pip list | grep fschat)
-  if [[ -n $output ]]; then
+  if python -m pip list | grep -q fschat; then
     echo "All requirements are installed."
   else
     echo "Install requirements ..."
diff --git a/benchmarks/accuracy/llama.sh b/benchmarks/accuracy/llama.sh
index 6ecc301..267f5a2 100755
--- a/benchmarks/accuracy/llama.sh
+++ b/benchmarks/accuracy/llama.sh
@@ -1,5 +1,7 @@
 #!/bin/bash
 
+set -e
+
 if [[ $# -ne 2 && $# -ne 3 ]]; then
   echo "Usage: $0 <local_model_dir> <use_torchacc> [checkpiont_output_dir]"
   echo "  local_model_dir: Path to the local directory where the model will be saved."
diff --git a/benchmarks/accuracy/run.sh b/benchmarks/accuracy/run.sh
index 94bbd12..9591180 100755
--- a/benchmarks/accuracy/run.sh
+++ b/benchmarks/accuracy/run.sh
@@ -1,5 +1,7 @@
 #!/bin/bash
 
+set -e
+
 if [ "$#" -eq 1 ]; then
   MODEL_DIR=$(realpath "$1")
 elif [ "$#" -eq 0 ]; then
@@ -25,6 +27,7 @@ ORIG_MODEL_EVAL_LOG="$RES_FOLDER/original_model_eval.log"
 TORCH_MODEL_EVAL_LOG="$RES_FOLDER/torch_model_eval.log"
 ACC_MODEL_EVAL_LOG="$RES_FOLDER/acc_model_eval.log"
 RES_LOG_FILE="$RES_FOLDER/result.log"
+OSS_BUCKET_PATH="oss://pai-devel/benchmark/accuracy/$(date +'%Y-%m')/$TIMESTAMP"
 
 mkdir -p $RES_FOLDER
 
@@ -55,14 +58,18 @@ function upload_to_oss {
       curl https://gosspublic.alicdn.com/ossutil/install.sh | bash
     fi
     ossutil config -e ${OSS_ENDPOINT} -i ${OSS_AK_ID} -k ${OSS_AK_SECRET}
-    ossutil cp -r -f -j 10 --exclude "*.safetensors" --exclude="*.bin" $RES_FOLDER oss://pai-devel/benchmark/accuracy/"$TIMESTAMP"
+    ossutil cp -r -f -j 10 --exclude "*.safetensors" --exclude="*.bin" $RES_FOLDER $OSS_BUCKET_PATH
   else
     echo "No oss information found. Skip uploading to oss."
   fi
 }
 
 
-function collect_and_show_results {
+is_numeric() {
+  [[ "$1" =~ ^[0-9]+([.][0-9]+)?$ ]]
+}
+
+function collect_and_upload_results {
   # Collect and compare the results
   ORIG_SCORE=$(tail -1 $ORIG_MODEL_EVAL_LOG | awk '{print $NF}')
   TORCH_SCORE=$(tail -1 $TORCH_MODEL_EVAL_LOG | awk '{print $NF}')
@@ -100,9 +107,31 @@ function collect_and_show_results {
     echo -e "\n${CYAN}More details can be found in    = ${RESET}${RES_FOLDER}"
     echo -e "${BLUE}==========================================================${RESET}"
   } | tee >(sed 's/\x1b\[[0-9;]*m//g' > $RES_LOG_FILE)
+
+  upload_to_oss
+
+  # Check the results
+  if ! is_numeric "$TORCH_TRAIN_LOSS" || \
+   ! is_numeric "$TORCH_TRAIN_RUNTIME" || \
+   ! is_numeric "$TORCH_TRAIN_STEPS_PER_SECOND" || \
+   ! is_numeric "$ACC_TRAIN_LOSS" || \
+   ! is_numeric "$ACC_TRAIN_RUNTIME" || \
+   ! is_numeric "$ACC_TRAIN_STEPS_PER_SECOND" || \
+   ! is_numeric "$ORIG_SCORE" || \
+   ! is_numeric "$TORCH_SCORE" || \
+   ! is_numeric "$ACC_SCORE" || \ ; then
+    echo "Error: One or more variables are not numeric."
+    exit 1
+  fi
+
+  LOSS_DIFF=$(echo "$TORCH_TRAIN_LOSS - $ACC_TRAIN_LOSS" | bc -l)
+  LOSS_DIFF_ABS=$(echo "${LOSS_DIFF#-}" | bc -l)
+  if (( $(echo "$LOSS_DIFF_ABS > 0.01" | bc -l) )); then
+    echo "Error: The difference between ACC_TRAIN_LOSS and TORCH_TRAIN_LOSS exceeds 1e-2."
+    exit 1
+  fi
 }
 
 do_train
 do_evaluation
-collect_and_show_results
-upload_to_oss
+collect_and_upload_results
diff --git a/docker/Dockerfile.base b/docker/Dockerfile.base
index fe80ef7..2565f1e 100644
--- a/docker/Dockerfile.base
+++ b/docker/Dockerfile.base
@@ -79,7 +79,7 @@ ENV TERM xterm
 RUN apt-get update \
     && apt-get install -y --allow-downgrades --allow-change-held-packages --no-install-recommends libnl-3-dev libnl-route-3-dev libnl-3-200 libnl-route-3-200 iproute2 udev dmidecode ethtool \
     && apt-get clean \
-    && rm -rf /rm -rf /var/lib/apt/lists/*
+    && rm -rf /var/lib/apt/lists/*
 
 RUN cd /tmp/ && \
     wget http://pythonrun.oss-cn-zhangjiakou.aliyuncs.com/rdma/nic-libs-mellanox-rdma-5.2-2/nic-lib-rdma-core-installer-ubuntu.tar.gz && \