Skip to content

Commit

Permalink
Optimize the accuracy benchmark workflow (#44)
Browse files Browse the repository at this point in the history
  • Loading branch information
anw90 authored Jan 2, 2025
1 parent 75dd4af commit 30c3ac5
Show file tree
Hide file tree
Showing 5 changed files with 71 additions and 17 deletions.
40 changes: 31 additions & 9 deletions .github/workflows/accuracy_benchmark.yml
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,8 @@ name: Daily Accuracy Benchmark
on:
workflow_dispatch:
schedule:
# Runs daily at 3:00 AM, Beijing time.
- cron: '0 19 * * *' # This is UTC time
# Runs daily at 2:00 AM, Beijing time.
- cron: '0 18 * * *' # This is UTC time

jobs:
accuracy_benchmark:
Expand All @@ -14,12 +14,34 @@ jobs:
- name: Checkout code
uses: actions/checkout@v3

- name: Create .env file
run: |
echo "OSS_AK_ID=${{ secrets.OSS_AK_ID }}" >> .env
echo "OSS_AK_SECRET=${{ secrets.OSS_AK_SECRET }}" >> .env
echo "OSS_ENDPOINT=${{ secrets.OSS_ENDPOINT }}" >> .env
echo "M6_TENANT=${{ secrets.M6_TENANT }}" >> .env
echo "MIT_SPIDER_TOKEN=${{ secrets.MIT_SPIDER_TOKEN }}" >> .env
echo "MIT_SPIDER_URL=${{ secrets.MIT_SPIDER_URL }}" >> .env
- name: Perform the accuracy benchmark
run: cd benchmarks/accuracy && bash ./run.sh
run: |
docker pull $UT_IMAGE
echo 'Running accuracy benchmark...'
docker run \
-v $PWD:$PWD \
-w $PWD \
--net host \
--ipc host \
--shm-size 80G \
-t --rm \
--gpus all \
--env-file .env \
$UT_IMAGE bash -c ' \
git config --global --add safe.directory $PWD && \
pip install -e . && \
cd benchmarks/accuracy && NPROC_PER_NODE=4 bash ./run.sh'
env:
OSS_AK_ID: ${{ secrets.OSS_AK_ID }}
OSS_AK_SECRET: ${{ secrets.OSS_AK_SECRET }}
OSS_ENDPOINT: ${{ secrets.OSS_ENDPOINT }}
M6_TENANT: ${{ secrets.M6_TENANT }}
MIT_SPIDER_TOKEN: ${{ secrets.MIT_SPIDER_TOKEN }}
MIT_SPIDER_URL: ${{ secrets.MIT_SPIDER_URL }}
UT_IMAGE: ${{ secrets.UT_IMAGE }}

- name: Clean up .env
run: rm -f .env
7 changes: 4 additions & 3 deletions benchmarks/accuracy/fastchat.sh
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
#!/bin/bash

set -e

if [ "$#" -ne 1 ]; then
echo "Usage: MIT_SPIDER_TOKEN=*** MIT_SPIDER_URL=*** M6_TENANT=*** $0 <local_model_dir>"
echo "You must provide exactly 1 parameters."
Expand All @@ -17,16 +19,15 @@ fi

MODEL_DIR=$(realpath $1)
MODEL_ID=$(basename "$MODEL_DIR")_$(date +"%Y%m%d_%H%M%S")
NUM_GPUS_TOTAL=1
NUM_GPUS_TOTAL=4
JUDGMENT_PARALLEL=4

function install_fastchat {
if [[ ! -d "FastChat_TorchAcc" ]]; then
git clone https://github.com/AlibabaPAI/FastChat_TorchAcc.git
fi

output=$(python -m pip list | grep fschat)
if [[ -n $output ]]; then
if python -m pip list | grep -q fschat; then
echo "All requirements are installed."
else
echo "Install requirements ..."
Expand Down
2 changes: 2 additions & 0 deletions benchmarks/accuracy/llama.sh
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
#!/bin/bash

set -e

if [[ $# -ne 2 && $# -ne 3 ]]; then
echo "Usage: $0 <local_model_dir> <use_torchacc> [checkpiont_output_dir]"
echo " local_model_dir: Path to the local directory where the model will be saved."
Expand Down
37 changes: 33 additions & 4 deletions benchmarks/accuracy/run.sh
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
#!/bin/bash

set -e

if [ "$#" -eq 1 ]; then
MODEL_DIR=$(realpath "$1")
elif [ "$#" -eq 0 ]; then
Expand All @@ -25,6 +27,7 @@ ORIG_MODEL_EVAL_LOG="$RES_FOLDER/original_model_eval.log"
TORCH_MODEL_EVAL_LOG="$RES_FOLDER/torch_model_eval.log"
ACC_MODEL_EVAL_LOG="$RES_FOLDER/acc_model_eval.log"
RES_LOG_FILE="$RES_FOLDER/result.log"
OSS_BUCKET_PATH="oss://pai-devel/benchmark/accuracy/$(date +'%Y-%m')/$TIMESTAMP"

mkdir -p $RES_FOLDER

Expand Down Expand Up @@ -55,14 +58,18 @@ function upload_to_oss {
curl https://gosspublic.alicdn.com/ossutil/install.sh | bash
fi
ossutil config -e ${OSS_ENDPOINT} -i ${OSS_AK_ID} -k ${OSS_AK_SECRET}
ossutil cp -r -f -j 10 --exclude "*.safetensors" --exclude="*.bin" $RES_FOLDER oss://pai-devel/benchmark/accuracy/"$TIMESTAMP"
ossutil cp -r -f -j 10 --exclude "*.safetensors" --exclude="*.bin" $RES_FOLDER $OSS_BUCKET_PATH
else
echo "No oss information found. Skip uploading to oss."
fi
}


function collect_and_show_results {
is_numeric() {
[[ "$1" =~ ^[0-9]+([.][0-9]+)?$ ]]
}

function collect_and_upload_results {
# Collect and compare the results
ORIG_SCORE=$(tail -1 $ORIG_MODEL_EVAL_LOG | awk '{print $NF}')
TORCH_SCORE=$(tail -1 $TORCH_MODEL_EVAL_LOG | awk '{print $NF}')
Expand Down Expand Up @@ -100,9 +107,31 @@ function collect_and_show_results {
echo -e "\n${CYAN}More details can be found in = ${RESET}${RES_FOLDER}"
echo -e "${BLUE}==========================================================${RESET}"
} | tee >(sed 's/\x1b\[[0-9;]*m//g' > $RES_LOG_FILE)

upload_to_oss

# Check the results
if ! is_numeric "$TORCH_TRAIN_LOSS" || \
! is_numeric "$TORCH_TRAIN_RUNTIME" || \
! is_numeric "$TORCH_TRAIN_STEPS_PER_SECOND" || \
! is_numeric "$ACC_TRAIN_LOSS" || \
! is_numeric "$ACC_TRAIN_RUNTIME" || \
! is_numeric "$ACC_TRAIN_STEPS_PER_SECOND" || \
! is_numeric "$ORIG_SCORE" || \
! is_numeric "$TORCH_SCORE" || \
! is_numeric "$ACC_SCORE" || \ ; then
echo "Error: One or more variables are not numeric."
exit 1
fi

LOSS_DIFF=$(echo "$TORCH_TRAIN_LOSS - $ACC_TRAIN_LOSS" | bc -l)
LOSS_DIFF_ABS=$(echo "${LOSS_DIFF#-}" | bc -l)
if (( $(echo "$LOSS_DIFF_ABS > 0.01" | bc -l) )); then
echo "Error: The difference between ACC_TRAIN_LOSS and TORCH_TRAIN_LOSS exceeds 1e-2."
exit 1
fi
}

do_train
do_evaluation
collect_and_show_results
upload_to_oss
collect_and_upload_results
2 changes: 1 addition & 1 deletion docker/Dockerfile.base
Original file line number Diff line number Diff line change
Expand Up @@ -79,7 +79,7 @@ ENV TERM xterm
RUN apt-get update \
&& apt-get install -y --allow-downgrades --allow-change-held-packages --no-install-recommends libnl-3-dev libnl-route-3-dev libnl-3-200 libnl-route-3-200 iproute2 udev dmidecode ethtool \
&& apt-get clean \
&& rm -rf /rm -rf /var/lib/apt/lists/*
&& rm -rf /var/lib/apt/lists/*

RUN cd /tmp/ && \
wget http://pythonrun.oss-cn-zhangjiakou.aliyuncs.com/rdma/nic-libs-mellanox-rdma-5.2-2/nic-lib-rdma-core-installer-ubuntu.tar.gz && \
Expand Down

0 comments on commit 30c3ac5

Please sign in to comment.