more functional tests

Signed-off-by: Terry Kong <[email protected]>
NVIDIA · Nov 4, 2024 · aca16b3 · aca16b3
1 parent 446f7f0
commit aca16b3
Show file tree

Hide file tree

Showing 4 changed files with 46 additions and 23 deletions.
diff --git a/.github/workflows/cicd-main.yml b/.github/workflows/cicd-main.yml
@@ -92,6 +92,8 @@ jobs:
           - ppo-llama3-pp2-reshard
           - dpo-llama3
           - rm-llama3
+          - dpo-mixtral-ep
+          - dpo-mixtral-sp
     with:
       RUNNER: self-hosted-azure
       # Fairly aggresive timeout that all functional tests should try to adhere to

diff --git a/tests/functional/dpo.sh b/tests/functional/dpo.sh
@@ -5,16 +5,11 @@ cd $SCRIPT_DIR
 set -eoux pipefail
 
 export NCCL_ALGO=Tree
-export NVTE_APPLY_QK_LAYER_SCALING=1
+export NVTE_APPLY_QK_LAYER_SCALING=${NVTE_APPLY_QK_LAYER_SCALING:-0}
 
-KL=${KL:-0.1}
-#LR=${LR:-9e-7}
-GBS=${GBS:-4}
 PRETRAINED_CHECKPOINT_NEMO_FILE=${PRETRAINED_CHECKPOINT_NEMO_FILE}
 
 
-#MIN_LR=$(awk -v var="$LR" 'BEGIN {print var - 1e-11}')
-
 TRAIN_DATA_PATH=$SCRIPT_DIR/test_data/dummy-dpo.jsonl
 VALID_DATA_PATH=$SCRIPT_DIR/test_data/dummy-dpo.jsonl
 
@@ -27,7 +22,7 @@ mkdir -p $RESULTS_DIR
 GPFS=$(git rev-parse --show-toplevel)
 
 # W&B Logging
-PROJECT=llama3_dpo_test
+PROJECT=dpo_test
 
 # START HETEROGENEUS JOB 3
 CONF_DIR="${GPFS}/examples/nlp/gpt/conf/"
@@ -44,38 +39,39 @@ dpo() {
 export CUDA_VISIBLE_DEVICES=0,1
 export PYTHONPATH="${GPFS}:${PYTHONPATH:-}"
 export HYDRA_FULL_ERROR=1
-mpirun -np 2 --allow-run-as-root python -u ${GPFS}/examples/nlp/gpt/train_gpt_dpo.py \
+torchrun --nproc_per_node=2 ${GPFS}/examples/nlp/gpt/train_gpt_dpo.py \
     --config-path=${CONF_DIR} \
     --config-name=${CONF_NAME} \
     trainer.num_nodes=1 \
     trainer.devices=2 \
-    ++model.data.data_impl=jsonl \
-    ++model.data.seq_length=128 \
-    ++model.global_batch_size=${GBS} \
+    pretrained_checkpoint.restore_from_path=${PRETRAINED_CHECKPOINT_NEMO_FILE} \
+    exp_manager.create_checkpoint_callback=False \
+    exp_manager.explicit_log_dir=${RESULTS_DIR} \
+    ++model.tensor_model_parallel_size=1 \
+    ++model.pipeline_model_parallel_size=1 \
+    ++model.global_batch_size=4 \
     ++model.micro_batch_size=1 \
     ++model.mcore_gpt=true \
     ++model.megatron_amp_O2=true \
-    ++model.dpo.ref_policy_kl_penalty=${KL} \
+    ++model.dpo.ref_policy_kl_penalty=0.1 \
     ++model.dpo.log_prob_forward_micro_batch_size=1 \
     ++model.dpo.average_log_probs=false \
     ++model.dpo.sft_loss_weight=0.1 \
     ++model.dpo.preference_loss_weight=1.0 \
-    pretrained_checkpoint.restore_from_path=${PRETRAINED_CHECKPOINT_NEMO_FILE} \
-    "model.data.data_prefix={train: [${TRAIN_DATA_PATH}], validation: [${VALID_DATA_PATH}], test: [${VALID_DATA_PATH}]}" \
-    exp_manager.create_checkpoint_callback=False \
+    ++model.activations_checkpoint_granularity=full \
+    ++model.activations_checkpoint_method=uniform \
+    ++model.activations_checkpoint_num_layers=1 \
+    ++model.dist_ckpt_load_strictness=log_all \
+    ++model.data.data_impl=jsonl \
+    ++model.data.seq_length=128 \
     model.data.num_workers=2 \
-    ++model.tensor_model_parallel_size=1 \
-    ++model.pipeline_model_parallel_size=1 \
+    "model.data.data_prefix={train: [${TRAIN_DATA_PATH}], validation: [${VALID_DATA_PATH}], test: [${VALID_DATA_PATH}]}" \
     trainer.dpo.max_steps=3 \
     trainer.dpo.val_check_interval=3 \
     trainer.dpo.limit_val_batches=8 \
     trainer.dpo.save_interval=0 \
-    exp_manager.explicit_log_dir=${RESULTS_DIR} \
-    ++model.activations_checkpoint_granularity=full \
-    ++model.activations_checkpoint_method=uniform \
-    ++model.activations_checkpoint_num_layers=1 \
-    ++model.dist_ckpt_load_strictness=log_all
+    "$@"
 }
 
 log_file=$(mktemp /tmp/dpo-log-XXXXXX)
-dpo | tee $log_file
+dpo "$@" | tee $log_file
diff --git a/tests/functional/test_cases/dpo-mixtral-ep.sh b/tests/functional/test_cases/dpo-mixtral-ep.sh
@@ -0,0 +1,12 @@
+#!/bin/bash
+SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
+cd $SCRIPT_DIR
+
+set -eoux pipefail
+
+PRETRAINED_CHECKPOINT_NEMO_FILE=/home/terryk/saved_experiments/tiny-mixtral-nlayers2-hidden128-ffn448-nhead4-qgroup2.nemo \
+bash ../dpo.sh \
+  ++model.optim.name=mcore_distributed_optim \
+  ++model.expert_model_parallel_size=2 \
+  2>&1 | tee $(basename $0 .sh).log
+
diff --git a/tests/functional/test_cases/dpo-mixtral-sp.sh b/tests/functional/test_cases/dpo-mixtral-sp.sh
@@ -0,0 +1,13 @@
+#!/bin/bash
+SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
+cd $SCRIPT_DIR
+
+set -eoux pipefail
+
+PRETRAINED_CHECKPOINT_NEMO_FILE=/home/terryk/saved_experiments/tiny-mixtral-nlayers2-hidden128-ffn448-nhead4-qgroup2.nemo \
+bash ../dpo.sh \
+  ++model.optim.name=mcore_distributed_optim \
+  ++model.tensor_model_parallel_size=2 \
+  ++model.expert_model_parallel_size=1 \
+  ++model.sequence_parallel=True \
+  2>&1 | tee $(basename $0 .sh).log