Skip to content

Commit

Permalink
more functional tests
Browse files Browse the repository at this point in the history
Signed-off-by: Terry Kong <[email protected]>
  • Loading branch information
terrykong committed Nov 4, 2024
1 parent 446f7f0 commit aca16b3
Show file tree
Hide file tree
Showing 4 changed files with 46 additions and 23 deletions.
2 changes: 2 additions & 0 deletions .github/workflows/cicd-main.yml
Original file line number Diff line number Diff line change
Expand Up @@ -92,6 +92,8 @@ jobs:
- ppo-llama3-pp2-reshard
- dpo-llama3
- rm-llama3
- dpo-mixtral-ep
- dpo-mixtral-sp
with:
RUNNER: self-hosted-azure
# Fairly aggresive timeout that all functional tests should try to adhere to
Expand Down
42 changes: 19 additions & 23 deletions tests/functional/dpo.sh
Original file line number Diff line number Diff line change
Expand Up @@ -5,16 +5,11 @@ cd $SCRIPT_DIR
set -eoux pipefail

export NCCL_ALGO=Tree
export NVTE_APPLY_QK_LAYER_SCALING=1
export NVTE_APPLY_QK_LAYER_SCALING=${NVTE_APPLY_QK_LAYER_SCALING:-0}

KL=${KL:-0.1}
#LR=${LR:-9e-7}
GBS=${GBS:-4}
PRETRAINED_CHECKPOINT_NEMO_FILE=${PRETRAINED_CHECKPOINT_NEMO_FILE}


#MIN_LR=$(awk -v var="$LR" 'BEGIN {print var - 1e-11}')

TRAIN_DATA_PATH=$SCRIPT_DIR/test_data/dummy-dpo.jsonl
VALID_DATA_PATH=$SCRIPT_DIR/test_data/dummy-dpo.jsonl

Expand All @@ -27,7 +22,7 @@ mkdir -p $RESULTS_DIR
GPFS=$(git rev-parse --show-toplevel)

# W&B Logging
PROJECT=llama3_dpo_test
PROJECT=dpo_test

# START HETEROGENEUS JOB 3
CONF_DIR="${GPFS}/examples/nlp/gpt/conf/"
Expand All @@ -44,38 +39,39 @@ dpo() {
export CUDA_VISIBLE_DEVICES=0,1
export PYTHONPATH="${GPFS}:${PYTHONPATH:-}"
export HYDRA_FULL_ERROR=1
mpirun -np 2 --allow-run-as-root python -u ${GPFS}/examples/nlp/gpt/train_gpt_dpo.py \
torchrun --nproc_per_node=2 ${GPFS}/examples/nlp/gpt/train_gpt_dpo.py \
--config-path=${CONF_DIR} \
--config-name=${CONF_NAME} \
trainer.num_nodes=1 \
trainer.devices=2 \
++model.data.data_impl=jsonl \
++model.data.seq_length=128 \
++model.global_batch_size=${GBS} \
pretrained_checkpoint.restore_from_path=${PRETRAINED_CHECKPOINT_NEMO_FILE} \
exp_manager.create_checkpoint_callback=False \
exp_manager.explicit_log_dir=${RESULTS_DIR} \
++model.tensor_model_parallel_size=1 \
++model.pipeline_model_parallel_size=1 \
++model.global_batch_size=4 \
++model.micro_batch_size=1 \
++model.mcore_gpt=true \
++model.megatron_amp_O2=true \
++model.dpo.ref_policy_kl_penalty=${KL} \
++model.dpo.ref_policy_kl_penalty=0.1 \
++model.dpo.log_prob_forward_micro_batch_size=1 \
++model.dpo.average_log_probs=false \
++model.dpo.sft_loss_weight=0.1 \
++model.dpo.preference_loss_weight=1.0 \
pretrained_checkpoint.restore_from_path=${PRETRAINED_CHECKPOINT_NEMO_FILE} \
"model.data.data_prefix={train: [${TRAIN_DATA_PATH}], validation: [${VALID_DATA_PATH}], test: [${VALID_DATA_PATH}]}" \
exp_manager.create_checkpoint_callback=False \
++model.activations_checkpoint_granularity=full \
++model.activations_checkpoint_method=uniform \
++model.activations_checkpoint_num_layers=1 \
++model.dist_ckpt_load_strictness=log_all \
++model.data.data_impl=jsonl \
++model.data.seq_length=128 \
model.data.num_workers=2 \
++model.tensor_model_parallel_size=1 \
++model.pipeline_model_parallel_size=1 \
"model.data.data_prefix={train: [${TRAIN_DATA_PATH}], validation: [${VALID_DATA_PATH}], test: [${VALID_DATA_PATH}]}" \
trainer.dpo.max_steps=3 \
trainer.dpo.val_check_interval=3 \
trainer.dpo.limit_val_batches=8 \
trainer.dpo.save_interval=0 \
exp_manager.explicit_log_dir=${RESULTS_DIR} \
++model.activations_checkpoint_granularity=full \
++model.activations_checkpoint_method=uniform \
++model.activations_checkpoint_num_layers=1 \
++model.dist_ckpt_load_strictness=log_all
"$@"
}

log_file=$(mktemp /tmp/dpo-log-XXXXXX)
dpo | tee $log_file
dpo "$@" | tee $log_file
12 changes: 12 additions & 0 deletions tests/functional/test_cases/dpo-mixtral-ep.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
#!/bin/bash
SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
cd $SCRIPT_DIR

set -eoux pipefail

PRETRAINED_CHECKPOINT_NEMO_FILE=/home/terryk/saved_experiments/tiny-mixtral-nlayers2-hidden128-ffn448-nhead4-qgroup2.nemo \
bash ../dpo.sh \
++model.optim.name=mcore_distributed_optim \
++model.expert_model_parallel_size=2 \
2>&1 | tee $(basename $0 .sh).log

13 changes: 13 additions & 0 deletions tests/functional/test_cases/dpo-mixtral-sp.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
#!/bin/bash
SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
cd $SCRIPT_DIR

set -eoux pipefail

PRETRAINED_CHECKPOINT_NEMO_FILE=/home/terryk/saved_experiments/tiny-mixtral-nlayers2-hidden128-ffn448-nhead4-qgroup2.nemo \
bash ../dpo.sh \
++model.optim.name=mcore_distributed_optim \
++model.tensor_model_parallel_size=2 \
++model.expert_model_parallel_size=1 \
++model.sequence_parallel=True \
2>&1 | tee $(basename $0 .sh).log

0 comments on commit aca16b3

Please sign in to comment.