diff --git a/.github/workflows/cicd-main.yml b/.github/workflows/cicd-main.yml index 97ca07da4..2e87f6dc1 100644 --- a/.github/workflows/cicd-main.yml +++ b/.github/workflows/cicd-main.yml @@ -92,6 +92,8 @@ jobs: - ppo-llama3-pp2-reshard - dpo-llama3 - rm-llama3 + - dpo-mixtral-ep + - dpo-mixtral-sp with: RUNNER: self-hosted-azure # Fairly aggresive timeout that all functional tests should try to adhere to diff --git a/tests/functional/dpo.sh b/tests/functional/dpo.sh index 3bfe3b8db..3064b5a6d 100644 --- a/tests/functional/dpo.sh +++ b/tests/functional/dpo.sh @@ -5,16 +5,11 @@ cd $SCRIPT_DIR set -eoux pipefail export NCCL_ALGO=Tree -export NVTE_APPLY_QK_LAYER_SCALING=1 +export NVTE_APPLY_QK_LAYER_SCALING=${NVTE_APPLY_QK_LAYER_SCALING:-0} -KL=${KL:-0.1} -#LR=${LR:-9e-7} -GBS=${GBS:-4} PRETRAINED_CHECKPOINT_NEMO_FILE=${PRETRAINED_CHECKPOINT_NEMO_FILE} -#MIN_LR=$(awk -v var="$LR" 'BEGIN {print var - 1e-11}') - TRAIN_DATA_PATH=$SCRIPT_DIR/test_data/dummy-dpo.jsonl VALID_DATA_PATH=$SCRIPT_DIR/test_data/dummy-dpo.jsonl @@ -27,7 +22,7 @@ mkdir -p $RESULTS_DIR GPFS=$(git rev-parse --show-toplevel) # W&B Logging -PROJECT=llama3_dpo_test +PROJECT=dpo_test # START HETEROGENEUS JOB 3 CONF_DIR="${GPFS}/examples/nlp/gpt/conf/" @@ -44,38 +39,39 @@ dpo() { export CUDA_VISIBLE_DEVICES=0,1 export PYTHONPATH="${GPFS}:${PYTHONPATH:-}" export HYDRA_FULL_ERROR=1 -mpirun -np 2 --allow-run-as-root python -u ${GPFS}/examples/nlp/gpt/train_gpt_dpo.py \ +torchrun --nproc_per_node=2 ${GPFS}/examples/nlp/gpt/train_gpt_dpo.py \ --config-path=${CONF_DIR} \ --config-name=${CONF_NAME} \ trainer.num_nodes=1 \ trainer.devices=2 \ - ++model.data.data_impl=jsonl \ - ++model.data.seq_length=128 \ - ++model.global_batch_size=${GBS} \ + pretrained_checkpoint.restore_from_path=${PRETRAINED_CHECKPOINT_NEMO_FILE} \ + exp_manager.create_checkpoint_callback=False \ + exp_manager.explicit_log_dir=${RESULTS_DIR} \ + ++model.tensor_model_parallel_size=1 \ + ++model.pipeline_model_parallel_size=1 \ + ++model.global_batch_size=4 \ ++model.micro_batch_size=1 \ ++model.mcore_gpt=true \ ++model.megatron_amp_O2=true \ - ++model.dpo.ref_policy_kl_penalty=${KL} \ + ++model.dpo.ref_policy_kl_penalty=0.1 \ ++model.dpo.log_prob_forward_micro_batch_size=1 \ ++model.dpo.average_log_probs=false \ ++model.dpo.sft_loss_weight=0.1 \ ++model.dpo.preference_loss_weight=1.0 \ - pretrained_checkpoint.restore_from_path=${PRETRAINED_CHECKPOINT_NEMO_FILE} \ - "model.data.data_prefix={train: [${TRAIN_DATA_PATH}], validation: [${VALID_DATA_PATH}], test: [${VALID_DATA_PATH}]}" \ - exp_manager.create_checkpoint_callback=False \ + ++model.activations_checkpoint_granularity=full \ + ++model.activations_checkpoint_method=uniform \ + ++model.activations_checkpoint_num_layers=1 \ + ++model.dist_ckpt_load_strictness=log_all \ + ++model.data.data_impl=jsonl \ + ++model.data.seq_length=128 \ model.data.num_workers=2 \ - ++model.tensor_model_parallel_size=1 \ - ++model.pipeline_model_parallel_size=1 \ + "model.data.data_prefix={train: [${TRAIN_DATA_PATH}], validation: [${VALID_DATA_PATH}], test: [${VALID_DATA_PATH}]}" \ trainer.dpo.max_steps=3 \ trainer.dpo.val_check_interval=3 \ trainer.dpo.limit_val_batches=8 \ trainer.dpo.save_interval=0 \ - exp_manager.explicit_log_dir=${RESULTS_DIR} \ - ++model.activations_checkpoint_granularity=full \ - ++model.activations_checkpoint_method=uniform \ - ++model.activations_checkpoint_num_layers=1 \ - ++model.dist_ckpt_load_strictness=log_all + "$@" } log_file=$(mktemp /tmp/dpo-log-XXXXXX) -dpo | tee $log_file \ No newline at end of file +dpo "$@" | tee $log_file diff --git a/tests/functional/test_cases/dpo-mixtral-ep.sh b/tests/functional/test_cases/dpo-mixtral-ep.sh new file mode 100755 index 000000000..752a1cb12 --- /dev/null +++ b/tests/functional/test_cases/dpo-mixtral-ep.sh @@ -0,0 +1,12 @@ +#!/bin/bash +SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd ) +cd $SCRIPT_DIR + +set -eoux pipefail + +PRETRAINED_CHECKPOINT_NEMO_FILE=/home/terryk/saved_experiments/tiny-mixtral-nlayers2-hidden128-ffn448-nhead4-qgroup2.nemo \ +bash ../dpo.sh \ + ++model.optim.name=mcore_distributed_optim \ + ++model.expert_model_parallel_size=2 \ + 2>&1 | tee $(basename $0 .sh).log + diff --git a/tests/functional/test_cases/dpo-mixtral-sp.sh b/tests/functional/test_cases/dpo-mixtral-sp.sh new file mode 100755 index 000000000..389b1c32b --- /dev/null +++ b/tests/functional/test_cases/dpo-mixtral-sp.sh @@ -0,0 +1,13 @@ +#!/bin/bash +SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd ) +cd $SCRIPT_DIR + +set -eoux pipefail + +PRETRAINED_CHECKPOINT_NEMO_FILE=/home/terryk/saved_experiments/tiny-mixtral-nlayers2-hidden128-ffn448-nhead4-qgroup2.nemo \ +bash ../dpo.sh \ + ++model.optim.name=mcore_distributed_optim \ + ++model.tensor_model_parallel_size=2 \ + ++model.expert_model_parallel_size=1 \ + ++model.sequence_parallel=True \ + 2>&1 | tee $(basename $0 .sh).log