-
Notifications
You must be signed in to change notification settings - Fork 0
/
sbatch_train.sh
87 lines (60 loc) · 2.51 KB
/
sbatch_train.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
#!/bin/bash
# change the directory path of model run-time output and error messages to your own
#SBATCH --output=/scratch/gilbreth/gupt1075/train_fourcastnet_24.out
#SBATCH --error=/scratch/gilbreth/gupt1075/train_fourcastnet_24.err
# The file name of this submission file, so it's easier to track jobs
# filename: submit_run_model_example.sub
#SBATCH --nodes=1
#SBATCH --ntasks-per-node=24
#SBATCH --gres=gpu:1
#SBATCH --time=24:00:00
# partner queue has a 24-hour limit
#SBATCH -A gdsp-k
#SBATCH -C "v100|a100"
# Job name, it will show up when you track this job
#SBATCH -J fourcast_train_job
# Use your email address so that you will receive email notifications about the job begin, end, or fail status
# To submit the job via command line:$ sbatch submit_run_model_example.sub
# To check status of the submitted job:$ squeue -u yourUserID
module --force purge
unset PYTHONPATH
module load anaconda/5.3.1-py37
module load cuda/11.7.0
module load cudnn/cuda-11.7_8.6
module use /depot/gdsp/etc/modules
module load utilities monitor
module load rcac
module list
export PRECXX11ABI=1
export CUDA="11.7"
echo $PYTHONPATH
echo "Current date completed loading modules: $now"
# # track per-code GPU load
# monitor gpu percent --all-cores > ./gpu-percent.log &
# GPU_PID=$!
# # track memory usage
# monitor gpu memory > ./gpu-memory.log &
# MEM_PID=$!
# # track per-code CPU load
# monitor cpu percent --all-cores > ./cpu-percent.log &
# CPU_PID=$!
# # track memory usage
# monitor cpu memory > ./cpu-memory.log &
# MEM_PID=$!
# Loading anaconda environment
source /apps/spack/gilbreth/apps/anaconda/5.3.1-py37-gcc-4.8.5-7vvmykn/etc/profile.d/conda.sh
conda activate pytorch
export WANDB_API_KEY=07dce1789bed58aeeab69df88f3327bb330dd5a6
# Change this directory to where you save the model-related files such as run_model.py
# cd /scratch/gilbreth/wwtung/FourCastNet/
source ./export_DDP_vars.sh
config_file=/scratch/gilbreth/gupt1075/FourCastNet/config/AFNO.yaml
config='afno_backbone_finetune'
run_num='2'
export HDF5_USE_FILE_LOCKING=FALSE
export NCCL_NET_GDR_LEVEL=PHB
export MASTER_ADDR=$(hostname)
export LD_LIBRARY_PATH=/apps/spack/gilbreth/apps/anaconda/5.3.1-py37-gcc-4.8.5-7vvmykn/lib:$LD_LIBRARY_PATH
set -x
python /scratch/gilbreth/gupt1075/FourCastNet/train.py --enable_amp --yaml_config=$config_file --config=$config --run_num=$run_num
# python /scratch/gilbreth/gupt1075/FourCastNet/train.py --enable_amp --yaml_config="/scratch/gilbreth/gupt1075/FourCastNet/config/AFNO.yaml" --config="afno_backbone_finetune" --run_num="3"