-
Notifications
You must be signed in to change notification settings - Fork 8
/
submit_train.sh
executable file
·118 lines (96 loc) · 2.73 KB
/
submit_train.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
#!/bin/bash
#conda
conda=/share/mini1/sw/std/python/anaconda3-2019.07/v3.7
conda_env=torch_1.9
#choose config
dataset=libritts
ling=vqwav2vec
#ling=conformerppg
#ling=contentvec100
#ling=whisperppgsmall
spk=uttdvec
#spk=uttecapatdnn
pros=ppgvcf0
#pros=fs2pitchenergy
#dec=fs2
#dec=vits
dec=gradtts
#dec=diffwave
#dec=tacoar
#dec=tacomol
vocoder=ppgvchifigan
#vocoder=none
#vocoder=bigvgan
exp_name=libritts_train_0
#exp_name=vctk_no16fp_split
exp_dir=exp
njobs=48
ngpus=4
slots=16
#gputypes="GeForceRTX3060|GeForceRTX3090"
#gputypes="GeForceRTX3090"
#gputypes="GeForceGTXTITANX|GeForceGTX1080Ti|GeForceRTX3060"
gputypes="GeForceGTX1080Ti"
. ./bin/parse_options.sh || exit 1;
model_name=${dataset}_${ling}_${spk}_${pros}_${dec}_${vocoder}
exp=$exp_dir/$model_name/$exp_name
config=configs/${dataset}_${ling}_${spk}_${pros}_${dec}_${vocoder}.yaml
if [ ! -e $config ] ; then
echo "can't find config file $config"
exit 1;
fi
# create exp dir
[ ! -e $exp ] && mkdir -p $exp
[ ! -e $exp/scripts ] && mkdir -p $exp/scripts
[ ! -e $exp/logs ] && mkdir -p $exp/logs
job_dir=$exp/scripts
log_dir=$exp/logs
exp_config=$exp/$(basename $config)
[ ! -e $exp_config ] && cp $config $exp_config
#submit first job
#jid=$(submitjob -m 10000 -g${ngpus} -M${slots} -o -l gputype=$gputypes -eo $log_dir/train.log ./bin/train.sh | grep -E [0-9]+)
jid=""
jobs_to_kill="qdel"
# create following jobs
for ((n=0;n<${njobs};n++)); do
job=$job_dir/train${n}.sh
cat <<EOF > $job
#!/bin/bash
source $conda/bin/activate $conda_env
export CUDA_HOME=/share/mini1/sw/std/cuda/cuda11.3/x86_64/
export LD_LIBRARY_PATH="\${CUDA_HOME}/lib64:\${LD_LIBRARY_PATH}"
exp=$exp
ckpt_dir=$exp/ckpt/
if [ ! -e \${ckpt_dir} ] ; then
export NCCL_IB_DISABLE=1
export NCCL_SOCKET_IFNAME=virbr0
python train.py \
-c $exp_config \
-e $exp_name \
-l $exp_dir \
-m $model_name
else
ckpt=\$(ls -t \$ckpt_dir/*.pth | head -n 1)
echo "resume from \$ckpt"
export NCCL_IB_DISABLE=1
export NCCL_SOCKET_IFNAME=virbr0
python train.py \
-p \$ckpt \
-c $exp_config \
-e $exp_name \
-l $exp_dir \
-m $model_name
fi
EOF
chmod +x $job
log=$log_dir/train.log
if [[ "$jid" == "" ]] ; then
jid=$(submitjob -m 10000 -g${ngpus} -M${slots} -o -l gputype=$gputypes -eo $log $job | grep -E [0-9]+)
else
jid=$(submitjob -m 10000 -w $jid -g${ngpus} -M${slots} -o -l gputype=$gputypes -eo $log $job | grep -E [0-9]+)
fi
jobs_to_kill+=" $jid"
echo "submit $jid job $job log $log"
done
[ -e $job_dir/kill_all.sh ] && rm $job_dir/kill_all.sh
touch $job_dir/kill_all.sh; echo "$jobs_to_kill" >> $job_dir/kill_all.sh; chmod +x $job_dir/kill_all.sh