-
Notifications
You must be signed in to change notification settings - Fork 8
/
train.sh
executable file
·51 lines (42 loc) · 1.27 KB
/
train.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
#!/bin/bash
#SBATCH --output=./logs/%j.out
#SBATCH --error=./logs/%j.out
#SBATCH --time=120:00:00
#SBATCH -n 1
#SBATCH --cpus-per-task=16
#SBATCH --mem-per-cpu=5000
#SBATCH --tmp=250000
#SBATCH --gpus=rtx_2080_ti:6
#SBATCH --open-mode=truncate
trap "echo sigterm recieved, exiting!" SIGTERM
DATASET_DIR="h5_womd_sim_agent"
run () {
python -u src/run.py \
model=traffic_bots \
loggers.wandb.entity="YOUR_ENTITY" \
loggers.wandb.name="refactored" \
loggers.wandb.project="trafficbots_release" \
datamodule.data_dir=${TMPDIR}/datasets \
hydra.run.dir='/cluster/scratch/zhejzhan/logs/${now:%Y-%m-%d}/${now:%H-%M-%S}'
}
# To resume from a previous run, use
# resume.checkpoint=YOUR_WANDB_RUN_NAME:latest \
source /cluster/project/cvl/zhejzhan/apps/miniconda3/etc/profile.d/conda.sh
conda activate traffic_bots
echo Running on host: `hostname`
echo In directory: `pwd`
echo Starting on: `date`
echo START copying data: `date`
mkdir $TMPDIR/datasets
cp /cluster/scratch/zhejzhan/$DATASET_DIR/training.h5 $TMPDIR/datasets/
cp /cluster/scratch/zhejzhan/$DATASET_DIR/validation.h5 $TMPDIR/datasets/
echo DONE copying: `date`
type run
echo START: `date`
run &
wait
echo DONE: `date`
mkdir -p ./logs/slurm
mv ./logs/$SLURM_JOB_ID.out ./logs/slurm/$SLURM_JOB_ID.out
echo finished at: `date`
exit 0;