-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathdist_train.sh
105 lines (94 loc) · 3.61 KB
/
dist_train.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
#!/bin/bash
# Detect `python3` command.
# This workaround addresses a common issue:
# `python` points to `python2`, which is deprecated.
export PYTHONS
export RVAL
PYTHONS=$(compgen -c | grep "^python3$")
# `$?` is a built-in variable in bash, which is the exit status of the most
# recently-executed command; by convention, 0 means success and anything else
# indicates failure.
RVAL=$?
if [[ $RVAL -eq 0 ]]; then # if `python3` exist
PYTHON="python3"
else
PYTHON="python"
fi
# Help message.
if [[ $# -lt 2 ]]; then
echo "This script helps launch distributed training job on local machine."
echo
echo "Usage: $0 GPUS COMMAND [ARGS]"
echo
echo "Example: $0 8 ddpm [--help]"
echo
echo "To enable multi-node training, one can reuse this script" \
"by simply setting the following environment variables on each" \
"machine:"
echo " MASTER_IP: The IP address of the master node."
echo " MASTER_PORT: The port of the master node."
echo " NODE_SIZE: Number of nodes (machines) used for training."
echo " NODE_RANK: Node rank of the current machine."
echo
echo "NOTE: In multi-node training, \`GPUS\` refers to the number" \
"of GPUs on each local machine, or say, GPUs per node."
echo
echo "Example of using 16 GPUs on 2 machines (i.e., 8 GPUs each):"
echo
echo " On machine 0: MASTER_IP=node_0_ip MASTER_PORT=node_0_port" \
"NODE_SIZE=2 NODE_RANK=0 $0 8 ddpm [--help]"
echo " On machine 1: MASTER_IP=node_0_ip MASTER_PORT=node_0_port" \
"NODE_SIZE=2 NODE_RANK=1 $0 8 ddpm [--help]"
echo
echo "Detailed instruction on available commands:"
echo "--------------------------------------------------"
${PYTHON} ./main.py --help
echo
exit 0
fi
GPUS=$1
COMMAND=$2
# Help message for a particular command.
if [[ $# -lt 3 || ${*: -1} == "--help" ]]; then
echo "Detailed instruction on the arguments for command \`"${COMMAND}"\`:"
echo "--------------------------------------------------"
${PYTHON} ./main.py ${COMMAND} --help
echo
exit 0
fi
# Switch memory allocator if available.
# Search order: jemalloc.so -> tcmalloc.so.
# According to https://pytorch.org/tutorials/recipes/recipes/tuning_guide.html,
# it can get better performance by reusing memory as much as possible than
# default malloc function.
JEMALLOC=$(ldconfig -p | grep -i "libjemalloc.so$" | tr " " "\n" | grep "/" \
| head -n 1)
TCMALLOC=$(ldconfig -p | grep -i "libtcmalloc.so.4$" | tr " " "\n" | grep "/" \
| head -n 1)
if [ -n "$JEMALLOC" ]; then # if found the path to libjemalloc.so
echo "Switch memory allocator to jemalloc."
export LD_PRELOAD=$JEMALLOC:$LD_PRELOAD
elif [ -n "$TCMALLOC" ]; then # if found the path to libtcmalloc.so.4
echo "Switch memory allocator to tcmalloc."
export LD_PRELOAD=$TCMALLOC:$LD_PRELOAD
fi
# Get an available port for launching distributed training.
# Credit to https://superuser.com/a/1293762.
export DEFAULT_FREE_PORT
DEFAULT_FREE_PORT=$(comm -23 <(seq 49152 65535 | sort) \
<(ss -Htan | awk '{print $4}' | cut -d':' -f2 | sort -u) \
| shuf | head -n 1)
MASTER_IP=${MASTER_IP:-127.0.0.1}
MASTER_PORT=${MASTER_PORT:-$DEFAULT_FREE_PORT}
NODE_SIZE=${NODE_SIZE:-1}
NODE_RANK=${NODE_RANK:-0}
${PYTHON} -m torch.distributed.launch \
--master_addr=${MASTER_IP} \
--master_port=${MASTER_PORT} \
--nnodes=${NODE_SIZE} \
--node_rank=${NODE_RANK} \
--nproc_per_node=${GPUS} \
./main.py \
${COMMAND} \
${@:3} \
|| exit 1 # Stop the script when it finds exception threw by Python.