forked from meta-llama/llama
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Example_multi-node.slurm
44 lines (35 loc) · 1.6 KB
/
Example_multi-node.slurm
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
#!/bin/bash
#SBATCH --job-name=inf-70b-2nodes
#SBATCH --time 2:00:00
#SBATCH --ntasks=2
#SBATCH --cpus-per-task=10
#SBATCH --mem 400G
#SBATCH --nodes=2
# SBATCH --gpus-per-task=1
#SBATCH --partition=gpuq
#SBATCH --gres=gpu:A30:4
#SBATCH --output=slurm_70b.out
#SBATCH -q bonus
#master_hostname=$(scontrol show hostnames "$SLURM_JOB_NODELIST" | head -n 1)
#master_ip=$(ifconfig bond1.1330 | grep 'inet ' | cut -d ' ' -f 10)
#export MASTER_ADDR=${master_ip}
#export MASTER_ADDR=10.11.3.102 #$master_addr
#echo "MASTER_ADDR="$MASTER_ADDR
#export NCCL_DEBUG=INFO
#echo master_ip=$master_ip
export MASTER_ADDR=$(hostname -I | grep -o '10.11.\w*.\w*') MASTER_PORT=29500
echo "MASTER_ADDR=${MASTER_ADDR}, MASTER_PORT=${MASTER_PORT}"
export NCCL_DEBUG=WARN NCCL_SOCKET_IFNAME=bond1.1330 NCCL_IB_DISABLE=1
source /stornext/Home/data/allstaff/i/iskander.j/micromamba/etc/profile.d/micromamba.sh
micromamba activate llm
### the command to run
srun torchrun --rdzv_id $RANDOM --rdzv_backend c10d \
--rdzv_endpoint $MASTER_ADDR:$MASTER_PORT --nproc-per-node 4 --nnodes 2 \
example_chat_completion.py --ckpt_dir /vast/projects/RCP/WEHI-llama2/llama-2-70b-chat/ \
--tokenizer_path /vast/projects/RCP/WEHI-llama2/tokenizer.model \
--max_seq_len 512 --max_batch_size 6
srun torchrun --rdzv_id $RANDOM --rdzv_backend c10d \
--rdzv_endpoint $MASTER_ADDR:$MASTER_PORT --nproc-per-node 4 --nnodes 2 \
example_text_completion.py \
--max_seq_len 128 --max_batch_size 4 --ckpt_dir /vast/projects/RCP/WEHI-llama2/llama-2-70b/ \
--tokenizer_path /vast/projects/RCP/WEHI-llama2/tokenizer.model