Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[iluvatar] add TG200 llama3_8B test case #780

Merged
merged 2 commits into from
Feb 6, 2025
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 4 additions & 1 deletion training/benchmarks/llama3_8B/megatron/run_pretraining.py
Original file line number Diff line number Diff line change
Expand Up @@ -109,7 +109,10 @@ def parse_args():
time_per_step = float(steptime) / 1000

whole_tps = 512 * 8192 / time_per_step
chip_tps = whole_tps / (args.nproc_per_node * args.nnodes)
if args.vendor=="iluvatar":
chip_tps = whole_tps / (args.nproc_per_node * args.nnodes) * 2
else:
chip_tps = whole_tps / (args.nproc_per_node * args.nnodes)
print("System tokens per second: ", whole_tps)
print("Tokens/p/s: ", chip_tps)
print("MFU: ", chip_tps * 8000000000.0 * 6 / theoryflops)
63 changes: 1 addition & 62 deletions training/iluvatar/docker_image/megatron/Dockerfile
Original file line number Diff line number Diff line change
@@ -1,63 +1,2 @@
FROM ubuntu:20.04
FROM flagperf:bi200

# copy /etc/apt/sources.list . or choose an available one if encountering a problem with the mirror source
# ADD sources.list /etc/apt/

RUN /bin/bash -c "source /root/.bashrc"

ENV DEBIAN_FRONTEND=noninteractive
ENV PATH /root/miniconda/bin:$PATH

RUN sed -i 's#http://archive.ubuntu.com/#http://mirrors.tuna.tsinghua.edu.cn/#' /etc/apt/sources.list
RUN apt-get update -y
RUN apt-get install -y --fix-missing \
apt-utils \
sudo \
openssh-server \
vim \
git \
curl \
wget \
tree \
perl \
kmod \
make \
pciutils \
build-essential \
python3.10-dev \
python3-pip \
libjpeg-dev \
zlib1g-dev \
unzip \
cmake \
bzip2 \
cabextract \
iputils-ping \
pbzip2 \
pv \
numactl \
ninja-build \
gcc-7 \
g++-7 \
libncursesw5

# Configure anaconda
RUN wget https://mirrors.tuna.tsinghua.edu.cn/anaconda/miniconda/Miniconda3-py310_4.10.3-Linux-x86_64.sh && \
bash ./Miniconda3-py310_4.10.3-Linux-x86_64.sh -b -p /root/miniconda && \
/root/miniconda/bin/conda clean -tipsy && \
ln -s /root/miniconda/etc/profile.d/conda.sh /etc/profile.d/conda.sh && \
echo ". /root/miniconda/etc/profile.d/conda.sh" >> ~/.bashrc && \
echo "conda activate base" >> ~/.bashrc && \
conda config --set always_yes yes --set changeps1 no && \
echo 'LD_LIBRARY_PATH="/usr/local/corex/lib:${LD_LIBRARY_PATH}"' >> ~/.bashrc && \
echo 'PATH="/usr/local/corex/bin:${PATH}"' >> ~/.bashrc

RUN /bin/bash -c "apt-get install -y linux-headers-`uname -r`"

RUN update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-7 10 --slave /usr/bin/g++ g++ /usr/bin/g++-7

RUN /bin/bash -c "pip config set global.index-url https://pypi.tuna.tsinghua.edu.cn/simple"

ENV LD_LIBRARY_PATH="/usr/local/corex/lib:${LD_LIBRARY_PATH}"
ENV PATH="/usr/local/corex/bin:${PATH}"
ENV NVCC_ARGUMENTS="-U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ -ftemplate-depth=1024"
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
tokenizer_path = "/home/zhiyuan/test/llama3-8b/tokenizer.model"
localbs = 1 #micro-batch-size
train_steps = 300 ##训练迭代次数
theoryflops = 276000000000000.0 ##由于测试环境功率限制,设定较低主频,限制理论算力上限,与官方标称算力不一致
megatron_path = "/usr/local/lib/python3.10/dist-packages/megatron" # need to be aligned with DockerFile. In NGCtorch, it's /workspace/ + Megatron-LM
tensor_parallel = 1
pipeline_parallel = 4
17 changes: 17 additions & 0 deletions training/iluvatar/llama3_8B-megatron/config/training_adapter.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
echo "[Prompt] iluvatar adaption is not NULL, for other Vendors"
export PYTHONPATH=/usr/local/lib/python3.10/dist-packages
export CUDA_DEVICE_MAX_CONNECTIONS=1
export NCCL_NET_SHARED_BUFFERS=0
export NCCL_ALGO=Ring
export OMP_NUM_THREADS=4
export ENABLE_FLASH_ATTENTION_WITH_IXDNN=1
VENDOR_ARGS=" \
--transformer-impl transformer_engine \
--use-distributed-optimizer \
--use-flash-attn \
--untie-embeddings-and-output-weights \
--no-create-attention-mask-in-dataloader \
--use-legacy-models \
--num-layers-per-stage 1 7 2 9 1 7 \
--disable-bias-linear \
"
2 changes: 1 addition & 1 deletion training/run_benchmarks/config/cluster_conf.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
'''Cluster configs'''

# Hosts to run the benchmark. Each item is an IP address or a hostname.
HOSTS = ["10.1.2.2", "10.1.2.3", "10.1.2.4"]
HOSTS = ["localhost"]
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

请不要修改示例 hosts


# Hosts port to run the tensorflow distribution_strategy = 'multi_worker_mirrored'
HOSTS_PORTS = ["2222"]
Expand Down
9 changes: 6 additions & 3 deletions training/run_benchmarks/config/test_conf.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@

# Set accelerator's vendor name, e.g. iluvatar, cambricon, kunlunxin, ascend, mthreads, metax and dcu.
# We will run benchmarks in training/<vendor>
VENDOR = "nvidia"
VENDOR = "iluvatar"
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

请不要修改示例 vendor


# Accelerator options for docker. TODO FIXME support more accelerators.
# possible value of ACCE_CONTAINER_OPT are:
Expand All @@ -25,7 +25,8 @@
# " --device=/dev/infiniband --device=/dev/dri --device=/dev/mxcd --group-add video"
# dcu:
# "-v /opt/hyhal/:/opt/hyhal/ --device=/dev/kfd --device=/dev/dri/ --group-add video"
ACCE_CONTAINER_OPT = " --gpus all"
ACCE_CONTAINER_OPT = " --privileged --pid=host --ipc=host --cap-add=ALL -v /dev:/dev -v /lib/modules:/lib/modules -v /mnt:/mnt -v /usr/src:/usr/src -v /home:/home "
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

同上


# XXX_VISIBLE_DEVICE item name in env
# possible value of ACCE_VISIBLE_DEVICE_ENV_NAME are:
# CUDA_VISIBLE_DEVICES for nvidia, iluvatar
Expand All @@ -42,7 +43,8 @@
# The path that flagperf deploy in the cluster.
# Users must set FLAGPERF_PATH to where flagperf deploy
# You can assume the preset "/home/FlagPerf/training" points to Null
FLAGPERF_PATH = "/home/FlagPerf/training"
FLAGPERF_PATH = "/home/zhiyuan/FlagPerf/training"
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

同上


# Set log path on the host here.
FLAGPERF_LOG_PATH = FLAGPERF_PATH + "/result/"

Expand Down Expand Up @@ -169,6 +171,7 @@
# "llava1.5_13b:deepspeed-torch:BI-V150:1:16:1": "/raid/dataset/llava1.5_13b",
# "mixtral_8x7B:megatron:BI-V150:4:16:1": "/raid/dataset/mixtral_8x7B", ##单机测试
# "mixtral_8x7B:megatron:BI-V150:1:16:1": "/raid/dataset/mixtral_8x7B", ##四机测试
"llama3_8B:megatron:TG-V200:1:16:1": "/datasets", ##单机测试

# mthreads cases
# "resnet50:pytorch_2.0:S4000:1:8:1": "/data/flagperf/ImageNet",
Expand Down