FlagOpen · tianxiao-baai · Feb 6, 2025 · Feb 5, 2025 · Feb 5, 2025 · tianxiao-baai
diff --git a/training/benchmarks/llama3_8B/megatron/run_pretraining.py b/training/benchmarks/llama3_8B/megatron/run_pretraining.py
@@ -109,7 +109,10 @@ def parse_args():
                 time_per_step = float(steptime) / 1000
 
     whole_tps = 512 * 8192 / time_per_step
-    chip_tps = whole_tps / (args.nproc_per_node * args.nnodes)
+    if args.vendor=="iluvatar":
+        chip_tps = whole_tps / (args.nproc_per_node * args.nnodes) * 2
+    else:
+        chip_tps = whole_tps / (args.nproc_per_node * args.nnodes)
     print("System tokens per second: ", whole_tps)
     print("Tokens/p/s: ", chip_tps)
     print("MFU: ", chip_tps * 8000000000.0 * 6 / theoryflops)
diff --git a/training/iluvatar/docker_image/megatron/Dockerfile b/training/iluvatar/docker_image/megatron/Dockerfile
@@ -1,63 +1,2 @@
-FROM ubuntu:20.04
+FROM flagperf:bi200
 
-# copy /etc/apt/sources.list . or choose an available one if encountering a problem with the mirror source
-# ADD sources.list /etc/apt/
-
-RUN /bin/bash -c "source /root/.bashrc"
-
-ENV DEBIAN_FRONTEND=noninteractive
-ENV PATH /root/miniconda/bin:$PATH
-
-RUN sed -i 's#http://archive.ubuntu.com/#http://mirrors.tuna.tsinghua.edu.cn/#' /etc/apt/sources.list
-RUN apt-get update -y
-RUN apt-get install -y --fix-missing \
-     apt-utils \
-     sudo \
-     openssh-server \
-     vim \
-     git \
-     curl \
-     wget \
-     tree \
-     perl \
-     kmod \
-     make \
-     pciutils \
-     build-essential \
-     python3.10-dev \
-     python3-pip \
-     libjpeg-dev \
-     zlib1g-dev \
-     unzip \
-     cmake \
-     bzip2 \
-     cabextract \
-     iputils-ping \
-     pbzip2 \
-     pv \
-     numactl \
-     ninja-build \
-     gcc-7 \
-     g++-7 \
-     libncursesw5
-
-# Configure anaconda
-RUN wget https://mirrors.tuna.tsinghua.edu.cn/anaconda/miniconda/Miniconda3-py310_4.10.3-Linux-x86_64.sh && \
-    bash ./Miniconda3-py310_4.10.3-Linux-x86_64.sh -b -p /root/miniconda && \
-    /root/miniconda/bin/conda clean -tipsy && \
-    ln -s /root/miniconda/etc/profile.d/conda.sh /etc/profile.d/conda.sh && \
-    echo ". /root/miniconda/etc/profile.d/conda.sh" >> ~/.bashrc && \
-    echo "conda activate base" >> ~/.bashrc && \
-    conda config --set always_yes yes --set changeps1 no && \
-    echo 'LD_LIBRARY_PATH="/usr/local/corex/lib:${LD_LIBRARY_PATH}"' >> ~/.bashrc && \
-    echo 'PATH="/usr/local/corex/bin:${PATH}"' >> ~/.bashrc 
-
-RUN /bin/bash -c "apt-get install -y linux-headers-`uname -r`"
-
-RUN update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-7 10 --slave /usr/bin/g++ g++ /usr/bin/g++-7
-
-RUN /bin/bash -c "pip config set global.index-url https://pypi.tuna.tsinghua.edu.cn/simple"
-
-ENV LD_LIBRARY_PATH="/usr/local/corex/lib:${LD_LIBRARY_PATH}"
-ENV PATH="/usr/local/corex/bin:${PATH}"
-ENV NVCC_ARGUMENTS="-U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ -ftemplate-depth=1024"
diff --git a/training/iluvatar/llama3_8B-megatron/config/config_TG-V200x1x16.py b/training/iluvatar/llama3_8B-megatron/config/config_TG-V200x1x16.py
@@ -0,0 +1,7 @@
+tokenizer_path = "/home/zhiyuan/test/llama3-8b/tokenizer.model"
+localbs = 1  #micro-batch-size
+train_steps = 300  ##训练迭代次数
+theoryflops = 276000000000000.0 ##由于测试环境功率限制，设定较低主频，限制理论算力上限，与官方标称算力不一致
+megatron_path = "/usr/local/lib/python3.10/dist-packages/megatron" # need to be aligned with DockerFile. In NGCtorch, it's /workspace/ + Megatron-LM
+tensor_parallel = 1  
+pipeline_parallel = 4 
diff --git a/training/iluvatar/llama3_8B-megatron/config/training_adapter.sh b/training/iluvatar/llama3_8B-megatron/config/training_adapter.sh
@@ -0,0 +1,17 @@
+echo "[Prompt] iluvatar adaption is not NULL, for other Vendors"
+export PYTHONPATH=/usr/local/lib/python3.10/dist-packages
+export CUDA_DEVICE_MAX_CONNECTIONS=1
+export NCCL_NET_SHARED_BUFFERS=0
+export NCCL_ALGO=Ring
+export OMP_NUM_THREADS=4
+export ENABLE_FLASH_ATTENTION_WITH_IXDNN=1
+VENDOR_ARGS=" \
+    --transformer-impl transformer_engine \
+    --use-distributed-optimizer \
+    --use-flash-attn \
+    --untie-embeddings-and-output-weights \
+    --no-create-attention-mask-in-dataloader \
+    --use-legacy-models \
+    --num-layers-per-stage 1 7 2 9 1 7 \
+    --disable-bias-linear \
+"
diff --git a/training/run_benchmarks/config/cluster_conf.py b/training/run_benchmarks/config/cluster_conf.py
@@ -1,7 +1,7 @@
 '''Cluster configs'''
 
 # Hosts to run the benchmark. Each item is an IP address or a hostname.
-HOSTS = ["10.1.2.2", "10.1.2.3", "10.1.2.4"]
+HOSTS = ["localhost"]
 
 # Hosts port to run the tensorflow distribution_strategy = 'multi_worker_mirrored'
 HOSTS_PORTS = ["2222"]

diff --git a/training/run_benchmarks/config/test_conf.py b/training/run_benchmarks/config/test_conf.py
@@ -3,7 +3,7 @@
 
 # Set accelerator's vendor name, e.g. iluvatar, cambricon, kunlunxin, ascend, mthreads, metax and dcu.
 # We will run benchmarks in training/<vendor>
-VENDOR = "nvidia"
+VENDOR = "iluvatar"
 
 # Accelerator options for docker. TODO FIXME support more accelerators.
 # possible value of ACCE_CONTAINER_OPT are:
@@ -25,7 +25,8 @@
 #       " --device=/dev/infiniband --device=/dev/dri --device=/dev/mxcd --group-add video"
 #   dcu:
 #       "-v /opt/hyhal/:/opt/hyhal/ --device=/dev/kfd --device=/dev/dri/ --group-add video"
-ACCE_CONTAINER_OPT = " --gpus all"
+ACCE_CONTAINER_OPT = " --privileged --pid=host --ipc=host --cap-add=ALL -v /dev:/dev -v /lib/modules:/lib/modules  -v /mnt:/mnt   -v /usr/src:/usr/src  -v /home:/home "
+
 # XXX_VISIBLE_DEVICE item name in env
 # possible value of ACCE_VISIBLE_DEVICE_ENV_NAME are:
 #   CUDA_VISIBLE_DEVICES for nvidia, iluvatar
@@ -42,7 +43,8 @@
 # The path that flagperf deploy in the cluster.
 # Users must set FLAGPERF_PATH to where flagperf deploy
 # You can assume the preset "/home/FlagPerf/training" points to Null
-FLAGPERF_PATH = "/home/FlagPerf/training"
+FLAGPERF_PATH = "/home/zhiyuan/FlagPerf/training"
+
 # Set log path on the host here.
 FLAGPERF_LOG_PATH = FLAGPERF_PATH + "/result/"
 
@@ -169,6 +171,7 @@
     # "llava1.5_13b:deepspeed-torch:BI-V150:1:16:1": "/raid/dataset/llava1.5_13b",
     # "mixtral_8x7B:megatron:BI-V150:4:16:1": "/raid/dataset/mixtral_8x7B",   ##单机测试
     # "mixtral_8x7B:megatron:BI-V150:1:16:1": "/raid/dataset/mixtral_8x7B",   ##四机测试
+    "llama3_8B:megatron:TG-V200:1:16:1": "/datasets",   ##单机测试
 
     # mthreads cases
     # "resnet50:pytorch_2.0:S4000:1:8:1": "/data/flagperf/ImageNet",