diff --git a/training/benchmarks/llama3_8B/megatron/run_pretraining.py b/training/benchmarks/llama3_8B/megatron/run_pretraining.py index ff64936e6..810abf2d0 100644 --- a/training/benchmarks/llama3_8B/megatron/run_pretraining.py +++ b/training/benchmarks/llama3_8B/megatron/run_pretraining.py @@ -109,7 +109,10 @@ def parse_args(): time_per_step = float(steptime) / 1000 whole_tps = 512 * 8192 / time_per_step - chip_tps = whole_tps / (args.nproc_per_node * args.nnodes) + if args.vendor=="iluvatar": + chip_tps = whole_tps / (args.nproc_per_node * args.nnodes) * 2 + else: + chip_tps = whole_tps / (args.nproc_per_node * args.nnodes) print("System tokens per second: ", whole_tps) print("Tokens/p/s: ", chip_tps) print("MFU: ", chip_tps * 8000000000.0 * 6 / theoryflops) diff --git a/training/iluvatar/docker_image/megatron/Dockerfile b/training/iluvatar/docker_image/megatron/Dockerfile index a58fe6269..0f83e55ad 100755 --- a/training/iluvatar/docker_image/megatron/Dockerfile +++ b/training/iluvatar/docker_image/megatron/Dockerfile @@ -1,63 +1,2 @@ -FROM ubuntu:20.04 +FROM flagperf:bi200 -# copy /etc/apt/sources.list . or choose an available one if encountering a problem with the mirror source -# ADD sources.list /etc/apt/ - -RUN /bin/bash -c "source /root/.bashrc" - -ENV DEBIAN_FRONTEND=noninteractive -ENV PATH /root/miniconda/bin:$PATH - -RUN sed -i 's#http://archive.ubuntu.com/#http://mirrors.tuna.tsinghua.edu.cn/#' /etc/apt/sources.list -RUN apt-get update -y -RUN apt-get install -y --fix-missing \ - apt-utils \ - sudo \ - openssh-server \ - vim \ - git \ - curl \ - wget \ - tree \ - perl \ - kmod \ - make \ - pciutils \ - build-essential \ - python3.10-dev \ - python3-pip \ - libjpeg-dev \ - zlib1g-dev \ - unzip \ - cmake \ - bzip2 \ - cabextract \ - iputils-ping \ - pbzip2 \ - pv \ - numactl \ - ninja-build \ - gcc-7 \ - g++-7 \ - libncursesw5 - -# Configure anaconda -RUN wget https://mirrors.tuna.tsinghua.edu.cn/anaconda/miniconda/Miniconda3-py310_4.10.3-Linux-x86_64.sh && \ - bash ./Miniconda3-py310_4.10.3-Linux-x86_64.sh -b -p /root/miniconda && \ - /root/miniconda/bin/conda clean -tipsy && \ - ln -s /root/miniconda/etc/profile.d/conda.sh /etc/profile.d/conda.sh && \ - echo ". /root/miniconda/etc/profile.d/conda.sh" >> ~/.bashrc && \ - echo "conda activate base" >> ~/.bashrc && \ - conda config --set always_yes yes --set changeps1 no && \ - echo 'LD_LIBRARY_PATH="/usr/local/corex/lib:${LD_LIBRARY_PATH}"' >> ~/.bashrc && \ - echo 'PATH="/usr/local/corex/bin:${PATH}"' >> ~/.bashrc - -RUN /bin/bash -c "apt-get install -y linux-headers-`uname -r`" - -RUN update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-7 10 --slave /usr/bin/g++ g++ /usr/bin/g++-7 - -RUN /bin/bash -c "pip config set global.index-url https://pypi.tuna.tsinghua.edu.cn/simple" - -ENV LD_LIBRARY_PATH="/usr/local/corex/lib:${LD_LIBRARY_PATH}" -ENV PATH="/usr/local/corex/bin:${PATH}" -ENV NVCC_ARGUMENTS="-U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ -ftemplate-depth=1024" diff --git a/training/iluvatar/llama3_8B-megatron/config/config_TG-V200x1x16.py b/training/iluvatar/llama3_8B-megatron/config/config_TG-V200x1x16.py new file mode 100755 index 000000000..0e0cf537c --- /dev/null +++ b/training/iluvatar/llama3_8B-megatron/config/config_TG-V200x1x16.py @@ -0,0 +1,7 @@ +tokenizer_path = "/home/zhiyuan/test/llama3-8b/tokenizer.model" +localbs = 1 #micro-batch-size +train_steps = 300 ##训练迭代次数 +theoryflops = 276000000000000.0 ##由于测试环境功率限制,设定较低主频,限制理论算力上限,与官方标称算力不一致 +megatron_path = "/usr/local/lib/python3.10/dist-packages/megatron" # need to be aligned with DockerFile. In NGCtorch, it's /workspace/ + Megatron-LM +tensor_parallel = 1 +pipeline_parallel = 4 diff --git a/training/iluvatar/llama3_8B-megatron/config/training_adapter.sh b/training/iluvatar/llama3_8B-megatron/config/training_adapter.sh new file mode 100755 index 000000000..e6e658aa8 --- /dev/null +++ b/training/iluvatar/llama3_8B-megatron/config/training_adapter.sh @@ -0,0 +1,17 @@ +echo "[Prompt] iluvatar adaption is not NULL, for other Vendors" +export PYTHONPATH=/usr/local/lib/python3.10/dist-packages +export CUDA_DEVICE_MAX_CONNECTIONS=1 +export NCCL_NET_SHARED_BUFFERS=0 +export NCCL_ALGO=Ring +export OMP_NUM_THREADS=4 +export ENABLE_FLASH_ATTENTION_WITH_IXDNN=1 +VENDOR_ARGS=" \ + --transformer-impl transformer_engine \ + --use-distributed-optimizer \ + --use-flash-attn \ + --untie-embeddings-and-output-weights \ + --no-create-attention-mask-in-dataloader \ + --use-legacy-models \ + --num-layers-per-stage 1 7 2 9 1 7 \ + --disable-bias-linear \ +" diff --git a/training/run_benchmarks/config/test_conf.py b/training/run_benchmarks/config/test_conf.py index a6ef8b029..2bae03dec 100755 --- a/training/run_benchmarks/config/test_conf.py +++ b/training/run_benchmarks/config/test_conf.py @@ -26,6 +26,7 @@ # dcu: # "-v /opt/hyhal/:/opt/hyhal/ --device=/dev/kfd --device=/dev/dri/ --group-add video" ACCE_CONTAINER_OPT = " --gpus all" + # XXX_VISIBLE_DEVICE item name in env # possible value of ACCE_VISIBLE_DEVICE_ENV_NAME are: # CUDA_VISIBLE_DEVICES for nvidia, iluvatar @@ -43,6 +44,7 @@ # Users must set FLAGPERF_PATH to where flagperf deploy # You can assume the preset "/home/FlagPerf/training" points to Null FLAGPERF_PATH = "/home/FlagPerf/training" + # Set log path on the host here. FLAGPERF_LOG_PATH = FLAGPERF_PATH + "/result/" @@ -169,6 +171,7 @@ # "llava1.5_13b:deepspeed-torch:BI-V150:1:16:1": "/raid/dataset/llava1.5_13b", # "mixtral_8x7B:megatron:BI-V150:4:16:1": "/raid/dataset/mixtral_8x7B", ##单机测试 # "mixtral_8x7B:megatron:BI-V150:1:16:1": "/raid/dataset/mixtral_8x7B", ##四机测试 + "llama3_8B:megatron:TG-V200:1:16:1": "/datasets", ##单机测试 # mthreads cases # "resnet50:pytorch_2.0:S4000:1:8:1": "/data/flagperf/ImageNet",