From 707377b3cd7908ff8623bfde891b80f532c50af9 Mon Sep 17 00:00:00 2001 From: Nilesh M Negi Date: Sun, 22 Sep 2024 03:53:16 -0500 Subject: [PATCH] Add Dockerfile to build rccl and rccl-tests (#1011) * [BUILD] Add Dockerfile for RCCL and RCCL-Tests Signed-off-by: nileshnegi * Update docker/Dockerfile.ubuntu Typo for LD_LIBRARY_PATH Co-authored-by: corey-derochie-amd <161367113+corey-derochie-amd@users.noreply.github.com> * Update docker/Dockerfile.ubuntu use `-b` for `git clone` instead of additional `git checkout` Co-authored-by: corey-derochie-amd <161367113+corey-derochie-amd@users.noreply.github.com> * Update docker/Dockerfile.ubuntu Signed-off-by: nileshnegi --------- Signed-off-by: nileshnegi Co-authored-by: corey-derochie-amd <161367113+corey-derochie-amd@users.noreply.github.com> --- README.md | 37 +++++++++++-- docker/Dockerfile.ubuntu | 109 +++++++++++++++++++++++++++++++++++++++ 2 files changed, 143 insertions(+), 3 deletions(-) create mode 100644 docker/Dockerfile.ubuntu diff --git a/README.md b/README.md index 71983b255..2fe91d0a5 100644 --- a/README.md +++ b/README.md @@ -96,11 +96,42 @@ $ make package $ sudo dpkg -i *.deb ``` -RCCL package install requires sudo/root access because it creates a directory called "rccl" under /opt/rocm/. This is an optional step and RCCL can be used directly by including the path containing librccl.so. +RCCL package install requires sudo/root access because it installs under `/opt/rocm/`. This is an optional step as RCCL can instead be used directly by including the path containing `librccl.so`. + +## Docker build + +Assuming you have docker installed on your system: + +#### To build the docker image : + +By default, the given Dockerfile uses `docker.io/rocm/dev-ubuntu-22.04:latest` as the base docker image, and then installs RCCL (develop branch) and RCCL-Tests (develop branch). +```shell +$ docker build -t rccl-tests -f Dockerfile.ubuntu --pull . +``` + +The base docker image, rccl repo, and rccl-tests repo can be modified using `--build-args` in the `docker build` command above. E.g., to use a different base docker image: +```shell +$ docker build -t rccl-tests -f Dockerfile.ubuntu --build-arg="ROCM_IMAGE_NAME=rocm/dev-ubuntu-20.04" --build-arg="ROCM_IMAGE_TAG=6.2" --pull . +``` + +#### To start an interactive docker container on a system with AMD GPUs : + +```shell +$ docker run -it --rm --device=/dev/kfd --device=/dev/dri --group-add video --ipc=host --network=host --cap-add=SYS_PTRACE --security-opt seccomp=unconfined rccl-tests /bin/bash +``` + +#### To run rccl-tests (all_reduce_perf) on 8 AMD GPUs (inside the docker container) : + +```shell +$ mpirun --allow-run-as-root -np 8 --mca pml ucx --mca btl ^openib -x NCCL_DEBUG=VERSION /workspace/rccl-tests/build/all_reduce_perf -b 1 -e 16G -f 2 -g 1 +``` + +For more information on rccl-tests options, refer to the [Usage](https://github.com/ROCm/rccl-tests#usage) section of rccl-tests. + ## Enabling peer-to-peer transport -In order to enable peer-to-peer access on machines with PCIe-connected GPUs, the HSA environment variable HSA_FORCE_FINE_GRAIN_PCIE=1 is required to be set, on top of requiring GPUs that support peer-to-peer access and proper large BAR addressing support. +In order to enable peer-to-peer access on machines with PCIe-connected GPUs, the HSA environment variable `HSA_FORCE_FINE_GRAIN_PCIE=1` is required to be set, on top of requiring GPUs that support peer-to-peer access and proper large BAR addressing support. ## Tests @@ -111,7 +142,7 @@ rccl unit test names are now of the format: CollectiveCall.[Type of test] -Filtering of rccl unit tests should be done with environment variable and by passing the --gtest_filter command line flag, for example: +Filtering of rccl unit tests should be done with environment variable and by passing the `--gtest_filter` command line flag, for example: ```shell UT_DATATYPES=ncclBfloat16 UT_REDOPS=prod ./rccl-UnitTests --gtest_filter="AllReduce.C*" diff --git a/docker/Dockerfile.ubuntu b/docker/Dockerfile.ubuntu new file mode 100644 index 000000000..1866ba169 --- /dev/null +++ b/docker/Dockerfile.ubuntu @@ -0,0 +1,109 @@ +## base docker image +ARG ROCM_IMAGE_NAME=rocm/dev-ubuntu-22.04 +ARG ROCM_IMAGE_TAG=latest +FROM "${ROCM_IMAGE_NAME}:${ROCM_IMAGE_TAG}" + +## rccl repo +ARG RCCL_REPO=https://github.com/ROCm/rccl +ARG RCCL_BRANCH=develop + +## rccl-tests repo +ARG RCCL_TESTS_REPO=https://github.com/ROCm/rccl-tests +ARG RCCL_TESTS_BRANCH=develop + + +## creating scratch space +RUN mkdir -p /workspace +WORKDIR /workspace + +## install dependencies +RUN apt-get update \ + && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \ + ca-certificates \ + git \ + make \ + rocm-cmake \ + ninja-build \ + gfortran \ + build-essential \ + libomp5 \ + libomp-dev \ + libbfd-dev \ + libboost-all-dev \ + libnuma1 \ + libnuma-dev \ + libpthread-stubs0-dev \ + libzstd-dev \ + lcov \ + zip \ + zlib1g-dev \ + wget \ + pkg-config \ + unzip \ + chrpath \ + doxygen \ + lshw \ + build-essential \ + libssl-dev \ + curl \ + libncursesw5-dev \ + xz-utils \ + liblzma-dev \ + python3-pip \ + python3-setuptools \ + python3-venv \ + python3-dev \ + python3-tk \ + python3-yaml \ + && \ + apt-get clean && \ + rm -rf /var/lib/apt/lists/* + +RUN wget https://github.com/Kitware/CMake/releases/download/v3.28.0/cmake-3.28.0-linux-x86_64.sh \ + && chmod +x cmake-3.28.0-linux-x86_64.sh \ + && bash ./cmake-3.28.0-linux-x86_64.sh --prefix=/usr --exclude-subdir --skip-license \ + && rm cmake-3.28.0-linux-x86_64.sh + +## Install UCX +ENV UCX_INSTALL_PREFIX=/opt/ucx +RUN wget https://github.com/openucx/ucx/releases/download/v1.16.0/ucx-1.16.0.tar.gz \ + && mkdir -p ucx \ + && tar -zxf ucx-1.16.0.tar.gz -C ucx --strip-components=1 \ + && cd ucx \ + && mkdir build \ + && cd build \ + && ../configure --prefix=${UCX_INSTALL_PREFIX} --with-rocm=/opt/rocm \ + && make -j16 install \ + && cd ../.. \ + && rm -rf ucx ucx-1.16.0.tar.gz + +## Install OpenMPI +ENV MPI_INSTALL_PREFIX=/opt/ompi +RUN wget https://download.open-mpi.org/release/open-mpi/v4.1/openmpi-4.1.6.tar.gz \ + && mkdir -p ompi4 \ + && tar -zxf openmpi-4.1.6.tar.gz -C ompi4 --strip-components=1 \ + && cd ompi4 \ + && mkdir build \ + && cd build \ + && ../configure --prefix=${MPI_INSTALL_PREFIX} --with-ucx=${UCX_INSTALL_PREFIX} --disable-oshmem --disable-mpi-fortran --enable-orterun-prefix-by-default \ + && make -j16 install \ + && cd ../.. \ + && rm -rf ompi4 openmpi-4.1.6.tar.gz + + +## building RCCL +ENV RCCL_INSTALL_PREFIX=/opt/rocm +RUN git clone --recurse-submodules -b "${RCCL_BRANCH}" "${RCCL_REPO}" ./rccl \ + && cd ./rccl \ + && ./install.sh -t --prefix=${RCCL_INSTALL_PREFIX} + +## building RCCL-Tests +RUN git clone -b "${RCCL_TESTS_BRANCH}" "${RCCL_TESTS_REPO}" ./rccl-tests \ + && cd ./rccl-tests \ + && make MPI=1 MPI_HOME=${MPI_INSTALL_PREFIX} NCCL_HOME=${RCCL_INSTALL_PREFIX} -j16 + + +## set environment variables +ENV PATH="${RCCL_INSTALL_PREFIX}/bin:${MPI_INSTALL_PREFIX}/bin:${PATH}" +ENV LD_LIBRARY_PATH="${RCCL_INSTALL_PREFIX}/lib:${MPI_INSTALL_PREFIX}/lib:${LD_LIBRARY_PATH}" +