Skip to content

Commit

Permalink
Update the dockerfile base image to cuda-dl-base (#1248)
Browse files Browse the repository at this point in the history
Update the base docker image to `cuda-dl-base`

---------

Co-authored-by: STEFANO BOSISIO <[email protected]>
  • Loading branch information
Steboss and STEFANO BOSISIO authored Jan 27, 2025
1 parent e57ade9 commit 1a13844
Show file tree
Hide file tree
Showing 10 changed files with 82 additions and 326 deletions.
75 changes: 7 additions & 68 deletions .github/container/Dockerfile.base
Original file line number Diff line number Diff line change
@@ -1,27 +1,10 @@
# syntax=docker/dockerfile:1-labs
ARG BASE_IMAGE=nvidia/cuda:12.6.3-devel-ubuntu24.04
ARG BASE_IMAGE=nvcr.io/nvidia/cuda-dl-base:24.11-cuda12.6-devel-ubuntu24.04
ARG GIT_USER_NAME="JAX Toolbox"
ARG [email protected]
ARG CLANG_VERSION=18
ARG JAX_TOOLBOX_REF

###############################################################################
## Obtain GCP's NCCL TCPx plugin
###############################################################################

FROM us-docker.pkg.dev/gce-ai-infra/gpudirect-tcpx/nccl-plugin-gpudirecttcpx:v3.1.10 AS tcpx-installer-amd64

# make a stub arm64 container because GCP does not provide an arm64 version of the plugin
FROM ubuntu AS tcpx-installer-arm64
RUN <<"OUTEREOF" bash -ex
mkdir -p /scripts /var/lib/tcpx/lib64
echo '#!/bin/bash' > /scripts/container_entry.sh
chmod +x /scripts/container_entry.sh
OUTEREOF

FROM tcpx-installer-${TARGETARCH} AS tcpx-installer
RUN /scripts/container_entry.sh install

###############################################################################
## Build base image
###############################################################################
Expand Down Expand Up @@ -153,50 +136,18 @@ ENV PIP_BREAK_SYSTEM_PACKAGES=1
RUN pip install --upgrade --ignore-installed --no-cache-dir -e /opt/pip pip-tools && rm -rf ~/.cache/*

###############################################################################
## Install TCPx
###############################################################################

ENV TCPX_LIBRARY_PATH=/usr/local/tcpx/lib64
COPY --from=tcpx-installer /var/lib/tcpx/lib64 ${TCPX_LIBRARY_PATH}

###############################################################################
## Install the latest versions of Nsight Systems and Nsight Compute
###############################################################################

ADD install-nsight.sh /usr/local/bin
RUN install-nsight.sh

###############################################################################
## Install cuDNN
## Symlink for cuDNN
###############################################################################

ADD install-cudnn.sh /usr/local/bin
RUN install-cudnn.sh
ADD symlnk-cudnn.sh /usr/local/bin
RUN symlnk-cudnn.sh

###############################################################################
## Install NCCL
## Symlink for NCCL
###############################################################################

ADD install-nccl.sh /usr/local/bin
RUN install-nccl.sh

###############################################################################
## RoCE and InfiniteBand support
###############################################################################

ADD install-ofed.sh /usr/local/bin
RUN install-ofed.sh

##############################################################################
## Amazon EFA support (need to run it inside container separately)
##############################################################################

ADD --chmod=777 \
install-efa.sh \
test-aws-efa.sh \
/usr/local/bin/
ENV LD_LIBRARY_PATH=/opt/amazon/efa/lib:${LD_LIBRARY_PATH}
ENV PATH=/opt/amazon/efa/bin:${PATH}
ADD symlnk-nccl.sh /usr/local/bin
RUN symlnk-nccl.sh

##############################################################################
## NCCL sanity check utility
Expand All @@ -207,18 +158,6 @@ ADD nccl-sanity-check.cu /opt
RUN install-nccl-sanity-check.sh
ADD jax-nccl-test parallel-launch /usr/local/bin/

###############################################################################
## Add the systemcheck to the entrypoint.
###############################################################################

COPY check-shm.sh /opt/nvidia/entrypoint.d/

###############################################################################
## Add the GCP - TCPX check to the entrypoint.
###############################################################################

# TODO(chaserileyroberts): Reenable once fully tested on GCP.
# COPY gcp-autoconfig.sh /opt/nvidia/entrypoint.d/

###############################################################################
## Install the nsys-jax JAX/XLA-aware profiling scripts, patch Nsight Systems
Expand Down
19 changes: 0 additions & 19 deletions .github/container/check-shm.sh

This file was deleted.

72 changes: 0 additions & 72 deletions .github/container/install-cudnn.sh

This file was deleted.

37 changes: 0 additions & 37 deletions .github/container/install-efa.sh

This file was deleted.

58 changes: 0 additions & 58 deletions .github/container/install-nccl.sh

This file was deleted.

18 changes: 0 additions & 18 deletions .github/container/install-nsight.sh

This file was deleted.

42 changes: 0 additions & 42 deletions .github/container/install-ofed.sh

This file was deleted.

41 changes: 41 additions & 0 deletions .github/container/symlnk-cudnn.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
#!/bin/bash

set -ex

CUDNN_MAJOR_VERSION=9

# Create a prefix with include/ and lib/ directories containing symlinks to the cuDNN
# version that was just installed; this is useful to pass to XLA to avoid it fetching
# its own copy of cuDNN.
prefix=/opt/nvidia/cudnn
if [[ -d "${prefix}" ]]; then
echo "Skipping link farm creation"
exit 1
fi

arch=$(uname -m)-linux-gnu
libcudnn_pkgs=$(dpkg -l 'libcudnn*' | awk '/^ii/ {print $2}')
if [[ -z "${libcudnn_pkgs}" ]]; then
echo "No libcudnn packages installed."
exit 1
fi

for cudnn_file in $(dpkg -L ${libcudnn_pkgs} | sort -u); do
# Real files and symlinks are linked into $prefix
if [[ -f "${cudnn_file}" || -h "${cudnn_file}" ]]; then
# Replace /usr with $prefix
nosysprefix="${cudnn_file#"/usr/"}"
# include/x86_64-linux-gpu -> include/
noarchinclude="${nosysprefix/#"include/${arch}"/include}"
# cudnn_v9.h -> cudnn.h
noverheader="${noarchinclude/%"_v${CUDNN_MAJOR_VERSION}.h"/.h}"
# lib/x86_64-linux-gnu -> lib/
noarchlib="${noverheader/#"lib/${arch}"/lib}"
link_name="${prefix}/${noarchlib}"
link_dir=$(dirname "${link_name}")
mkdir -p "${link_dir}"
ln -s "${cudnn_file}" "${link_name}"
else
echo "Skipping ${cudnn_file}"
fi
done
Loading

0 comments on commit 1a13844

Please sign in to comment.