-
Notifications
You must be signed in to change notification settings - Fork 56
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Update the dockerfile base image to cuda-dl-base (#1248)
Update the base docker image to `cuda-dl-base` --------- Co-authored-by: STEFANO BOSISIO <[email protected]>
- Loading branch information
Showing
10 changed files
with
82 additions
and
326 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,27 +1,10 @@ | ||
# syntax=docker/dockerfile:1-labs | ||
ARG BASE_IMAGE=nvidia/cuda:12.6.3-devel-ubuntu24.04 | ||
ARG BASE_IMAGE=nvcr.io/nvidia/cuda-dl-base:24.11-cuda12.6-devel-ubuntu24.04 | ||
ARG GIT_USER_NAME="JAX Toolbox" | ||
ARG [email protected] | ||
ARG CLANG_VERSION=18 | ||
ARG JAX_TOOLBOX_REF | ||
|
||
############################################################################### | ||
## Obtain GCP's NCCL TCPx plugin | ||
############################################################################### | ||
|
||
FROM us-docker.pkg.dev/gce-ai-infra/gpudirect-tcpx/nccl-plugin-gpudirecttcpx:v3.1.10 AS tcpx-installer-amd64 | ||
|
||
# make a stub arm64 container because GCP does not provide an arm64 version of the plugin | ||
FROM ubuntu AS tcpx-installer-arm64 | ||
RUN <<"OUTEREOF" bash -ex | ||
mkdir -p /scripts /var/lib/tcpx/lib64 | ||
echo '#!/bin/bash' > /scripts/container_entry.sh | ||
chmod +x /scripts/container_entry.sh | ||
OUTEREOF | ||
|
||
FROM tcpx-installer-${TARGETARCH} AS tcpx-installer | ||
RUN /scripts/container_entry.sh install | ||
|
||
############################################################################### | ||
## Build base image | ||
############################################################################### | ||
|
@@ -153,50 +136,18 @@ ENV PIP_BREAK_SYSTEM_PACKAGES=1 | |
RUN pip install --upgrade --ignore-installed --no-cache-dir -e /opt/pip pip-tools && rm -rf ~/.cache/* | ||
|
||
############################################################################### | ||
## Install TCPx | ||
############################################################################### | ||
|
||
ENV TCPX_LIBRARY_PATH=/usr/local/tcpx/lib64 | ||
COPY --from=tcpx-installer /var/lib/tcpx/lib64 ${TCPX_LIBRARY_PATH} | ||
|
||
############################################################################### | ||
## Install the latest versions of Nsight Systems and Nsight Compute | ||
############################################################################### | ||
|
||
ADD install-nsight.sh /usr/local/bin | ||
RUN install-nsight.sh | ||
|
||
############################################################################### | ||
## Install cuDNN | ||
## Symlink for cuDNN | ||
############################################################################### | ||
|
||
ADD install-cudnn.sh /usr/local/bin | ||
RUN install-cudnn.sh | ||
ADD symlnk-cudnn.sh /usr/local/bin | ||
RUN symlnk-cudnn.sh | ||
|
||
############################################################################### | ||
## Install NCCL | ||
## Symlink for NCCL | ||
############################################################################### | ||
|
||
ADD install-nccl.sh /usr/local/bin | ||
RUN install-nccl.sh | ||
|
||
############################################################################### | ||
## RoCE and InfiniteBand support | ||
############################################################################### | ||
|
||
ADD install-ofed.sh /usr/local/bin | ||
RUN install-ofed.sh | ||
|
||
############################################################################## | ||
## Amazon EFA support (need to run it inside container separately) | ||
############################################################################## | ||
|
||
ADD --chmod=777 \ | ||
install-efa.sh \ | ||
test-aws-efa.sh \ | ||
/usr/local/bin/ | ||
ENV LD_LIBRARY_PATH=/opt/amazon/efa/lib:${LD_LIBRARY_PATH} | ||
ENV PATH=/opt/amazon/efa/bin:${PATH} | ||
ADD symlnk-nccl.sh /usr/local/bin | ||
RUN symlnk-nccl.sh | ||
|
||
############################################################################## | ||
## NCCL sanity check utility | ||
|
@@ -207,18 +158,6 @@ ADD nccl-sanity-check.cu /opt | |
RUN install-nccl-sanity-check.sh | ||
ADD jax-nccl-test parallel-launch /usr/local/bin/ | ||
|
||
############################################################################### | ||
## Add the systemcheck to the entrypoint. | ||
############################################################################### | ||
|
||
COPY check-shm.sh /opt/nvidia/entrypoint.d/ | ||
|
||
############################################################################### | ||
## Add the GCP - TCPX check to the entrypoint. | ||
############################################################################### | ||
|
||
# TODO(chaserileyroberts): Reenable once fully tested on GCP. | ||
# COPY gcp-autoconfig.sh /opt/nvidia/entrypoint.d/ | ||
|
||
############################################################################### | ||
## Install the nsys-jax JAX/XLA-aware profiling scripts, patch Nsight Systems | ||
|
This file was deleted.
Oops, something went wrong.
This file was deleted.
Oops, something went wrong.
This file was deleted.
Oops, something went wrong.
This file was deleted.
Oops, something went wrong.
This file was deleted.
Oops, something went wrong.
This file was deleted.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,41 @@ | ||
#!/bin/bash | ||
|
||
set -ex | ||
|
||
CUDNN_MAJOR_VERSION=9 | ||
|
||
# Create a prefix with include/ and lib/ directories containing symlinks to the cuDNN | ||
# version that was just installed; this is useful to pass to XLA to avoid it fetching | ||
# its own copy of cuDNN. | ||
prefix=/opt/nvidia/cudnn | ||
if [[ -d "${prefix}" ]]; then | ||
echo "Skipping link farm creation" | ||
exit 1 | ||
fi | ||
|
||
arch=$(uname -m)-linux-gnu | ||
libcudnn_pkgs=$(dpkg -l 'libcudnn*' | awk '/^ii/ {print $2}') | ||
if [[ -z "${libcudnn_pkgs}" ]]; then | ||
echo "No libcudnn packages installed." | ||
exit 1 | ||
fi | ||
|
||
for cudnn_file in $(dpkg -L ${libcudnn_pkgs} | sort -u); do | ||
# Real files and symlinks are linked into $prefix | ||
if [[ -f "${cudnn_file}" || -h "${cudnn_file}" ]]; then | ||
# Replace /usr with $prefix | ||
nosysprefix="${cudnn_file#"/usr/"}" | ||
# include/x86_64-linux-gpu -> include/ | ||
noarchinclude="${nosysprefix/#"include/${arch}"/include}" | ||
# cudnn_v9.h -> cudnn.h | ||
noverheader="${noarchinclude/%"_v${CUDNN_MAJOR_VERSION}.h"/.h}" | ||
# lib/x86_64-linux-gnu -> lib/ | ||
noarchlib="${noverheader/#"lib/${arch}"/lib}" | ||
link_name="${prefix}/${noarchlib}" | ||
link_dir=$(dirname "${link_name}") | ||
mkdir -p "${link_dir}" | ||
ln -s "${cudnn_file}" "${link_name}" | ||
else | ||
echo "Skipping ${cudnn_file}" | ||
fi | ||
done |
Oops, something went wrong.