From 6b737c7119a62faf5d7683a3721eec0a3e0c3a07 Mon Sep 17 00:00:00 2001 From: dafeliton Date: Wed, 18 Dec 2024 18:36:27 -0800 Subject: [PATCH] rewrite scipy, fix rstudio --- images/rstudio-notebook/Dockerfile | 9 ++- images/scipy-ml-notebook/Dockerfile | 118 +++++++++++++--------------- 2 files changed, 60 insertions(+), 67 deletions(-) diff --git a/images/rstudio-notebook/Dockerfile b/images/rstudio-notebook/Dockerfile index ff32beed..7d5142b4 100644 --- a/images/rstudio-notebook/Dockerfile +++ b/images/rstudio-notebook/Dockerfile @@ -7,17 +7,22 @@ USER root # Ubuntu 22 setup with v 2024.04.2-764 ## Follow instructions at https://www.rstudio.com/products/rstudio/download-server/ +## https://posit.co/code-signing/ - find latest pubkey here ENV RSTUDIO_PKG=rstudio-server-2024.09.1-394-amd64.deb ENV RSTUDIO_URL=https://download2.rstudio.org/server/jammy/amd64/${RSTUDIO_PKG} +ENV RSTUDIO_PUBKEY=51C0B5BB19F92D60 ## rstudio installation expects R to live in /usr/bin, /bin/, etc. RUN ln -s /opt/conda/bin/R /usr/bin/R && \ apt-get update && \ - apt-get -qq install -y apt-utils gdebi-core dpkg-sig && \ + apt-get -qq install -y apt-utils gdebi-core wget gnupg && \ wget ${RSTUDIO_URL} && \ + gpg --keyserver keys.openpgp.org --recv-keys ${RSTUDIO_PUBKEY} && \ + gpg --verify ${RSTUDIO_PKG} && \ gdebi -n ${RSTUDIO_PKG} && \ rm -f ${RSTUDIO_PKG} && \ - echo '/opt/conda/lib/R/lib' > /etc/ld.so.conf.d/r.conf && /sbin/ldconfig -v && \ + echo '/opt/conda/lib/R/lib' > /etc/ld.so.conf.d/r.conf && \ + /sbin/ldconfig -v && \ apt-get clean && rm -rf /var/lib/apt/lists/* && \ rm -f /usr/bin/R && \ chmod -R g=u /var/lib/rstudio-server && \ diff --git a/images/scipy-ml-notebook/Dockerfile b/images/scipy-ml-notebook/Dockerfile index 97e3635a..11a26415 100644 --- a/images/scipy-ml-notebook/Dockerfile +++ b/images/scipy-ml-notebook/Dockerfile @@ -3,23 +3,21 @@ FROM ghcr.io/ucsd-ets/datascience-notebook:${BASE_TAG} USER root -# tensorflow, pytorch stable versions -# https://pytorch.org/get-started/previous-versions/ -# https://www.tensorflow.org/install/source#linux - -# Python/Mamba deps -## Package versions -## tf 2.13 does not work with torch 2.2.1. Both require conflicting versions of typing-extensions -ARG CUDA_VERSION=12.1 CUDNN_VERSION=8.9.2.26 LIBNVINFER=7.2.2 LIBNVINFER_MAJOR_VERSION=7 \ - TENSORFLOW_VERSION=2.17.0 KERAS_VERSION=3.5.0 TENSORRT_VERSION=8.6.1 TORCH_VERSION=2.3.1 \ - PROTOBUF_VERSION=3.20.3 +# Package versions (adjust as needed) +ARG CUDA_VERSION=12.1 +ARG CUDNN_VERSION=8.9.7.29 +ARG TENSORFLOW_VERSION=2.17.0 +ARG KERAS_VERSION=3.5.0 +ARG TENSORRT_VERSION=8.6.1 +ARG TORCH_VERSION=2.3.1 +ARG PROTOBUF_VERSION=3.20.3 # apt deps RUN apt-get update && \ - apt-get install -y \ - libtinfo5 build-essential && \ + apt-get install -y libtinfo5 build-essential && \ apt-get clean && rm -rf /var/lib/apt/lists/* -## Symbolic link for Stata 17 dependency on libncurses5 + +# Symbolic link for Stata 17 dependency on libncurses5 RUN ln -s libncurses.so.6 /usr/lib/x86_64-linux-gnu/libncurses.so.5 # Jupyter setup @@ -37,82 +35,72 @@ ADD manual_tests /opt/manual_tests RUN chmod 777 /etc/datahub-profile.d/*.sh /tmp/activate.sh -# cudnn (TBD) -#RUN apt update && apt install -y wget && \ -# wget https://developer.download.nvidia.com/compute/cuda/repos/debian11/x86_64/libcudnn8_8.9.6.50-1+cuda11.8_amd64.deb && \ -# dpkg -i libcudnn8_8.9.6.50-1+cuda11.8_amd64.deb && \ -# rm libcudnn8_8.9.6.50-1+cuda11.8_amd64.deb && \ -# apt-get clean && \ -# rm -rf /var/lib/apt/lists/* - +# Switch to non-root user for installing packages via mamba/pip USER jovyan # Install nvdashboard for GPU monitoring -RUN mamba install -c rapidsai-nightly -c conda-forge jupyterlab-nvdashboard - -# CUDA setup w/mamba -## TODO: Investigate this command, seems to duplicate cuda packages for nvidia (pypi + conda-forge). -# cuda-toolkit is a skeleton package on CUDA 12, unlike CUDA <= 11 -RUN mamba install -c "nvidia/label/cuda-12.1.1" cuda-nvcc \ - cuda-toolkit=$CUDA_VERSION \ - cuda-version=$CUDA_VERSION \ +RUN mamba install -c rapidsai-nightly -c conda-forge jupyterlab-nvdashboard && \ + mamba clean -a -y + +# Install CUDA toolkit, NCCL, cuDNN via Conda +RUN mamba install -c "nvidia/label/cuda-12.1.1" \ + cuda-nvcc \ + cuda-toolkit=${CUDA_VERSION} \ + cuda-version=${CUDA_VERSION} \ nccl \ + cudnn=${CUDNN_VERSION} \ -y && \ fix-permissions $CONDA_DIR && \ fix-permissions /home/$NB_USER && \ mamba clean -a -y -# Install scipy pip packages -## install protobuf to avoid weird base type error. seems like if we don't then it'll be installed twice. -## https://github.com/spesmilo/electrum/issues/7825 -## pip cache purge didnt work here for some reason. -RUN pip install --no-cache-dir protobuf==$PROTOBUF_VERSION -## cuda-python installed to have parity with tensorflow and cudnn -## Install pillow<7 due to dependency issue https://github.com/pytorch/vision/issues/1712 -## tensorrt installed to fix not having libnvinfer that has caused tensorflow issues. -RUN pip install opencv-contrib-python-headless \ - opencv-python && \ - fix-permissions $CONDA_DIR && \ +# Install protobuf via pip to ensure a specific version +RUN pip install --no-cache-dir protobuf==${PROTOBUF_VERSION} + +# Install other Python packages that are simpler via pip +RUN pip install --no-cache-dir opencv-contrib-python-headless opencv-python && \ + fix-permissions $CONDA_DIR && \ fix-permissions /home/$NB_USER && \ pip cache purge +# Install common packages via conda-forge RUN mamba install -c conda-forge pyqt pycocotools pillow scapy && \ fix-permissions $CONDA_DIR && \ fix-permissions /home/$NB_USER && \ mamba clean --all -# Install CUDA/Torch/Tensorflow/Keras w/pip -# TF Compatibility Matrix: https://www.tensorflow.org/install/source?hl=en#gpu -## no purge required but no-cache-dir is used. pip purge will actually break the build here! -## Beware of potentially needing to update these if we update the drivers. -## Check tensorrt_env_vars.sh if you have to bump tensorrt! -RUN pip install nvidia-cudnn-cu12==$CUDNN_VERSION torch==$TORCH_VERSION torchvision torchaudio --index-url https://download.pytorch.org/whl/cu121 && \ - pip install tensorflow==$TENSORFLOW_VERSION tensorflow-datasets tensorrt==$TENSORRT_VERSION keras==$KERAS_VERSION tf-keras==$TENSORFLOW_VERSION && \ - fix-permissions $CONDA_DIR && \ - fix-permissions /home/$NB_USER && \ - mamba clean -a -y && \ - pip cache purge - -RUN pip install transformers datasets accelerate huggingface-hub timm && \ - fix-permissions $CONDA_DIR && \ - fix-permissions /home/$NB_USER && \ - mamba clean -a -y && \ - pip cache purge +# Install PyTorch and GPU support from Conda +# Use pytorch & nvidia channels to ensure proper CUDA integration +RUN mamba install pytorch==${TORCH_VERSION} torchvision torchaudio pytorch-cuda=12.1 -c pytorch -c nvidia -y && \ + fix-permissions $CONDA_DIR && \ + fix-permissions /home/$NB_USER && \ + mamba clean -a -y + +# Install TensorFlow, Keras, and TF datasets from conda-forge if available +# Note: Check if these versions are available and GPU-accelerated on conda-forge. +RUN mamba install tensorflow==${TENSORFLOW_VERSION} keras==${KERAS_VERSION} tensorflow-datasets -c conda-forge -y && \ + fix-permissions $CONDA_DIR && \ + fix-permissions /home/$NB_USER && \ + mamba clean -a -y + +# Additional ML packages via pip +RUN pip install --no-cache-dir transformers datasets accelerate huggingface-hub timm && \ + fix-permissions $CONDA_DIR && \ + fix-permissions /home/$NB_USER && \ + mamba clean -a -y && \ + pip cache purge USER $NB_UID:$NB_GID ENV PATH=${PATH}:/usr/local/nvidia/bin:/opt/conda/bin # CUDA fixes for CONDA -## Copy libdevice file to the required path RUN mkdir -p $CONDA_DIR/lib/nvvm/libdevice && \ cp $CONDA_DIR/nvvm/libdevice/libdevice.10.bc $CONDA_DIR/lib/nvvm/libdevice/ - #CUDA 11: cp $CONDA_DIR/lib/libdevice.10.bc $CONDA_DIR/lib/nvvm/libdevice/ -# TensorRT fix for tensorflow -## https://github.com/tensorflow/tensorflow/issues/61468 (could not find TensorRT) -## This will most definitely have to be changed after 8.6.1... -RUN ln -s /opt/conda/lib/python3.11/site-packages/tensorrt_libs/libnvinfer_plugin.so.8 /opt/conda/lib/python3.11/site-packages/tensorrt_libs/libnvinfer_plugin.so.$TENSORRT_VERSION && \ - ln -s /opt/conda/lib/python3.11/site-packages/tensorrt_libs/libnvinfer.so.8 /opt/conda/lib/python3.11/site-packages/tensorrt_libs/libnvinfer.so.$TENSORRT_VERSION +# TensorRT fix for TensorFlow (if needed) +# Adjust paths as necessary, depending on how tensorrt is installed. +#RUN ln -s /opt/conda/lib/python3.11/site-packages/tensorrt_libs/libnvinfer_plugin.so.8 /opt/conda/lib/python3.11/site-packages/tensorrt_libs/libnvinfer_plugin.so.${TENSORRT_VERSION} && \ +# ln -s /opt/conda/lib/python3.11/site-packages/tensorrt_libs/libnvinfer.so.8 /opt/conda/lib/python3.11/site-packages/tensorrt_libs/libnvinfer.so.${TENSORRT_VERSION} # Run datahub scripts -RUN . /tmp/activate.sh +RUN . /tmp/activate.sh \ No newline at end of file