From 2ad3512466c45f4c9ee5c10539a20ff295583331 Mon Sep 17 00:00:00 2001 From: clee2000 <44682903+clee2000@users.noreply.github.com> Date: Tue, 26 Mar 2024 19:02:15 -0700 Subject: [PATCH] Build docker image (#2542) Downloading pip dependencies takes 2-4 min, but downloading the new docker image takes +1min from the pytorch one Not sure how downloading all dependencies to a new image takes in comparison to pulling docker image Docker image scripts are copied from pytorch, I tried to remove stuff but there's definitely a lot left over --- .ci/docker/Dockerfile | 23 +++++++++ .ci/docker/build.sh | 24 +++++++++ .ci/docker/common/common_utils.sh | 26 ++++++++++ .ci/docker/common/install_base.sh | 47 +++++++++++++++++ .ci/docker/common/install_conda.sh | 54 ++++++++++++++++++++ .ci/docker/common/install_docs_reqs.sh | 21 ++++++++ .ci/docker/requirements.txt | 70 +++++++++++++++++++++++++ .github/workflows/build-tutorials.yml | 60 +++++++--------------- .github/workflows/docker-build.yml | 59 +++++++++++++++++++++ .jenkins/build.sh | 7 --- requirements.txt | 71 +------------------------- 11 files changed, 344 insertions(+), 118 deletions(-) create mode 100644 .ci/docker/Dockerfile create mode 100755 .ci/docker/build.sh create mode 100644 .ci/docker/common/common_utils.sh create mode 100644 .ci/docker/common/install_base.sh create mode 100644 .ci/docker/common/install_conda.sh create mode 100644 .ci/docker/common/install_docs_reqs.sh create mode 100644 .ci/docker/requirements.txt create mode 100644 .github/workflows/docker-build.yml mode change 100644 => 120000 requirements.txt diff --git a/.ci/docker/Dockerfile b/.ci/docker/Dockerfile new file mode 100644 index 0000000000..e77f472eb8 --- /dev/null +++ b/.ci/docker/Dockerfile @@ -0,0 +1,23 @@ +ARG BASE_IMAGE +FROM ${BASE_IMAGE} + +ENV DEBIAN_FRONTEND noninteractive + +# Install common dependencies (so that this step can be cached separately) +COPY ./common/install_base.sh install_base.sh +RUN bash ./install_base.sh && rm install_base.sh + +COPY ./common/install_docs_reqs.sh install_docs_reqs.sh +RUN bash ./install_docs_reqs.sh && rm install_docs_reqs.sh + +# Install conda and other packages +ENV ANACONDA_PYTHON_VERSION=3.10 +ENV CONDA_CMAKE yes +ENV DOCS yes +ENV PATH /opt/conda/envs/py_$ANACONDA_PYTHON_VERSION/bin:/opt/conda/bin:$PATH +COPY ./requirements.txt /opt/conda/ +COPY ./common/install_conda.sh install_conda.sh +COPY ./common/common_utils.sh common_utils.sh +RUN bash ./install_conda.sh && rm install_conda.sh common_utils.sh /opt/conda/requirements.txt + +CMD ["bash"] diff --git a/.ci/docker/build.sh b/.ci/docker/build.sh new file mode 100755 index 0000000000..31f42fdbd8 --- /dev/null +++ b/.ci/docker/build.sh @@ -0,0 +1,24 @@ +#!/bin/bash +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +set -exu + +IMAGE_NAME="$1" +shift + +export UBUNTU_VERSION="20.04" + +export BASE_IMAGE="ubuntu:${UBUNTU_VERSION}" +echo "Building ${IMAGE_NAME} Docker image" + +docker build \ + --no-cache \ + --progress=plain \ + -f Dockerfile \ + --build-arg BASE_IMAGE="${BASE_IMAGE}" \ + "$@" \ + . diff --git a/.ci/docker/common/common_utils.sh b/.ci/docker/common/common_utils.sh new file mode 100644 index 0000000000..03cb5cbafc --- /dev/null +++ b/.ci/docker/common/common_utils.sh @@ -0,0 +1,26 @@ +#!/bin/bash + +# Work around bug where devtoolset replaces sudo and breaks it. +as_ci_user() { + # NB: unsetting the environment variables works around a conda bug + # https://github.com/conda/conda/issues/6576 + # NB: Pass on PATH and LD_LIBRARY_PATH to sudo invocation + # NB: This must be run from a directory that the user has access to, + # works around https://github.com/conda/conda-package-handling/pull/34 + sudo -E -H env -u SUDO_UID -u SUDO_GID -u SUDO_COMMAND -u SUDO_USER env "PATH=$PATH" "LD_LIBRARY_PATH=$LD_LIBRARY_PATH" $* +} + +conda_install() { + # Ensure that the install command don't upgrade/downgrade Python + # This should be called as + # conda_install pkg1 pkg2 ... [-c channel] + as_ci_user conda install -q -n py_$ANACONDA_PYTHON_VERSION -y python="$ANACONDA_PYTHON_VERSION" $* +} + +conda_run() { + as_ci_user conda run -n py_$ANACONDA_PYTHON_VERSION --no-capture-output $* +} + +pip_install() { + as_ci_user conda run -n py_$ANACONDA_PYTHON_VERSION pip install --progress-bar off $* +} diff --git a/.ci/docker/common/install_base.sh b/.ci/docker/common/install_base.sh new file mode 100644 index 0000000000..7fcb81ffea --- /dev/null +++ b/.ci/docker/common/install_base.sh @@ -0,0 +1,47 @@ +#!/bin/bash +# Based off of https://github.com/pytorch/pytorch/tree/b52e0bf131a4e55cd987176f9c5a8d2ad6783b4f/.ci/docker + +set -ex + +install_ubuntu() { + # Install common dependencies + apt-get update + # TODO: Some of these may not be necessary + apt-get install -y --no-install-recommends \ + build-essential \ + ca-certificates \ + cmake=3.16* \ + curl \ + git \ + wget \ + sudo \ + vim \ + jq \ + vim \ + unzip \ + gdb \ + rsync \ + libssl-dev \ + p7zip-full \ + libglfw3 \ + libglfw3-dev \ + sox \ + libsox-dev \ + libsox-fmt-all + + # Cleanup package manager + apt-get autoclean && apt-get clean + rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/* +} + +# Install base packages depending on the base OS +ID=$(grep -oP '(?<=^ID=).+' /etc/os-release | tr -d '"') +case "$ID" in + ubuntu) + install_ubuntu + ;; + *) + echo "Unable to determine OS..." + exit 1 + ;; +esac diff --git a/.ci/docker/common/install_conda.sh b/.ci/docker/common/install_conda.sh new file mode 100644 index 0000000000..4ef67e0c18 --- /dev/null +++ b/.ci/docker/common/install_conda.sh @@ -0,0 +1,54 @@ +#!/bin/bash + +set -ex + +# Optionally install conda +if [ -n "$ANACONDA_PYTHON_VERSION" ]; then + BASE_URL="https://repo.anaconda.com/miniconda" + + MAJOR_PYTHON_VERSION=$(echo "$ANACONDA_PYTHON_VERSION" | cut -d . -f 1) + MINOR_PYTHON_VERSION=$(echo "$ANACONDA_PYTHON_VERSION" | cut -d . -f 2) + + CONDA_FILE="Miniconda3-latest-Linux-x86_64.sh" + + mkdir -p /opt/conda + + source "$(dirname "${BASH_SOURCE[0]}")/common_utils.sh" + + pushd /tmp + wget -q "${BASE_URL}/${CONDA_FILE}" + # NB: Manually invoke bash per https://github.com/conda/conda/issues/10431 + as_ci_user bash "${CONDA_FILE}" -b -f -p "/opt/conda" + popd + + # NB: Don't do this, rely on the rpath to get it right + #echo "/opt/conda/lib" > /etc/ld.so.conf.d/conda-python.conf + #ldconfig + sed -e 's|PATH="\(.*\)"|PATH="/opt/conda/bin:\1"|g' -i /etc/environment + export PATH="/opt/conda/bin:$PATH" + + # Ensure we run conda in a directory that the user has write access to + pushd /opt/conda + + # Prevent conda from updating to 4.14.0, which causes docker build failures + # See https://hud.pytorch.org/pytorch/pytorch/commit/754d7f05b6841e555cea5a4b2c505dd9e0baec1d + # Uncomment the below when resolved to track the latest conda update + # as_ci_user conda update -y -n base conda + + # Install correct Python version + as_ci_user conda create -n py_$ANACONDA_PYTHON_VERSION -y python="$ANACONDA_PYTHON_VERSION" + + # Use conda cmake in some cases. Conda cmake will be newer than our supported + # min version (3.5 for xenial and 3.10 for bionic), so we only do it in those + # following builds that we know should use conda. Specifically, Ubuntu bionic + # and focal cannot find conda mkl with stock cmake, so we need a cmake from conda + conda_install cmake + + # Install pip packages + pip_install -r /opt/conda/requirements.txt + + apt-get update + apt-get -y install expect-dev + + popd +fi diff --git a/.ci/docker/common/install_docs_reqs.sh b/.ci/docker/common/install_docs_reqs.sh new file mode 100644 index 0000000000..541c9976ad --- /dev/null +++ b/.ci/docker/common/install_docs_reqs.sh @@ -0,0 +1,21 @@ +#!/bin/bash +# Based off of https://github.com/pytorch/pytorch/tree/b52e0bf131a4e55cd987176f9c5a8d2ad6783b4f/.ci/docker +set -ex + +apt-get update +apt-get install -y gpg-agent + +curl --retry 3 -sL https://deb.nodesource.com/setup_20.x | sudo -E bash - +sudo apt-get install -y nodejs + +curl --retry 3 -sS https://dl.yarnpkg.com/debian/pubkey.gpg | sudo apt-key add - +echo "deb https://dl.yarnpkg.com/debian/ stable main" | sudo tee /etc/apt/sources.list.d/yarn.list + +apt-get update +apt-get install -y --no-install-recommends yarn +yarn global add katex --prefix /usr/local + +sudo apt-get -y install doxygen + +apt-get autoclean && apt-get clean +rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/* diff --git a/.ci/docker/requirements.txt b/.ci/docker/requirements.txt new file mode 100644 index 0000000000..843362dd09 --- /dev/null +++ b/.ci/docker/requirements.txt @@ -0,0 +1,70 @@ +# --extra-index-url https://download.pytorch.org/whl/cu117/index.html # Use this to run/publish tutorials against the latest binaries during the RC stage. Comment out after the release. Each release verify the correct cuda version. +# Refer to ./jenkins/build.sh for tutorial build instructions + +sphinx==5.0.0 +sphinx-gallery==0.11.1 +sphinx_design +nbsphinx +docutils==0.16 +sphinx-copybutton +pypandoc==1.12 +pandocfilters +markdown +tqdm==4.66.1 +numpy==1.24.4 +matplotlib +librosa +torch +torchvision +torchtext +torchdata +networkx +PyHamcrest +bs4 +awscliv2==2.1.1 +flask +spacy==3.4.1 +ray[tune]==2.7.2 +tensorboard +jinja2==3.1.3 +pytorch-lightning +torchx +torchrl==0.3.0 +tensordict==0.3.0 +ax-platform +nbformat>==5.9.2 +datasets +transformers +torchmultimodal-nightly # needs to be updated to stable as soon as it's avaialable +onnx +onnxscript +onnxruntime + +importlib-metadata==6.8.0 + +# PyTorch Theme +-e git+https://github.com/pytorch/pytorch_sphinx_theme.git#egg=pytorch_sphinx_theme + +ipython + +sphinxcontrib.katex +# to run examples +boto3 +pandas +requests +scikit-image +scipy==1.11.1 +numba==0.57.1 +pillow==10.2.0 +wget +gym==0.26.2 +gym-super-mario-bros==7.4.0 +pyopengl +gymnasium[mujoco]==0.27.0 +timm +iopath +pygame==2.1.2 +pycocotools +semilearn==0.3.2 +torchao==0.0.3 +segment_anything==1.0 diff --git a/.github/workflows/build-tutorials.yml b/.github/workflows/build-tutorials.yml index 809b9ad4bf..dd0022f636 100644 --- a/.github/workflows/build-tutorials.yml +++ b/.github/workflows/build-tutorials.yml @@ -33,9 +33,6 @@ jobs: - { shard: 15, num_shards: 15, runner: "linux.4xlarge.nvidia.gpu" } fail-fast: false runs-on: ${{ matrix.runner }} - env: - DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-focal-cuda12.1-cudnn8-py3-gcc9" - CUDA_VERSION: "9" steps: - name: Setup SSH (Click me for login details) uses: pytorch/test-infra/.github/actions/setup-ssh@main @@ -54,27 +51,21 @@ jobs: - name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG uses: pytorch/test-infra/.github/actions/setup-nvidia@main - - name: Calculate docker image - shell: bash - id: docker-image - run: | - set -ex - - # for some reason, pip installs it in a different place than what is looked at in the py file - pip3 install requests==2.26 - pyTorchDockerImageTag=$(python3 .jenkins/get_docker_tag.py) - - echo "docker-image=${DOCKER_IMAGE}:${pyTorchDockerImageTag}" >> "${GITHUB_OUTPUT}" + - name: Calculate/build docker image + id: calculate-docker-image + uses: pytorch/test-infra/.github/actions/calculate-docker-image@main + with: + docker-image-name: tutorials - name: Pull docker image uses: pytorch/test-infra/.github/actions/pull-docker-image@main with: - docker-image: ${{ steps.docker-image.outputs.docker-image }} + docker-image: ${{ steps.calculate-docker-image.outputs.docker-image }} - name: Build shell: bash env: - DOCKER_IMAGE: ${{ steps.docker-image.outputs.docker-image }} + DOCKER_IMAGE: ${{ steps.calculate-docker-image.outputs.docker-image }} NUM_WORKERS: ${{ matrix.num_shards }} WORKER_ID: ${{ matrix.shard }} COMMIT_ID: ${{ github.sha }} @@ -95,16 +86,13 @@ jobs: --env-file="/tmp/github_env_${GITHUB_RUN_ID}" \ --tty \ --detach \ - --user jenkins \ --shm-size=2gb \ --name="${container_name}" \ - -v "${GITHUB_WORKSPACE}:/var/lib/jenkins/workspace" \ - -w /var/lib/jenkins/workspace \ + -v "${GITHUB_WORKSPACE}:/var/lib/workspace" \ + -w /var/lib/workspace \ "${DOCKER_IMAGE}" ) - echo "rm /opt/cache/bin/*" | docker exec -u root -i "${container_name}" bash - docker exec -t "${container_name}" sh -c ".jenkins/build.sh" - name: Teardown Linux @@ -116,9 +104,6 @@ jobs: needs: worker runs-on: [self-hosted, linux.2xlarge] environment: ${{ github.ref == 'refs/heads/main' && 'pytorchbot-env' || '' }} - env: - DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-focal-cuda12.1-cudnn8-py3-gcc9" - CUDA_VERSION: "9" steps: - name: Setup SSH (Click me for login details) uses: pytorch/test-infra/.github/actions/setup-ssh@main @@ -134,27 +119,21 @@ jobs: - name: Setup Linux uses: pytorch/pytorch/.github/actions/setup-linux@main - - name: Calculate docker image - shell: bash - id: docker-image - run: | - set -ex - - # for some reason, pip installs it in a different place than what is looked at in the py file - pip3 install requests==2.26 - pyTorchDockerImageTag=$(python3 .jenkins/get_docker_tag.py) - - echo "docker-image=${DOCKER_IMAGE}:${pyTorchDockerImageTag}" >> "${GITHUB_OUTPUT}" + - name: Calculate/build docker image + id: calculate-docker-image + uses: pytorch/test-infra/.github/actions/calculate-docker-image@main + with: + docker-image-name: tutorials - name: Pull docker image uses: pytorch/test-infra/.github/actions/pull-docker-image@main with: - docker-image: ${{ steps.docker-image.outputs.docker-image }} + docker-image: ${{ steps.calculate-docker-image.outputs.docker-image }} - name: Build shell: bash env: - DOCKER_IMAGE: ${{ steps.docker-image.outputs.docker-image }} + DOCKER_IMAGE: ${{ steps.calculate-docker-image.outputs.docker-image }} NUM_WORKERS: 15 WORKER_ID: ${{ matrix.shard }} COMMIT_ID: ${{ github.sha }} @@ -177,14 +156,13 @@ jobs: --env-file="/tmp/github_env_${GITHUB_RUN_ID}" \ --tty \ --detach \ - --user jenkins \ --name="${container_name}" \ - -v "${GITHUB_WORKSPACE}:/var/lib/jenkins/workspace" \ - -w /var/lib/jenkins/workspace \ + -v "${GITHUB_WORKSPACE}:/var/lib/workspace" \ + -w /var/lib/workspace \ "${DOCKER_IMAGE}" ) - echo "rm /opt/cache/bin/*" | docker exec -u root -i "${container_name}" bash + docker exec -u root -i "${container_name}" bash docker exec -t "${container_name}" sh -c ".jenkins/build.sh" diff --git a/.github/workflows/docker-build.yml b/.github/workflows/docker-build.yml new file mode 100644 index 0000000000..6d75d1fc92 --- /dev/null +++ b/.github/workflows/docker-build.yml @@ -0,0 +1,59 @@ +name: Docker Build + +on: + workflow_dispatch: + pull_request: + paths: + - .ci/docker/** + - .github/workflows/docker-builds.yml + push: + branches: + - main + paths: + - .ci/docker/** + - .github/workflows/docker-builds.yml + +jobs: + docker-build: + runs-on: [self-hosted, linux.2xlarge] + timeout-minutes: 240 + strategy: + fail-fast: false + matrix: + include: + - docker-image-name: tutorials + env: + DOCKER_IMAGE: 308535385114.dkr.ecr.us-east-1.amazonaws.com/tutorials/${{ matrix.docker-image-name }} + steps: + - name: Clean workspace + shell: bash + run: | + echo "${GITHUB_WORKSPACE}" + sudo rm -rf "${GITHUB_WORKSPACE}" + mkdir "${GITHUB_WORKSPACE}" + + - name: Setup SSH (Click me for login details) + uses: pytorch/test-infra/.github/actions/setup-ssh@main + with: + github-secret: ${{ secrets.GITHUB_TOKEN }} + + - name: Checkout + uses: actions/checkout@v3 + + - name: Setup Linux + uses: pytorch/test-infra/.github/actions/setup-linux@main + + - name: Build docker image + id: build-docker-image + uses: pytorch/test-infra/.github/actions/calculate-docker-image@main + with: + docker-image-name: ${{ matrix.docker-image-name }} + push: true + + - name: Teardown Linux + uses: pytorch/test-infra/.github/actions/teardown-linux@main + if: always() + +concurrency: + group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }} + cancel-in-progress: true diff --git a/.jenkins/build.sh b/.jenkins/build.sh index 14f29bc223..c7bbb5c250 100755 --- a/.jenkins/build.sh +++ b/.jenkins/build.sh @@ -11,9 +11,6 @@ export LANG=C.UTF-8 DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null && pwd )" # Update root certificates by installing new libgnutls30 -sudo apt-get update || sudo apt-get install libgnutls30 -sudo apt-get update -sudo apt-get install -y --no-install-recommends unzip p7zip-full sox libsox-dev libsox-fmt-all rsync # Install pandoc (does not install from pypi) sudo apt-get update @@ -21,10 +18,6 @@ sudo apt-get install -y pandoc # NS: Path to python runtime should already be part of docker container # export PATH=/opt/conda/bin:$PATH -rm -rf src -# NS: ghstack is not needed to build tutorials and right now it forces importlib to be downgraded to 3.X -pip uninstall -y ghstack -pip install --progress-bar off -r $DIR/../requirements.txt #Install PyTorch Nightly for test. # Nightly - pip install --pre torch torchvision torchaudio -f https://download.pytorch.org/whl/nightly/cu102/torch_nightly.html diff --git a/requirements.txt b/requirements.txt deleted file mode 100644 index 843362dd09..0000000000 --- a/requirements.txt +++ /dev/null @@ -1,70 +0,0 @@ -# --extra-index-url https://download.pytorch.org/whl/cu117/index.html # Use this to run/publish tutorials against the latest binaries during the RC stage. Comment out after the release. Each release verify the correct cuda version. -# Refer to ./jenkins/build.sh for tutorial build instructions - -sphinx==5.0.0 -sphinx-gallery==0.11.1 -sphinx_design -nbsphinx -docutils==0.16 -sphinx-copybutton -pypandoc==1.12 -pandocfilters -markdown -tqdm==4.66.1 -numpy==1.24.4 -matplotlib -librosa -torch -torchvision -torchtext -torchdata -networkx -PyHamcrest -bs4 -awscliv2==2.1.1 -flask -spacy==3.4.1 -ray[tune]==2.7.2 -tensorboard -jinja2==3.1.3 -pytorch-lightning -torchx -torchrl==0.3.0 -tensordict==0.3.0 -ax-platform -nbformat>==5.9.2 -datasets -transformers -torchmultimodal-nightly # needs to be updated to stable as soon as it's avaialable -onnx -onnxscript -onnxruntime - -importlib-metadata==6.8.0 - -# PyTorch Theme --e git+https://github.com/pytorch/pytorch_sphinx_theme.git#egg=pytorch_sphinx_theme - -ipython - -sphinxcontrib.katex -# to run examples -boto3 -pandas -requests -scikit-image -scipy==1.11.1 -numba==0.57.1 -pillow==10.2.0 -wget -gym==0.26.2 -gym-super-mario-bros==7.4.0 -pyopengl -gymnasium[mujoco]==0.27.0 -timm -iopath -pygame==2.1.2 -pycocotools -semilearn==0.3.2 -torchao==0.0.3 -segment_anything==1.0 diff --git a/requirements.txt b/requirements.txt new file mode 120000 index 0000000000..72b541c1eb --- /dev/null +++ b/requirements.txt @@ -0,0 +1 @@ +.ci/docker/requirements.txt \ No newline at end of file