From 6f5d560f4537a875cb186b5e83fa4afa75243c2c Mon Sep 17 00:00:00 2001 From: Max Podkorytov <4273004+tenpercent@users.noreply.github.com> Date: Wed, 14 Aug 2024 17:49:11 +0000 Subject: [PATCH] build rocm wheels --- .github/workflows/rocm_ci.yml | 20 ++++++--- .github/workflows/rocm_wheels.yml | 67 ++++++++++++++++++++++++++++++ .github/workflows/wheels.yml | 43 +++++++++++++++---- .github/workflows/wheels_build.yml | 46 ++++++++++++++++---- setup.py | 13 ++++-- 5 files changed, 161 insertions(+), 28 deletions(-) create mode 100644 .github/workflows/rocm_wheels.yml diff --git a/.github/workflows/rocm_ci.yml b/.github/workflows/rocm_ci.yml index d498bea530..5a57b2b67b 100644 --- a/.github/workflows/rocm_ci.yml +++ b/.github/workflows/rocm_ci.yml @@ -1,20 +1,28 @@ -name: ROCM_CI +name: rocm-ci on: pull_request: types: [labeled, synchronize, reopened] + paths: + - "xformers/**" + - "!xformers/benchmarks/**" + - "!xformers/version.txt" + - ".github/workflows/gpu_test_gh*" + - "tests/**" + - "setup.py" + - "requirements*.txt" + - "third_party/**" workflow_dispatch: - inputs: - logLevel: - description: 'Log level' - required: true - default: 'warning' + push: + branches: + - develop schedule: - cron: "15 1 * * *" jobs: build: runs-on: self-hosted + if: github.repository == 'rocm/xformers' || github.event_name == 'pull_request' container: image: 'rocm/pytorch-nightly:latest' options: ' --cap-add=SYS_PTRACE --security-opt seccomp=unconfined --device=/dev/kfd --device=/dev/dri --group-add video --ipc=host --shm-size 8G ' diff --git a/.github/workflows/rocm_wheels.yml b/.github/workflows/rocm_wheels.yml new file mode 100644 index 0000000000..ec1f0cb0b1 --- /dev/null +++ b/.github/workflows/rocm_wheels.yml @@ -0,0 +1,67 @@ +name: rocm-wheels + +on: + push: + branches: + - develop + workflow_dispatch: + +jobs: + target_determinator: + runs-on: ubuntu-latest + outputs: + matrix: ${{ steps.set-matrix.outputs.matrix }} + steps: + - id: set-matrix + shell: python + run: | + import os + import json + environ = os.environ + + PY_VERSIONS = ['3.11'] + TORCH_VERSIONS = ['2.4.0'] + + include = [] + for os in ['ubuntu-alola']: + for python in TORCH_VERSIONS: + for torch_version in ['2.4.0']: + for toolkit_type, toolkit_short_versions in {'rocm': ["6.0", "6.1"]}.items(): + for toolkit_short_version in toolkit_short_versions: + include.append(dict( + os=os, + python=python, + torch_version=torch_version, + toolkit_type=toolkit_type, + toolkit_short_version=toolkit_short_version, + )) + print(include[-1]) + matrix = {'include': include} + print(json.dumps(matrix)) + with open(environ["GITHUB_OUTPUT"], "a") as fd: + fd.write("matrix="+json.dumps(matrix)) + + build: + needs: target_determinator + strategy: + fail-fast: false + matrix: ${{ fromJson(needs.target_determinator.outputs.matrix) }} + + uses: ./.github/workflows/wheels_build.yml + if: github.repository == 'rocm/xformers' || github.event_name == 'pull_request' + with: + os: ${{ matrix.os }} + python: ${{ matrix.python }} + torch_version: ${{ matrix.torch_version }} + toolkit_type: ${{ matrix.toolkit_type }} + toolkit_short_version: ${{ matrix.toolkit_short_version }} + artifact_tag: ${{ github.run_id }} + + clean: + runs-on: self-hosted + if: ${{ always() }} + needs: [build] + steps: + - name: Remove dangling Docker images + run: | + docker images -q -f dangling=true | xargs --no-run-if-empty docker rmi diff --git a/.github/workflows/wheels.yml b/.github/workflows/wheels.yml index a40bccd13f..24c74e9383 100644 --- a/.github/workflows/wheels.yml +++ b/.github/workflows/wheels.yml @@ -36,14 +36,18 @@ jobs: for os in ['8-core-ubuntu', 'windows-8-core']: for python in PY_VERSIONS: for torch_version in ['2.3.0']: - for cuda_short_version in ["118", "121"]: - include.append(dict( - os=os, - python=python, - torch_version=torch_version, - cuda_short_version=cuda_short_version, - )) - print(include[-1]) + for toolkit_type, toolkit_short_version in {'cuda': ["118", "121"], 'rocm': ["6.0", "6.1"]}.items(): + if os == 'windows-8-core' and toolkit_type == 'rocm': + continue + for toolkit_short_version in toolkit_short_versions: + include.append(dict( + os=os, + python=python, + torch_version=torch_version, + toolkit_type=toolkit_type, + toolkit_short_version=toolkit_short_version, + )) + print(include[-1]) matrix = {'include': include} print(json.dumps(matrix)) with open(environ["GITHUB_OUTPUT"], "a") as fd: @@ -60,7 +64,8 @@ jobs: os: ${{ matrix.os }} python: ${{ matrix.python }} torch_version: ${{ matrix.torch_version }} - cuda_short_version: ${{ matrix.cuda_short_version }} + toolkit_type: ${{ matrix.toolkit_type }} + toolkit_short_version: ${{ matrix.toolkit_short_version }} upload_pip: needs: build @@ -92,3 +97,23 @@ jobs: filter: "*torch2.3.0+cu121*" execute: ${{ github.repository == 'facebookresearch/xformers' && github.ref_type == 'tag' }} + upload_pt_rocm6_0: + needs: build + uses: ./.github/workflows/wheels_upload_s3.yml + with: + aws_role: "arn:aws:iam::749337293305:role/pytorch_bot_uploader_role" + s3_path: s3://pytorch/whl/rocm6.0/ + aws_s3_cp_extra_args: --acl public-read + filter: "*torch2.4.0+rocm6.0*" + execute: ${{ github.repository == 'facebookresearch/xformers' && github.ref_type == 'tag' }} + + upload_pt_rocm6_1: + needs: build + uses: ./.github/workflows/wheels_upload_s3.yml + with: + aws_role: "arn:aws:iam::749337293305:role/pytorch_bot_uploader_role" + s3_path: s3://pytorch/whl/rocm6.1/ + aws_s3_cp_extra_args: --acl public-read + filter: "*torch2.4.0+rocm6.1*" + execute: ${{ github.repository == 'facebookresearch/xformers' && github.ref_type == 'tag' }} + diff --git a/.github/workflows/wheels_build.yml b/.github/workflows/wheels_build.yml index 0b398822b3..bfc00ca62d 100644 --- a/.github/workflows/wheels_build.yml +++ b/.github/workflows/wheels_build.yml @@ -13,7 +13,11 @@ on: required: true type: string description: "Example: 1.13.1" - cuda_short_version: + toolkit_type: + required: true + type: string + description: "Example: cuda for cuda, rocm for rocm" + toolkit_short_version: required: true type: string description: "Example: 117 for 11.7" @@ -26,16 +30,20 @@ on: env: # you need at least cuda 5.0 for some of the stuff compiled here. - TORCH_CUDA_ARCH_LIST: "5.0+PTX 6.0 6.1 7.0 7.5 8.0+PTX" + TORCH_CUDA_ARCH_LIST: ${{ contains(inputs.toolkit_type, 'cuda') && join('6.0+PTX 7.0 7.5 8.0+PTX', fromJSON(inputs.toolkit_short_version) >= 118 && ' 9.0a' || '') || '' }} + HIP_ARCHITECTURES: ${{ contains(inputs.toolkit_type, 'rocm') && 'gfx90a gfx942' || '' }} MAX_JOBS: 4 DISTUTILS_USE_SDK: 1 # otherwise distutils will complain on windows about multiple versions of msvc XFORMERS_BUILD_TYPE: "Release" TWINE_USERNAME: __token__ XFORMERS_PACKAGE_FROM: "wheel-${{ github.ref_name }}" + # https://github.blog/changelog/2024-03-07-github-actions-all-actions-will-run-on-node20-instead-of-node16-by-default/ + ACTIONS_ALLOW_USE_UNSECURE_NODE_VERSION: "true" + PYTORCH_INDEX_URL: "https://download.pytorch.org/whl/${{ contains(inputs.toolkit_type, 'cuda') && 'cu' || 'rocm' }}${{ inputs.toolkit_short_version }}" jobs: build: - name: ${{ contains(inputs.os, 'ubuntu') && 'ubuntu' || 'win' }}-py${{ inputs.python }}-pt${{ inputs.torch_version }}+cu${{ inputs.cuda_short_version }} + name: ${{ contains(inputs.os, 'ubuntu') && 'ubuntu' || 'win' }}-py${{ inputs.python }}-pt${{ inputs.torch_version }}+${{ contains(inputs.toolkit_type, 'cuda') && 'cu' || 'rocm' }}${{ inputs.toolkit_short_version }} runs-on: ${{ inputs.os }} env: # alias for the current python version @@ -54,7 +62,7 @@ jobs: import os import sys print(sys.version) - cushort = "${{ inputs.cuda_short_version }}" + cushort = "${{ inputs.toolkit_short_version }}" TORCH_CUDA_DEFAULT = "121" # pytorch 2.1.0 # https://github.com/Jimver/cuda-toolkit/blob/master/src/links/linux-links.ts full_version, install_script = { @@ -62,6 +70,9 @@ jobs: "118": ("11.8.0", "https://developer.download.nvidia.com/compute/cuda/11.8.0/local_installers/cuda_11.8.0_520.61.05_linux.run"), "117": ("11.7.1", "https://developer.download.nvidia.com/compute/cuda/11.7.1/local_installers/cuda_11.7.1_515.65.01_linux.run"), "116": ("11.6.2", "https://developer.download.nvidia.com/compute/cuda/11.6.2/local_installers/cuda_11.6.2_510.47.03_linux.run"), + + "6.0": ("6.0.2", "https://repo.radeon.com/amdgpu-install/6.0.2/rhel/7.9/amdgpu-install-6.0.60002-1.el7.noarch.rpm"), + "6.1": ("6.1.2", "https://repo.radeon.com/amdgpu-install/6.1.2/el/7/amdgpu-install-6.1.60102-1.el7.noarch.rpm"), }[cushort] with open(os.environ['GITHUB_OUTPUT'], "r+") as fp: fp.write("CUDA_VERSION=" + full_version + "\n") @@ -70,7 +81,7 @@ jobs: fp.write("TORCH_ORG_S3_PATH=s3://pytorch/whl\n") fp.write("PUBLISH_PYPI=1\n") else: - fp.write("CUDA_VERSION_SUFFIX=+cu" + cushort + "\n") + fp.write("CUDA_VERSION_SUFFIX=+" + ("cu" if "cuda" == "${{ inputs.toolkit_type }}" else "rocm") + cushort + "\n") fp.write("TORCH_ORG_S3_PATH=s3://pytorch/whl/" + cushort + "\n") fp.write("PUBLISH_PYPI=0\n") fp.write("CUDA_INSTALL_SCRIPT=" + install_script + "\n") @@ -80,6 +91,7 @@ jobs: - name: Add H100 if nvcc 11.08+ shell: python + if: contains(inputs.toolkit_type, 'cuda') run: | import os import sys @@ -140,10 +152,12 @@ jobs: cuda: ${{ steps.cuda_info.outputs.CUDA_VERSION }} python: ${{ inputs.python }} - - name: Install dependencies - run: $PY -m pip install wheel setuptools twine -r requirements.txt --extra-index-url https://download.pytorch.org/whl/cu${{ inputs.cuda_short_version }} - - if: runner.os == 'Linux' + name: (Linux) list installed packages + run: | + yum list installed + + - if: runner.os == 'Linux' && contains(inputs.toolkit_type, 'cuda') name: (Linux) install cuda run: > yum install wget git prename -y && @@ -151,6 +165,20 @@ jobs: sh ./cuda.run --silent --toolkit && rm ./cuda.run + - if: runner.os == 'Linux' && contains(inputs.toolkit_type, 'rocm') + name: (Linux) install rocm + run: | + yum install -y libzstd + yum install -y ${{ steps.cuda_info.outputs.CUDA_INSTALL_SCRIPT }} + amdgpu-install -y --usecase=rocm --no-dkms + echo "ROCM_PATH=/opt/rocm" >> ${GITHUB_ENV} + echo "PATH=$PATH:/opt/rocm/bin" >> ${GITHUB_ENV} + echo "FORCE_ROCM=1" >> ${GITHUB_ENV} + echo "MAX_JOBS=96" >> ${GITHUB_ENV} + + - name: Install dependencies + run: $PY -m pip install wheel setuptools twine -r requirements.txt --extra-index-url $PYTORCH_INDEX_URL + - name: Build wheel run: | $PY setup.py bdist_wheel -d dist/ -k $PLAT_ARG @@ -160,6 +188,6 @@ jobs: - run: du -h dist/* - uses: actions/upload-artifact@v3 with: - name: ${{ inputs.os }}-py${{ inputs.python }}-torch${{ inputs.torch_version }}+cu${{ inputs.cuda_short_version }}_${{ inputs.artifact_tag }} + name: ${{ inputs.os }}-py${{ inputs.python }}-torch${{ inputs.torch_version }}+${{ contains(inputs.toolkit_type, 'cuda') && 'cu' || 'rocm' }}${{ inputs.toolkit_short_version }}_${{ inputs.artifact_tag }} path: dist/*.whl # Note: it might be helpful to have additional steps that test if the built wheels actually work diff --git a/setup.py b/setup.py index abadb4a17f..c08d31646f 100644 --- a/setup.py +++ b/setup.py @@ -24,6 +24,7 @@ import torch from torch.utils.cpp_extension import ( CUDA_HOME, + ROCM_HOME, BuildExtension, CppExtension, CUDAExtension, @@ -401,7 +402,10 @@ def get_extensions(): "--ptxas-options=-O2", "--ptxas-options=-allow-expensive-optimizations=true", ] - elif torch.cuda.is_available() and torch.version.hip: + elif ( + (torch.cuda.is_available() and torch.version.hip) or + (os.getenv("FORCE_ROCM", "0") == "1") + ): disable_hd256_hip_fmha = os.getenv("DISABLE_HD256_HIP_FMHA", "0") if disable_hd256_hip_fmha == "1": source_hip_maxk_256 = [] @@ -411,8 +415,7 @@ def get_extensions(): source_hip = list(set(source_hip) - set(source_hip_maxk_256)) rename_cpp_cu(source_hip) - rocm_home = os.getenv("ROCM_PATH") - hip_version = get_hip_version(rocm_home) + hip_version = get_hip_version(ROCM_HOME) source_hip_cu = [] for ff in source_hip: @@ -438,12 +441,14 @@ def get_extensions(): if use_rtn_bf16_convert == "1": cc_flag += ["-DCK_TILE_FLOAT_TO_BFLOAT16_DEFAULT=0"] + arch_list = os.getenv("HIP_ARCHITECTURES", "native").split() + extra_compile_args = { "cxx": ["-O3", "-std=c++17"] + generator_flag, "nvcc": [ "-O3", "-std=c++17", - f"--offload-arch={os.getenv('HIP_ARCHITECTURES', 'native')}", + *[f"--offload-arch={arch}" for arch in arch_list], "-U__CUDA_NO_HALF_OPERATORS__", "-U__CUDA_NO_HALF_CONVERSIONS__", "-DCK_TILE_FMHA_FWD_FAST_EXP2=1",