diff --git a/.github/workflows/rocm_wheels.yml b/.github/workflows/rocm_wheels.yml
new file mode 100644
index 0000000000..9ec31fbf2d
--- /dev/null
+++ b/.github/workflows/rocm_wheels.yml
@@ -0,0 +1,77 @@
+name: rocm-wheels
+
+on:
+  push:
+    branches:
+      - develop
+  pull_request:
+    paths:
+      - "packaging/compute_wheel_version.sh"
+      - ".github/workflows/wheel*"
+      - ".github/actions/setup-windows-runner/action.yml"
+      - "setup.py"
+      - "requirements*.txt"
+  workflow_dispatch:
+    inputs:
+      logLevel:
+        description: 'Log level'     
+        required: false
+        default: 'warning'
+
+jobs:
+  target_determinator:
+    runs-on: ubuntu-latest
+    outputs:
+      matrix: ${{ steps.set-matrix.outputs.matrix }}
+    steps:
+    - id: set-matrix
+      shell: python
+      run: |
+        import os
+        import json
+        environ = os.environ
+
+        PY_VERSIONS = ['3.11']
+
+        include = []
+        for os in ['ubuntu-alola']:
+          for python in PY_VERSIONS:
+            for torch_version in ['2.4.0']:
+              for toolkit_type, toolkit_short_versions in {'rocm': ["6.1"]}.items():
+                for toolkit_short_version in toolkit_short_versions:
+                  include.append(dict(
+                    os=os,
+                    python=python,
+                    torch_version=torch_version,
+                    toolkit_type=toolkit_type,
+                    toolkit_short_version=toolkit_short_version,
+                  ))
+                  print(include[-1])
+        matrix = {'include': include}
+        print(json.dumps(matrix))
+        with open(environ["GITHUB_OUTPUT"], "a") as fd:
+          fd.write("matrix="+json.dumps(matrix))
+
+  build:
+    needs: target_determinator
+    strategy:
+      fail-fast: false
+      matrix: ${{ fromJson(needs.target_determinator.outputs.matrix) }}
+
+    uses: ./.github/workflows/wheels_build.yml
+    if: github.repository == 'rocm/xformers' || github.event_name == 'pull_request'
+    with:
+      os: ${{ matrix.os }}
+      python: ${{ matrix.python }}
+      torch_version: ${{ matrix.torch_version }}
+      toolkit_type: ${{ matrix.toolkit_type }}
+      toolkit_short_version: ${{ matrix.toolkit_short_version }}
+
+  clean:
+    runs-on: self-hosted
+    if: ${{ always() }}
+    needs: [build]
+    steps:
+      - name: Remove dangling Docker images
+        run: |
+          docker images -q -f dangling=true | xargs --no-run-if-empty docker rmi
diff --git a/.github/workflows/wheels.yml b/.github/workflows/wheels.yml
index a40bccd13f..9c29347883 100644
--- a/.github/workflows/wheels.yml
+++ b/.github/workflows/wheels.yml
@@ -35,13 +35,16 @@ jobs:
         include = []
         for os in ['8-core-ubuntu', 'windows-8-core']:
           for python in PY_VERSIONS:
-            for torch_version in ['2.3.0']:
-              for cuda_short_version in ["118", "121"]:
+            for torch_version in ['2.4.0']:
+              for toolkit_type, toolkit_short_version in {'cuda': ["118", "121"], 'rocm': ["6.0", "6.1"]}.items():
+                if os == 'windows-8-core' and toolkit_type == 'rocm':
+                  continue
                 include.append(dict(
                   os=os,
                   python=python,
                   torch_version=torch_version,
-                  cuda_short_version=cuda_short_version,
+                  toolkit_type=toolkit_type,
+                  toolkit_short_version=toolkit_short_version,
                 ))
                 print(include[-1])
         matrix = {'include': include}
@@ -60,7 +63,8 @@ jobs:
       os: ${{ matrix.os }}
       python: ${{ matrix.python }}
       torch_version: ${{ matrix.torch_version }}
-      cuda_short_version: ${{ matrix.cuda_short_version }}
+      toolkit_type: ${{ matrix.toolkit_type }}
+      toolkit_short_version: ${{ matrix.toolkit_short_version }}
 
   upload_pip:
     needs: build
@@ -92,3 +96,23 @@ jobs:
       filter: "*torch2.3.0+cu121*"
       execute: ${{ github.repository == 'facebookresearch/xformers' && github.ref_type == 'tag' }}
 
+  upload_pt_rocm6_0:
+    needs: build
+    uses: ./.github/workflows/wheels_upload_s3.yml
+    with:
+      aws_role: "arn:aws:iam::749337293305:role/pytorch_bot_uploader_role"
+      s3_path: s3://pytorch/whl/rocm6.0/
+      aws_s3_cp_extra_args: --acl public-read
+      filter: "*torch2.4.0+rocm6.0*"
+      execute: ${{ github.repository == 'facebookresearch/xformers' && github.ref_type == 'tag' }}
+
+  upload_pt_rocm6_1:
+    needs: build
+    uses: ./.github/workflows/wheels_upload_s3.yml
+    with:
+      aws_role: "arn:aws:iam::749337293305:role/pytorch_bot_uploader_role"
+      s3_path: s3://pytorch/whl/rocm6.1/
+      aws_s3_cp_extra_args: --acl public-read
+      filter: "*torch2.4.0+rocm6.1*"
+      execute: ${{ github.repository == 'facebookresearch/xformers' && github.ref_type == 'tag' }}
+
diff --git a/.github/workflows/wheels_build.yml b/.github/workflows/wheels_build.yml
index 0b398822b3..7f61b652f0 100644
--- a/.github/workflows/wheels_build.yml
+++ b/.github/workflows/wheels_build.yml
@@ -13,7 +13,11 @@ on:
         required: true
         type: string
         description: "Example: 1.13.1"
-      cuda_short_version:
+      toolkit_type:
+        required: true
+        type: string
+        description: "Example: cuda for cuda, rocm for rocm"
+      toolkit_short_version:
         required: true
         type: string
         description: "Example: 117 for 11.7"
@@ -26,16 +30,20 @@ on:
 
 env:
   # you need at least cuda 5.0 for some of the stuff compiled here.
-  TORCH_CUDA_ARCH_LIST: "5.0+PTX 6.0 6.1 7.0 7.5 8.0+PTX"
+  TORCH_CUDA_ARCH_LIST: ${{ contains(inputs.toolkit_type, 'cuda') && join('6.0+PTX 7.0 7.5 8.0+PTX', fromJSON(inputs.toolkit_short_version) >= 118 && ' 9.0a' || '') || '' }}
+  HIP_ARCHITECTURES: ${{ contains(inputs.toolkit_type, 'rocm') && 'gfx90a gfx942' || '' }}
   MAX_JOBS: 4
   DISTUTILS_USE_SDK: 1 # otherwise distutils will complain on windows about multiple versions of msvc
   XFORMERS_BUILD_TYPE: "Release"
   TWINE_USERNAME: __token__
   XFORMERS_PACKAGE_FROM: "wheel-${{ github.ref_name }}"
+  # https://github.blog/changelog/2024-03-07-github-actions-all-actions-will-run-on-node20-instead-of-node16-by-default/
+  ACTIONS_ALLOW_USE_UNSECURE_NODE_VERSION: "true"
+  PYTORCH_INDEX_URL: "https://download.pytorch.org/whl/${{ contains(inputs.toolkit_type, 'cuda') && 'cu' || 'rocm' }}${{ inputs.toolkit_short_version }}"
 
 jobs:
   build:
-    name: ${{ contains(inputs.os, 'ubuntu') && 'ubuntu' || 'win' }}-py${{ inputs.python }}-pt${{ inputs.torch_version }}+cu${{ inputs.cuda_short_version }}
+    name: ${{ contains(inputs.os, 'ubuntu') && 'ubuntu' || 'win' }}-py${{ inputs.python }}-pt${{ inputs.torch_version }}+${{ contains(inputs.toolkit_type, 'cuda') && 'cu' || 'rocm' }}${{ inputs.toolkit_short_version }}
     runs-on: ${{ inputs.os }}
     env:
       # alias for the current python version
@@ -54,7 +62,7 @@ jobs:
           import os
           import sys
           print(sys.version)
-          cushort = "${{ inputs.cuda_short_version }}"
+          cushort = "${{ inputs.toolkit_short_version }}"
           TORCH_CUDA_DEFAULT = "121"  # pytorch 2.1.0
           # https://github.com/Jimver/cuda-toolkit/blob/master/src/links/linux-links.ts
           full_version, install_script = {
@@ -62,6 +70,9 @@ jobs:
             "118": ("11.8.0", "https://developer.download.nvidia.com/compute/cuda/11.8.0/local_installers/cuda_11.8.0_520.61.05_linux.run"),
             "117": ("11.7.1", "https://developer.download.nvidia.com/compute/cuda/11.7.1/local_installers/cuda_11.7.1_515.65.01_linux.run"),
             "116": ("11.6.2", "https://developer.download.nvidia.com/compute/cuda/11.6.2/local_installers/cuda_11.6.2_510.47.03_linux.run"),
+
+            "6.0": ("6.0.3", "https://repo.radeon.com/amdgpu-install/6.0.3/rhel/7/amdgpu-install-6.0.60003-1.el7.noarch.rpm"),
+            "6.1": ("6.1.2", "https://repo.radeon.com/amdgpu-install/6.1.2/el/7/amdgpu-install-6.1.60102-1.el7.noarch.rpm"),
           }[cushort]
           with open(os.environ['GITHUB_OUTPUT'], "r+") as fp:
             fp.write("CUDA_VERSION=" + full_version + "\n")
@@ -70,7 +81,7 @@ jobs:
               fp.write("TORCH_ORG_S3_PATH=s3://pytorch/whl\n")
               fp.write("PUBLISH_PYPI=1\n")
             else:
-              fp.write("CUDA_VERSION_SUFFIX=+cu" + cushort + "\n")
+              fp.write("CUDA_VERSION_SUFFIX=+" + ("cu" if "cuda" == "${{ inputs.toolkit_type }}" else "rocm") + cushort + "\n")
               fp.write("TORCH_ORG_S3_PATH=s3://pytorch/whl/" + cushort + "\n")
               fp.write("PUBLISH_PYPI=0\n")
             fp.write("CUDA_INSTALL_SCRIPT=" + install_script + "\n")
@@ -80,6 +91,7 @@ jobs:
 
       - name: Add H100 if nvcc 11.08+
         shell: python
+        if: contains(inputs.toolkit_type, 'cuda')
         run: |
           import os
           import sys
@@ -140,10 +152,12 @@ jobs:
           cuda: ${{ steps.cuda_info.outputs.CUDA_VERSION }}
           python: ${{ inputs.python }}
 
-      - name: Install dependencies
-        run: $PY -m pip install wheel setuptools twine -r requirements.txt --extra-index-url https://download.pytorch.org/whl/cu${{ inputs.cuda_short_version }}
-
       - if: runner.os == 'Linux'
+        name: (Linux) list installed packages
+        run: |
+          yum list installed
+
+      - if: runner.os == 'Linux' && contains(inputs.toolkit_type, 'cuda')
         name: (Linux) install cuda
         run: >
           yum install wget git prename -y &&
@@ -151,6 +165,18 @@ jobs:
           sh ./cuda.run --silent --toolkit &&
           rm ./cuda.run
 
+      - if: runner.os == 'Linux' && contains(inputs.toolkit_type, 'rocm')
+        name: (Linux) install rocm
+        run: |
+          yum install -y libzstd
+          yum install -y ${{ steps.cuda_info.outputs.CUDA_INSTALL_SCRIPT }}
+          amdgpu-install -y --usecase=rocm --no-dkms
+          echo "ROCM_HOME=/opt/rocm" >> ${GITHUB_ENV}
+          echo "PATH=$PATH:$ROCM_HOME/bin" >> ${GITHUB_ENV}
+
+      - name: Install dependencies
+        run: $PY -m pip install wheel setuptools twine -r requirements.txt --extra-index-url $PYTORCH_INDEX_URL
+
       - name: Build wheel
         run: |
           $PY setup.py bdist_wheel -d dist/ -k $PLAT_ARG
@@ -160,6 +186,6 @@ jobs:
       - run: du -h dist/*
       - uses: actions/upload-artifact@v3
         with:
-          name: ${{ inputs.os }}-py${{ inputs.python }}-torch${{ inputs.torch_version }}+cu${{ inputs.cuda_short_version }}_${{ inputs.artifact_tag }}
+          name: ${{ inputs.os }}-py${{ inputs.python }}-torch${{ inputs.torch_version }}+${{ contains(inputs.toolkit_type, 'cuda') && 'cu' || 'rocm' }}${{ inputs.toolkit_short_version }}_${{ inputs.artifact_tag }}
           path: dist/*.whl
 # Note: it might be helpful to have additional steps that test if the built wheels actually work
diff --git a/setup.py b/setup.py
index f648706e2b..f61b41e40a 100644
--- a/setup.py
+++ b/setup.py
@@ -24,6 +24,7 @@
 import torch
 from torch.utils.cpp_extension import (
     CUDA_HOME,
+    ROCM_HOME,
     BuildExtension,
     CppExtension,
     CUDAExtension,
@@ -411,8 +412,7 @@ def get_extensions():
             source_hip = list(set(source_hip) - set(source_hip_maxk_256))
 
         rename_cpp_cu(source_hip)
-        rocm_home = os.getenv("ROCM_PATH")
-        hip_version = get_hip_version(rocm_home)
+        hip_version = get_hip_version(ROCM_HOME)
 
         source_hip_cu = []
         for ff in source_hip:
@@ -438,12 +438,14 @@ def get_extensions():
         if use_rtn_bf16_convert == "1":
             cc_flag += ["-DCK_TILE_FLOAT_TO_BFLOAT16_DEFAULT=0"]
 
+        arch_list = os.getenv("HIP_ARCHITECTURES", "native").split()
+
         extra_compile_args = {
             "cxx": ["-O3", "-std=c++17"] + generator_flag,
             "nvcc": [
                 "-O3",
                 "-std=c++17",
-                f"--offload-arch={os.getenv('HIP_ARCHITECTURES', 'native')}",
+                *[f"--offload-arch={arch}" for arch in arch_list],
                 "-U__CUDA_NO_HALF_OPERATORS__",
                 "-U__CUDA_NO_HALF_CONVERSIONS__",
                 "-DCK_TILE_FMHA_FWD_FAST_EXP2=1",