From 21c7f1e3a66ae4d0be7c1248028b17c1b9e0f081 Mon Sep 17 00:00:00 2001 From: Allison Piper Date: Wed, 30 Oct 2024 14:57:50 +0000 Subject: [PATCH 1/8] Fix regular expression wildcard. --- .github/actions/workflow-build/build-workflow.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/actions/workflow-build/build-workflow.py b/.github/actions/workflow-build/build-workflow.py index 58b818ffe30..84b2fb927f2 100755 --- a/.github/actions/workflow-build/build-workflow.py +++ b/.github/actions/workflow-build/build-workflow.py @@ -710,7 +710,7 @@ def finalize_workflow_dispatch_groups(workflow_dispatch_groups_orig): # Natural sort impl (handles embedded numbers in strings, case insensitive) def natural_sort_key(key): - return [(int(text) if text.isdigit() else text.lower()) for text in re.split('(\d+)', key)] + return [(int(text) if text.isdigit() else text.lower()) for text in re.split('(\\d+)', key)] # Sort the dispatch groups by name: workflow_dispatch_groups = dict(sorted(workflow_dispatch_groups.items(), key=lambda x: natural_sort_key(x[0]))) From 21f965c13a662d9290ffdda1a2babb6feecd6bab Mon Sep 17 00:00:00 2001 From: Allison Piper Date: Wed, 30 Oct 2024 15:22:44 +0000 Subject: [PATCH 2/8] Update build-workflow.py to be aware of cuda_ext jobs. --- .github/actions/workflow-build/build-workflow.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/.github/actions/workflow-build/build-workflow.py b/.github/actions/workflow-build/build-workflow.py index 84b2fb927f2..dd09ac5685d 100755 --- a/.github/actions/workflow-build/build-workflow.py +++ b/.github/actions/workflow-build/build-workflow.py @@ -269,6 +269,8 @@ def get_job_type_info(job): result['name'] = job.capitalize() if not 'gpu' in result: result['gpu'] = False + if not 'cuda_ext' in result: + result['cuda_ext'] = False if not 'needs' in result: result['needs'] = None if not 'invoke' in result: @@ -413,13 +415,16 @@ def generate_dispatch_job_image(matrix_job, job_type): ctk = matrix_job['ctk'] host_compiler = generate_dispatch_job_host_compiler(matrix_job, job_type) + job_info = get_job_type_info(job_type) + ctk_suffix = "ext" if job_info['cuda_ext'] else "" + if is_windows(matrix_job): - return f"rapidsai/devcontainers:{devcontainer_version}-cuda{ctk}-{host_compiler}" + return f"rapidsai/devcontainers:{devcontainer_version}-cuda{ctk}{ctk_suffix}-{host_compiler}" if is_nvhpc(matrix_job): return f"rapidsai/devcontainers:{devcontainer_version}-cpp-{host_compiler}" - return f"rapidsai/devcontainers:{devcontainer_version}-cpp-{host_compiler}-cuda{ctk}" + return f"rapidsai/devcontainers:{devcontainer_version}-cpp-{host_compiler}-cuda{ctk}{ctk_suffix}" def generate_dispatch_job_command(matrix_job, job_type): From 4eb152ceee0f1c9622cbc21c4ff21d3afd246a14 Mon Sep 17 00:00:00 2001 From: Allison Piper Date: Wed, 30 Oct 2024 14:57:34 +0000 Subject: [PATCH 3/8] Update devcontainer infra to build cuda_ext DCs. --- .devcontainer/cuda11.1-gcc6/devcontainer.json | 3 +- .devcontainer/cuda11.1-gcc7/devcontainer.json | 3 +- .devcontainer/cuda11.1-gcc8/devcontainer.json | 3 +- .devcontainer/cuda11.1-gcc9/devcontainer.json | 3 +- .../cuda11.1-llvm9/devcontainer.json | 3 +- .../cuda11.8-gcc11/devcontainer.json | 3 +- .../cuda12.0-gcc10/devcontainer.json | 3 +- .../cuda12.0-gcc11/devcontainer.json | 3 +- .../cuda12.0-gcc12/devcontainer.json | 3 +- .devcontainer/cuda12.0-gcc9/devcontainer.json | 3 +- .../cuda12.0-llvm10/devcontainer.json | 3 +- .../cuda12.0-llvm11/devcontainer.json | 3 +- .../cuda12.0-llvm12/devcontainer.json | 3 +- .../cuda12.0-llvm13/devcontainer.json | 3 +- .../cuda12.0-llvm14/devcontainer.json | 3 +- .../cuda12.0-llvm9/devcontainer.json | 3 +- .../cuda12.5-nvhpc24.7/devcontainer.json | 3 +- .../cuda12.6-gcc10/devcontainer.json | 3 +- .../cuda12.6-gcc11/devcontainer.json | 3 +- .../cuda12.6-gcc12/devcontainer.json | 3 +- .../cuda12.6-gcc13/devcontainer.json | 3 +- .devcontainer/cuda12.6-gcc7/devcontainer.json | 3 +- .devcontainer/cuda12.6-gcc8/devcontainer.json | 3 +- .devcontainer/cuda12.6-gcc9/devcontainer.json | 3 +- .../cuda12.6-llvm10/devcontainer.json | 3 +- .../cuda12.6-llvm11/devcontainer.json | 3 +- .../cuda12.6-llvm12/devcontainer.json | 3 +- .../cuda12.6-llvm13/devcontainer.json | 3 +- .../cuda12.6-llvm14/devcontainer.json | 3 +- .../cuda12.6-llvm15/devcontainer.json | 3 +- .../cuda12.6-llvm16/devcontainer.json | 3 +- .../cuda12.6-llvm17/devcontainer.json | 3 +- .../cuda12.6-llvm18/devcontainer.json | 3 +- .../cuda12.6-llvm9/devcontainer.json | 3 +- .../cuda12.6-oneapi2023.2.0/devcontainer.json | 3 +- .devcontainer/devcontainer.json | 3 +- .devcontainer/launch.sh | 12 +++- .devcontainer/make_devcontainers.sh | 56 +++++++++++++------ .../actions/workflow-build/build-workflow.py | 12 +++- .../actions/workflow-run-job-linux/action.yml | 7 +++ ci/build_common.sh | 1 + 41 files changed, 140 insertions(+), 56 deletions(-) diff --git a/.devcontainer/cuda11.1-gcc6/devcontainer.json b/.devcontainer/cuda11.1-gcc6/devcontainer.json index d6f9e0edb0e..ea4ed84efa2 100644 --- a/.devcontainer/cuda11.1-gcc6/devcontainer.json +++ b/.devcontainer/cuda11.1-gcc6/devcontainer.json @@ -19,7 +19,8 @@ "CCCL_CUDA_VERSION": "11.1", "CCCL_HOST_COMPILER": "gcc", "CCCL_HOST_COMPILER_VERSION": "6", - "CCCL_BUILD_INFIX": "cuda11.1-gcc6" + "CCCL_BUILD_INFIX": "cuda11.1-gcc6", + "CCCL_CUDA_EXTENDED": "false" }, "workspaceFolder": "/home/coder/${localWorkspaceFolderBasename}", "workspaceMount": "source=${localWorkspaceFolder},target=/home/coder/${localWorkspaceFolderBasename},type=bind,consistency=consistent", diff --git a/.devcontainer/cuda11.1-gcc7/devcontainer.json b/.devcontainer/cuda11.1-gcc7/devcontainer.json index 49cf446835f..6ab67f6e23a 100644 --- a/.devcontainer/cuda11.1-gcc7/devcontainer.json +++ b/.devcontainer/cuda11.1-gcc7/devcontainer.json @@ -19,7 +19,8 @@ "CCCL_CUDA_VERSION": "11.1", "CCCL_HOST_COMPILER": "gcc", "CCCL_HOST_COMPILER_VERSION": "7", - "CCCL_BUILD_INFIX": "cuda11.1-gcc7" + "CCCL_BUILD_INFIX": "cuda11.1-gcc7", + "CCCL_CUDA_EXTENDED": "false" }, "workspaceFolder": "/home/coder/${localWorkspaceFolderBasename}", "workspaceMount": "source=${localWorkspaceFolder},target=/home/coder/${localWorkspaceFolderBasename},type=bind,consistency=consistent", diff --git a/.devcontainer/cuda11.1-gcc8/devcontainer.json b/.devcontainer/cuda11.1-gcc8/devcontainer.json index 94151b34e96..2493e892f3f 100644 --- a/.devcontainer/cuda11.1-gcc8/devcontainer.json +++ b/.devcontainer/cuda11.1-gcc8/devcontainer.json @@ -19,7 +19,8 @@ "CCCL_CUDA_VERSION": "11.1", "CCCL_HOST_COMPILER": "gcc", "CCCL_HOST_COMPILER_VERSION": "8", - "CCCL_BUILD_INFIX": "cuda11.1-gcc8" + "CCCL_BUILD_INFIX": "cuda11.1-gcc8", + "CCCL_CUDA_EXTENDED": "false" }, "workspaceFolder": "/home/coder/${localWorkspaceFolderBasename}", "workspaceMount": "source=${localWorkspaceFolder},target=/home/coder/${localWorkspaceFolderBasename},type=bind,consistency=consistent", diff --git a/.devcontainer/cuda11.1-gcc9/devcontainer.json b/.devcontainer/cuda11.1-gcc9/devcontainer.json index 960042c859f..1bf49147573 100644 --- a/.devcontainer/cuda11.1-gcc9/devcontainer.json +++ b/.devcontainer/cuda11.1-gcc9/devcontainer.json @@ -19,7 +19,8 @@ "CCCL_CUDA_VERSION": "11.1", "CCCL_HOST_COMPILER": "gcc", "CCCL_HOST_COMPILER_VERSION": "9", - "CCCL_BUILD_INFIX": "cuda11.1-gcc9" + "CCCL_BUILD_INFIX": "cuda11.1-gcc9", + "CCCL_CUDA_EXTENDED": "false" }, "workspaceFolder": "/home/coder/${localWorkspaceFolderBasename}", "workspaceMount": "source=${localWorkspaceFolder},target=/home/coder/${localWorkspaceFolderBasename},type=bind,consistency=consistent", diff --git a/.devcontainer/cuda11.1-llvm9/devcontainer.json b/.devcontainer/cuda11.1-llvm9/devcontainer.json index f3a5ce64a90..907658e37c7 100644 --- a/.devcontainer/cuda11.1-llvm9/devcontainer.json +++ b/.devcontainer/cuda11.1-llvm9/devcontainer.json @@ -19,7 +19,8 @@ "CCCL_CUDA_VERSION": "11.1", "CCCL_HOST_COMPILER": "llvm", "CCCL_HOST_COMPILER_VERSION": "9", - "CCCL_BUILD_INFIX": "cuda11.1-llvm9" + "CCCL_BUILD_INFIX": "cuda11.1-llvm9", + "CCCL_CUDA_EXTENDED": "false" }, "workspaceFolder": "/home/coder/${localWorkspaceFolderBasename}", "workspaceMount": "source=${localWorkspaceFolder},target=/home/coder/${localWorkspaceFolderBasename},type=bind,consistency=consistent", diff --git a/.devcontainer/cuda11.8-gcc11/devcontainer.json b/.devcontainer/cuda11.8-gcc11/devcontainer.json index a7ec1756b9d..2f2088df7b7 100644 --- a/.devcontainer/cuda11.8-gcc11/devcontainer.json +++ b/.devcontainer/cuda11.8-gcc11/devcontainer.json @@ -19,7 +19,8 @@ "CCCL_CUDA_VERSION": "11.8", "CCCL_HOST_COMPILER": "gcc", "CCCL_HOST_COMPILER_VERSION": "11", - "CCCL_BUILD_INFIX": "cuda11.8-gcc11" + "CCCL_BUILD_INFIX": "cuda11.8-gcc11", + "CCCL_CUDA_EXTENDED": "false" }, "workspaceFolder": "/home/coder/${localWorkspaceFolderBasename}", "workspaceMount": "source=${localWorkspaceFolder},target=/home/coder/${localWorkspaceFolderBasename},type=bind,consistency=consistent", diff --git a/.devcontainer/cuda12.0-gcc10/devcontainer.json b/.devcontainer/cuda12.0-gcc10/devcontainer.json index e2d1b3f85c4..6c875bbd070 100644 --- a/.devcontainer/cuda12.0-gcc10/devcontainer.json +++ b/.devcontainer/cuda12.0-gcc10/devcontainer.json @@ -19,7 +19,8 @@ "CCCL_CUDA_VERSION": "12.0", "CCCL_HOST_COMPILER": "gcc", "CCCL_HOST_COMPILER_VERSION": "10", - "CCCL_BUILD_INFIX": "cuda12.0-gcc10" + "CCCL_BUILD_INFIX": "cuda12.0-gcc10", + "CCCL_CUDA_EXTENDED": "false" }, "workspaceFolder": "/home/coder/${localWorkspaceFolderBasename}", "workspaceMount": "source=${localWorkspaceFolder},target=/home/coder/${localWorkspaceFolderBasename},type=bind,consistency=consistent", diff --git a/.devcontainer/cuda12.0-gcc11/devcontainer.json b/.devcontainer/cuda12.0-gcc11/devcontainer.json index bd745fe7e26..59ce4842331 100644 --- a/.devcontainer/cuda12.0-gcc11/devcontainer.json +++ b/.devcontainer/cuda12.0-gcc11/devcontainer.json @@ -19,7 +19,8 @@ "CCCL_CUDA_VERSION": "12.0", "CCCL_HOST_COMPILER": "gcc", "CCCL_HOST_COMPILER_VERSION": "11", - "CCCL_BUILD_INFIX": "cuda12.0-gcc11" + "CCCL_BUILD_INFIX": "cuda12.0-gcc11", + "CCCL_CUDA_EXTENDED": "false" }, "workspaceFolder": "/home/coder/${localWorkspaceFolderBasename}", "workspaceMount": "source=${localWorkspaceFolder},target=/home/coder/${localWorkspaceFolderBasename},type=bind,consistency=consistent", diff --git a/.devcontainer/cuda12.0-gcc12/devcontainer.json b/.devcontainer/cuda12.0-gcc12/devcontainer.json index eb489e97b13..e38ff48bfe9 100644 --- a/.devcontainer/cuda12.0-gcc12/devcontainer.json +++ b/.devcontainer/cuda12.0-gcc12/devcontainer.json @@ -19,7 +19,8 @@ "CCCL_CUDA_VERSION": "12.0", "CCCL_HOST_COMPILER": "gcc", "CCCL_HOST_COMPILER_VERSION": "12", - "CCCL_BUILD_INFIX": "cuda12.0-gcc12" + "CCCL_BUILD_INFIX": "cuda12.0-gcc12", + "CCCL_CUDA_EXTENDED": "false" }, "workspaceFolder": "/home/coder/${localWorkspaceFolderBasename}", "workspaceMount": "source=${localWorkspaceFolder},target=/home/coder/${localWorkspaceFolderBasename},type=bind,consistency=consistent", diff --git a/.devcontainer/cuda12.0-gcc9/devcontainer.json b/.devcontainer/cuda12.0-gcc9/devcontainer.json index 1bf2c88c76b..965f0737fea 100644 --- a/.devcontainer/cuda12.0-gcc9/devcontainer.json +++ b/.devcontainer/cuda12.0-gcc9/devcontainer.json @@ -19,7 +19,8 @@ "CCCL_CUDA_VERSION": "12.0", "CCCL_HOST_COMPILER": "gcc", "CCCL_HOST_COMPILER_VERSION": "9", - "CCCL_BUILD_INFIX": "cuda12.0-gcc9" + "CCCL_BUILD_INFIX": "cuda12.0-gcc9", + "CCCL_CUDA_EXTENDED": "false" }, "workspaceFolder": "/home/coder/${localWorkspaceFolderBasename}", "workspaceMount": "source=${localWorkspaceFolder},target=/home/coder/${localWorkspaceFolderBasename},type=bind,consistency=consistent", diff --git a/.devcontainer/cuda12.0-llvm10/devcontainer.json b/.devcontainer/cuda12.0-llvm10/devcontainer.json index b021ed08bdc..4c5f0b7b65b 100644 --- a/.devcontainer/cuda12.0-llvm10/devcontainer.json +++ b/.devcontainer/cuda12.0-llvm10/devcontainer.json @@ -19,7 +19,8 @@ "CCCL_CUDA_VERSION": "12.0", "CCCL_HOST_COMPILER": "llvm", "CCCL_HOST_COMPILER_VERSION": "10", - "CCCL_BUILD_INFIX": "cuda12.0-llvm10" + "CCCL_BUILD_INFIX": "cuda12.0-llvm10", + "CCCL_CUDA_EXTENDED": "false" }, "workspaceFolder": "/home/coder/${localWorkspaceFolderBasename}", "workspaceMount": "source=${localWorkspaceFolder},target=/home/coder/${localWorkspaceFolderBasename},type=bind,consistency=consistent", diff --git a/.devcontainer/cuda12.0-llvm11/devcontainer.json b/.devcontainer/cuda12.0-llvm11/devcontainer.json index 6a4c9705b28..075223b7a16 100644 --- a/.devcontainer/cuda12.0-llvm11/devcontainer.json +++ b/.devcontainer/cuda12.0-llvm11/devcontainer.json @@ -19,7 +19,8 @@ "CCCL_CUDA_VERSION": "12.0", "CCCL_HOST_COMPILER": "llvm", "CCCL_HOST_COMPILER_VERSION": "11", - "CCCL_BUILD_INFIX": "cuda12.0-llvm11" + "CCCL_BUILD_INFIX": "cuda12.0-llvm11", + "CCCL_CUDA_EXTENDED": "false" }, "workspaceFolder": "/home/coder/${localWorkspaceFolderBasename}", "workspaceMount": "source=${localWorkspaceFolder},target=/home/coder/${localWorkspaceFolderBasename},type=bind,consistency=consistent", diff --git a/.devcontainer/cuda12.0-llvm12/devcontainer.json b/.devcontainer/cuda12.0-llvm12/devcontainer.json index e052215fb27..f3808af0883 100644 --- a/.devcontainer/cuda12.0-llvm12/devcontainer.json +++ b/.devcontainer/cuda12.0-llvm12/devcontainer.json @@ -19,7 +19,8 @@ "CCCL_CUDA_VERSION": "12.0", "CCCL_HOST_COMPILER": "llvm", "CCCL_HOST_COMPILER_VERSION": "12", - "CCCL_BUILD_INFIX": "cuda12.0-llvm12" + "CCCL_BUILD_INFIX": "cuda12.0-llvm12", + "CCCL_CUDA_EXTENDED": "false" }, "workspaceFolder": "/home/coder/${localWorkspaceFolderBasename}", "workspaceMount": "source=${localWorkspaceFolder},target=/home/coder/${localWorkspaceFolderBasename},type=bind,consistency=consistent", diff --git a/.devcontainer/cuda12.0-llvm13/devcontainer.json b/.devcontainer/cuda12.0-llvm13/devcontainer.json index 013a6cbad33..a52662083dd 100644 --- a/.devcontainer/cuda12.0-llvm13/devcontainer.json +++ b/.devcontainer/cuda12.0-llvm13/devcontainer.json @@ -19,7 +19,8 @@ "CCCL_CUDA_VERSION": "12.0", "CCCL_HOST_COMPILER": "llvm", "CCCL_HOST_COMPILER_VERSION": "13", - "CCCL_BUILD_INFIX": "cuda12.0-llvm13" + "CCCL_BUILD_INFIX": "cuda12.0-llvm13", + "CCCL_CUDA_EXTENDED": "false" }, "workspaceFolder": "/home/coder/${localWorkspaceFolderBasename}", "workspaceMount": "source=${localWorkspaceFolder},target=/home/coder/${localWorkspaceFolderBasename},type=bind,consistency=consistent", diff --git a/.devcontainer/cuda12.0-llvm14/devcontainer.json b/.devcontainer/cuda12.0-llvm14/devcontainer.json index a8def359e21..56191fc9417 100644 --- a/.devcontainer/cuda12.0-llvm14/devcontainer.json +++ b/.devcontainer/cuda12.0-llvm14/devcontainer.json @@ -19,7 +19,8 @@ "CCCL_CUDA_VERSION": "12.0", "CCCL_HOST_COMPILER": "llvm", "CCCL_HOST_COMPILER_VERSION": "14", - "CCCL_BUILD_INFIX": "cuda12.0-llvm14" + "CCCL_BUILD_INFIX": "cuda12.0-llvm14", + "CCCL_CUDA_EXTENDED": "false" }, "workspaceFolder": "/home/coder/${localWorkspaceFolderBasename}", "workspaceMount": "source=${localWorkspaceFolder},target=/home/coder/${localWorkspaceFolderBasename},type=bind,consistency=consistent", diff --git a/.devcontainer/cuda12.0-llvm9/devcontainer.json b/.devcontainer/cuda12.0-llvm9/devcontainer.json index 10d506db826..500064a9b8b 100644 --- a/.devcontainer/cuda12.0-llvm9/devcontainer.json +++ b/.devcontainer/cuda12.0-llvm9/devcontainer.json @@ -19,7 +19,8 @@ "CCCL_CUDA_VERSION": "12.0", "CCCL_HOST_COMPILER": "llvm", "CCCL_HOST_COMPILER_VERSION": "9", - "CCCL_BUILD_INFIX": "cuda12.0-llvm9" + "CCCL_BUILD_INFIX": "cuda12.0-llvm9", + "CCCL_CUDA_EXTENDED": "false" }, "workspaceFolder": "/home/coder/${localWorkspaceFolderBasename}", "workspaceMount": "source=${localWorkspaceFolder},target=/home/coder/${localWorkspaceFolderBasename},type=bind,consistency=consistent", diff --git a/.devcontainer/cuda12.5-nvhpc24.7/devcontainer.json b/.devcontainer/cuda12.5-nvhpc24.7/devcontainer.json index 22502249765..a6cd7c9a22a 100644 --- a/.devcontainer/cuda12.5-nvhpc24.7/devcontainer.json +++ b/.devcontainer/cuda12.5-nvhpc24.7/devcontainer.json @@ -19,7 +19,8 @@ "CCCL_CUDA_VERSION": "12.5", "CCCL_HOST_COMPILER": "nvhpc", "CCCL_HOST_COMPILER_VERSION": "24.7", - "CCCL_BUILD_INFIX": "cuda12.5-nvhpc24.7" + "CCCL_BUILD_INFIX": "cuda12.5-nvhpc24.7", + "CCCL_CUDA_EXTENDED": "false" }, "workspaceFolder": "/home/coder/${localWorkspaceFolderBasename}", "workspaceMount": "source=${localWorkspaceFolder},target=/home/coder/${localWorkspaceFolderBasename},type=bind,consistency=consistent", diff --git a/.devcontainer/cuda12.6-gcc10/devcontainer.json b/.devcontainer/cuda12.6-gcc10/devcontainer.json index 0175d38bb81..2a5ce3b59fd 100644 --- a/.devcontainer/cuda12.6-gcc10/devcontainer.json +++ b/.devcontainer/cuda12.6-gcc10/devcontainer.json @@ -19,7 +19,8 @@ "CCCL_CUDA_VERSION": "12.6", "CCCL_HOST_COMPILER": "gcc", "CCCL_HOST_COMPILER_VERSION": "10", - "CCCL_BUILD_INFIX": "cuda12.6-gcc10" + "CCCL_BUILD_INFIX": "cuda12.6-gcc10", + "CCCL_CUDA_EXTENDED": "false" }, "workspaceFolder": "/home/coder/${localWorkspaceFolderBasename}", "workspaceMount": "source=${localWorkspaceFolder},target=/home/coder/${localWorkspaceFolderBasename},type=bind,consistency=consistent", diff --git a/.devcontainer/cuda12.6-gcc11/devcontainer.json b/.devcontainer/cuda12.6-gcc11/devcontainer.json index 8064e66e340..46ff02a1282 100644 --- a/.devcontainer/cuda12.6-gcc11/devcontainer.json +++ b/.devcontainer/cuda12.6-gcc11/devcontainer.json @@ -19,7 +19,8 @@ "CCCL_CUDA_VERSION": "12.6", "CCCL_HOST_COMPILER": "gcc", "CCCL_HOST_COMPILER_VERSION": "11", - "CCCL_BUILD_INFIX": "cuda12.6-gcc11" + "CCCL_BUILD_INFIX": "cuda12.6-gcc11", + "CCCL_CUDA_EXTENDED": "false" }, "workspaceFolder": "/home/coder/${localWorkspaceFolderBasename}", "workspaceMount": "source=${localWorkspaceFolder},target=/home/coder/${localWorkspaceFolderBasename},type=bind,consistency=consistent", diff --git a/.devcontainer/cuda12.6-gcc12/devcontainer.json b/.devcontainer/cuda12.6-gcc12/devcontainer.json index 819e665da39..8291cc423ba 100644 --- a/.devcontainer/cuda12.6-gcc12/devcontainer.json +++ b/.devcontainer/cuda12.6-gcc12/devcontainer.json @@ -19,7 +19,8 @@ "CCCL_CUDA_VERSION": "12.6", "CCCL_HOST_COMPILER": "gcc", "CCCL_HOST_COMPILER_VERSION": "12", - "CCCL_BUILD_INFIX": "cuda12.6-gcc12" + "CCCL_BUILD_INFIX": "cuda12.6-gcc12", + "CCCL_CUDA_EXTENDED": "false" }, "workspaceFolder": "/home/coder/${localWorkspaceFolderBasename}", "workspaceMount": "source=${localWorkspaceFolder},target=/home/coder/${localWorkspaceFolderBasename},type=bind,consistency=consistent", diff --git a/.devcontainer/cuda12.6-gcc13/devcontainer.json b/.devcontainer/cuda12.6-gcc13/devcontainer.json index 28ba1a24099..6066c63518c 100644 --- a/.devcontainer/cuda12.6-gcc13/devcontainer.json +++ b/.devcontainer/cuda12.6-gcc13/devcontainer.json @@ -19,7 +19,8 @@ "CCCL_CUDA_VERSION": "12.6", "CCCL_HOST_COMPILER": "gcc", "CCCL_HOST_COMPILER_VERSION": "13", - "CCCL_BUILD_INFIX": "cuda12.6-gcc13" + "CCCL_BUILD_INFIX": "cuda12.6-gcc13", + "CCCL_CUDA_EXTENDED": "false" }, "workspaceFolder": "/home/coder/${localWorkspaceFolderBasename}", "workspaceMount": "source=${localWorkspaceFolder},target=/home/coder/${localWorkspaceFolderBasename},type=bind,consistency=consistent", diff --git a/.devcontainer/cuda12.6-gcc7/devcontainer.json b/.devcontainer/cuda12.6-gcc7/devcontainer.json index 0c6a46624cc..522f3127a5d 100644 --- a/.devcontainer/cuda12.6-gcc7/devcontainer.json +++ b/.devcontainer/cuda12.6-gcc7/devcontainer.json @@ -19,7 +19,8 @@ "CCCL_CUDA_VERSION": "12.6", "CCCL_HOST_COMPILER": "gcc", "CCCL_HOST_COMPILER_VERSION": "7", - "CCCL_BUILD_INFIX": "cuda12.6-gcc7" + "CCCL_BUILD_INFIX": "cuda12.6-gcc7", + "CCCL_CUDA_EXTENDED": "false" }, "workspaceFolder": "/home/coder/${localWorkspaceFolderBasename}", "workspaceMount": "source=${localWorkspaceFolder},target=/home/coder/${localWorkspaceFolderBasename},type=bind,consistency=consistent", diff --git a/.devcontainer/cuda12.6-gcc8/devcontainer.json b/.devcontainer/cuda12.6-gcc8/devcontainer.json index 050a27227bd..046ab36bb38 100644 --- a/.devcontainer/cuda12.6-gcc8/devcontainer.json +++ b/.devcontainer/cuda12.6-gcc8/devcontainer.json @@ -19,7 +19,8 @@ "CCCL_CUDA_VERSION": "12.6", "CCCL_HOST_COMPILER": "gcc", "CCCL_HOST_COMPILER_VERSION": "8", - "CCCL_BUILD_INFIX": "cuda12.6-gcc8" + "CCCL_BUILD_INFIX": "cuda12.6-gcc8", + "CCCL_CUDA_EXTENDED": "false" }, "workspaceFolder": "/home/coder/${localWorkspaceFolderBasename}", "workspaceMount": "source=${localWorkspaceFolder},target=/home/coder/${localWorkspaceFolderBasename},type=bind,consistency=consistent", diff --git a/.devcontainer/cuda12.6-gcc9/devcontainer.json b/.devcontainer/cuda12.6-gcc9/devcontainer.json index d162790a6c6..73900438829 100644 --- a/.devcontainer/cuda12.6-gcc9/devcontainer.json +++ b/.devcontainer/cuda12.6-gcc9/devcontainer.json @@ -19,7 +19,8 @@ "CCCL_CUDA_VERSION": "12.6", "CCCL_HOST_COMPILER": "gcc", "CCCL_HOST_COMPILER_VERSION": "9", - "CCCL_BUILD_INFIX": "cuda12.6-gcc9" + "CCCL_BUILD_INFIX": "cuda12.6-gcc9", + "CCCL_CUDA_EXTENDED": "false" }, "workspaceFolder": "/home/coder/${localWorkspaceFolderBasename}", "workspaceMount": "source=${localWorkspaceFolder},target=/home/coder/${localWorkspaceFolderBasename},type=bind,consistency=consistent", diff --git a/.devcontainer/cuda12.6-llvm10/devcontainer.json b/.devcontainer/cuda12.6-llvm10/devcontainer.json index 92b6d2ae709..e59f34e3132 100644 --- a/.devcontainer/cuda12.6-llvm10/devcontainer.json +++ b/.devcontainer/cuda12.6-llvm10/devcontainer.json @@ -19,7 +19,8 @@ "CCCL_CUDA_VERSION": "12.6", "CCCL_HOST_COMPILER": "llvm", "CCCL_HOST_COMPILER_VERSION": "10", - "CCCL_BUILD_INFIX": "cuda12.6-llvm10" + "CCCL_BUILD_INFIX": "cuda12.6-llvm10", + "CCCL_CUDA_EXTENDED": "false" }, "workspaceFolder": "/home/coder/${localWorkspaceFolderBasename}", "workspaceMount": "source=${localWorkspaceFolder},target=/home/coder/${localWorkspaceFolderBasename},type=bind,consistency=consistent", diff --git a/.devcontainer/cuda12.6-llvm11/devcontainer.json b/.devcontainer/cuda12.6-llvm11/devcontainer.json index fd07354149c..4473bb79e82 100644 --- a/.devcontainer/cuda12.6-llvm11/devcontainer.json +++ b/.devcontainer/cuda12.6-llvm11/devcontainer.json @@ -19,7 +19,8 @@ "CCCL_CUDA_VERSION": "12.6", "CCCL_HOST_COMPILER": "llvm", "CCCL_HOST_COMPILER_VERSION": "11", - "CCCL_BUILD_INFIX": "cuda12.6-llvm11" + "CCCL_BUILD_INFIX": "cuda12.6-llvm11", + "CCCL_CUDA_EXTENDED": "false" }, "workspaceFolder": "/home/coder/${localWorkspaceFolderBasename}", "workspaceMount": "source=${localWorkspaceFolder},target=/home/coder/${localWorkspaceFolderBasename},type=bind,consistency=consistent", diff --git a/.devcontainer/cuda12.6-llvm12/devcontainer.json b/.devcontainer/cuda12.6-llvm12/devcontainer.json index 1a35c4ef2c8..3fc6b35d1ae 100644 --- a/.devcontainer/cuda12.6-llvm12/devcontainer.json +++ b/.devcontainer/cuda12.6-llvm12/devcontainer.json @@ -19,7 +19,8 @@ "CCCL_CUDA_VERSION": "12.6", "CCCL_HOST_COMPILER": "llvm", "CCCL_HOST_COMPILER_VERSION": "12", - "CCCL_BUILD_INFIX": "cuda12.6-llvm12" + "CCCL_BUILD_INFIX": "cuda12.6-llvm12", + "CCCL_CUDA_EXTENDED": "false" }, "workspaceFolder": "/home/coder/${localWorkspaceFolderBasename}", "workspaceMount": "source=${localWorkspaceFolder},target=/home/coder/${localWorkspaceFolderBasename},type=bind,consistency=consistent", diff --git a/.devcontainer/cuda12.6-llvm13/devcontainer.json b/.devcontainer/cuda12.6-llvm13/devcontainer.json index a825b77a030..ad8ec71e9ba 100644 --- a/.devcontainer/cuda12.6-llvm13/devcontainer.json +++ b/.devcontainer/cuda12.6-llvm13/devcontainer.json @@ -19,7 +19,8 @@ "CCCL_CUDA_VERSION": "12.6", "CCCL_HOST_COMPILER": "llvm", "CCCL_HOST_COMPILER_VERSION": "13", - "CCCL_BUILD_INFIX": "cuda12.6-llvm13" + "CCCL_BUILD_INFIX": "cuda12.6-llvm13", + "CCCL_CUDA_EXTENDED": "false" }, "workspaceFolder": "/home/coder/${localWorkspaceFolderBasename}", "workspaceMount": "source=${localWorkspaceFolder},target=/home/coder/${localWorkspaceFolderBasename},type=bind,consistency=consistent", diff --git a/.devcontainer/cuda12.6-llvm14/devcontainer.json b/.devcontainer/cuda12.6-llvm14/devcontainer.json index 5c5de81c491..9184ec66f62 100644 --- a/.devcontainer/cuda12.6-llvm14/devcontainer.json +++ b/.devcontainer/cuda12.6-llvm14/devcontainer.json @@ -19,7 +19,8 @@ "CCCL_CUDA_VERSION": "12.6", "CCCL_HOST_COMPILER": "llvm", "CCCL_HOST_COMPILER_VERSION": "14", - "CCCL_BUILD_INFIX": "cuda12.6-llvm14" + "CCCL_BUILD_INFIX": "cuda12.6-llvm14", + "CCCL_CUDA_EXTENDED": "false" }, "workspaceFolder": "/home/coder/${localWorkspaceFolderBasename}", "workspaceMount": "source=${localWorkspaceFolder},target=/home/coder/${localWorkspaceFolderBasename},type=bind,consistency=consistent", diff --git a/.devcontainer/cuda12.6-llvm15/devcontainer.json b/.devcontainer/cuda12.6-llvm15/devcontainer.json index cbd7ac51bc5..0e8552e4297 100644 --- a/.devcontainer/cuda12.6-llvm15/devcontainer.json +++ b/.devcontainer/cuda12.6-llvm15/devcontainer.json @@ -19,7 +19,8 @@ "CCCL_CUDA_VERSION": "12.6", "CCCL_HOST_COMPILER": "llvm", "CCCL_HOST_COMPILER_VERSION": "15", - "CCCL_BUILD_INFIX": "cuda12.6-llvm15" + "CCCL_BUILD_INFIX": "cuda12.6-llvm15", + "CCCL_CUDA_EXTENDED": "false" }, "workspaceFolder": "/home/coder/${localWorkspaceFolderBasename}", "workspaceMount": "source=${localWorkspaceFolder},target=/home/coder/${localWorkspaceFolderBasename},type=bind,consistency=consistent", diff --git a/.devcontainer/cuda12.6-llvm16/devcontainer.json b/.devcontainer/cuda12.6-llvm16/devcontainer.json index 3601c4b8000..66339b7a309 100644 --- a/.devcontainer/cuda12.6-llvm16/devcontainer.json +++ b/.devcontainer/cuda12.6-llvm16/devcontainer.json @@ -19,7 +19,8 @@ "CCCL_CUDA_VERSION": "12.6", "CCCL_HOST_COMPILER": "llvm", "CCCL_HOST_COMPILER_VERSION": "16", - "CCCL_BUILD_INFIX": "cuda12.6-llvm16" + "CCCL_BUILD_INFIX": "cuda12.6-llvm16", + "CCCL_CUDA_EXTENDED": "false" }, "workspaceFolder": "/home/coder/${localWorkspaceFolderBasename}", "workspaceMount": "source=${localWorkspaceFolder},target=/home/coder/${localWorkspaceFolderBasename},type=bind,consistency=consistent", diff --git a/.devcontainer/cuda12.6-llvm17/devcontainer.json b/.devcontainer/cuda12.6-llvm17/devcontainer.json index e5d5d2b28fa..026d75fd120 100644 --- a/.devcontainer/cuda12.6-llvm17/devcontainer.json +++ b/.devcontainer/cuda12.6-llvm17/devcontainer.json @@ -19,7 +19,8 @@ "CCCL_CUDA_VERSION": "12.6", "CCCL_HOST_COMPILER": "llvm", "CCCL_HOST_COMPILER_VERSION": "17", - "CCCL_BUILD_INFIX": "cuda12.6-llvm17" + "CCCL_BUILD_INFIX": "cuda12.6-llvm17", + "CCCL_CUDA_EXTENDED": "false" }, "workspaceFolder": "/home/coder/${localWorkspaceFolderBasename}", "workspaceMount": "source=${localWorkspaceFolder},target=/home/coder/${localWorkspaceFolderBasename},type=bind,consistency=consistent", diff --git a/.devcontainer/cuda12.6-llvm18/devcontainer.json b/.devcontainer/cuda12.6-llvm18/devcontainer.json index f470fe47215..23097840005 100644 --- a/.devcontainer/cuda12.6-llvm18/devcontainer.json +++ b/.devcontainer/cuda12.6-llvm18/devcontainer.json @@ -19,7 +19,8 @@ "CCCL_CUDA_VERSION": "12.6", "CCCL_HOST_COMPILER": "llvm", "CCCL_HOST_COMPILER_VERSION": "18", - "CCCL_BUILD_INFIX": "cuda12.6-llvm18" + "CCCL_BUILD_INFIX": "cuda12.6-llvm18", + "CCCL_CUDA_EXTENDED": "false" }, "workspaceFolder": "/home/coder/${localWorkspaceFolderBasename}", "workspaceMount": "source=${localWorkspaceFolder},target=/home/coder/${localWorkspaceFolderBasename},type=bind,consistency=consistent", diff --git a/.devcontainer/cuda12.6-llvm9/devcontainer.json b/.devcontainer/cuda12.6-llvm9/devcontainer.json index ab9911c0a4b..7fc3f2a197f 100644 --- a/.devcontainer/cuda12.6-llvm9/devcontainer.json +++ b/.devcontainer/cuda12.6-llvm9/devcontainer.json @@ -19,7 +19,8 @@ "CCCL_CUDA_VERSION": "12.6", "CCCL_HOST_COMPILER": "llvm", "CCCL_HOST_COMPILER_VERSION": "9", - "CCCL_BUILD_INFIX": "cuda12.6-llvm9" + "CCCL_BUILD_INFIX": "cuda12.6-llvm9", + "CCCL_CUDA_EXTENDED": "false" }, "workspaceFolder": "/home/coder/${localWorkspaceFolderBasename}", "workspaceMount": "source=${localWorkspaceFolder},target=/home/coder/${localWorkspaceFolderBasename},type=bind,consistency=consistent", diff --git a/.devcontainer/cuda12.6-oneapi2023.2.0/devcontainer.json b/.devcontainer/cuda12.6-oneapi2023.2.0/devcontainer.json index 1e074116f40..3e8df8501af 100644 --- a/.devcontainer/cuda12.6-oneapi2023.2.0/devcontainer.json +++ b/.devcontainer/cuda12.6-oneapi2023.2.0/devcontainer.json @@ -19,7 +19,8 @@ "CCCL_CUDA_VERSION": "12.6", "CCCL_HOST_COMPILER": "oneapi", "CCCL_HOST_COMPILER_VERSION": "2023.2.0", - "CCCL_BUILD_INFIX": "cuda12.6-oneapi2023.2.0" + "CCCL_BUILD_INFIX": "cuda12.6-oneapi2023.2.0", + "CCCL_CUDA_EXTENDED": "false" }, "workspaceFolder": "/home/coder/${localWorkspaceFolderBasename}", "workspaceMount": "source=${localWorkspaceFolder},target=/home/coder/${localWorkspaceFolderBasename},type=bind,consistency=consistent", diff --git a/.devcontainer/devcontainer.json b/.devcontainer/devcontainer.json index 28ba1a24099..6066c63518c 100644 --- a/.devcontainer/devcontainer.json +++ b/.devcontainer/devcontainer.json @@ -19,7 +19,8 @@ "CCCL_CUDA_VERSION": "12.6", "CCCL_HOST_COMPILER": "gcc", "CCCL_HOST_COMPILER_VERSION": "13", - "CCCL_BUILD_INFIX": "cuda12.6-gcc13" + "CCCL_BUILD_INFIX": "cuda12.6-gcc13", + "CCCL_CUDA_EXTENDED": "false" }, "workspaceFolder": "/home/coder/${localWorkspaceFolderBasename}", "workspaceMount": "source=${localWorkspaceFolder},target=/home/coder/${localWorkspaceFolderBasename},type=bind,consistency=consistent", diff --git a/.devcontainer/launch.sh b/.devcontainer/launch.sh index d67e1f73993..cb3aecc21fe 100755 --- a/.devcontainer/launch.sh +++ b/.devcontainer/launch.sh @@ -12,6 +12,7 @@ print_help() { echo "" echo "Options:" echo " -c, --cuda Specify the CUDA version. E.g., 12.2" + echo " --cuda-ext Use a docker image with extended CTK libraries." echo " -H, --host Specify the host compiler. E.g., gcc12" echo " -d, --docker Launch the development environment in Docker directly without using VSCode." echo " --gpus gpu-request GPU devices to add to the container ('all' to pass all GPUs)." @@ -46,7 +47,7 @@ parse_options() { set -- "${@:1:$#-1}"; local OPTIONS=c:e:H:dhv: - local LONG_OPTIONS=cuda:,env:,host:,gpus:,volume:,docker,help + local LONG_OPTIONS=cuda:,cuda-ext,env:,host:,gpus:,volume:,docker,help # shellcheck disable=SC2155 local PARSED_OPTIONS="$(getopt -n "$0" -o "${OPTIONS}" --long "${LONG_OPTIONS}" -- "$@")" @@ -63,6 +64,10 @@ parse_options() { cuda_version="$2" shift 2 ;; + --cuda-ext) + cuda_ext=true + shift + ;; -e|--env) env_vars+=("$1" "$2") shift 2 @@ -288,7 +293,10 @@ main() { if [[ -z ${cuda_version:-} ]] && [[ -z ${host_compiler:-} ]]; then path=".devcontainer" else - path=".devcontainer/cuda${cuda_version}-${host_compiler}" + if ${cuda_ext:-false}; then + cuda_suffix="ext" + fi + path=".devcontainer/cuda${cuda_version}${cuda_suffix:-}-${host_compiler}" if [[ ! -f "${path}/devcontainer.json" ]]; then echo "Unknown CUDA [${cuda_version}] compiler [${host_compiler}] combination" echo "Requested devcontainer ${path}/devcontainer.json does not exist" diff --git a/.devcontainer/make_devcontainers.sh b/.devcontainer/make_devcontainers.sh index 24c2df89ca4..773ef06eb6d 100755 --- a/.devcontainer/make_devcontainers.sh +++ b/.devcontainer/make_devcontainers.sh @@ -25,35 +25,55 @@ update_devcontainer() { local output_file="$2" local name="$3" local cuda_version="$4" - local compiler_name="$5" - local compiler_exe="$6" - local compiler_version="$7" - local devcontainer_version="$8" + local cuda_ext="$5" + local compiler_name="$6" + local compiler_exe="$7" + local compiler_version="$8" + local devcontainer_version="$9" + + local cuda_suffix="" + if $cuda_ext; then + local cuda_suffix="ext" + fi # NVHPC SDK comes with its own bundeled toolkit - local toolkit_name="-cuda${cuda_version}" + local toolkit_name="-cuda${cuda_version}${cuda_suffix}" if [ $compiler_name == "nvhpc" ]; then toolkit_name="" fi local IMAGE_ROOT="rapidsai/devcontainers:${devcontainer_version}-cpp-" local image="${IMAGE_ROOT}${compiler_name}${compiler_version}${toolkit_name}" - jq --arg image "$image" --arg name "$name" \ - --arg cuda_version "$cuda_version" --arg compiler_name "$compiler_name" \ - --arg compiler_exe "$compiler_exe" --arg compiler_version "$compiler_version" \ - '.image = $image | .name = $name | .containerEnv.DEVCONTAINER_NAME = $name | + jq --arg image "$image" \ + --arg name "$name" \ + --arg cuda_version "$cuda_version" \ + --arg cuda_ext "$cuda_ext" \ + --arg compiler_name "$compiler_name" \ + --arg compiler_exe "$compiler_exe" \ + --arg compiler_version "$compiler_version" \ + '.image = $image | + .name = $name | + .containerEnv.DEVCONTAINER_NAME = $name | .containerEnv.CCCL_BUILD_INFIX = $name | - .containerEnv.CCCL_CUDA_VERSION = $cuda_version | .containerEnv.CCCL_HOST_COMPILER = $compiler_name | + .containerEnv.CCCL_CUDA_VERSION = $cuda_version | + .containerEnv.CCCL_CUDA_EXTENDED = $cuda_ext | + .containerEnv.CCCL_HOST_COMPILER = $compiler_name | .containerEnv.CCCL_HOST_COMPILER_VERSION = $compiler_version '\ "$input_file" > "$output_file" } make_name() { local cuda_version="$1" - local compiler_name="$2" - local compiler_version="$3" + local cuda_ext="$2" + local compiler_name="$3" + local compiler_version="$4" + + local cuda_suffix="" + if $cuda_ext; then + local cuda_suffix="ext" + fi - echo "cuda$cuda_version-$compiler_name$compiler_version" + echo "cuda${cuda_version}${cuda_suffix}-${compiler_name}${compiler_version}" } CLEAN=false @@ -104,12 +124,13 @@ readonly combinations=$(echo "$matrix_json" | jq -c '.combinations[]') readonly base_devcontainer_file="./devcontainer.json" readonly NEWEST_GCC_CUDA_ENTRY=$(echo "$combinations" | jq -rs '[.[] | select(.compiler_name == "gcc")] | sort_by((.cuda | tonumber), (.compiler_version | tonumber)) | .[-1]') readonly DEFAULT_CUDA=$(echo "$NEWEST_GCC_CUDA_ENTRY" | jq -r '.cuda') +readonly DEFAULT_CUDA_EXT=false readonly DEFAULT_COMPILER_NAME=$(echo "$NEWEST_GCC_CUDA_ENTRY" | jq -r '.compiler_name') readonly DEFAULT_COMPILER_EXE=$(echo "$NEWEST_GCC_CUDA_ENTRY" | jq -r '.compiler_exe') readonly DEFAULT_COMPILER_VERSION=$(echo "$NEWEST_GCC_CUDA_ENTRY" | jq -r '.compiler_version') -readonly DEFAULT_NAME=$(make_name "$DEFAULT_CUDA" "$DEFAULT_COMPILER_NAME" "$DEFAULT_COMPILER_VERSION") +readonly DEFAULT_NAME=$(make_name "$DEFAULT_CUDA" "$DEFAULT_CUDA_EXT" "$DEFAULT_COMPILER_NAME" "$DEFAULT_COMPILER_VERSION") -update_devcontainer ${base_devcontainer_file} "./temp_devcontainer.json" "$DEFAULT_NAME" "$DEFAULT_CUDA" "$DEFAULT_COMPILER_NAME" "$DEFAULT_COMPILER_EXE" "$DEFAULT_COMPILER_VERSION" "$DEVCONTAINER_VERSION" +update_devcontainer ${base_devcontainer_file} "./temp_devcontainer.json" "$DEFAULT_NAME" "$DEFAULT_CUDA" "$DEFAULT_CUDA_EXT" "$DEFAULT_COMPILER_NAME" "$DEFAULT_COMPILER_EXE" "$DEFAULT_COMPILER_VERSION" "$DEVCONTAINER_VERSION" mv "./temp_devcontainer.json" ${base_devcontainer_file} # Create an array to keep track of valid subdirectory names @@ -126,15 +147,16 @@ done # For each unique combination for combination in $combinations; do cuda_version=$(echo "$combination" | jq -r '.cuda') + cuda_ext=$(echo "$combination" | jq -r '.cuda_ext') compiler_name=$(echo "$combination" | jq -r '.compiler_name') compiler_exe=$(echo "$combination" | jq -r '.compiler_exe') compiler_version=$(echo "$combination" | jq -r '.compiler_version') - name=$(make_name "$cuda_version" "$compiler_name" "$compiler_version") + name=$(make_name "$cuda_version" "$cuda_ext" "$compiler_name" "$compiler_version") mkdir -p "$name" new_devcontainer_file="$name/devcontainer.json" - update_devcontainer "$base_devcontainer_file" "$new_devcontainer_file" "$name" "$cuda_version" "$compiler_name" "$compiler_exe" "$compiler_version" "$DEVCONTAINER_VERSION" + update_devcontainer "$base_devcontainer_file" "$new_devcontainer_file" "$name" "$cuda_version" "$cuda_ext" "$compiler_name" "$compiler_exe" "$compiler_version" "$DEVCONTAINER_VERSION" echo "Created $new_devcontainer_file" # Add the subdirectory name to the valid_subdirs array diff --git a/.github/actions/workflow-build/build-workflow.py b/.github/actions/workflow-build/build-workflow.py index dd09ac5685d..60269699076 100755 --- a/.github/actions/workflow-build/build-workflow.py +++ b/.github/actions/workflow-build/build-workflow.py @@ -1093,8 +1093,18 @@ def print_devcontainer_info(args): for workflow_name in workflow_names: matrix_jobs.extend(parse_workflow_matrix_jobs(args, workflow_name)) + # Check if the extended cuda images are needed: + for matrix_job in matrix_jobs: + cuda_ext = False + for job in matrix_job['jobs']: + job_info = get_job_type_info(job) + if job_info['cuda_ext']: + cuda_ext = True + break + matrix_job['cuda_ext'] = cuda_ext + # Remove all but the following keys from the matrix jobs: - keep_keys = ['ctk', 'cxx'] + keep_keys = ['ctk', 'cxx', 'cuda_ext'] combinations = [{key: job[key] for key in keep_keys} for job in matrix_jobs] # Remove duplicates and filter out windows jobs: diff --git a/.github/actions/workflow-run-job-linux/action.yml b/.github/actions/workflow-run-job-linux/action.yml index 53bcb6c38e5..bc6704ba497 100644 --- a/.github/actions/workflow-run-job-linux/action.yml +++ b/.github/actions/workflow-run-job-linux/action.yml @@ -68,6 +68,7 @@ runs: env: CI: true RUNNER: "${{inputs.runner}}" + IMAGE: "${{inputs.image}}" # Dereferencing the command from an env var instead of a GHA input avoids issues with escaping # semicolons and other special characters (e.g. `-arch "60;70;80"`). COMMAND: "${{inputs.command}}" @@ -139,12 +140,18 @@ runs: sed "s@/__w@$(dirname "$(dirname "${{github.workspace}}")")@" <<< "$1" } + # If the image contains "cudaXX.Yext"... + if [[ "${IMAGE}" =~ cuda[0-9.]+ext ]]; then + cuda_ext_request="--cuda-ext" + fi + # Launch this container using the host's docker daemon set -x ${{github.event.repository.name}}/.devcontainer/launch.sh \ --docker \ --cuda ${{inputs.cuda}} \ --host ${{inputs.host}} \ + "${cuda_ext_request:-}" \ "${gpu_request[@]}" \ --env "CI=$CI" \ --env "AWS_ROLE_ARN=" \ diff --git a/ci/build_common.sh b/ci/build_common.sh index 0ce0de34ed6..277e2a52b93 100755 --- a/ci/build_common.sh +++ b/ci/build_common.sh @@ -126,6 +126,7 @@ print_environment_details() { NVCC_VERSION \ CMAKE_BUILD_PARALLEL_LEVEL \ CTEST_PARALLEL_LEVEL \ + CCCL_CUDA_EXTENDED \ CCCL_BUILD_INFIX \ GLOBAL_CMAKE_OPTIONS \ TBB_ROOT From 30a3f80c9aff7c44ed26d02579107b507a08723d Mon Sep 17 00:00:00 2001 From: Allison Piper Date: Wed, 30 Oct 2024 15:30:02 +0000 Subject: [PATCH 4/8] Always create an extended version of the default devcontainer. --- .../cuda12.6ext-gcc13/devcontainer.json | 54 +++++++++++++++++++ .devcontainer/make_devcontainers.sh | 8 ++- 2 files changed, 61 insertions(+), 1 deletion(-) create mode 100644 .devcontainer/cuda12.6ext-gcc13/devcontainer.json diff --git a/.devcontainer/cuda12.6ext-gcc13/devcontainer.json b/.devcontainer/cuda12.6ext-gcc13/devcontainer.json new file mode 100644 index 00000000000..e8cb406b3d5 --- /dev/null +++ b/.devcontainer/cuda12.6ext-gcc13/devcontainer.json @@ -0,0 +1,54 @@ +{ + "shutdownAction": "stopContainer", + "image": "rapidsai/devcontainers:24.12-cpp-gcc13-cuda12.6ext", + "hostRequirements": { + "gpu": "optional" + }, + "initializeCommand": [ + "/bin/bash", + "-c", + "mkdir -m 0755 -p ${localWorkspaceFolder}/.{aws,cache,config}; mkdir -m 0755 -p ${localWorkspaceFolder}/build;", + "if [[ -n ${WSLENV+set} ]]; then docker volume create cccl-build; else docker volume create --driver local --opt type=none --opt device=${localWorkspaceFolder}/build --opt o=bind cccl-build fi;" + ], + "containerEnv": { + "SCCACHE_REGION": "us-east-2", + "SCCACHE_BUCKET": "rapids-sccache-devs", + "AWS_ROLE_ARN": "arn:aws:iam::279114543810:role/nv-gha-token-sccache-devs", + "HISTFILE": "${containerWorkspaceFolder}/.cache/._bash_history", + "DEVCONTAINER_NAME": "cuda12.6ext-gcc13", + "CCCL_CUDA_VERSION": "12.6", + "CCCL_HOST_COMPILER": "gcc", + "CCCL_HOST_COMPILER_VERSION": "13", + "CCCL_BUILD_INFIX": "cuda12.6ext-gcc13", + "CCCL_CUDA_EXTENDED": "true" + }, + "workspaceFolder": "/home/coder/${localWorkspaceFolderBasename}", + "workspaceMount": "source=${localWorkspaceFolder},target=/home/coder/${localWorkspaceFolderBasename},type=bind,consistency=consistent", + "mounts": [ + "source=${localWorkspaceFolder}/.aws,target=/home/coder/.aws,type=bind,consistency=consistent", + "source=${localWorkspaceFolder}/.cache,target=/home/coder/.cache,type=bind,consistency=consistent", + "source=${localWorkspaceFolder}/.config,target=/home/coder/.config,type=bind,consistency=consistent", + "source=cccl-build,target=/home/coder/cccl/build" + ], + "customizations": { + "vscode": { + "extensions": [ + "llvm-vs-code-extensions.vscode-clangd", + "xaver.clang-format", + "nvidia.nsight-vscode-edition", + "ms-vscode.cmake-tools" + ], + "settings": { + "editor.defaultFormatter": "xaver.clang-format", + "editor.formatOnSave": true, + "clang-format.executable": "/usr/bin/clang-format", + "clangd.arguments": [ + "--compile-commands-dir=${workspaceFolder}" + ], + "files.eol": "\n", + "files.trimTrailingWhitespace": true + } + } + }, + "name": "cuda12.6ext-gcc13" +} diff --git a/.devcontainer/make_devcontainers.sh b/.devcontainer/make_devcontainers.sh index 773ef06eb6d..e7f95d59146 100755 --- a/.devcontainer/make_devcontainers.sh +++ b/.devcontainer/make_devcontainers.sh @@ -133,8 +133,14 @@ readonly DEFAULT_NAME=$(make_name "$DEFAULT_CUDA" "$DEFAULT_CUDA_EXT" "$DEFAULT_ update_devcontainer ${base_devcontainer_file} "./temp_devcontainer.json" "$DEFAULT_NAME" "$DEFAULT_CUDA" "$DEFAULT_CUDA_EXT" "$DEFAULT_COMPILER_NAME" "$DEFAULT_COMPILER_EXE" "$DEFAULT_COMPILER_VERSION" "$DEVCONTAINER_VERSION" mv "./temp_devcontainer.json" ${base_devcontainer_file} +# Always create an extended version of the default devcontainer: +readonly EXT_NAME=$(make_name "$DEFAULT_CUDA" true "$DEFAULT_COMPILER_NAME" "$DEFAULT_COMPILER_VERSION") +update_devcontainer ${base_devcontainer_file} "./temp_devcontainer.json" "$EXT_NAME" "$DEFAULT_CUDA" true "$DEFAULT_COMPILER_NAME" "$DEFAULT_COMPILER_EXE" "$DEFAULT_COMPILER_VERSION" "$DEVCONTAINER_VERSION" +mkdir -p "$EXT_NAME" +mv "./temp_devcontainer.json" "$EXT_NAME/devcontainer.json" + # Create an array to keep track of valid subdirectory names -valid_subdirs=() +valid_subdirs=("$EXT_NAME") # The img folder should not be removed: valid_subdirs+=("img") From a2d6bb4002d31d45a53e0e95c90c683fffd9a237 Mon Sep 17 00:00:00 2001 From: Allison Piper Date: Wed, 30 Oct 2024 15:22:56 +0000 Subject: [PATCH 5/8] Use extended devcontainers for cudax testing. [skip-vdc][skip-rapids][skip-docs] --- .../cuda12.0ext-gcc12/devcontainer.json | 54 +++++++++++++++++++ .../cuda12.0ext-llvm14/devcontainer.json | 54 +++++++++++++++++++ .../cuda12.6ext-gcc12/devcontainer.json | 54 +++++++++++++++++++ .../cuda12.6ext-llvm18/devcontainer.json | 54 +++++++++++++++++++ ci/build_cudax.sh | 6 +++ ci/matrix.yaml | 19 +++++-- 6 files changed, 236 insertions(+), 5 deletions(-) create mode 100644 .devcontainer/cuda12.0ext-gcc12/devcontainer.json create mode 100644 .devcontainer/cuda12.0ext-llvm14/devcontainer.json create mode 100644 .devcontainer/cuda12.6ext-gcc12/devcontainer.json create mode 100644 .devcontainer/cuda12.6ext-llvm18/devcontainer.json diff --git a/.devcontainer/cuda12.0ext-gcc12/devcontainer.json b/.devcontainer/cuda12.0ext-gcc12/devcontainer.json new file mode 100644 index 00000000000..eac9174d848 --- /dev/null +++ b/.devcontainer/cuda12.0ext-gcc12/devcontainer.json @@ -0,0 +1,54 @@ +{ + "shutdownAction": "stopContainer", + "image": "rapidsai/devcontainers:24.12-cpp-gcc12-cuda12.0ext", + "hostRequirements": { + "gpu": "optional" + }, + "initializeCommand": [ + "/bin/bash", + "-c", + "mkdir -m 0755 -p ${localWorkspaceFolder}/.{aws,cache,config}; mkdir -m 0755 -p ${localWorkspaceFolder}/build;", + "if [[ -n ${WSLENV+set} ]]; then docker volume create cccl-build; else docker volume create --driver local --opt type=none --opt device=${localWorkspaceFolder}/build --opt o=bind cccl-build fi;" + ], + "containerEnv": { + "SCCACHE_REGION": "us-east-2", + "SCCACHE_BUCKET": "rapids-sccache-devs", + "AWS_ROLE_ARN": "arn:aws:iam::279114543810:role/nv-gha-token-sccache-devs", + "HISTFILE": "${containerWorkspaceFolder}/.cache/._bash_history", + "DEVCONTAINER_NAME": "cuda12.0ext-gcc12", + "CCCL_CUDA_VERSION": "12.0", + "CCCL_HOST_COMPILER": "gcc", + "CCCL_HOST_COMPILER_VERSION": "12", + "CCCL_BUILD_INFIX": "cuda12.0ext-gcc12", + "CCCL_CUDA_EXTENDED": "true" + }, + "workspaceFolder": "/home/coder/${localWorkspaceFolderBasename}", + "workspaceMount": "source=${localWorkspaceFolder},target=/home/coder/${localWorkspaceFolderBasename},type=bind,consistency=consistent", + "mounts": [ + "source=${localWorkspaceFolder}/.aws,target=/home/coder/.aws,type=bind,consistency=consistent", + "source=${localWorkspaceFolder}/.cache,target=/home/coder/.cache,type=bind,consistency=consistent", + "source=${localWorkspaceFolder}/.config,target=/home/coder/.config,type=bind,consistency=consistent", + "source=cccl-build,target=/home/coder/cccl/build" + ], + "customizations": { + "vscode": { + "extensions": [ + "llvm-vs-code-extensions.vscode-clangd", + "xaver.clang-format", + "nvidia.nsight-vscode-edition", + "ms-vscode.cmake-tools" + ], + "settings": { + "editor.defaultFormatter": "xaver.clang-format", + "editor.formatOnSave": true, + "clang-format.executable": "/usr/bin/clang-format", + "clangd.arguments": [ + "--compile-commands-dir=${workspaceFolder}" + ], + "files.eol": "\n", + "files.trimTrailingWhitespace": true + } + } + }, + "name": "cuda12.0ext-gcc12" +} diff --git a/.devcontainer/cuda12.0ext-llvm14/devcontainer.json b/.devcontainer/cuda12.0ext-llvm14/devcontainer.json new file mode 100644 index 00000000000..deebd12ad31 --- /dev/null +++ b/.devcontainer/cuda12.0ext-llvm14/devcontainer.json @@ -0,0 +1,54 @@ +{ + "shutdownAction": "stopContainer", + "image": "rapidsai/devcontainers:24.12-cpp-llvm14-cuda12.0ext", + "hostRequirements": { + "gpu": "optional" + }, + "initializeCommand": [ + "/bin/bash", + "-c", + "mkdir -m 0755 -p ${localWorkspaceFolder}/.{aws,cache,config}; mkdir -m 0755 -p ${localWorkspaceFolder}/build;", + "if [[ -n ${WSLENV+set} ]]; then docker volume create cccl-build; else docker volume create --driver local --opt type=none --opt device=${localWorkspaceFolder}/build --opt o=bind cccl-build fi;" + ], + "containerEnv": { + "SCCACHE_REGION": "us-east-2", + "SCCACHE_BUCKET": "rapids-sccache-devs", + "AWS_ROLE_ARN": "arn:aws:iam::279114543810:role/nv-gha-token-sccache-devs", + "HISTFILE": "${containerWorkspaceFolder}/.cache/._bash_history", + "DEVCONTAINER_NAME": "cuda12.0ext-llvm14", + "CCCL_CUDA_VERSION": "12.0", + "CCCL_HOST_COMPILER": "llvm", + "CCCL_HOST_COMPILER_VERSION": "14", + "CCCL_BUILD_INFIX": "cuda12.0ext-llvm14", + "CCCL_CUDA_EXTENDED": "true" + }, + "workspaceFolder": "/home/coder/${localWorkspaceFolderBasename}", + "workspaceMount": "source=${localWorkspaceFolder},target=/home/coder/${localWorkspaceFolderBasename},type=bind,consistency=consistent", + "mounts": [ + "source=${localWorkspaceFolder}/.aws,target=/home/coder/.aws,type=bind,consistency=consistent", + "source=${localWorkspaceFolder}/.cache,target=/home/coder/.cache,type=bind,consistency=consistent", + "source=${localWorkspaceFolder}/.config,target=/home/coder/.config,type=bind,consistency=consistent", + "source=cccl-build,target=/home/coder/cccl/build" + ], + "customizations": { + "vscode": { + "extensions": [ + "llvm-vs-code-extensions.vscode-clangd", + "xaver.clang-format", + "nvidia.nsight-vscode-edition", + "ms-vscode.cmake-tools" + ], + "settings": { + "editor.defaultFormatter": "xaver.clang-format", + "editor.formatOnSave": true, + "clang-format.executable": "/usr/bin/clang-format", + "clangd.arguments": [ + "--compile-commands-dir=${workspaceFolder}" + ], + "files.eol": "\n", + "files.trimTrailingWhitespace": true + } + } + }, + "name": "cuda12.0ext-llvm14" +} diff --git a/.devcontainer/cuda12.6ext-gcc12/devcontainer.json b/.devcontainer/cuda12.6ext-gcc12/devcontainer.json new file mode 100644 index 00000000000..f3afa152724 --- /dev/null +++ b/.devcontainer/cuda12.6ext-gcc12/devcontainer.json @@ -0,0 +1,54 @@ +{ + "shutdownAction": "stopContainer", + "image": "rapidsai/devcontainers:24.12-cpp-gcc12-cuda12.6ext", + "hostRequirements": { + "gpu": "optional" + }, + "initializeCommand": [ + "/bin/bash", + "-c", + "mkdir -m 0755 -p ${localWorkspaceFolder}/.{aws,cache,config}; mkdir -m 0755 -p ${localWorkspaceFolder}/build;", + "if [[ -n ${WSLENV+set} ]]; then docker volume create cccl-build; else docker volume create --driver local --opt type=none --opt device=${localWorkspaceFolder}/build --opt o=bind cccl-build fi;" + ], + "containerEnv": { + "SCCACHE_REGION": "us-east-2", + "SCCACHE_BUCKET": "rapids-sccache-devs", + "AWS_ROLE_ARN": "arn:aws:iam::279114543810:role/nv-gha-token-sccache-devs", + "HISTFILE": "${containerWorkspaceFolder}/.cache/._bash_history", + "DEVCONTAINER_NAME": "cuda12.6ext-gcc12", + "CCCL_CUDA_VERSION": "12.6", + "CCCL_HOST_COMPILER": "gcc", + "CCCL_HOST_COMPILER_VERSION": "12", + "CCCL_BUILD_INFIX": "cuda12.6ext-gcc12", + "CCCL_CUDA_EXTENDED": "true" + }, + "workspaceFolder": "/home/coder/${localWorkspaceFolderBasename}", + "workspaceMount": "source=${localWorkspaceFolder},target=/home/coder/${localWorkspaceFolderBasename},type=bind,consistency=consistent", + "mounts": [ + "source=${localWorkspaceFolder}/.aws,target=/home/coder/.aws,type=bind,consistency=consistent", + "source=${localWorkspaceFolder}/.cache,target=/home/coder/.cache,type=bind,consistency=consistent", + "source=${localWorkspaceFolder}/.config,target=/home/coder/.config,type=bind,consistency=consistent", + "source=cccl-build,target=/home/coder/cccl/build" + ], + "customizations": { + "vscode": { + "extensions": [ + "llvm-vs-code-extensions.vscode-clangd", + "xaver.clang-format", + "nvidia.nsight-vscode-edition", + "ms-vscode.cmake-tools" + ], + "settings": { + "editor.defaultFormatter": "xaver.clang-format", + "editor.formatOnSave": true, + "clang-format.executable": "/usr/bin/clang-format", + "clangd.arguments": [ + "--compile-commands-dir=${workspaceFolder}" + ], + "files.eol": "\n", + "files.trimTrailingWhitespace": true + } + } + }, + "name": "cuda12.6ext-gcc12" +} diff --git a/.devcontainer/cuda12.6ext-llvm18/devcontainer.json b/.devcontainer/cuda12.6ext-llvm18/devcontainer.json new file mode 100644 index 00000000000..b19814ba1f7 --- /dev/null +++ b/.devcontainer/cuda12.6ext-llvm18/devcontainer.json @@ -0,0 +1,54 @@ +{ + "shutdownAction": "stopContainer", + "image": "rapidsai/devcontainers:24.12-cpp-llvm18-cuda12.6ext", + "hostRequirements": { + "gpu": "optional" + }, + "initializeCommand": [ + "/bin/bash", + "-c", + "mkdir -m 0755 -p ${localWorkspaceFolder}/.{aws,cache,config}; mkdir -m 0755 -p ${localWorkspaceFolder}/build;", + "if [[ -n ${WSLENV+set} ]]; then docker volume create cccl-build; else docker volume create --driver local --opt type=none --opt device=${localWorkspaceFolder}/build --opt o=bind cccl-build fi;" + ], + "containerEnv": { + "SCCACHE_REGION": "us-east-2", + "SCCACHE_BUCKET": "rapids-sccache-devs", + "AWS_ROLE_ARN": "arn:aws:iam::279114543810:role/nv-gha-token-sccache-devs", + "HISTFILE": "${containerWorkspaceFolder}/.cache/._bash_history", + "DEVCONTAINER_NAME": "cuda12.6ext-llvm18", + "CCCL_CUDA_VERSION": "12.6", + "CCCL_HOST_COMPILER": "llvm", + "CCCL_HOST_COMPILER_VERSION": "18", + "CCCL_BUILD_INFIX": "cuda12.6ext-llvm18", + "CCCL_CUDA_EXTENDED": "true" + }, + "workspaceFolder": "/home/coder/${localWorkspaceFolderBasename}", + "workspaceMount": "source=${localWorkspaceFolder},target=/home/coder/${localWorkspaceFolderBasename},type=bind,consistency=consistent", + "mounts": [ + "source=${localWorkspaceFolder}/.aws,target=/home/coder/.aws,type=bind,consistency=consistent", + "source=${localWorkspaceFolder}/.cache,target=/home/coder/.cache,type=bind,consistency=consistent", + "source=${localWorkspaceFolder}/.config,target=/home/coder/.config,type=bind,consistency=consistent", + "source=cccl-build,target=/home/coder/cccl/build" + ], + "customizations": { + "vscode": { + "extensions": [ + "llvm-vs-code-extensions.vscode-clangd", + "xaver.clang-format", + "nvidia.nsight-vscode-edition", + "ms-vscode.cmake-tools" + ], + "settings": { + "editor.defaultFormatter": "xaver.clang-format", + "editor.formatOnSave": true, + "clang-format.executable": "/usr/bin/clang-format", + "clangd.arguments": [ + "--compile-commands-dir=${workspaceFolder}" + ], + "files.eol": "\n", + "files.trimTrailingWhitespace": true + } + } + }, + "name": "cuda12.6ext-llvm18" +} diff --git a/ci/build_cudax.sh b/ci/build_cudax.sh index 2dff2549720..d640951b745 100755 --- a/ci/build_cudax.sh +++ b/ci/build_cudax.sh @@ -10,6 +10,12 @@ PRESET="cudax-cpp$CXX_STANDARD" CMAKE_OPTIONS="" +# Enable extra mathlibs if we're in an extended CUDA image: +if $CCCL_CUDA_EXTENDED; then + echo "Image with extended CUDA libs detected, enabling STF MathLibs." + CMAKE_OPTIONS="$CMAKE_OPTIONS -Dcudax_ENABLE_CUDASTF_MATHLIBS=ON" +fi + configure_and_build_preset "CUDA Experimental" "$PRESET" "$CMAKE_OPTIONS" print_time_summary diff --git a/ci/matrix.yaml b/ci/matrix.yaml index 2586fab09e6..b602061b299 100644 --- a/ci/matrix.yaml +++ b/ci/matrix.yaml @@ -8,6 +8,10 @@ workflows: # - {jobs: ['test'], project: 'thrust', std: 17, ctk: 'curr', cxx: ['gcc12', 'llvm16']} # override: + - {jobs: ['test_ext'], project: 'cudax', ctk: ['12.0' ], std: 'min', cxx: ['gcc12']} + - {jobs: ['test_ext'], project: 'cudax', ctk: [ 'curr'], std: 'all', cxx: ['gcc12']} + - {jobs: ['test_ext'], project: 'cudax', ctk: ['12.0' ], std: 'max', cxx: ['clang14']} + - {jobs: ['test_ext'], project: 'cudax', ctk: [ 'curr'], std: 'max', cxx: ['clang18']} pull_request: # Old CTK @@ -46,10 +50,10 @@ workflows: - {jobs: ['build'], project: 'cudax', ctk: ['12.0' ], std: 17, cxx: ['gcc12'], sm: "90"} - {jobs: ['build'], project: 'cudax', ctk: [ 'curr'], std: 17, cxx: ['gcc13'], sm: "90a"} - {jobs: ['build'], project: 'cudax', ctk: [ 'curr'], std: 'all', cxx: ['gcc13', 'clang16'], cpu: 'arm64'} - - {jobs: ['test'], project: 'cudax', ctk: ['12.0' ], std: 'min', cxx: ['gcc12']} - - {jobs: ['test'], project: 'cudax', ctk: [ 'curr'], std: 'all', cxx: ['gcc12']} - - {jobs: ['test'], project: 'cudax', ctk: ['12.0' ], std: 'max', cxx: ['clang14']} - - {jobs: ['test'], project: 'cudax', ctk: [ 'curr'], std: 'max', cxx: ['clang18']} + - {jobs: ['test_ext'], project: 'cudax', ctk: ['12.0' ], std: 'min', cxx: ['gcc12']} + - {jobs: ['test_ext'], project: 'cudax', ctk: [ 'curr'], std: 'all', cxx: ['gcc12']} + - {jobs: ['test_ext'], project: 'cudax', ctk: ['12.0' ], std: 'max', cxx: ['clang14']} + - {jobs: ['test_ext'], project: 'cudax', ctk: [ 'curr'], std: 'max', cxx: ['clang18']} # Python and c/parallel jobs: - {jobs: ['test'], project: ['cccl_c_parallel', 'python'], ctk: '12.6'} # cccl-infra: @@ -169,8 +173,9 @@ host_compilers: # Jobs support the following properties: # -# - gpu: Whether the job requires a GPU runner. Default is false. # - name: The human-readable name of the job. Default is the capitalized job key. +# - gpu: Whether the job requires a GPU runner. Default is false. +# - cuda_ext: Whether the job requires a devcontainer with extra CUDA libraries. Default is false. # - needs: # - A list of jobs that must be completed before this job can run. Default is an empty list. # - These are automatically added if needed: @@ -188,6 +193,10 @@ jobs: test: { gpu: true, needs: 'build' } test_nobuild: { gpu: true, name: 'Test', invoke: { prefix: 'test' } } + # Use images with extra CUDA libs: + build_ext: { name: "Build (extra CTK libs)", gpu: false, cuda_ext: true, invoke: { prefix: 'build' } } + test_ext: { name: "Test (extra CTK libs)", gpu: true, cuda_ext: true, invoke: { prefix: 'test' }, needs: 'build_ext' } + # CCCL: infra: { gpu: true } # example project launches a kernel From 4fad3b04c3515bfb50abb5ead3d60decee3cf5b6 Mon Sep 17 00:00:00 2001 From: Cedric Augonnet Date: Wed, 30 Oct 2024 22:16:42 +0100 Subject: [PATCH 6/8] Misc C++ fixes in stf examples using CUDA libraries --- .../examples/stf/linear_algebra/06-pdgemm.cu | 22 ++-- .../stf/linear_algebra/07-cholesky.cu | 60 ++++----- cudax/examples/stf/linear_algebra/07-potri.cu | 116 +++++++++--------- .../stf/linear_algebra/cg_dense_2D.cu | 18 +-- cudax/examples/stf/linear_algebra/strassen.cu | 4 +- .../conjugateGradientMultiDeviceCG_custf.cu | 8 +- .../MonteCarloMultiGPU.cu | 30 ++--- cudax/test/stf/examples/07-cholesky-redux.cu | 48 ++++---- .../test/stf/examples/07-cholesky-unified.cu | 50 ++++---- cudax/test/stf/gnu/06-pdgemm.cpp | 26 ++-- cudax/test/stf/gnu/07-cholesky.cpp | 62 +++++----- 11 files changed, 222 insertions(+), 222 deletions(-) diff --git a/cudax/examples/stf/linear_algebra/06-pdgemm.cu b/cudax/examples/stf/linear_algebra/06-pdgemm.cu index 07835093b71..d68b0249e19 100644 --- a/cudax/examples/stf/linear_algebra/06-pdgemm.cu +++ b/cudax/examples/stf/linear_algebra/06-pdgemm.cu @@ -160,9 +160,9 @@ public: { nvtxRangePushA("FILL"); // Fill blocks by blocks - for (int colb = 0; colb < nt; colb++) + for (size_t colb = 0; colb < nt; colb++) { - for (int rowb = 0; rowb < mt; rowb++) + for (size_t rowb = 0; rowb < mt; rowb++) { // Each task fills a block auto& h = get_handle(rowb, colb); @@ -251,14 +251,14 @@ void PDGEMM(stream_ctx& ctx, double beta, matrix& C) { - for (int m = 0; m < C.mt; m++) + for (size_t m = 0; m < C.mt; m++) { - for (int n = 0; n < C.nt; n++) + for (size_t n = 0; n < C.nt; n++) { //========================================= // alpha*A*B does not contribute; scale C //========================================= - int inner_k = transa == CUBLAS_OP_N ? A.n : A.m; + size_t inner_k = transa == CUBLAS_OP_N ? A.n : A.m; if (alpha == 0.0 || inner_k == 0) { DGEMM(ctx, transa, transb, alpha, A, 0, 0, B, 0, 0, beta, C, m, n); @@ -271,7 +271,7 @@ void PDGEMM(stream_ctx& ctx, if (transb == CUBLAS_OP_N) { assert(A.nt == B.mt); - for (int k = 0; k < A.nt; k++) + for (size_t k = 0; k < A.nt; k++) { double zbeta = k == 0 ? beta : 1.0; DGEMM(ctx, transa, transb, alpha, A, m, k, B, k, n, zbeta, C, m, n); @@ -282,7 +282,7 @@ void PDGEMM(stream_ctx& ctx, //===================================== else { - for (int k = 0; k < A.nt; k++) + for (size_t k = 0; k < A.nt; k++) { double zbeta = k == 0 ? beta : 1.0; DGEMM(ctx, transa, transb, alpha, A, m, k, B, n, k, zbeta, C, m, n); @@ -296,7 +296,7 @@ void PDGEMM(stream_ctx& ctx, //===================================== if (transb == CUBLAS_OP_N) { - for (int k = 0; k < A.mt; k++) + for (size_t k = 0; k < A.mt; k++) { double zbeta = k == 0 ? beta : 1.0; DGEMM(ctx, transa, transb, alpha, A, k, m, B, k, n, zbeta, C, m, n); @@ -307,7 +307,7 @@ void PDGEMM(stream_ctx& ctx, //========================================== else { - for (int k = 0; k < A.mt; k++) + for (size_t k = 0; k < A.mt; k++) { double zbeta = k == 0 ? beta : 1.0; DGEMM(ctx, transa, transb, alpha, A, k, m, B, n, k, zbeta, C, m, n); @@ -328,14 +328,14 @@ void run(stream_ctx& ctx, size_t N, size_t NB) cuda_safe_call(cudaGetDeviceCount(&ndevs)); /* Warm up allocators */ - for (size_t d = 0; d < ndevs; d++) + for (int d = 0; d < ndevs; d++) { auto lX = ctx.logical_data(shape_of>(1)); ctx.parallel_for(exec_place::device(d), lX.shape(), lX.write())->*[] _CCCL_DEVICE(size_t, auto) {}; } /* Initializes CUBLAS on all devices */ - for (size_t d = 0; d < ndevs; d++) + for (int d = 0; d < ndevs; d++) { cuda_safe_call(cudaSetDevice(d)); get_cublas_handle(); diff --git a/cudax/examples/stf/linear_algebra/07-cholesky.cu b/cudax/examples/stf/linear_algebra/07-cholesky.cu index 144721ff184..578c9a0b95d 100644 --- a/cudax/examples/stf/linear_algebra/07-cholesky.cu +++ b/cudax/examples/stf/linear_algebra/07-cholesky.cu @@ -91,10 +91,10 @@ public: handles.resize(mt * nt); - for (int colb = 0; colb < nt; colb++) + for (size_t colb = 0; colb < nt; colb++) { - int low_rowb = sym_matrix ? colb : 0; - for (int rowb = low_rowb; rowb < mt; rowb++) + size_t low_rowb = sym_matrix ? colb : 0; + for (size_t rowb = low_rowb; rowb < mt; rowb++) { T* addr_h = get_block_h(rowb, colb); auto& h = handle(rowb, colb); @@ -171,10 +171,10 @@ public: { nvtxRangePushA("FILL"); // Fill blocks by blocks - for (int colb = 0; colb < nt; colb++) + for (size_t colb = 0; colb < nt; colb++) { - int low_rowb = sym_matrix ? colb : 0; - for (int rowb = low_rowb; rowb < mt; rowb++) + size_t low_rowb = sym_matrix ? colb : 0; + for (size_t rowb = low_rowb; rowb < mt; rowb++) { // Each task fills a block auto& h = handle(rowb, colb); @@ -363,9 +363,9 @@ void PDNRM2_HOST(matrix* A, double* result) reserved::dot::set_current_color("red"); #endif - for (int rowb = 0; rowb < A->mt; rowb++) + for (size_t rowb = 0; rowb < A->mt; rowb++) { - for (int colb = 0; colb < A->nt; colb++) + for (size_t colb = 0; colb < A->nt; colb++) { ctx.host_launch(A->handle(rowb, colb).read())->*[=](auto sA) { double res2 = 0.0; @@ -392,24 +392,24 @@ void PDPOTRF(matrix& A) assert(A.m == A.n); assert(A.mt == A.nt); - int NBLOCKS = A.mt; + size_t NBLOCKS = A.mt; assert(A.mb == A.nb); cuda_safe_call(cudaSetDevice(0)); nvtxRangePushA("SUBMIT_PDPOTRF"); - for (int K = 0; K < NBLOCKS; K++) + for (size_t K = 0; K < NBLOCKS; K++) { int dev_akk = A.get_preferred_devid(K, K); cuda_safe_call(cudaSetDevice(A.get_preferred_devid(K, K))); DPOTRF(CUBLAS_FILL_MODE_LOWER, A, K, K); - for (int row = K + 1; row < NBLOCKS; row++) + for (size_t row = K + 1; row < NBLOCKS; row++) { cuda_safe_call(cudaSetDevice(A.get_preferred_devid(row, K))); DTRSM(CUBLAS_SIDE_RIGHT, CUBLAS_FILL_MODE_LOWER, CUBLAS_OP_T, CUBLAS_DIAG_NON_UNIT, 1.0, A, K, K, A, row, K); - for (int col = K + 1; col < row; col++) + for (size_t col = K + 1; col < row; col++) { cuda_safe_call(cudaSetDevice(A.get_preferred_devid(row, col))); DGEMM(CUBLAS_OP_N, CUBLAS_OP_T, -1.0, A, row, K, A, col, K, 1.0, A, row, col); @@ -450,17 +450,17 @@ void PDTRSM(cublasSideMode_t side, //=========================================== if (trans == CUBLAS_OP_N) { - for (int k = 0; k < B.mt; k++) + for (size_t k = 0; k < B.mt; k++) { double lalpha = k == 0 ? alpha : 1.0; - for (int n = 0; n < B.nt; n++) + for (size_t n = 0; n < B.nt; n++) { cuda_safe_call(cudaSetDevice(A.get_preferred_devid(k, k))); DTRSM(side, uplo, trans, diag, lalpha, A, k, k, B, k, n); } - for (int m = k + 1; m < B.mt; m++) + for (size_t m = k + 1; m < B.mt; m++) { - for (int n = 0; n < B.nt; n++) + for (size_t n = 0; n < B.nt; n++) { cuda_safe_call(cudaSetDevice(A.get_preferred_devid(m, k))); DGEMM(CUBLAS_OP_N, CUBLAS_OP_N, -1.0, A, m, k, B, k, n, lalpha, B, m, n); @@ -473,17 +473,17 @@ void PDTRSM(cublasSideMode_t side, //================================================ else { - for (int k = 0; k < B.mt; k++) + for (size_t k = 0; k < B.mt; k++) { double lalpha = k == 0 ? alpha : 1.0; - for (int n = 0; n < B.nt; n++) + for (size_t n = 0; n < B.nt; n++) { cuda_safe_call(cudaSetDevice(A.get_preferred_devid(B.mt - k - 1, B.mt - k - 1))); DTRSM(side, uplo, trans, diag, lalpha, A, B.mt - k - 1, B.mt - k - 1, B, B.mt - k - 1, n); } - for (int m = k + 1; m < B.mt; m++) + for (size_t m = k + 1; m < B.mt; m++) { - for (int n = 0; n < B.nt; n++) + for (size_t n = 0; n < B.nt; n++) { cuda_safe_call(cudaSetDevice(A.get_preferred_devid(B.mt - k - 1, B.mt - 1 - m))); DGEMM( @@ -540,14 +540,14 @@ void PDGEMM(cublasOperation_t transa, reserved::dot::set_current_color("blue"); #endif - for (int m = 0; m < C.mt; m++) + for (size_t m = 0; m < C.mt; m++) { - for (int n = 0; n < C.nt; n++) + for (size_t n = 0; n < C.nt; n++) { //========================================= // alpha*A*B does not contribute; scale C //========================================= - int inner_k = transa == CUBLAS_OP_N ? A.n : A.m; + size_t inner_k = transa == CUBLAS_OP_N ? A.n : A.m; if (alpha == 0.0 || inner_k == 0) { DGEMM(transa, transb, alpha, A, 0, 0, B, 0, 0, beta, C, m, n); @@ -559,7 +559,7 @@ void PDGEMM(cublasOperation_t transa, //================================ if (transb == CUBLAS_OP_N) { - for (int k = 0; k < A.nt; k++) + for (size_t k = 0; k < A.nt; k++) { double zbeta = k == 0 ? beta : 1.0; DGEMM(transa, transb, alpha, A, m, k, B, k, n, zbeta, C, m, n); @@ -570,7 +570,7 @@ void PDGEMM(cublasOperation_t transa, //===================================== else { - for (int k = 0; k < A.nt; k++) + for (size_t k = 0; k < A.nt; k++) { double zbeta = k == 0 ? beta : 1.0; DGEMM(transa, transb, alpha, A, m, k, B, n, k, zbeta, C, m, n); @@ -584,7 +584,7 @@ void PDGEMM(cublasOperation_t transa, //===================================== if (transb == CUBLAS_OP_N) { - for (int k = 0; k < A.mt; k++) + for (size_t k = 0; k < A.mt; k++) { double zbeta = k == 0 ? beta : 1.0; DGEMM(transa, transb, alpha, A, k, m, B, k, n, zbeta, C, m, n); @@ -595,7 +595,7 @@ void PDGEMM(cublasOperation_t transa, //========================================== else { - for (int k = 0; k < A.mt; k++) + for (size_t k = 0; k < A.mt; k++) { double zbeta = k == 0 ? beta : 1.0; DGEMM(transa, transb, alpha, A, k, m, B, n, k, zbeta, C, m, n); @@ -637,7 +637,7 @@ int main(int argc, char** argv) int ndevs; cuda_safe_call(cudaGetDeviceCount(&ndevs)); - for (size_t d = 0; d < ndevs; d++) + for (int d = 0; d < ndevs; d++) { auto lX = ctx.logical_data(shape_of>(1)); ctx.parallel_for(exec_place::device(d), lX.shape(), lX.write())->*[] _CCCL_DEVICE(size_t, auto) {}; @@ -688,9 +688,9 @@ int main(int argc, char** argv) cudaEvent_t startEvent_pdpotrf, stopEvent_pdpotrf; float milliseconds_pdpotrf = 0; - // for (int row = 0; row < A.mt; row++) + // for (size_t row = 0; row < A.mt; row++) // { - // for (int col = 0; col <= row; col++) + // for (size_t col = 0; col <= row; col++) // { // cuda_safe_call(cudaSetDevice(A.get_preferred_devid(row, col))); // NOOP(A, row, col); diff --git a/cudax/examples/stf/linear_algebra/07-potri.cu b/cudax/examples/stf/linear_algebra/07-potri.cu index e3fb3dd55b7..6855e563578 100644 --- a/cudax/examples/stf/linear_algebra/07-potri.cu +++ b/cudax/examples/stf/linear_algebra/07-potri.cu @@ -93,10 +93,10 @@ public: handles.resize(mt * nt); - for (int colb = 0; colb < nt; colb++) + for (size_t colb = 0; colb < nt; colb++) { - int low_rowb = sym_matrix ? colb : 0; - for (int rowb = low_rowb; rowb < mt; rowb++) + size_t low_rowb = sym_matrix ? colb : 0; + for (size_t rowb = low_rowb; rowb < mt; rowb++) { T* addr_h = get_block_h(rowb, colb); auto& h = get_handle(rowb, colb); @@ -173,10 +173,10 @@ public: { nvtxRangePushA("FILL"); // Fill blocks by blocks - for (int colb = 0; colb < nt; colb++) + for (size_t colb = 0; colb < nt; colb++) { - int low_rowb = sym_matrix ? colb : 0; - for (int rowb = low_rowb; rowb < mt; rowb++) + size_t low_rowb = sym_matrix ? colb : 0; + for (size_t rowb = low_rowb; rowb < mt; rowb++) { // Each task fills a block auto& h = get_handle(rowb, colb); @@ -804,9 +804,9 @@ void PDNRM2_HOST(matrix* A, double* result) ctx.get_dot()->set_current_color("red"); #endif - for (int rowb = 0; rowb < A->mt; rowb++) + for (size_t rowb = 0; rowb < A->mt; rowb++) { - for (int colb = 0; colb < A->nt; colb++) + for (size_t colb = 0; colb < A->nt; colb++) { ctx.host_launch(A->get_handle(rowb, colb).read())->*[=](auto sA) { double res2 = 0.0; @@ -833,21 +833,21 @@ void PDPOTRF(matrix& A) assert(A.m == A.n); assert(A.mt == A.nt); - int NBLOCKS = A.mt; + size_t NBLOCKS = A.mt; assert(A.mb == A.nb); nvtxRangePushA("SUBMIT_PDPOTRF"); - for (int K = 0; K < NBLOCKS; K++) + for (size_t K = 0; K < NBLOCKS; K++) { cuda_try(cudaSetDevice(A.get_preferred_devid(K, K))); DPOTRF(CUBLAS_FILL_MODE_LOWER, A, K, K); - for (int row = K + 1; row < NBLOCKS; row++) + for (size_t row = K + 1; row < NBLOCKS; row++) { cuda_try(cudaSetDevice(A.get_preferred_devid(row, K))); DTRSM(CUBLAS_SIDE_RIGHT, CUBLAS_FILL_MODE_LOWER, CUBLAS_OP_T, CUBLAS_DIAG_NON_UNIT, 1.0, A, K, K, A, row, K); - for (int col = K + 1; col < row; col++) + for (size_t col = K + 1; col < row; col++) { cuda_try(cudaSetDevice(A.get_preferred_devid(row, col))); DGEMM(CUBLAS_OP_N, CUBLAS_OP_T, -1.0, A, row, K, A, col, K, 1.0, A, row, col); @@ -888,17 +888,17 @@ void PDTRSM(cublasSideMode_t side, //=========================================== if (trans == CUBLAS_OP_N) { - for (int k = 0; k < B.mt; k++) + for (size_t k = 0; k < B.mt; k++) { double lalpha = k == 0 ? alpha : 1.0; - for (int n = 0; n < B.nt; n++) + for (size_t n = 0; n < B.nt; n++) { cuda_try(cudaSetDevice(A.get_preferred_devid(k, k))); DTRSM(side, uplo, trans, diag, lalpha, A, k, k, B, k, n); } - for (int m = k + 1; m < B.mt; m++) + for (size_t m = k + 1; m < B.mt; m++) { - for (int n = 0; n < B.nt; n++) + for (size_t n = 0; n < B.nt; n++) { cuda_try(cudaSetDevice(A.get_preferred_devid(m, k))); DGEMM(CUBLAS_OP_N, CUBLAS_OP_N, -1.0, A, m, k, B, k, n, lalpha, B, m, n); @@ -911,17 +911,17 @@ void PDTRSM(cublasSideMode_t side, //================================================ else { - for (int k = 0; k < B.mt; k++) + for (size_t k = 0; k < B.mt; k++) { double lalpha = k == 0 ? alpha : 1.0; - for (int n = 0; n < B.nt; n++) + for (size_t n = 0; n < B.nt; n++) { cuda_try(cudaSetDevice(A.get_preferred_devid(B.mt - k - 1, B.mt - k - 1))); DTRSM(side, uplo, trans, diag, lalpha, A, B.mt - k - 1, B.mt - k - 1, B, B.mt - k - 1, n); } - for (int m = k + 1; m < B.mt; m++) + for (size_t m = k + 1; m < B.mt; m++) { - for (int n = 0; n < B.nt; n++) + for (size_t n = 0; n < B.nt; n++) { cuda_try(cudaSetDevice(A.get_preferred_devid(B.mt - k - 1, B.mt - 1 - m))); DGEMM( @@ -983,16 +983,16 @@ void PDGEMM(cublasOperation_t transa, reserved::dot::set_current_color("blue"); #endif - for (int m = 0; m < C.mt; m++) + for (size_t m = 0; m < C.mt; m++) { - for (int n = 0; n < C.nt; n++) + for (size_t n = 0; n < C.nt; n++) { cuda_try(cudaSetDevice(C.get_preferred_devid(m, n))); //========================================= // alpha*A*B does not contribute; scale C //========================================= - int inner_k = transa == CUBLAS_OP_N ? A.n : A.m; + size_t inner_k = transa == CUBLAS_OP_N ? A.n : A.m; if (alpha == 0.0 || inner_k == 0) { DGEMM(transa, transb, alpha, A, 0, 0, B, 0, 0, beta, C, m, n); @@ -1005,7 +1005,7 @@ void PDGEMM(cublasOperation_t transa, if (transb == CUBLAS_OP_N) { assert(A.nt == B.mt); - for (int k = 0; k < A.nt; k++) + for (size_t k = 0; k < A.nt; k++) { double zbeta = k == 0 ? beta : 1.0; DGEMM(transa, transb, alpha, A, m, k, B, k, n, zbeta, C, m, n); @@ -1016,7 +1016,7 @@ void PDGEMM(cublasOperation_t transa, //===================================== else { - for (int k = 0; k < A.nt; k++) + for (size_t k = 0; k < A.nt; k++) { double zbeta = k == 0 ? beta : 1.0; DGEMM(transa, transb, alpha, A, m, k, B, n, k, zbeta, C, m, n); @@ -1030,7 +1030,7 @@ void PDGEMM(cublasOperation_t transa, //===================================== if (transb == CUBLAS_OP_N) { - for (int k = 0; k < A.mt; k++) + for (size_t k = 0; k < A.mt; k++) { double zbeta = k == 0 ? beta : 1.0; DGEMM(transa, transb, alpha, A, k, m, B, k, n, zbeta, C, m, n); @@ -1041,7 +1041,7 @@ void PDGEMM(cublasOperation_t transa, //========================================== else { - for (int k = 0; k < A.mt; k++) + for (size_t k = 0; k < A.mt; k++) { double zbeta = k == 0 ? beta : 1.0; DGEMM(transa, transb, alpha, A, k, m, B, n, k, zbeta, C, m, n); @@ -1062,22 +1062,22 @@ void PDTRTRI(matrix& A, cublasFillMode_t uplo, cublasDiagType_t diag) nvtxRangePushA("SUBMIT_PDTRTRI"); - for (int k = 0; k < A.nt; k++) + for (size_t k = 0; k < A.nt; k++) { - for (int m = k + 1; m < A.mt; m++) + for (size_t m = k + 1; m < A.mt; m++) { cuda_try(cudaSetDevice(A.get_preferred_devid(m, k))); DTRSM(CUBLAS_SIDE_RIGHT, CUBLAS_FILL_MODE_LOWER, CUBLAS_OP_N, diag, -1.0, A, k, k, A, m, k); } - for (int m = k + 1; m < A.mt; m++) + for (size_t m = k + 1; m < A.mt; m++) { - for (int n = 0; n < k; n++) + for (size_t n = 0; n < k; n++) { cuda_try(cudaSetDevice(A.get_preferred_devid(m, n))); DGEMM(CUBLAS_OP_N, CUBLAS_OP_N, 1.0, A, m, k, A, k, n, 1.0, A, m, n); } } - for (int n = 0; n < k; n++) + for (size_t n = 0; n < k; n++) { cuda_try(cudaSetDevice(A.get_preferred_devid(k, n))); DTRSM(CUBLAS_SIDE_LEFT, CUBLAS_FILL_MODE_LOWER, CUBLAS_OP_N, diag, 1.0, A, k, k, A, k, n); @@ -1101,20 +1101,20 @@ void PDLAUUM(matrix& A, cublasFillMode_t uplo) nvtxRangePushA("SUBMIT_PDLAUUM"); - for (int k = 0; k < A.mt; k++) + for (size_t k = 0; k < A.mt; k++) { - for (int n = 0; n < k; n++) + for (size_t n = 0; n < k; n++) { cuda_try(cudaSetDevice(A.get_preferred_devid(n, n))); DSYRK(CUBLAS_FILL_MODE_LOWER, CUBLAS_OP_T, 1.0, A, k, n, 1.0, A, n, n); - for (int m = n + 1; m < k; m++) + for (size_t m = n + 1; m < k; m++) { cuda_try(cudaSetDevice(A.get_preferred_devid(m, n))); DGEMM(CUBLAS_OP_T, CUBLAS_OP_N, 1.0, A, k, m, A, k, n, 1.0, A, m, n); } } - for (int n = 0; n < k; n++) + for (size_t n = 0; n < k; n++) { cuda_try(cudaSetDevice(A.get_preferred_devid(k, n))); DTRMM(CUBLAS_SIDE_LEFT, CUBLAS_FILL_MODE_LOWER, CUBLAS_OP_T, CUBLAS_DIAG_NON_UNIT, 1.0, A, k, k, A, k, n); @@ -1136,7 +1136,7 @@ void PDSYMM(cublasSideMode_t side, double beta, matrix& C) { - int k, m, n; + size_t k, m, n; double zbeta; double zone = (double) 1.0; @@ -1272,15 +1272,15 @@ void PDTRMM(cublasSideMode_t side, //=========================================== if (trans == CUBLAS_OP_N) { - for (int m = 0; m < B.mt; m++) + for (size_t m = 0; m < B.mt; m++) { - for (int n = 0; n < B.nt; n++) + for (size_t n = 0; n < B.nt; n++) { cuda_try(cudaSetDevice(B.get_preferred_devid(m, n))); DTRMM(side, uplo, trans, diag, alpha, A, m, m, B, m, n); - for (int k = m + 1; k < A.mt; k++) + for (size_t k = m + 1; k < A.mt; k++) { DGEMM(trans, CUBLAS_OP_N, alpha, A, m, k, B, k, n, 1.0, B, m, n); } @@ -1292,9 +1292,9 @@ void PDTRMM(cublasSideMode_t side, //================================================ else { - for (int m = B.mt - 1; m > -1; m--) + for (int m = static_cast(B.mt) - 1; m > -1; m--) { - for (int n = 0; n < B.nt; n++) + for (size_t n = 0; n < B.nt; n++) { cuda_try(cudaSetDevice(B.get_preferred_devid(m, n))); @@ -1315,9 +1315,9 @@ void PDTRMM(cublasSideMode_t side, //=========================================== if (trans == CUBLAS_OP_N) { - for (int m = B.mt - 1; m > -1; m--) + for (int m = static_cast(B.mt) - 1; m > -1; m--) { - for (int n = 0; n < B.nt; n++) + for (size_t n = 0; n < B.nt; n++) { cuda_try(cudaSetDevice(B.get_preferred_devid(m, n))); @@ -1335,13 +1335,13 @@ void PDTRMM(cublasSideMode_t side, //================================================ else { - for (int m = 0; m < B.mt; m++) + for (size_t m = 0; m < B.mt; m++) { - for (int n = 0; n < B.nt; n++) + for (size_t n = 0; n < B.nt; n++) { DTRMM(side, uplo, trans, diag, alpha, A, m, m, B, m, n); - for (int k = m + 1; k < A.mt; k++) + for (size_t k = m + 1; k < A.mt; k++) { DGEMM(trans, CUBLAS_OP_N, alpha, A, k, m, B, k, n, 1.0, B, m, n); } @@ -1359,9 +1359,9 @@ void PDTRMM(cublasSideMode_t side, //============================================ if (trans == CUBLAS_OP_N) { - for (int n = B.nt - 1; n > -1; n--) + for (int n = static_cast(B.nt) - 1; n > -1; n--) { - for (int m = 0; m < B.mt; m++) + for (size_t m = 0; m < B.mt; m++) { cuda_try(cudaSetDevice(B.get_preferred_devid(m, n))); @@ -1379,15 +1379,15 @@ void PDTRMM(cublasSideMode_t side, //================================================= else { - for (int n = 0; n < B.nt; n++) + for (size_t n = 0; n < B.nt; n++) { - for (int m = 0; m < B.mt; m++) + for (size_t m = 0; m < B.mt; m++) { cuda_try(cudaSetDevice(B.get_preferred_devid(m, n))); DTRMM(side, uplo, trans, diag, alpha, A, n, n, B, m, n); - for (int k = n + 1; k < A.mt; k++) + for (size_t k = n + 1; k < A.mt; k++) { DGEMM(CUBLAS_OP_N, trans, alpha, B, m, k, A, n, k, 1.0, B, m, n); } @@ -1402,15 +1402,15 @@ void PDTRMM(cublasSideMode_t side, //============================================ if (trans == CUBLAS_OP_N) { - for (int n = 0; n < B.nt; n++) + for (size_t n = 0; n < B.nt; n++) { - for (int m = 0; m < B.mt; m++) + for (size_t m = 0; m < B.mt; m++) { cuda_try(cudaSetDevice(B.get_preferred_devid(m, n))); DTRMM(side, uplo, trans, diag, alpha, A, n, n, B, m, n); - for (int k = n + 1; k < A.mt; k++) + for (size_t k = n + 1; k < A.mt; k++) { DGEMM(CUBLAS_OP_N, trans, alpha, B, m, k, A, k, n, 1.0, B, m, n); } @@ -1422,9 +1422,9 @@ void PDTRMM(cublasSideMode_t side, //================================================= else { - for (int n = B.nt - 1; n > -1; n--) + for (int n = static_cast(B.nt) - 1; n > -1; n--) { - for (int m = 0; m < B.mt; m++) + for (size_t m = 0; m < B.mt; m++) { cuda_try(cudaSetDevice(B.get_preferred_devid(m, n))); @@ -1462,7 +1462,7 @@ void run(int N, int NB) int ndevs; cuda_try(cudaGetDeviceCount(&ndevs)); - for (size_t d = 0; d < ndevs; d++) + for (int d = 0; d < ndevs; d++) { auto ldummy = ctx.logical_data(shape_of>(1)); ctx.task(exec_place::device(d), ldummy.write())->*[](cudaStream_t, auto) { diff --git a/cudax/examples/stf/linear_algebra/cg_dense_2D.cu b/cudax/examples/stf/linear_algebra/cg_dense_2D.cu index e98f737e524..3f41e4cca04 100644 --- a/cudax/examples/stf/linear_algebra/cg_dense_2D.cu +++ b/cudax/examples/stf/linear_algebra/cg_dense_2D.cu @@ -65,7 +65,7 @@ public: if (is_tmp) { // There is no physical backing for this temporary vector - for (int b = 0; b < nblocks; b++) + for (size_t b = 0; b < nblocks; b++) { size_t bs = std::min(N - block_size * b, block_size); handles[b] = to_shared(ctx.logical_data(shape_of>(bs))); @@ -91,7 +91,7 @@ public: { handles.resize(nblocks); - for (int b = 0; b < nblocks; b++) + for (size_t b = 0; b < nblocks; b++) { size_t bs = std::min(N - block_size * b, block_size); handles[b] = to_shared(ctx.logical_data(shape_of>(bs))); @@ -107,12 +107,12 @@ public: void fill(const std::function& f) { size_t bs = block_size; - for (int b = 0; b < nblocks; b++) + for (size_t b = 0; b < nblocks; b++) { ctx.task(exec_place::host, handles[b]->write())->*[&f, b, bs](cudaStream_t stream, auto ds) { cuda_safe_call(cudaStreamSynchronize(stream)); - for (int local_row = 0; local_row < ds.extent(0); local_row++) + for (size_t local_row = 0; local_row < ds.extent(0); local_row++) { ds(local_row) = f(local_row + b * bs); } @@ -234,7 +234,7 @@ class scalar DOT(vector& a, class vector& b) scalar global_res(true); // Loop over all blocks, - for (int bid = 0; bid < a.nblocks; bid++) + for (size_t bid = 0; bid < a.nblocks; bid++) { scalar res(true); @@ -267,7 +267,7 @@ void AXPY(const class scalar& alpha, class vector& x, class vector& y) assert(x.N == y.N); assert(x.nblocks == y.nblocks); - for (int b = 0; b < x.nblocks; b++) + for (size_t b = 0; b < x.nblocks; b++) { ctx.task(alpha.handle->read(), x.handles[b]->read(), y.handles[b]->rw()) ->* @@ -286,7 +286,7 @@ void SCALE_AXPY(const scalar& alpha, const class vector& x, class vector& y) assert(x.N == y.N); assert(x.nblocks == y.nblocks); - for (int b = 0; b < x.nblocks; b++) + for (size_t b = 0; b < x.nblocks; b++) { ctx.task(alpha.handle->read(), x.handles[b]->read(), y.handles[b]->rw()) ->*[](cudaStream_t stream, auto dalpha, auto dx, auto dy) { @@ -315,9 +315,9 @@ void GEMV(double alpha, class matrix& a, class vector& x, double beta, class vec size_t block_size = x.block_size; assert(block_size == y.block_size); - for (int row_y = 0; row_y < y.nblocks; row_y++) + for (size_t row_y = 0; row_y < y.nblocks; row_y++) { - for (int row_x = 0; row_x < x.nblocks; row_x++) + for (size_t row_x = 0; row_x < x.nblocks; row_x++) { double local_beta = (row_x == 0) ? beta : 1.0; diff --git a/cudax/examples/stf/linear_algebra/strassen.cu b/cudax/examples/stf/linear_algebra/strassen.cu index 0b00bd41f48..f06e0a65620 100644 --- a/cudax/examples/stf/linear_algebra/strassen.cu +++ b/cudax/examples/stf/linear_algebra/strassen.cu @@ -417,9 +417,9 @@ void strassen_test(context& ctx, size_t N) cuda_safe_call(cudaHostRegister(B, N * N * sizeof(double), cudaHostRegisterPortable)); cuda_safe_call(cudaHostRegister(C, N * N * sizeof(double), cudaHostRegisterPortable)); - for (int col = 0; col < N; col++) + for (size_t col = 0; col < N; col++) { - for (int row = 0; row < N; row++) + for (size_t row = 0; row < N; row++) { A[row + N * col] = 1.0; B[row + N * col] = -1.0; diff --git a/cudax/test/stf/cuda-samples/4_CUDA_Libraries/conjugateGradientMultiDeviceCG/conjugateGradientMultiDeviceCG_custf.cu b/cudax/test/stf/cuda-samples/4_CUDA_Libraries/conjugateGradientMultiDeviceCG/conjugateGradientMultiDeviceCG_custf.cu index 0d1ec06345b..4c069332e88 100644 --- a/cudax/test/stf/cuda-samples/4_CUDA_Libraries/conjugateGradientMultiDeviceCG/conjugateGradientMultiDeviceCG_custf.cu +++ b/cudax/test/stf/cuda-samples/4_CUDA_Libraries/conjugateGradientMultiDeviceCG/conjugateGradientMultiDeviceCG_custf.cu @@ -75,8 +75,8 @@ __device__ double grid_dot_result = 0.0; void genTridiag(slice I, slice J, slice val, int N, int nz) { I(0) = 0, J(0) = 0, J(1) = 1; - val(0) = (float) rand() / RAND_MAX + 10.0f; - val(1) = (float) rand() / RAND_MAX; + val(0) = (float) rand() / (float)RAND_MAX + 10.0f; + val(1) = (float) rand() / (float)RAND_MAX; int start; for (int i = 1; i < N; i++) @@ -100,11 +100,11 @@ void genTridiag(slice I, slice J, slice val, int N, int nz) } val(start) = val(start - 1); - val(start + 1) = (float) rand() / RAND_MAX + 10.0f; + val(start + 1) = (float) rand() / (float)RAND_MAX + 10.0f; if (i < N - 1) { - val(start + 2) = (float) rand() / RAND_MAX; + val(start + 2) = (float) rand() / (float)RAND_MAX; } } diff --git a/cudax/test/stf/cuda-samples/5_Domain_Specific/MonteCarloMultiGPU_cudastf/MonteCarloMultiGPU.cu b/cudax/test/stf/cuda-samples/5_Domain_Specific/MonteCarloMultiGPU_cudastf/MonteCarloMultiGPU.cu index 3cae7cb6bde..95ca023e11c 100644 --- a/cudax/test/stf/cuda-samples/5_Domain_Specific/MonteCarloMultiGPU_cudastf/MonteCarloMultiGPU.cu +++ b/cudax/test/stf/cuda-samples/5_Domain_Specific/MonteCarloMultiGPU_cudastf/MonteCarloMultiGPU.cu @@ -289,7 +289,7 @@ int main(int argc, char** argv) int gpuBase, gpuIndex; int i; - double delta, ref, sumDelta, sumRef, sumReserve; + double delta, sumReserve; // printf("MonteCarloMultiGPU\n"); // printf("==================\n"); @@ -369,17 +369,17 @@ int main(int argc, char** argv) } // printf("main(): comparing Monte Carlo and Black-Scholes results...\n"); - sumDelta = 0; - sumRef = 0; + // sumDelta = 0; + // sumRef = 0; sumReserve = 0; for (i = 0; i < OPT_N; i++) { BlackScholesCall(callValueBS[i], optionData[i]); delta = fabs(callValueBS[i] - callValueGPU[i].Expected); - ref = callValueBS[i]; - sumDelta += delta; - sumRef += fabs(ref); + // ref = callValueBS[i]; + // sumDelta += delta; + // sumRef += fabs(ref); if (delta > 1e-6) { @@ -415,17 +415,17 @@ int main(int argc, char** argv) // printf("Options per sec.: %f\n", OPT_N / (time * 0.001)); // printf("main(): comparing Monte Carlo and Black-Scholes results...\n"); - sumDelta = 0; - sumRef = 0; + // sumDelta = 0; + // sumRef = 0; sumReserve = 0; for (i = 0; i < OPT_N; i++) { BlackScholesCall(callValueBS[i], optionData[i]); delta = fabs(callValueBS[i] - callValueGPU[i].Expected); - ref = callValueBS[i]; - sumDelta += delta; - sumRef += fabs(ref); + // ref = callValueBS[i]; + // sumDelta += delta; + // sumRef += fabs(ref); if (delta > 1e-6) { @@ -444,15 +444,15 @@ int main(int argc, char** argv) // printf("main(): running CPU MonteCarlo...\n"); TOptionValue callValueCPU; sumDelta = 0; - sumRef = 0; + // sumRef = 0; for (i = 0; i < OPT_N; i++) { MonteCarloCPU(callValueCPU, optionData[i], NULL, PATH_N); delta = fabs(callValueCPU.Expected - callValueGPU[i].Expected); - ref = callValueCPU.Expected; - sumDelta += delta; - sumRef += fabs(ref); + // ref = callValueCPU.Expected; + // sumDelta += delta; + // sumRef += fabs(ref); // printf("Exp : %f | %f\t", callValueCPU.Expected, callValueGPU[i].Expected); // printf("Conf: %f | %f\n", callValueCPU.Confidence, callValueGPU[i].Confidence); } diff --git a/cudax/test/stf/examples/07-cholesky-redux.cu b/cudax/test/stf/examples/07-cholesky-redux.cu index 765047c8669..6e70f7a6a1d 100644 --- a/cudax/test/stf/examples/07-cholesky-redux.cu +++ b/cudax/test/stf/examples/07-cholesky-redux.cu @@ -91,10 +91,10 @@ public: handles.resize(mt * nt); - for (int colb = 0; colb < nt; colb++) + for (size_t colb = 0; colb < nt; colb++) { - int low_rowb = sym_matrix ? colb : 0; - for (int rowb = low_rowb; rowb < mt; rowb++) + size_t low_rowb = sym_matrix ? colb : 0; + for (size_t rowb = low_rowb; rowb < mt; rowb++) { T* addr_h = get_block_h(rowb, colb); auto& h = handle(rowb, colb); @@ -171,10 +171,10 @@ public: { nvtxRangePushA("FILL"); // Fill blocks by blocks - for (int colb = 0; colb < nt; colb++) + for (size_t colb = 0; colb < nt; colb++) { - int low_rowb = sym_matrix ? colb : 0; - for (int rowb = low_rowb; rowb < mt; rowb++) + size_t low_rowb = sym_matrix ? colb : 0; + for (size_t rowb = low_rowb; rowb < mt; rowb++) { // Each task fills a block auto& h = handle(rowb, colb); @@ -367,9 +367,9 @@ void PDNRM2_HOST(matrix* A, double* result) reserved::dot::set_current_color("red"); #endif - for (int rowb = 0; rowb < A->mt; rowb++) + for (size_t rowb = 0; rowb < A->mt; rowb++) { - for (int colb = 0; colb < A->nt; colb++) + for (size_t colb = 0; colb < A->nt; colb++) { ctx.host_launch(A->handle(rowb, colb).read())->*[=](auto sA) { double res2 = 0.0; @@ -454,17 +454,17 @@ void PDTRSM(cublasSideMode_t side, //=========================================== if (trans == CUBLAS_OP_N) { - for (int k = 0; k < B.mt; k++) + for (size_t k = 0; k < B.mt; k++) { double lalpha = k == 0 ? alpha : 1.0; - for (int n = 0; n < B.nt; n++) + for (size_t n = 0; n < B.nt; n++) { cuda_safe_call(cudaSetDevice(A.get_preferred_devid(k, k))); DTRSM(side, uplo, trans, diag, lalpha, A, k, k, B, k, n); } - for (int m = k + 1; m < B.mt; m++) + for (size_t m = k + 1; m < B.mt; m++) { - for (int n = 0; n < B.nt; n++) + for (size_t n = 0; n < B.nt; n++) { cuda_safe_call(cudaSetDevice(A.get_preferred_devid(m, k))); DGEMM(CUBLAS_OP_N, CUBLAS_OP_N, -1.0, A, m, k, B, k, n, lalpha, B, m, n); @@ -477,17 +477,17 @@ void PDTRSM(cublasSideMode_t side, //================================================ else { - for (int k = 0; k < B.mt; k++) + for (size_t k = 0; k < B.mt; k++) { double lalpha = k == 0 ? alpha : 1.0; - for (int n = 0; n < B.nt; n++) + for (size_t n = 0; n < B.nt; n++) { cuda_safe_call(cudaSetDevice(A.get_preferred_devid(B.mt - k - 1, B.mt - k - 1))); DTRSM(side, uplo, trans, diag, lalpha, A, B.mt - k - 1, B.mt - k - 1, B, B.mt - k - 1, n); } - for (int m = k + 1; m < B.mt; m++) + for (size_t m = k + 1; m < B.mt; m++) { - for (int n = 0; n < B.nt; n++) + for (size_t n = 0; n < B.nt; n++) { cuda_safe_call(cudaSetDevice(A.get_preferred_devid(B.mt - k - 1, B.mt - 1 - m))); DGEMM( @@ -544,14 +544,14 @@ void PDGEMM(cublasOperation_t transa, reserved::dot::set_current_color("blue"); #endif - for (int m = 0; m < C.mt; m++) + for (size_t m = 0; m < C.mt; m++) { - for (int n = 0; n < C.nt; n++) + for (size_t n = 0; n < C.nt; n++) { //========================================= // alpha*A*B does not contribute; scale C //========================================= - int inner_k = transa == CUBLAS_OP_N ? A.n : A.m; + size_t inner_k = transa == CUBLAS_OP_N ? A.n : A.m; if (alpha == 0.0 || inner_k == 0) { DGEMM(transa, transb, alpha, A, 0, 0, B, 0, 0, beta, C, m, n); @@ -563,7 +563,7 @@ void PDGEMM(cublasOperation_t transa, //================================ if (transb == CUBLAS_OP_N) { - for (int k = 0; k < A.nt; k++) + for (size_t k = 0; k < A.nt; k++) { double zbeta = k == 0 ? beta : 1.0; DGEMM(transa, transb, alpha, A, m, k, B, k, n, zbeta, C, m, n); @@ -574,7 +574,7 @@ void PDGEMM(cublasOperation_t transa, //===================================== else { - for (int k = 0; k < A.nt; k++) + for (size_t k = 0; k < A.nt; k++) { double zbeta = k == 0 ? beta : 1.0; DGEMM(transa, transb, alpha, A, m, k, B, n, k, zbeta, C, m, n); @@ -588,7 +588,7 @@ void PDGEMM(cublasOperation_t transa, //===================================== if (transb == CUBLAS_OP_N) { - for (int k = 0; k < A.mt; k++) + for (size_t k = 0; k < A.mt; k++) { double zbeta = k == 0 ? beta : 1.0; DGEMM(transa, transb, alpha, A, k, m, B, k, n, zbeta, C, m, n); @@ -599,7 +599,7 @@ void PDGEMM(cublasOperation_t transa, //========================================== else { - for (int k = 0; k < A.mt; k++) + for (size_t k = 0; k < A.mt; k++) { double zbeta = k == 0 ? beta : 1.0; DGEMM(transa, transb, alpha, A, k, m, B, n, k, zbeta, C, m, n); @@ -641,7 +641,7 @@ int main(int argc, char** argv) int ndevs; cuda_safe_call(cudaGetDeviceCount(&ndevs)); - for (size_t d = 0; d < ndevs; d++) + for (int d = 0; d < ndevs; d++) { auto lX = ctx.logical_data(shape_of>(1)); ctx.parallel_for(exec_place::device(d), lX.shape(), lX.write())->*[] __device__(size_t, auto) {}; diff --git a/cudax/test/stf/examples/07-cholesky-unified.cu b/cudax/test/stf/examples/07-cholesky-unified.cu index 7a52ed5ac41..c174e66bd85 100644 --- a/cudax/test/stf/examples/07-cholesky-unified.cu +++ b/cudax/test/stf/examples/07-cholesky-unified.cu @@ -84,10 +84,10 @@ public: handles.resize(mt * nt); - for (int colb = 0; colb < nt; colb++) + for (size_t colb = 0; colb < nt; colb++) { - int low_rowb = sym_matrix ? colb : 0; - for (int rowb = low_rowb; rowb < mt; rowb++) + size_t low_rowb = sym_matrix ? colb : 0; + for (size_t rowb = low_rowb; rowb < mt; rowb++) { T* addr_h = get_block_h(rowb, colb); auto& h = handle(rowb, colb); @@ -162,17 +162,17 @@ public: void fill(Fun&& fun) { // Fill blocks by blocks - for (int colb = 0; colb < nt; colb++) + for (size_t colb = 0; colb < nt; colb++) { - int low_rowb = sym_matrix ? colb : 0; - for (int rowb = low_rowb; rowb < mt; rowb++) + size_t low_rowb = sym_matrix ? colb : 0; + for (size_t rowb = low_rowb; rowb < mt; rowb++) { // Each task fills a block ctx.host_launch(handle(rowb, colb).write())->*[=, self = this](auto sA) { - for (int lcol = 0; lcol < sA.extent(1); lcol++) + for (size_t lcol = 0; lcol < sA.extent(1); lcol++) { size_t col = lcol + colb * sA.extent(1); - for (int lrow = 0; lrow < sA.extent(0); lrow++) + for (size_t lrow = 0; lrow < sA.extent(0); lrow++) { size_t row = lrow + rowb * sA.extent(0); sA(lrow, lcol) = fun(*self, row, col); @@ -348,9 +348,9 @@ void PDNRM2_HOST(matrix* A, double* result) reserved::dot::set_current_color("red"); #endif - for (int rowb = 0; rowb < A->mt; rowb++) + for (size_t rowb = 0; rowb < A->mt; rowb++) { - for (int colb = 0; colb < A->nt; colb++) + for (size_t colb = 0; colb < A->nt; colb++) { ctx.host_launch(A->handle(rowb, colb).read())->*[=](auto sA) { double res2 = 0.0; @@ -435,17 +435,17 @@ void PDTRSM(cublasSideMode_t side, //=========================================== if (trans == CUBLAS_OP_N) { - for (int k = 0; k < B.mt; k++) + for (size_t k = 0; k < B.mt; k++) { double lalpha = k == 0 ? alpha : 1.0; - for (int n = 0; n < B.nt; n++) + for (size_t n = 0; n < B.nt; n++) { cuda_safe_call(cudaSetDevice(A.get_preferred_devid(k, k))); DTRSM(side, uplo, trans, diag, lalpha, A, k, k, B, k, n); } - for (int m = k + 1; m < B.mt; m++) + for (size_t m = k + 1; m < B.mt; m++) { - for (int n = 0; n < B.nt; n++) + for (size_t n = 0; n < B.nt; n++) { cuda_safe_call(cudaSetDevice(A.get_preferred_devid(m, k))); DGEMM(CUBLAS_OP_N, CUBLAS_OP_N, -1.0, A, m, k, B, k, n, lalpha, B, m, n); @@ -458,17 +458,17 @@ void PDTRSM(cublasSideMode_t side, //================================================ else { - for (int k = 0; k < B.mt; k++) + for (size_t k = 0; k < B.mt; k++) { double lalpha = k == 0 ? alpha : 1.0; - for (int n = 0; n < B.nt; n++) + for (size_t n = 0; n < B.nt; n++) { cuda_safe_call(cudaSetDevice(A.get_preferred_devid(B.mt - k - 1, B.mt - k - 1))); DTRSM(side, uplo, trans, diag, lalpha, A, B.mt - k - 1, B.mt - k - 1, B, B.mt - k - 1, n); } - for (int m = k + 1; m < B.mt; m++) + for (size_t m = k + 1; m < B.mt; m++) { - for (int n = 0; n < B.nt; n++) + for (size_t n = 0; n < B.nt; n++) { cuda_safe_call(cudaSetDevice(A.get_preferred_devid(B.mt - k - 1, B.mt - 1 - m))); DGEMM( @@ -525,14 +525,14 @@ void PDGEMM(cublasOperation_t transa, reserved::dot::set_current_color("blue"); #endif - for (int m = 0; m < C.mt; m++) + for (size_t m = 0; m < C.mt; m++) { - for (int n = 0; n < C.nt; n++) + for (size_t n = 0; n < C.nt; n++) { //========================================= // alpha*A*B does not contribute; scale C //========================================= - int inner_k = transa == CUBLAS_OP_N ? A.n : A.m; + size_t inner_k = transa == CUBLAS_OP_N ? A.n : A.m; if (alpha == 0.0 || inner_k == 0) { DGEMM(transa, transb, alpha, A, 0, 0, B, 0, 0, beta, C, m, n); @@ -544,7 +544,7 @@ void PDGEMM(cublasOperation_t transa, //================================ if (transb == CUBLAS_OP_N) { - for (int k = 0; k < A.nt; k++) + for (size_t k = 0; k < A.nt; k++) { double zbeta = k == 0 ? beta : 1.0; DGEMM(transa, transb, alpha, A, m, k, B, k, n, zbeta, C, m, n); @@ -555,7 +555,7 @@ void PDGEMM(cublasOperation_t transa, //===================================== else { - for (int k = 0; k < A.nt; k++) + for (size_t k = 0; k < A.nt; k++) { double zbeta = k == 0 ? beta : 1.0; DGEMM(transa, transb, alpha, A, m, k, B, n, k, zbeta, C, m, n); @@ -569,7 +569,7 @@ void PDGEMM(cublasOperation_t transa, //===================================== if (transb == CUBLAS_OP_N) { - for (int k = 0; k < A.mt; k++) + for (size_t k = 0; k < A.mt; k++) { double zbeta = k == 0 ? beta : 1.0; DGEMM(transa, transb, alpha, A, k, m, B, k, n, zbeta, C, m, n); @@ -580,7 +580,7 @@ void PDGEMM(cublasOperation_t transa, //========================================== else { - for (int k = 0; k < A.mt; k++) + for (size_t k = 0; k < A.mt; k++) { double zbeta = k == 0 ? beta : 1.0; DGEMM(transa, transb, alpha, A, k, m, B, n, k, zbeta, C, m, n); diff --git a/cudax/test/stf/gnu/06-pdgemm.cpp b/cudax/test/stf/gnu/06-pdgemm.cpp index 1ae2f363c14..850ad136786 100644 --- a/cudax/test/stf/gnu/06-pdgemm.cpp +++ b/cudax/test/stf/gnu/06-pdgemm.cpp @@ -155,21 +155,21 @@ class matrix void fill(T (*func)(matrix*, int, int)) { // Fill blocks by blocks - for (int colb = 0; colb < nt; colb++) + for (size_t colb = 0; colb < nt; colb++) { - for (int rowb = 0; rowb < mt; rowb++) + for (size_t rowb = 0; rowb < mt; rowb++) { T* addr_h = get_block_h(rowb, colb); #ifdef TILED // tiles are stored contiguously - int ld = mb; + size_t ld = mb; #else - int ld = m; + size_t ld = m; #endif - for (int lrow = 0; lrow < mb; lrow++) + for (size_t lrow = 0; lrow < mb; lrow++) { - for (int lcol = 0; lcol < nb; lcol++) + for (size_t lcol = 0; lcol < nb; lcol++) { size_t row = lrow + rowb * mb; size_t col = lcol + colb * nb; @@ -257,14 +257,14 @@ void PDGEMM(Ctx& ctx, double beta, matrix& C) { - for (int m = 0; m < C.mt; m++) + for (size_t m = 0; m < C.mt; m++) { - for (int n = 0; n < C.nt; n++) + for (size_t n = 0; n < C.nt; n++) { //========================================= // alpha*A*B does not contribute; scale C //========================================= - int inner_k = transa == CUBLAS_OP_N ? A.n : A.m; + size_t inner_k = transa == CUBLAS_OP_N ? A.n : A.m; if (alpha == 0.0 || inner_k == 0) { DGEMM(ctx, transa, transb, alpha, A, 0, 0, B, 0, 0, beta, C, m, n); @@ -277,7 +277,7 @@ void PDGEMM(Ctx& ctx, if (transb == CUBLAS_OP_N) { assert(A.nt == B.mt); - for (int k = 0; k < A.nt; k++) + for (size_t k = 0; k < A.nt; k++) { double zbeta = k == 0 ? beta : 1.0; DGEMM(ctx, transa, transb, alpha, A, m, k, B, k, n, zbeta, C, m, n); @@ -288,7 +288,7 @@ void PDGEMM(Ctx& ctx, //===================================== else { - for (int k = 0; k < A.nt; k++) + for (size_t k = 0; k < A.nt; k++) { double zbeta = k == 0 ? beta : 1.0; DGEMM(ctx, transa, transb, alpha, A, m, k, B, n, k, zbeta, C, m, n); @@ -302,7 +302,7 @@ void PDGEMM(Ctx& ctx, //===================================== if (transb == CUBLAS_OP_N) { - for (int k = 0; k < A.mt; k++) + for (size_t k = 0; k < A.mt; k++) { double zbeta = k == 0 ? beta : 1.0; DGEMM(ctx, transa, transb, alpha, A, k, m, B, k, n, zbeta, C, m, n); @@ -313,7 +313,7 @@ void PDGEMM(Ctx& ctx, //========================================== else { - for (int k = 0; k < A.mt; k++) + for (size_t k = 0; k < A.mt; k++) { double zbeta = k == 0 ? beta : 1.0; DGEMM(ctx, transa, transb, alpha, A, k, m, B, n, k, zbeta, C, m, n); diff --git a/cudax/test/stf/gnu/07-cholesky.cpp b/cudax/test/stf/gnu/07-cholesky.cpp index 011de211e5c..e7c5f7fbfd1 100644 --- a/cudax/test/stf/gnu/07-cholesky.cpp +++ b/cudax/test/stf/gnu/07-cholesky.cpp @@ -90,10 +90,10 @@ class matrix handles.resize(mt * nt); - for (int colb = 0; colb < nt; colb++) + for (size_t colb = 0; colb < nt; colb++) { - int low_rowb = sym_matrix ? colb : 0; - for (int rowb = low_rowb; rowb < mt; rowb++) + size_t low_rowb = sym_matrix ? colb : 0; + for (size_t rowb = low_rowb; rowb < mt; rowb++) { T* addr_h = get_block_h(rowb, colb); auto& h = handle(rowb, colb); @@ -168,17 +168,17 @@ class matrix void fill(Fun&& fun) { // Fill blocks by blocks - for (int colb = 0; colb < nt; colb++) + for (size_t colb = 0; colb < nt; colb++) { - int low_rowb = sym_matrix ? colb : 0; - for (int rowb = low_rowb; rowb < mt; rowb++) + size_t low_rowb = sym_matrix ? colb : 0; + for (size_t rowb = low_rowb; rowb < mt; rowb++) { // Each task fills a block ctx.host_launch(handle(rowb, colb).write())->*[this, fun, rowb, colb](auto sA) { - for (int lcol = 0; lcol < sA.extent(1); lcol++) + for (size_t lcol = 0; lcol < sA.extent(1); lcol++) { size_t col = lcol + colb * sA.extent(1); - for (int lrow = 0; lrow < sA.extent(0); lrow++) + for (size_t lrow = 0; lrow < sA.extent(0); lrow++) { size_t row = lrow + rowb * sA.extent(0); sA(lrow, lcol) = fun(*this, row, col); @@ -351,9 +351,9 @@ void PDNRM2_HOST(matrix* A, double* result) reserved::dot::set_current_color("red"); #endif - for (int rowb = 0; rowb < A->mt; rowb++) + for (size_t rowb = 0; rowb < A->mt; rowb++) { - for (int colb = 0; colb < A->nt; colb++) + for (size_t colb = 0; colb < A->nt; colb++) { ctx.host_launch(A->handle(rowb, colb).read())->*[=](auto sA) { double res2 = 0.0; @@ -380,23 +380,23 @@ void PDPOTRF(matrix& A) assert(A.m == A.n); assert(A.mt == A.nt); - int NBLOCKS = A.mt; + size_t NBLOCKS = A.mt; assert(A.mb == A.nb); cuda_safe_call(cudaSetDevice(0)); nvtxRangePushA("SUBMIT_PDPOTRF"); - for (int K = 0; K < NBLOCKS; K++) + for (size_t K = 0; K < NBLOCKS; K++) { cuda_safe_call(cudaSetDevice(A.get_preferred_devid(K, K))); DPOTRF(CUBLAS_FILL_MODE_LOWER, A, K, K); - for (int row = K + 1; row < NBLOCKS; row++) + for (size_t row = K + 1; row < NBLOCKS; row++) { cuda_safe_call(cudaSetDevice(A.get_preferred_devid(row, K))); DTRSM(CUBLAS_SIDE_RIGHT, CUBLAS_FILL_MODE_LOWER, CUBLAS_OP_T, CUBLAS_DIAG_NON_UNIT, 1.0, A, K, K, A, row, K); - for (int col = K + 1; col < row; col++) + for (size_t col = K + 1; col < row; col++) { cuda_safe_call(cudaSetDevice(A.get_preferred_devid(row, col))); DGEMM(CUBLAS_OP_N, CUBLAS_OP_T, -1.0, A, row, K, A, col, K, 1.0, A, row, col); @@ -437,17 +437,17 @@ void PDTRSM(cublasSideMode_t side, //=========================================== if (trans == CUBLAS_OP_N) { - for (int k = 0; k < B.mt; k++) + for (size_t k = 0; k < B.mt; k++) { double lalpha = k == 0 ? alpha : 1.0; - for (int n = 0; n < B.nt; n++) + for (size_t n = 0; n < B.nt; n++) { cuda_safe_call(cudaSetDevice(A.get_preferred_devid(k, k))); DTRSM(side, uplo, trans, diag, lalpha, A, k, k, B, k, n); } - for (int m = k + 1; m < B.mt; m++) + for (size_t m = k + 1; m < B.mt; m++) { - for (int n = 0; n < B.nt; n++) + for (size_t n = 0; n < B.nt; n++) { cuda_safe_call(cudaSetDevice(A.get_preferred_devid(m, k))); DGEMM(CUBLAS_OP_N, CUBLAS_OP_N, -1.0, A, m, k, B, k, n, lalpha, B, m, n); @@ -460,17 +460,17 @@ void PDTRSM(cublasSideMode_t side, //================================================ else { - for (int k = 0; k < B.mt; k++) + for (size_t k = 0; k < B.mt; k++) { double lalpha = k == 0 ? alpha : 1.0; - for (int n = 0; n < B.nt; n++) + for (size_t n = 0; n < B.nt; n++) { cuda_safe_call(cudaSetDevice(A.get_preferred_devid(B.mt - k - 1, B.mt - k - 1))); DTRSM(side, uplo, trans, diag, lalpha, A, B.mt - k - 1, B.mt - k - 1, B, B.mt - k - 1, n); } - for (int m = k + 1; m < B.mt; m++) + for (size_t m = k + 1; m < B.mt; m++) { - for (int n = 0; n < B.nt; n++) + for (size_t n = 0; n < B.nt; n++) { cuda_safe_call(cudaSetDevice(A.get_preferred_devid(B.mt - k - 1, B.mt - 1 - m))); DGEMM( @@ -527,14 +527,14 @@ void PDGEMM(cublasOperation_t transa, reserved::dot::set_current_color("blue"); #endif - for (int m = 0; m < C.mt; m++) + for (size_t m = 0; m < C.mt; m++) { - for (int n = 0; n < C.nt; n++) + for (size_t n = 0; n < C.nt; n++) { //========================================= // alpha*A*B does not contribute; scale C //========================================= - int inner_k = transa == CUBLAS_OP_N ? A.n : A.m; + size_t inner_k = transa == CUBLAS_OP_N ? A.n : A.m; if (alpha == 0.0 || inner_k == 0) { DGEMM(transa, transb, alpha, A, 0, 0, B, 0, 0, beta, C, m, n); @@ -546,7 +546,7 @@ void PDGEMM(cublasOperation_t transa, //================================ if (transb == CUBLAS_OP_N) { - for (int k = 0; k < A.nt; k++) + for (size_t k = 0; k < A.nt; k++) { double zbeta = k == 0 ? beta : 1.0; DGEMM(transa, transb, alpha, A, m, k, B, k, n, zbeta, C, m, n); @@ -557,7 +557,7 @@ void PDGEMM(cublasOperation_t transa, //===================================== else { - for (int k = 0; k < A.nt; k++) + for (size_t k = 0; k < A.nt; k++) { double zbeta = k == 0 ? beta : 1.0; DGEMM(transa, transb, alpha, A, m, k, B, n, k, zbeta, C, m, n); @@ -571,7 +571,7 @@ void PDGEMM(cublasOperation_t transa, //===================================== if (transb == CUBLAS_OP_N) { - for (int k = 0; k < A.mt; k++) + for (size_t k = 0; k < A.mt; k++) { double zbeta = k == 0 ? beta : 1.0; DGEMM(transa, transb, alpha, A, k, m, B, k, n, zbeta, C, m, n); @@ -582,7 +582,7 @@ void PDGEMM(cublasOperation_t transa, //========================================== else { - for (int k = 0; k < A.mt; k++) + for (size_t k = 0; k < A.mt; k++) { double zbeta = k == 0 ? beta : 1.0; DGEMM(transa, transb, alpha, A, k, m, B, n, k, zbeta, C, m, n); @@ -662,9 +662,9 @@ int main(int argc, char** argv) cudaEvent_t startEvent_pdpotrf, stopEvent_pdpotrf; float milliseconds_pdpotrf = 0; - // for (int row = 0; row < A.mt; row++) + // for (size_t row = 0; row < A.mt; row++) // { - // for (int col = 0; col <= row; col++) + // for (size_t col = 0; col <= row; col++) // { // cuda_safe_call(cudaSetDevice(A.get_preferred_devid(row, col))); // NOOP(A, row, col); From ec877667795097216560eabef4f768bed13e4e68 Mon Sep 17 00:00:00 2001 From: Allison Piper Date: Thu, 31 Oct 2024 17:12:55 +0000 Subject: [PATCH 7/8] Run clang-format --- .../conjugateGradientMultiDeviceCG_custf.cu | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/cudax/test/stf/cuda-samples/4_CUDA_Libraries/conjugateGradientMultiDeviceCG/conjugateGradientMultiDeviceCG_custf.cu b/cudax/test/stf/cuda-samples/4_CUDA_Libraries/conjugateGradientMultiDeviceCG/conjugateGradientMultiDeviceCG_custf.cu index 4c069332e88..130cd2c8b7a 100644 --- a/cudax/test/stf/cuda-samples/4_CUDA_Libraries/conjugateGradientMultiDeviceCG/conjugateGradientMultiDeviceCG_custf.cu +++ b/cudax/test/stf/cuda-samples/4_CUDA_Libraries/conjugateGradientMultiDeviceCG/conjugateGradientMultiDeviceCG_custf.cu @@ -75,8 +75,8 @@ __device__ double grid_dot_result = 0.0; void genTridiag(slice I, slice J, slice val, int N, int nz) { I(0) = 0, J(0) = 0, J(1) = 1; - val(0) = (float) rand() / (float)RAND_MAX + 10.0f; - val(1) = (float) rand() / (float)RAND_MAX; + val(0) = (float) rand() / (float) RAND_MAX + 10.0f; + val(1) = (float) rand() / (float) RAND_MAX; int start; for (int i = 1; i < N; i++) @@ -100,11 +100,11 @@ void genTridiag(slice I, slice J, slice val, int N, int nz) } val(start) = val(start - 1); - val(start + 1) = (float) rand() / (float)RAND_MAX + 10.0f; + val(start + 1) = (float) rand() / (float) RAND_MAX + 10.0f; if (i < N - 1) { - val(start + 2) = (float) rand() / (float)RAND_MAX; + val(start + 2) = (float) rand() / (float) RAND_MAX; } } From 8214c6d5f683f7d2523ba8a95568719777b96443 Mon Sep 17 00:00:00 2001 From: Allison Piper Date: Tue, 19 Nov 2024 15:25:17 +0000 Subject: [PATCH 8/8] Remove vanilla CUDA sample from STF tests. There remains a STF-specific version of this example. --- cudax/test/stf/CMakeLists.txt | 1 - .../jacobiCudaGraphs/jacobi.cu | 626 ------------------ 2 files changed, 627 deletions(-) delete mode 100644 cudax/test/stf/cuda-samples/3_CUDA_Features/jacobiCudaGraphs/jacobi.cu diff --git a/cudax/test/stf/CMakeLists.txt b/cudax/test/stf/CMakeLists.txt index 65082348ae8..56f7036f75f 100644 --- a/cudax/test/stf/CMakeLists.txt +++ b/cudax/test/stf/CMakeLists.txt @@ -136,7 +136,6 @@ set(stf_test_mathlib_sources cuda-samples/0_Introduction/vectorAdd/vectorAdd_cudastf.cu # Reduce compilation time by not adding this (useless) example # cuda-samples/0_Introduction/vectorAdd/vectorAdd - cuda-samples/3_CUDA_Features/jacobiCudaGraphs/jacobi.cu cuda-samples/3_CUDA_Features/jacobiCudaGraphs/jacobi_cudastf.cu cuda-samples/4_CUDA_Libraries/conjugateGradientMultiDeviceCG/conjugateGradientMultiDeviceCG_custf.cu cuda-samples/5_Domain_Specific/MonteCarloMultiGPU_cudastf/MonteCarloMultiGPU.cu diff --git a/cudax/test/stf/cuda-samples/3_CUDA_Features/jacobiCudaGraphs/jacobi.cu b/cudax/test/stf/cuda-samples/3_CUDA_Features/jacobiCudaGraphs/jacobi.cu deleted file mode 100644 index 4b8e8be4600..00000000000 --- a/cudax/test/stf/cuda-samples/3_CUDA_Features/jacobiCudaGraphs/jacobi.cu +++ /dev/null @@ -1,626 +0,0 @@ -//===----------------------------------------------------------------------===// -// -// Part of CUDASTF in CUDA C++ Core Libraries, -// under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. -// -//===----------------------------------------------------------------------===// - -/* Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * * Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * * Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * * Neither the name of NVIDIA CORPORATION nor the names of its - * contributors may be used to endorse or promote products derived - * from this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY - * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR - * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR - * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, - * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, - * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR - * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY - * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - */ - -// This sample demonstrates Instantiated CUDA Graph Update -// with Jacobi Iterative Method in 3 different methods: -// 1 - JacobiMethodGpuCudaGraphExecKernelSetParams() - CUDA Graph with -// cudaGraphExecKernelNodeSetParams() 2 - JacobiMethodGpuCudaGraphExecUpdate() - -// CUDA Graph with cudaGraphExecUpdate() 3 - JacobiMethodGpu() - Non CUDA Graph -// method - -// Jacobi method on a linear system A*x = b, -// where A is diagonally dominant and the exact solution consists -// of all ones. - -#include - -#include - -using cuda::experimental::stf::cuda_safe_call; - -#define N_ROWS 512 - -namespace cg = cooperative_groups; - -// 8 Rows of square-matrix A processed by each CTA. -// This can be max 32 and only power of 2 (i.e., 2/4/8/16/32). -#define ROWS_PER_CTA 8 - -#if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 600 -#else -__device__ double atomicAdd(double* address, double val) -{ - unsigned long long int* address_as_ull = (unsigned long long int*) address; - unsigned long long int old = *address_as_ull, assumed; - - do - { - assumed = old; - old = atomicCAS(address_as_ull, assumed, __double_as_longlong(val + __longlong_as_double(assumed))); - - // Note: uses integer comparison to avoid hang in case of NaN (since NaN != - // NaN) - } while (assumed != old); - - return __longlong_as_double(old); -} -#endif - -// creates N_ROWS x N_ROWS matrix A with N_ROWS+1 on the diagonal and 1 -// elsewhere. The elements of the right hand side b all equal 2*n, hence the -// exact solution x to A*x = b is a vector of ones. -void createLinearSystem(float* A, double* b) -{ - int i, j; - for (i = 0; i < N_ROWS; i++) - { - b[i] = 2.0 * N_ROWS; - for (j = 0; j < N_ROWS; j++) - { - A[i * N_ROWS + j] = 1.0; - } - A[i * N_ROWS + i] = N_ROWS + 1.0; - } -} - -static __global__ void -JacobiMethod(const float* A, const double* b, const float conv_threshold, double* x, double* x_new, double* sum) -{ - // Handle to thread block group - cg::thread_block cta = cg::this_thread_block(); - __shared__ double x_shared[N_ROWS]; // N_ROWS == n - __shared__ double b_shared[ROWS_PER_CTA + 1]; - - for (int i = threadIdx.x; i < N_ROWS; i += blockDim.x) - { - x_shared[i] = x[i]; - } - - if (threadIdx.x < ROWS_PER_CTA) - { - int k = threadIdx.x; - for (int i = k + (blockIdx.x * ROWS_PER_CTA); (k < ROWS_PER_CTA) && (i < N_ROWS); - k += ROWS_PER_CTA, i += ROWS_PER_CTA) - { - b_shared[i % (ROWS_PER_CTA + 1)] = b[i]; - } - } - - cg::sync(cta); - - cg::thread_block_tile<32> tile32 = cg::tiled_partition<32>(cta); - - for (int k = 0, i = blockIdx.x * ROWS_PER_CTA; (k < ROWS_PER_CTA) && (i < N_ROWS); k++, i++) - { - double rowThreadSum = 0.0; - for (int j = threadIdx.x; j < N_ROWS; j += blockDim.x) - { - rowThreadSum += (A[i * N_ROWS + j] * x_shared[j]); - } - - for (int offset = tile32.size() / 2; offset > 0; offset /= 2) - { - rowThreadSum += tile32.shfl_down(rowThreadSum, offset); - } - - if (tile32.thread_rank() == 0) - { - atomicAdd(&b_shared[i % (ROWS_PER_CTA + 1)], -rowThreadSum); - } - } - - cg::sync(cta); - - if (threadIdx.x < ROWS_PER_CTA) - { - cg::thread_block_tile tile8 = cg::tiled_partition(cta); - double temp_sum = 0.0; - - int k = threadIdx.x; - - for (int i = k + (blockIdx.x * ROWS_PER_CTA); (k < ROWS_PER_CTA) && (i < N_ROWS); - k += ROWS_PER_CTA, i += ROWS_PER_CTA) - { - double dx = b_shared[i % (ROWS_PER_CTA + 1)]; - dx /= A[i * N_ROWS + i]; - - x_new[i] = (x_shared[i] + dx); - temp_sum += fabs(dx); - } - - for (int offset = tile8.size() / 2; offset > 0; offset /= 2) - { - temp_sum += tile8.shfl_down(temp_sum, offset); - } - - if (tile8.thread_rank() == 0) - { - atomicAdd(sum, temp_sum); - } - } -} - -// Thread block size for finalError kernel should be multiple of 32 -static __global__ void finalError(double* x, double* g_sum) -{ - // Handle to thread block group - cg::thread_block cta = cg::this_thread_block(); - extern __shared__ double warpSum[]; - double sum = 0.0; - - int globalThreadId = blockIdx.x * blockDim.x + threadIdx.x; - - for (int i = globalThreadId; i < N_ROWS; i += blockDim.x * gridDim.x) - { - double d = x[i] - 1.0; - sum += fabs(d); - } - - cg::thread_block_tile<32> tile32 = cg::tiled_partition<32>(cta); - - for (int offset = tile32.size() / 2; offset > 0; offset /= 2) - { - sum += tile32.shfl_down(sum, offset); - } - - if (tile32.thread_rank() == 0) - { - warpSum[threadIdx.x / warpSize] = sum; - } - - cg::sync(cta); - - double blockSum = 0.0; - if (threadIdx.x < (blockDim.x / warpSize)) - { - blockSum = warpSum[threadIdx.x]; - } - - if (threadIdx.x < 32) - { - for (int offset = tile32.size() / 2; offset > 0; offset /= 2) - { - blockSum += tile32.shfl_down(blockSum, offset); - } - if (tile32.thread_rank() == 0) - { - atomicAdd(g_sum, blockSum); - } - } -} - -// Run the Jacobi method for A*x = b on GPU with CUDA Graph - -// cudaGraphExecKernelNodeSetParams(). -double JacobiMethodGpuCudaGraphExecKernelSetParams( - const float* A, - const double* b, - const float conv_threshold, - const int max_iter, - double* x, - double* x_new, - cudaStream_t stream) -{ - // CTA size - dim3 nthreads(256, 1, 1); - // grid size - dim3 nblocks((N_ROWS / ROWS_PER_CTA) + 2, 1, 1); - cudaGraph_t graph; - cudaGraphExec_t graphExec = NULL; - - double sum = 0.0; - double* d_sum = NULL; - cuda_safe_call(cudaMalloc(&d_sum, sizeof(double))); - - std::vector nodeDependencies; - cudaGraphNode_t memcpyNode, jacobiKernelNode, memsetNode; - cudaMemcpy3DParms memcpyParams; - cudaMemsetParams memsetParams; - - memsetParams.dst = (void*) d_sum; - memsetParams.value = 0; - memsetParams.pitch = 0; - // elementSize can be max 4 bytes, so we take sizeof(float) and width=2 - memsetParams.elementSize = sizeof(float); - memsetParams.width = 2; - memsetParams.height = 1; - - cuda_safe_call(cudaGraphCreate(&graph, 0)); - cuda_safe_call(cudaGraphAddMemsetNode(&memsetNode, graph, NULL, 0, &memsetParams)); - nodeDependencies.push_back(memsetNode); - - cudaKernelNodeParams NodeParams0, NodeParams1; - NodeParams0.func = (void*) JacobiMethod; - NodeParams0.gridDim = nblocks; - NodeParams0.blockDim = nthreads; - NodeParams0.sharedMemBytes = 0; - void* kernelArgs0[6] = {(void*) &A, (void*) &b, (void*) &conv_threshold, (void*) &x, (void*) &x_new, (void*) &d_sum}; - NodeParams0.kernelParams = kernelArgs0; - NodeParams0.extra = NULL; - - cuda_safe_call( - cudaGraphAddKernelNode(&jacobiKernelNode, graph, nodeDependencies.data(), nodeDependencies.size(), &NodeParams0)); - - nodeDependencies.clear(); - nodeDependencies.push_back(jacobiKernelNode); - - memcpyParams.srcArray = NULL; - memcpyParams.srcPos = make_cudaPos(0, 0, 0); - memcpyParams.srcPtr = make_cudaPitchedPtr(d_sum, sizeof(double), 1, 1); - memcpyParams.dstArray = NULL; - memcpyParams.dstPos = make_cudaPos(0, 0, 0); - memcpyParams.dstPtr = make_cudaPitchedPtr(&sum, sizeof(double), 1, 1); - memcpyParams.extent = make_cudaExtent(sizeof(double), 1, 1); - memcpyParams.kind = cudaMemcpyDeviceToHost; - - cuda_safe_call( - cudaGraphAddMemcpyNode(&memcpyNode, graph, nodeDependencies.data(), nodeDependencies.size(), &memcpyParams)); - - cuda_safe_call(cudaGraphInstantiate(&graphExec, graph, NULL, NULL, 0)); - - NodeParams1.func = (void*) JacobiMethod; - NodeParams1.gridDim = nblocks; - NodeParams1.blockDim = nthreads; - NodeParams1.sharedMemBytes = 0; - void* kernelArgs1[6] = {(void*) &A, (void*) &b, (void*) &conv_threshold, (void*) &x_new, (void*) &x, (void*) &d_sum}; - NodeParams1.kernelParams = kernelArgs1; - NodeParams1.extra = NULL; - - int k = 0; - for (k = 0; k < max_iter; k++) - { - cuda_safe_call( - cudaGraphExecKernelNodeSetParams(graphExec, jacobiKernelNode, ((k & 1) == 0) ? &NodeParams0 : &NodeParams1)); - cuda_safe_call(cudaGraphLaunch(graphExec, stream)); - cuda_safe_call(cudaStreamSynchronize(stream)); - - if (sum <= conv_threshold) - { - cuda_safe_call(cudaMemsetAsync(d_sum, 0, sizeof(double), stream)); - nblocks.x = (N_ROWS / nthreads.x) + 1; - size_t sharedMemSize = ((nthreads.x / 32) + 1) * sizeof(double); - if ((k & 1) == 0) - { - finalError<<>>(x_new, d_sum); - } - else - { - finalError<<>>(x, d_sum); - } - - cuda_safe_call(cudaMemcpyAsync(&sum, d_sum, sizeof(double), cudaMemcpyDeviceToHost, stream)); - cuda_safe_call(cudaStreamSynchronize(stream)); - // printf("GPU iterations : %d\n", k + 1); - // printf("GPU error: %.3e\n", sum); - break; - } - } - - cuda_safe_call(cudaFree(d_sum)); - return sum; -} - -// Run the Jacobi method for A*x = b on GPU with Instantiated CUDA Graph Update -// API - cudaGraphExecUpdate(). -double JacobiMethodGpuCudaGraphExecUpdate( - const float* A, - const double* b, - const float conv_threshold, - const int max_iter, - double* x, - double* x_new, - cudaStream_t stream) -{ - // CTA size - dim3 nthreads(256, 1, 1); - // grid size - dim3 nblocks((N_ROWS / ROWS_PER_CTA) + 2, 1, 1); - cudaGraph_t graph; - cudaGraphExec_t graphExec = NULL; - - double sum = 0.0; - double* d_sum; - cuda_safe_call(cudaMalloc(&d_sum, sizeof(double))); - - int k = 0; - for (k = 0; k < max_iter; k++) - { - cuda_safe_call(cudaStreamBeginCapture(stream, cudaStreamCaptureModeGlobal)); - cuda_safe_call(cudaMemsetAsync(d_sum, 0, sizeof(double), stream)); - if ((k & 1) == 0) - { - JacobiMethod<<>>(A, b, conv_threshold, x, x_new, d_sum); - } - else - { - JacobiMethod<<>>(A, b, conv_threshold, x_new, x, d_sum); - } - cuda_safe_call(cudaMemcpyAsync(&sum, d_sum, sizeof(double), cudaMemcpyDeviceToHost, stream)); - cuda_safe_call(cudaStreamEndCapture(stream, &graph)); - - if (graphExec == NULL) - { - cuda_safe_call(cudaGraphInstantiate(&graphExec, graph, NULL, NULL, 0)); - } - else - { - cudaGraphExecUpdateResult updateResult_out; - cuda_safe_call(cudaGraphExecUpdate(graphExec, graph, NULL, &updateResult_out)); - if (updateResult_out != cudaGraphExecUpdateSuccess) - { - if (graphExec != NULL) - { - cuda_safe_call(cudaGraphExecDestroy(graphExec)); - } - printf("k = %d graph update failed with error - %d\n", k, updateResult_out); - cuda_safe_call(cudaGraphInstantiate(&graphExec, graph, NULL, NULL, 0)); - } - } - cuda_safe_call(cudaGraphLaunch(graphExec, stream)); - cuda_safe_call(cudaStreamSynchronize(stream)); - - if (sum <= conv_threshold) - { - cuda_safe_call(cudaMemsetAsync(d_sum, 0, sizeof(double), stream)); - nblocks.x = (N_ROWS / nthreads.x) + 1; - size_t sharedMemSize = ((nthreads.x / 32) + 1) * sizeof(double); - if ((k & 1) == 0) - { - finalError<<>>(x_new, d_sum); - } - else - { - finalError<<>>(x, d_sum); - } - - cuda_safe_call(cudaMemcpyAsync(&sum, d_sum, sizeof(double), cudaMemcpyDeviceToHost, stream)); - cuda_safe_call(cudaStreamSynchronize(stream)); - // printf("GPU iterations : %d\n", k + 1); - // printf("GPU error: %.3e\n", sum); - break; - } - } - - cuda_safe_call(cudaFree(d_sum)); - return sum; -} - -// Run the Jacobi method for A*x = b on GPU without CUDA Graph. -double JacobiMethodGpu( - const float* A, - const double* b, - const float conv_threshold, - const int max_iter, - double* x, - double* x_new, - cudaStream_t stream) -{ - // CTA size - dim3 nthreads(256, 1, 1); - // grid size - dim3 nblocks((N_ROWS / ROWS_PER_CTA) + 2, 1, 1); - - double sum = 0.0; - double* d_sum; - cuda_safe_call(cudaMalloc(&d_sum, sizeof(double))); - int k = 0; - - for (k = 0; k < max_iter; k++) - { - cuda_safe_call(cudaMemsetAsync(d_sum, 0, sizeof(double), stream)); - if ((k & 1) == 0) - { - JacobiMethod<<>>(A, b, conv_threshold, x, x_new, d_sum); - } - else - { - JacobiMethod<<>>(A, b, conv_threshold, x_new, x, d_sum); - } - cuda_safe_call(cudaMemcpyAsync(&sum, d_sum, sizeof(double), cudaMemcpyDeviceToHost, stream)); - cuda_safe_call(cudaStreamSynchronize(stream)); - - if (sum <= conv_threshold) - { - cuda_safe_call(cudaMemsetAsync(d_sum, 0, sizeof(double), stream)); - nblocks.x = (N_ROWS / nthreads.x) + 1; - size_t sharedMemSize = ((nthreads.x / 32) + 1) * sizeof(double); - if ((k & 1) == 0) - { - finalError<<>>(x_new, d_sum); - } - else - { - finalError<<>>(x, d_sum); - } - - cuda_safe_call(cudaMemcpyAsync(&sum, d_sum, sizeof(double), cudaMemcpyDeviceToHost, stream)); - cuda_safe_call(cudaStreamSynchronize(stream)); - // printf("GPU iterations : %d\n", k + 1); - // printf("GPU error: %.3e\n", sum); - break; - } - } - - cuda_safe_call(cudaFree(d_sum)); - return sum; -} - -// Run the Jacobi method for A*x = b on CPU. -void JacobiMethodCPU(float* A, double* b, float conv_threshold, int max_iter, int* num_iter, double* x) -{ - double* x_new; - x_new = (double*) calloc(N_ROWS, sizeof(double)); - int k; - - for (k = 0; k < max_iter; k++) - { - double sum = 0.0; - for (int i = 0; i < N_ROWS; i++) - { - double temp_dx = b[i]; - for (int j = 0; j < N_ROWS; j++) - { - temp_dx -= A[i * N_ROWS + j] * x[j]; - } - temp_dx /= A[i * N_ROWS + i]; - x_new[i] += temp_dx; - sum += fabs(temp_dx); - } - - for (int i = 0; i < N_ROWS; i++) - { - x[i] = x_new[i]; - } - - if (sum <= conv_threshold) - { - break; - } - } - *num_iter = k + 1; - free(x_new); -} - -int main() -{ - // if (checkCmdLineFlag(argc, (const char **)argv, "help")) { - // printf("Command line: jacobiCudaGraphs [-option]\n"); - // printf("Valid options:\n"); - // printf( - // "-gpumethod=<0,1 or 2> : 0 - [Default] " - // "JacobiMethodGpuCudaGraphExecKernelSetParams\n"); - // printf(" : 1 - JacobiMethodGpuCudaGraphExecUpdate\n"); - // printf(" : 2 - JacobiMethodGpu - Non CUDA Graph\n"); - // printf("-device=device_num : cuda device id"); - // printf("-help : Output a help message\n"); - // exit(EXIT_SUCCESS); - // } - // - int gpumethod = 0; - // if (checkCmdLineFlag(argc, (const char **)argv, "gpumethod")) { - // gpumethod = getCmdLineArgumentInt(argc, (const char **)argv, "gpumethod"); - // - // if (gpumethod < 0 || gpumethod > 2) { - // printf("Error: gpumethod must be 0 or 1 or 2, gpumethod=%d is invalid\n", - // gpumethod); - // exit(EXIT_SUCCESS); - // } - // } - - // int dev = findCudaDevice(argc, (const char **)argv); - // int dev = 0; - - double* b = NULL; - float* A = NULL; - cuda_safe_call(cudaMallocHost(&b, N_ROWS * sizeof(double))); - memset(b, 0, N_ROWS * sizeof(double)); - cuda_safe_call(cudaMallocHost(&A, N_ROWS * N_ROWS * sizeof(float))); - memset(A, 0, N_ROWS * N_ROWS * sizeof(float)); - - createLinearSystem(A, b); - double* x = NULL; - // start with array of all zeroes - x = (double*) calloc(N_ROWS, sizeof(double)); - - float conv_threshold = 1.0e-2; - int max_iter = 4 * N_ROWS * N_ROWS; - int cnt = 0; - - // // create timer - // StopWatchInterface *timerCPU = NULL, *timerGpu = NULL; - // sdkCreateTimer(&timerCPU); - // - // sdkStartTimer(&timerCPU); - JacobiMethodCPU(A, b, conv_threshold, max_iter, &cnt, x); - - double sum = 0.0; - // Compute error - for (int i = 0; i < N_ROWS; i++) - { - double d = x[i] - 1.0; - sum += fabs(d); - } - // sdkStopTimer(&timerCPU); - // printf("CPU iterations : %d\n", cnt); - // printf("CPU error: %.3e\n", sum); - // printf("CPU Processing time: %f (ms)\n", sdkGetTimerValue(&timerCPU)); - - float* d_A; - double *d_b, *d_x, *d_x_new; - cudaStream_t stream1; - cuda_safe_call(cudaStreamCreateWithFlags(&stream1, cudaStreamNonBlocking)); - cuda_safe_call(cudaMalloc(&d_b, sizeof(double) * N_ROWS)); - cuda_safe_call(cudaMalloc(&d_A, sizeof(float) * N_ROWS * N_ROWS)); - cuda_safe_call(cudaMalloc(&d_x, sizeof(double) * N_ROWS)); - cuda_safe_call(cudaMalloc(&d_x_new, sizeof(double) * N_ROWS)); - - cuda_safe_call(cudaMemsetAsync(d_x, 0, sizeof(double) * N_ROWS, stream1)); - cuda_safe_call(cudaMemsetAsync(d_x_new, 0, sizeof(double) * N_ROWS, stream1)); - cuda_safe_call(cudaMemcpyAsync(d_A, A, sizeof(float) * N_ROWS * N_ROWS, cudaMemcpyHostToDevice, stream1)); - cuda_safe_call(cudaMemcpyAsync(d_b, b, sizeof(double) * N_ROWS, cudaMemcpyHostToDevice, stream1)); - - // sdkCreateTimer(&timerGpu); - // sdkStartTimer(&timerGpu); - - double sumGPU = 0.0; - if (gpumethod == 0) - { - sumGPU = JacobiMethodGpuCudaGraphExecKernelSetParams(d_A, d_b, conv_threshold, max_iter, d_x, d_x_new, stream1); - } - else if (gpumethod == 1) - { - sumGPU = JacobiMethodGpuCudaGraphExecUpdate(d_A, d_b, conv_threshold, max_iter, d_x, d_x_new, stream1); - } - else if (gpumethod == 2) - { - sumGPU = JacobiMethodGpu(d_A, d_b, conv_threshold, max_iter, d_x, d_x_new, stream1); - } - - // sdkStopTimer(&timerGpu); - // printf("GPU Processing time: %f (ms)\n", sdkGetTimerValue(&timerGpu)); - - cuda_safe_call(cudaFree(d_b)); - cuda_safe_call(cudaFree(d_A)); - cuda_safe_call(cudaFree(d_x)); - cuda_safe_call(cudaFree(d_x_new)); - - cuda_safe_call(cudaFreeHost(A)); - cuda_safe_call(cudaFreeHost(b)); - - // printf("&&&& jacobiCudaGraphs %s\n", (fabs(sum - sumGPU) < conv_threshold) ? "PASSED" : "FAILED"); - - return (fabs(sum - sumGPU) < conv_threshold) ? EXIT_SUCCESS : EXIT_FAILURE; -}