Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add CI with STF MathLib builds #2651

Open
wants to merge 10 commits into
base: main
Choose a base branch
from
54 changes: 54 additions & 0 deletions .devcontainer/cuda12.0ext-gcc12/devcontainer.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
{
"shutdownAction": "stopContainer",
"image": "rapidsai/devcontainers:24.12-cpp-gcc12-cuda12.0ext",
"hostRequirements": {
"gpu": "optional"
},
"initializeCommand": [
"/bin/bash",
"-c",
"mkdir -m 0755 -p ${localWorkspaceFolder}/.{aws,cache,config}; mkdir -m 0755 -p ${localWorkspaceFolder}/build;",
"if [[ -n ${WSLENV+set} ]]; then docker volume create cccl-build; else docker volume create --driver local --opt type=none --opt device=${localWorkspaceFolder}/build --opt o=bind cccl-build fi;"
],
"containerEnv": {
"SCCACHE_REGION": "us-east-2",
"SCCACHE_BUCKET": "rapids-sccache-devs",
"AWS_ROLE_ARN": "arn:aws:iam::279114543810:role/nv-gha-token-sccache-devs",
"HISTFILE": "${containerWorkspaceFolder}/.cache/._bash_history",
"DEVCONTAINER_NAME": "cuda12.0ext-gcc12",
"CCCL_CUDA_VERSION": "12.0",
"CCCL_HOST_COMPILER": "gcc",
"CCCL_HOST_COMPILER_VERSION": "12",
"CCCL_BUILD_INFIX": "cuda12.0ext-gcc12",
"CCCL_CUDA_EXTENDED": "true"
},
"workspaceFolder": "/home/coder/${localWorkspaceFolderBasename}",
"workspaceMount": "source=${localWorkspaceFolder},target=/home/coder/${localWorkspaceFolderBasename},type=bind,consistency=consistent",
"mounts": [
"source=${localWorkspaceFolder}/.aws,target=/home/coder/.aws,type=bind,consistency=consistent",
"source=${localWorkspaceFolder}/.cache,target=/home/coder/.cache,type=bind,consistency=consistent",
"source=${localWorkspaceFolder}/.config,target=/home/coder/.config,type=bind,consistency=consistent",
"source=cccl-build,target=/home/coder/cccl/build"
],
"customizations": {
"vscode": {
"extensions": [
"llvm-vs-code-extensions.vscode-clangd",
"xaver.clang-format",
"nvidia.nsight-vscode-edition",
"ms-vscode.cmake-tools"
],
"settings": {
"editor.defaultFormatter": "xaver.clang-format",
"editor.formatOnSave": true,
"clang-format.executable": "/usr/bin/clang-format",
"clangd.arguments": [
"--compile-commands-dir=${workspaceFolder}"
],
"files.eol": "\n",
"files.trimTrailingWhitespace": true
}
}
},
"name": "cuda12.0ext-gcc12"
}
54 changes: 54 additions & 0 deletions .devcontainer/cuda12.0ext-llvm14/devcontainer.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
{
"shutdownAction": "stopContainer",
"image": "rapidsai/devcontainers:24.12-cpp-llvm14-cuda12.0ext",
"hostRequirements": {
"gpu": "optional"
},
"initializeCommand": [
"/bin/bash",
"-c",
"mkdir -m 0755 -p ${localWorkspaceFolder}/.{aws,cache,config}; mkdir -m 0755 -p ${localWorkspaceFolder}/build;",
"if [[ -n ${WSLENV+set} ]]; then docker volume create cccl-build; else docker volume create --driver local --opt type=none --opt device=${localWorkspaceFolder}/build --opt o=bind cccl-build fi;"
],
"containerEnv": {
"SCCACHE_REGION": "us-east-2",
"SCCACHE_BUCKET": "rapids-sccache-devs",
"AWS_ROLE_ARN": "arn:aws:iam::279114543810:role/nv-gha-token-sccache-devs",
"HISTFILE": "${containerWorkspaceFolder}/.cache/._bash_history",
"DEVCONTAINER_NAME": "cuda12.0ext-llvm14",
"CCCL_CUDA_VERSION": "12.0",
"CCCL_HOST_COMPILER": "llvm",
"CCCL_HOST_COMPILER_VERSION": "14",
"CCCL_BUILD_INFIX": "cuda12.0ext-llvm14",
"CCCL_CUDA_EXTENDED": "true"
},
"workspaceFolder": "/home/coder/${localWorkspaceFolderBasename}",
"workspaceMount": "source=${localWorkspaceFolder},target=/home/coder/${localWorkspaceFolderBasename},type=bind,consistency=consistent",
"mounts": [
"source=${localWorkspaceFolder}/.aws,target=/home/coder/.aws,type=bind,consistency=consistent",
"source=${localWorkspaceFolder}/.cache,target=/home/coder/.cache,type=bind,consistency=consistent",
"source=${localWorkspaceFolder}/.config,target=/home/coder/.config,type=bind,consistency=consistent",
"source=cccl-build,target=/home/coder/cccl/build"
],
"customizations": {
"vscode": {
"extensions": [
"llvm-vs-code-extensions.vscode-clangd",
"xaver.clang-format",
"nvidia.nsight-vscode-edition",
"ms-vscode.cmake-tools"
],
"settings": {
"editor.defaultFormatter": "xaver.clang-format",
"editor.formatOnSave": true,
"clang-format.executable": "/usr/bin/clang-format",
"clangd.arguments": [
"--compile-commands-dir=${workspaceFolder}"
],
"files.eol": "\n",
"files.trimTrailingWhitespace": true
}
}
},
"name": "cuda12.0ext-llvm14"
}
54 changes: 54 additions & 0 deletions .devcontainer/cuda12.6ext-gcc12/devcontainer.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
{
"shutdownAction": "stopContainer",
"image": "rapidsai/devcontainers:24.12-cpp-gcc12-cuda12.6ext",
"hostRequirements": {
"gpu": "optional"
},
"initializeCommand": [
"/bin/bash",
"-c",
"mkdir -m 0755 -p ${localWorkspaceFolder}/.{aws,cache,config}; mkdir -m 0755 -p ${localWorkspaceFolder}/build;",
"if [[ -n ${WSLENV+set} ]]; then docker volume create cccl-build; else docker volume create --driver local --opt type=none --opt device=${localWorkspaceFolder}/build --opt o=bind cccl-build fi;"
],
"containerEnv": {
"SCCACHE_REGION": "us-east-2",
"SCCACHE_BUCKET": "rapids-sccache-devs",
"AWS_ROLE_ARN": "arn:aws:iam::279114543810:role/nv-gha-token-sccache-devs",
"HISTFILE": "${containerWorkspaceFolder}/.cache/._bash_history",
"DEVCONTAINER_NAME": "cuda12.6ext-gcc12",
"CCCL_CUDA_VERSION": "12.6",
"CCCL_HOST_COMPILER": "gcc",
"CCCL_HOST_COMPILER_VERSION": "12",
"CCCL_BUILD_INFIX": "cuda12.6ext-gcc12",
"CCCL_CUDA_EXTENDED": "true"
},
"workspaceFolder": "/home/coder/${localWorkspaceFolderBasename}",
"workspaceMount": "source=${localWorkspaceFolder},target=/home/coder/${localWorkspaceFolderBasename},type=bind,consistency=consistent",
"mounts": [
"source=${localWorkspaceFolder}/.aws,target=/home/coder/.aws,type=bind,consistency=consistent",
"source=${localWorkspaceFolder}/.cache,target=/home/coder/.cache,type=bind,consistency=consistent",
"source=${localWorkspaceFolder}/.config,target=/home/coder/.config,type=bind,consistency=consistent",
"source=cccl-build,target=/home/coder/cccl/build"
],
"customizations": {
"vscode": {
"extensions": [
"llvm-vs-code-extensions.vscode-clangd",
"xaver.clang-format",
"nvidia.nsight-vscode-edition",
"ms-vscode.cmake-tools"
],
"settings": {
"editor.defaultFormatter": "xaver.clang-format",
"editor.formatOnSave": true,
"clang-format.executable": "/usr/bin/clang-format",
"clangd.arguments": [
"--compile-commands-dir=${workspaceFolder}"
],
"files.eol": "\n",
"files.trimTrailingWhitespace": true
}
}
},
"name": "cuda12.6ext-gcc12"
}
54 changes: 54 additions & 0 deletions .devcontainer/cuda12.6ext-llvm18/devcontainer.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
{
"shutdownAction": "stopContainer",
"image": "rapidsai/devcontainers:24.12-cpp-llvm18-cuda12.6ext",
"hostRequirements": {
"gpu": "optional"
},
"initializeCommand": [
"/bin/bash",
"-c",
"mkdir -m 0755 -p ${localWorkspaceFolder}/.{aws,cache,config}; mkdir -m 0755 -p ${localWorkspaceFolder}/build;",
"if [[ -n ${WSLENV+set} ]]; then docker volume create cccl-build; else docker volume create --driver local --opt type=none --opt device=${localWorkspaceFolder}/build --opt o=bind cccl-build fi;"
],
"containerEnv": {
"SCCACHE_REGION": "us-east-2",
"SCCACHE_BUCKET": "rapids-sccache-devs",
"AWS_ROLE_ARN": "arn:aws:iam::279114543810:role/nv-gha-token-sccache-devs",
"HISTFILE": "${containerWorkspaceFolder}/.cache/._bash_history",
"DEVCONTAINER_NAME": "cuda12.6ext-llvm18",
"CCCL_CUDA_VERSION": "12.6",
"CCCL_HOST_COMPILER": "llvm",
"CCCL_HOST_COMPILER_VERSION": "18",
"CCCL_BUILD_INFIX": "cuda12.6ext-llvm18",
"CCCL_CUDA_EXTENDED": "true"
},
"workspaceFolder": "/home/coder/${localWorkspaceFolderBasename}",
"workspaceMount": "source=${localWorkspaceFolder},target=/home/coder/${localWorkspaceFolderBasename},type=bind,consistency=consistent",
"mounts": [
"source=${localWorkspaceFolder}/.aws,target=/home/coder/.aws,type=bind,consistency=consistent",
"source=${localWorkspaceFolder}/.cache,target=/home/coder/.cache,type=bind,consistency=consistent",
"source=${localWorkspaceFolder}/.config,target=/home/coder/.config,type=bind,consistency=consistent",
"source=cccl-build,target=/home/coder/cccl/build"
],
"customizations": {
"vscode": {
"extensions": [
"llvm-vs-code-extensions.vscode-clangd",
"xaver.clang-format",
"nvidia.nsight-vscode-edition",
"ms-vscode.cmake-tools"
],
"settings": {
"editor.defaultFormatter": "xaver.clang-format",
"editor.formatOnSave": true,
"clang-format.executable": "/usr/bin/clang-format",
"clangd.arguments": [
"--compile-commands-dir=${workspaceFolder}"
],
"files.eol": "\n",
"files.trimTrailingWhitespace": true
}
}
},
"name": "cuda12.6ext-llvm18"
}
6 changes: 6 additions & 0 deletions ci/build_cudax.sh
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,12 @@ PRESET="cudax-cpp$CXX_STANDARD"

CMAKE_OPTIONS=""

# Enable extra mathlibs if we're in an extended CUDA image:
if $CCCL_CUDA_EXTENDED; then
echo "Image with extended CUDA libs detected, enabling STF MathLibs."
CMAKE_OPTIONS="$CMAKE_OPTIONS -Dcudax_ENABLE_CUDASTF_MATHLIBS=ON"
fi

configure_and_build_preset "CUDA Experimental" "$PRESET" "$CMAKE_OPTIONS"

print_time_summary
19 changes: 14 additions & 5 deletions ci/matrix.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,10 @@ workflows:
# - {jobs: ['test'], project: 'thrust', std: 17, ctk: 'curr', cxx: ['gcc12', 'llvm16']}
#
override:
- {jobs: ['test_ext'], project: 'cudax', ctk: ['12.0' ], std: 'min', cxx: ['gcc12']}
- {jobs: ['test_ext'], project: 'cudax', ctk: [ 'curr'], std: 'all', cxx: ['gcc12']}
- {jobs: ['test_ext'], project: 'cudax', ctk: ['12.0' ], std: 'max', cxx: ['clang14']}
- {jobs: ['test_ext'], project: 'cudax', ctk: [ 'curr'], std: 'max', cxx: ['clang18']}

pull_request:
# Old CTK
Expand Down Expand Up @@ -46,10 +50,10 @@ workflows:
- {jobs: ['build'], project: 'cudax', ctk: ['12.0' ], std: 17, cxx: ['gcc12'], sm: "90"}
- {jobs: ['build'], project: 'cudax', ctk: [ 'curr'], std: 17, cxx: ['gcc13'], sm: "90a"}
- {jobs: ['build'], project: 'cudax', ctk: [ 'curr'], std: 'all', cxx: ['gcc13', 'clang16'], cpu: 'arm64'}
- {jobs: ['test'], project: 'cudax', ctk: ['12.0' ], std: 'min', cxx: ['gcc12']}
- {jobs: ['test'], project: 'cudax', ctk: [ 'curr'], std: 'all', cxx: ['gcc12']}
- {jobs: ['test'], project: 'cudax', ctk: ['12.0' ], std: 'max', cxx: ['clang14']}
- {jobs: ['test'], project: 'cudax', ctk: [ 'curr'], std: 'max', cxx: ['clang18']}
- {jobs: ['test_ext'], project: 'cudax', ctk: ['12.0' ], std: 'min', cxx: ['gcc12']}
- {jobs: ['test_ext'], project: 'cudax', ctk: [ 'curr'], std: 'all', cxx: ['gcc12']}
- {jobs: ['test_ext'], project: 'cudax', ctk: ['12.0' ], std: 'max', cxx: ['clang14']}
- {jobs: ['test_ext'], project: 'cudax', ctk: [ 'curr'], std: 'max', cxx: ['clang18']}
# Python and c/parallel jobs:
- {jobs: ['test'], project: ['cccl_c_parallel', 'python'], ctk: '12.6'}
# cccl-infra:
Expand Down Expand Up @@ -169,8 +173,9 @@ host_compilers:

# Jobs support the following properties:
#
# - gpu: Whether the job requires a GPU runner. Default is false.
# - name: The human-readable name of the job. Default is the capitalized job key.
# - gpu: Whether the job requires a GPU runner. Default is false.
# - cuda_ext: Whether the job requires a devcontainer with extra CUDA libraries. Default is false.
# - needs:
# - A list of jobs that must be completed before this job can run. Default is an empty list.
# - These are automatically added if needed:
Expand All @@ -188,6 +193,10 @@ jobs:
test: { gpu: true, needs: 'build' }
test_nobuild: { gpu: true, name: 'Test', invoke: { prefix: 'test' } }

# Use images with extra CUDA libs:
build_ext: { name: "Build (extra CTK libs)", gpu: false, cuda_ext: true, invoke: { prefix: 'build' } }
test_ext: { name: "Test (extra CTK libs)", gpu: true, cuda_ext: true, invoke: { prefix: 'test' }, needs: 'build_ext' }

# CCCL:
infra: { gpu: true } # example project launches a kernel

Expand Down
22 changes: 11 additions & 11 deletions cudax/examples/stf/linear_algebra/06-pdgemm.cu
Original file line number Diff line number Diff line change
Expand Up @@ -160,9 +160,9 @@ public:
{
nvtxRangePushA("FILL");
// Fill blocks by blocks
for (int colb = 0; colb < nt; colb++)
for (size_t colb = 0; colb < nt; colb++)
{
for (int rowb = 0; rowb < mt; rowb++)
for (size_t rowb = 0; rowb < mt; rowb++)
{
// Each task fills a block
auto& h = get_handle(rowb, colb);
Expand Down Expand Up @@ -251,14 +251,14 @@ void PDGEMM(stream_ctx& ctx,
double beta,
matrix<double>& C)
{
for (int m = 0; m < C.mt; m++)
for (size_t m = 0; m < C.mt; m++)
{
for (int n = 0; n < C.nt; n++)
for (size_t n = 0; n < C.nt; n++)
{
//=========================================
// alpha*A*B does not contribute; scale C
//=========================================
int inner_k = transa == CUBLAS_OP_N ? A.n : A.m;
size_t inner_k = transa == CUBLAS_OP_N ? A.n : A.m;
if (alpha == 0.0 || inner_k == 0)
{
DGEMM(ctx, transa, transb, alpha, A, 0, 0, B, 0, 0, beta, C, m, n);
Expand All @@ -271,7 +271,7 @@ void PDGEMM(stream_ctx& ctx,
if (transb == CUBLAS_OP_N)
{
assert(A.nt == B.mt);
for (int k = 0; k < A.nt; k++)
for (size_t k = 0; k < A.nt; k++)
{
double zbeta = k == 0 ? beta : 1.0;
DGEMM(ctx, transa, transb, alpha, A, m, k, B, k, n, zbeta, C, m, n);
Expand All @@ -282,7 +282,7 @@ void PDGEMM(stream_ctx& ctx,
//=====================================
else
{
for (int k = 0; k < A.nt; k++)
for (size_t k = 0; k < A.nt; k++)
{
double zbeta = k == 0 ? beta : 1.0;
DGEMM(ctx, transa, transb, alpha, A, m, k, B, n, k, zbeta, C, m, n);
Expand All @@ -296,7 +296,7 @@ void PDGEMM(stream_ctx& ctx,
//=====================================
if (transb == CUBLAS_OP_N)
{
for (int k = 0; k < A.mt; k++)
for (size_t k = 0; k < A.mt; k++)
{
double zbeta = k == 0 ? beta : 1.0;
DGEMM(ctx, transa, transb, alpha, A, k, m, B, k, n, zbeta, C, m, n);
Expand All @@ -307,7 +307,7 @@ void PDGEMM(stream_ctx& ctx,
//==========================================
else
{
for (int k = 0; k < A.mt; k++)
for (size_t k = 0; k < A.mt; k++)
{
double zbeta = k == 0 ? beta : 1.0;
DGEMM(ctx, transa, transb, alpha, A, k, m, B, n, k, zbeta, C, m, n);
Expand All @@ -328,14 +328,14 @@ void run(stream_ctx& ctx, size_t N, size_t NB)
cuda_safe_call(cudaGetDeviceCount(&ndevs));

/* Warm up allocators */
for (size_t d = 0; d < ndevs; d++)
for (int d = 0; d < ndevs; d++)
{
auto lX = ctx.logical_data(shape_of<slice<double>>(1));
ctx.parallel_for(exec_place::device(d), lX.shape(), lX.write())->*[] _CCCL_DEVICE(size_t, auto) {};
}

/* Initializes CUBLAS on all devices */
for (size_t d = 0; d < ndevs; d++)
for (int d = 0; d < ndevs; d++)
{
cuda_safe_call(cudaSetDevice(d));
get_cublas_handle();
Expand Down
Loading
Loading