diff --git a/.gitignore b/.gitignore index 8f17b75..fd32fb9 100644 --- a/.gitignore +++ b/.gitignore @@ -1,2 +1,3 @@ Manifest.toml /LocalPreferences.toml +docs/build diff --git a/README.md b/README.md index 9f0baa3..cf5de13 100644 --- a/README.md +++ b/README.md @@ -27,6 +27,15 @@ pkg> add CUDSS pkg> test CUDSS ``` +## Content + +CUDSS.jl provides a structured approach for leveraging NVIDIA cuDSS functionalities. +It introduces the `CudssSolver` type along with three core routines: `cudss`, `cudss_set`, and `cudss_get`. +Additionally, specialized methods for the `CuSparseMatrixCSR` type have been incorporated for `cholesky`, `ldlt`, `lu` and `\`. +To further enhance performance, in-place variants including `cholesky!`, `ldlt!`, `lu!` and `ldiv!` have been implemented. +These variants optimize performance by reusing the symbolic factorization as well as storage. +This ensures efficient solving of sparse linear systems on GPUs. + ## Examples ### Example 1: Sparse unsymmetric linear system with one right-hand side diff --git a/docs/Project.toml b/docs/Project.toml index 37ab2c5..c9eb349 100644 --- a/docs/Project.toml +++ b/docs/Project.toml @@ -1,6 +1,8 @@ [deps] Documenter = "e30172f5-a6a5-5a46-863b-614d45cd2de4" CUDSS = "45b445bb-4962-46a0-9369-b4df9d0f772e" +CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba" +LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e" [compat] Documenter = "1.0" diff --git a/docs/make.jl b/docs/make.jl index ab2b559..5738ba2 100644 --- a/docs/make.jl +++ b/docs/make.jl @@ -1,4 +1,6 @@ using Documenter, CUDSS +using LinearAlgebra +using CUDA, CUDA.CUSPARSE makedocs( modules = [CUDSS], diff --git a/docs/src/generic.md b/docs/src/generic.md index 7cabfc9..3474ab0 100644 --- a/docs/src/generic.md +++ b/docs/src/generic.md @@ -1,6 +1,9 @@ -# Examples +## LLᵀ and LLᴴ -## Example 1: Sparse unsymmetric linear system with one right-hand side +```@docs + LinearAlgebra.cholesky(A::CuSparseMatrixCSR{T,Cint}; view::Char='F') where T <: LinearAlgebra.BlasFloat + LinearAlgebra.cholesky!(solver::CudssSolver{T}, A::CuSparseMatrixCSR{T,Cint}) where T <: LinearAlgebra.BlasFloat +``` ```julia using CUDA, CUDA.CUSPARSE @@ -8,34 +11,51 @@ using CUDSS using LinearAlgebra using SparseArrays -T = Float64 +T = ComplexF64 +R = real(T) n = 100 -A_cpu = sprand(T, n, n, 0.05) + I -b_cpu = rand(T, n) +p = 5 +A_cpu = sprand(T, n, n, 0.01) +A_cpu = A_cpu * A_cpu' + I +B_cpu = rand(T, n, p) -A_gpu = CuSparseMatrixCSR(A_cpu) -b_gpu = CuVector(b_cpu) +A_gpu = CuSparseMatrixCSR(A_cpu |> triu) +B_gpu = CuMatrix(B_cpu) +X_gpu = similar(B_gpu) -F = lu(A_gpu) -x_gpu = F \ b_gpu +F = cholesky(A_gpu, view='U') +X_gpu = F \ B_gpu -r_gpu = b_gpu - A_gpu * x_gpu -norm(r_gpu) +R_gpu = B_gpu - CuSparseMatrixCSR(A_cpu) * X_gpu +norm(R_gpu) -# In-place LU -d_gpu = rand(T, n) |> CuVector -B_gpu = A_gpu + Diagonal(d_gpu) -lu!(F, B_gpu) +# In-place LLᴴ +d_gpu = rand(R, n) |> CuVector +A_gpu = A_gpu + Diagonal(d_gpu) +cholesky!(F, A_gpu) -c_cpu = rand(T, n) -c_gpu = CuVector(c_cpu) -ldiv!(x_gpu, F, c_gpu) +C_cpu = rand(T, n, p) +C_gpu = CuMatrix(C_cpu) +ldiv!(X_gpu, F, C_gpu) -r_gpu = c_gpu - B_gpu * x_gpu -norm(r_gpu) +R_gpu = C_gpu - ( CuSparseMatrixCSR(A_cpu) + Diagonal(d_gpu) ) * X_gpu +norm(R_gpu) ``` -## Example 2: Sparse symmetric linear system with multiple right-hand sides +!!! note + If we only store one triangle of `A_gpu`, we can also use the wrappers `Symmetric` and `Hermitian` instead of using the keyword argument `view` in `cholesky`. For real matrices, both wrappers are allowed but only `Hermitian` can be used for complex matrices. + +```julia +H_gpu = Hermitian(A_gpu, :U) +F = cholesky(H_gpu) +``` + +## LDLᵀ and LDLᴴ + +```@docs + LinearAlgebra.ldlt(A::CuSparseMatrixCSR{T,Cint}; view::Char='F') where T <: LinearAlgebra.BlasFloat + LinearAlgebra.ldlt!(solver::CudssSolver{T}, A::CuSparseMatrixCSR{T,Cint}) where T <: LinearAlgebra.BlasFloat +``` ```julia using CUDA, CUDA.CUSPARSE @@ -63,8 +83,8 @@ norm(R_gpu) # In-place LDLᵀ d_gpu = rand(R, n) |> CuVector -B_gpu = A_gpu + Diagonal(d_gpu) -ldlt!(F, B_gpu) +A_gpu = A_gpu + Diagonal(d_gpu) +ldlt!(F, A_gpu) C_cpu = rand(T, n, p) C_gpu = CuMatrix(C_cpu) @@ -82,7 +102,12 @@ S_gpu = Symmetric(A_gpu, :L) F = ldlt(S_gpu) ``` -## Example 3: Sparse hermitian positive definite linear system with multiple right-hand sides +## LU + +```@docs + LinearAlgebra.lu(A::CuSparseMatrixCSR{T,Cint}) where T <: LinearAlgebra.BlasFloat + LinearAlgebra.lu!(solver::CudssSolver{T}, A::CuSparseMatrixCSR{T,Cint}) where T <: LinearAlgebra.BlasFloat +``` ```julia using CUDA, CUDA.CUSPARSE @@ -90,41 +115,29 @@ using CUDSS using LinearAlgebra using SparseArrays -T = ComplexF64 -R = real(T) +T = Float64 n = 100 -p = 5 -A_cpu = sprand(T, n, n, 0.01) -A_cpu = A_cpu * A_cpu' + I -B_cpu = rand(T, n, p) - -A_gpu = CuSparseMatrixCSR(A_cpu |> triu) -B_gpu = CuMatrix(B_cpu) -X_gpu = similar(B_gpu) - -F = cholesky(A_gpu, view='U') -X_gpu = F \ B_gpu +A_cpu = sprand(T, n, n, 0.05) + I +b_cpu = rand(T, n) -R_gpu = B_gpu - CuSparseMatrixCSR(A_cpu) * X_gpu -norm(R_gpu) +A_gpu = CuSparseMatrixCSR(A_cpu) +b_gpu = CuVector(b_cpu) -# In-place LLᴴ -d_gpu = rand(R, n) |> CuVector -B_gpu = A_gpu + Diagonal(d_gpu) -cholesky!(F, B_gpu) +F = lu(A_gpu) +x_gpu = F \ b_gpu -C_cpu = rand(T, n, p) -C_gpu = CuMatrix(C_cpu) -ldiv!(X_gpu, F, C_gpu) +r_gpu = b_gpu - A_gpu * x_gpu +norm(r_gpu) -R_gpu = C_gpu - ( CuSparseMatrixCSR(A_cpu) + Diagonal(d_gpu) ) * X_gpu -norm(R_gpu) -``` +# In-place LU +d_gpu = rand(T, n) |> CuVector +A_gpu = A_gpu + Diagonal(d_gpu) +lu!(F, A_gpu) -!!! note - If we only store one triangle of `A_gpu`, we can also use the wrappers `Symmetric` and `Hermitian` instead of using the keyword argument `view` in `cholesky`. For real matrices, both wrappers are allowed but only `Hermitian` can be used for complex matrices. +c_cpu = rand(T, n) +c_gpu = CuVector(c_cpu) +ldiv!(x_gpu, F, c_gpu) -```julia -H_gpu = Hermitian(A_gpu, :U) -F = cholesky(H_gpu) +r_gpu = c_gpu - A_gpu * x_gpu +norm(r_gpu) ``` diff --git a/src/CUDSS.jl b/src/CUDSS.jl index 6468c8e..d7f62a4 100644 --- a/src/CUDSS.jl +++ b/src/CUDSS.jl @@ -14,7 +14,7 @@ else end import CUDA: @checked, libraryPropertyType, cudaDataType, initialize_context, retry_reclaim, CUstream -import LinearAlgebra: lu, lu!, ldlt, ldlt!, cholesky, cholesky!, ldiv!, BlasFloat, BlasReal +import LinearAlgebra: lu, lu!, ldlt, ldlt!, cholesky, cholesky!, ldiv!, BlasFloat, BlasReal, checksquare import Base: \ include("libcudss.jl") diff --git a/src/generic.jl b/src/generic.jl index 0dc5386..2daea22 100644 --- a/src/generic.jl +++ b/src/generic.jl @@ -1,5 +1,19 @@ +""" + solver = lu(A::CuSparseMatrixCSR{T,Cint}) + +Compute the LU factorization of a sparse matrix `A` on an NVIDIA GPU. +The type `T` can be `Float32`, `Float64`, `ComplexF32` or `ComplexF64`. + +#### Input argument + +* `A`: a sparse square matrix stored in the `CuSparseMatrixCSR` format. + +#### Output argument + +* `solver`: an opaque structure [`CudssSolver`](@ref) that stores the factors of the LU decomposition. +""" function LinearAlgebra.lu(A::CuSparseMatrixCSR{T,Cint}) where T <: BlasFloat - n = LinearAlgebra.checksquare(A) + n = checksquare(A) solver = CudssSolver(A, "G", 'F') x = CudssMatrix(T, n) b = CudssMatrix(T, n) @@ -8,8 +22,41 @@ function LinearAlgebra.lu(A::CuSparseMatrixCSR{T,Cint}) where T <: BlasFloat return solver end +""" + solver = lu!(solver::CudssSolver{T}, A::CuSparseMatrixCSR{T,Cint}) + +Compute the LU factorization of a sparse matrix `A` on an NVIDIA GPU, reusing the symbolic factorization stored in `solver`. +The type `T` can be `Float32`, `Float64`, `ComplexF32` or `ComplexF64`. +""" +function LinearAlgebra.lu!(solver::CudssSolver{T}, A::CuSparseMatrixCSR{T,Cint}) where T <: BlasFloat + n = checksquare(A) + cudss_set(solver, A) + x = CudssMatrix(T, n) + b = CudssMatrix(T, n) + cudss("factorization", solver, x, b) + return solver +end + +""" + solver = ldlt(A::CuSparseMatrixCSR{T,Cint}; view::Char='F') + +Compute the LDLᴴ factorization of a sparse matrix `A` on an NVIDIA GPU. +The type `T` can be `Float32`, `Float64`, `ComplexF32` or `ComplexF64`. + +#### Input argument + +* `A`: a sparse Hermitian matrix stored in the `CuSparseMatrixCSR` format. + +#### Keyword argument + +*`view`: A character that specifies which triangle of the sparse matrix is provided. Possible options are `L` for the lower triangle, `U` for the upper triangle, and `F` for the full matrix. + +#### Output argument + +* `solver`: Opaque structure [`CudssSolver`](@ref) that stores the factors of the LDLᴴ decomposition. +""" function LinearAlgebra.ldlt(A::CuSparseMatrixCSR{T,Cint}; view::Char='F') where T <: BlasFloat - n = LinearAlgebra.checksquare(A) + n = checksquare(A) structure = T <: Real ? "S" : "H" solver = CudssSolver(A, structure, view) (T <: Complex) && cudss_set(solver, "pivot_type", 'N') @@ -23,8 +70,41 @@ end LinearAlgebra.ldlt(A::Symmetric{T,<:CuSparseMatrixCSR{T,Cint}}) where T <: BlasReal = LinearAlgebra.ldlt(A.data, view=A.uplo) LinearAlgebra.ldlt(A::Hermitian{T,<:CuSparseMatrixCSR{T,Cint}}) where T <: BlasFloat = LinearAlgebra.ldlt(A.data, view=A.uplo) +""" + solver = ldlt!(solver::CudssSolver{T}, A::CuSparseMatrixCSR{T,Cint}) + +Compute the LDLᴴ factorization of a sparse matrix `A` on an NVIDIA GPU, reusing the symbolic factorization stored in `solver`. +The type `T` can be `Float32`, `Float64`, `ComplexF32` or `ComplexF64`. +""" +function LinearAlgebra.ldlt!(solver::CudssSolver{T}, A::CuSparseMatrixCSR{T,Cint}) where T <: BlasFloat + n = checksquare(A) + cudss_set(solver, A) + x = CudssMatrix(T, n) + b = CudssMatrix(T, n) + cudss("factorization", solver, x, b) + return solver +end + +""" + solver = cholesky(A::CuSparseMatrixCSR{T,Cint}; view::Char='F') + +Compute the LLᴴ factorization of a sparse matrix `A` on an NVIDIA GPU. +The type `T` can be `Float32`, `Float64`, `ComplexF32` or `ComplexF64`. + +#### Input argument + +* `A`: a sparse Hermitian positive definite matrix stored in the `CuSparseMatrixCSR` format. + +#### Keyword argument + +*`view`: A character that specifies which triangle of the sparse matrix is provided. Possible options are `L` for the lower triangle, `U` for the upper triangle, and `F` for the full matrix. + +#### Output argument + +* `solver`: Opaque structure [`CudssSolver`](@ref) that stores the factors of the LLᴴ decomposition. +""" function LinearAlgebra.cholesky(A::CuSparseMatrixCSR{T,Cint}; view::Char='F') where T <: BlasFloat - n = LinearAlgebra.checksquare(A) + n = checksquare(A) structure = T <: Real ? "SPD" : "HPD" solver = CudssSolver(A, structure, view) x = CudssMatrix(T, n) @@ -37,17 +117,19 @@ end LinearAlgebra.cholesky(A::Symmetric{T,<:CuSparseMatrixCSR{T,Cint}}) where T <: BlasReal = LinearAlgebra.cholesky(A.data, view=A.uplo) LinearAlgebra.cholesky(A::Hermitian{T,<:CuSparseMatrixCSR{T,Cint}}) where T <: BlasFloat = LinearAlgebra.cholesky(A.data, view=A.uplo) -for fun in (:lu!, :ldlt!, :cholesky!) - @eval begin - function LinearAlgebra.$fun(solver::CudssSolver{T}, A::CuSparseMatrixCSR{T,Cint}) where T <: BlasFloat - n = LinearAlgebra.checksquare(A) - cudss_set(solver.matrix, A) - x = CudssMatrix(T, n) - b = CudssMatrix(T, n) - cudss("factorization", solver, x, b) - return solver - end - end +""" + solver = cholesky!(solver::CudssSolver{T}, A::CuSparseMatrixCSR{T,Cint}) + +Compute the LLᴴ factorization of a sparse matrix `A` on an NVIDIA GPU, reusing the symbolic factorization stored in `solver`. +The type `T` can be `Float32`, `Float64`, `ComplexF32` or `ComplexF64`. +""" +function LinearAlgebra.cholesky!(solver::CudssSolver{T}, A::CuSparseMatrixCSR{T,Cint}) where T <: BlasFloat + n = checksquare(A) + cudss_set(solver, A) + x = CudssMatrix(T, n) + b = CudssMatrix(T, n) + cudss("factorization", solver, x, b) + return solver end for type in (:CuVector, :CuMatrix)